% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/preprocess_data.R
\name{preprocess_data}
\alias{preprocess_data}
\title{Preprocess data prior to running machine learning}
\usage{
preprocess_data(
  dataset,
  outcome_colname,
  method = c("center", "scale"),
  remove_var = "nzv",
  collapse_corr_feats = TRUE,
  to_numeric = TRUE,
  group_neg_corr = TRUE,
  prefilter_threshold = 1
)
}
\arguments{
\item{dataset}{Dataframe with an outcome variable and other columns as features.}

\item{outcome_colname}{Column name as a string of the outcome variable
(default \code{NULL}; the first column will be chosen automatically).}

\item{method}{Methods to preprocess the data, described in
\code{\link[caret:preProcess]{caret::preProcess()}} (default: \code{c("center","scale")}, use \code{NULL} for
no normalization).}

\item{remove_var}{Whether to remove variables with near-zero variance
(\code{'nzv'}; default), zero variance (\code{'zv'}), or none (\code{NULL}).}

\item{collapse_corr_feats}{Whether to keep only one of perfectly correlated
features.}

\item{to_numeric}{Whether to change features to numeric where possible.}

\item{group_neg_corr}{Whether to group negatively correlated features
together (e.g. c(0,1) and c(1,0)).}

\item{prefilter_threshold}{Remove features which only have non-zero & non-NA
values N rows or fewer (default: 1). Set this to -1 to keep all columns at
this step. This step will also be skipped if \code{to_numeric} is set to
\code{FALSE}.}
}
\value{
Named list including:
\itemize{
\item \code{dat_transformed}: Preprocessed data.
\item \code{grp_feats}: If features were grouped together, a named list of the features corresponding to each group.
\item \code{removed_feats}: Any features that were removed during preprocessing (e.g. because there was zero variance or near-zero variance for those features).
}

If the \code{progressr} package is installed, a progress bar with time elapsed
and estimated time to completion can be displayed.
}
\description{
Function to preprocess your data for input into \code{\link[=run_ml]{run_ml()}}.
}
\section{More details}{


See the \href{http://www.schlosslab.org/mikropml/articles/preprocess.html}{preprocessing vignette}
for more details.

Note that if any values in \code{outcome_colname} contain spaces, they will be
converted to underscores for compatibility with \code{caret}.
}

\examples{
preprocess_data(mikropml::otu_small, "dx")

# the function can show a progress bar if you have the progressr package installed
## optionally, specify the progress bar format
progressr::handlers(progressr::handler_progress(
  format = ":message :bar :percent | elapsed: :elapsed | eta: :eta",
  clear = FALSE,
  show_after = 0
))
## tell progressor to always report progress
\dontrun{
progressr::handlers(global = TRUE)
## run the function and watch the live progress udpates
dat_preproc <- preprocess_data(mikropml::otu_small, "dx")
}
}
\author{
Zena Lapp, \email{zenalapp@umich.edu}

Kelly Sovacool, \email{sovacool@umich.edu}
}
