% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/CBDA.pipeline.R
\name{CBDA.pipeline}
\alias{CBDA.pipeline}
\title{Training/Leaning Step for Compressive Big Data Analytics - LONI PIPELINE}
\usage{
CBDA.pipeline(job_id, Ytemp, Xtemp, label = "CBDA_package_test",
  alpha = 0.2, Kcol_min = 5, Kcol_max = 15, Nrow_min = 30,
  Nrow_max = 50, misValperc = 0, M = 3000, N_cores = 1, top = 1000,
  workspace_directory = setwd(tempdir()), max_covs = 100, min_covs = 5,
  algorithm_list = c("SL.glm", "SL.xgboost", "SL.glmnet", "SL.svm",
  "SL.randomForest", "SL.bartMachine"))
}
\arguments{
\item{job_id}{This is the ID for the job generator in the LONI pipeline interface}

\item{Ytemp}{This is the output variable (vector) in the original Big Data}

\item{Xtemp}{This is the input variable (matrix) in the original Big Data}

\item{label}{This is the label appended to RData workspaces generated within the CBDA calls}

\item{alpha}{Percentage of the Big Data to hold off for Validation}

\item{Kcol_min}{Lower bound for the percentage of features-columns sampling (used for the Feature Sampling Range - FSR)}

\item{Kcol_max}{Upper bound for the percentage of features-columns sampling (used for the Feature Sampling Range - FSR)}

\item{Nrow_min}{Lower bound for the percentage of cases-rows sampling (used for the Case Sampling Range - CSR)}

\item{Nrow_max}{Upper bound for the percentage of cases-rows sampling (used for the Case Sampling Range - CSR)}

\item{misValperc}{Percentage of missing values to introduce in BigData (used just for testing, to mimic real cases).}

\item{M}{Number of the BigData subsets on which perform Knockoff Filtering and SuperLearner feature mining}

\item{N_cores}{Number of Cores to use in the parallel implementation (default is set to 1 core)}

\item{top}{Top predictions to select out of the M (must be < M, optimal ~0.1*M)}

\item{workspace_directory}{Directory where the results and workspaces are saved (set by default to tempdir())}

\item{max_covs}{Top features to display and include in the Validation Step where nested models are tested}

\item{min_covs}{Minimum number of top features to include in the initial model
for the Validation Step (it must be greater than 2)}

\item{algorithm_list}{List of algorithms/wrappers used by the SuperLearner.
By default is set to the following list
algorithm_list <- c("SL.glm","SL.xgboost",
"SL.glmnet","SL.svm","SL.randomForest","SL.bartMachine")}
}
\value{
CBDA object with validation results and 3 RData workspaces
}
\description{
The CBDA.pipeline() function comprises all the input specifications to run a set M of subsamples
 from the Big Data [Xtemp, Ytemp]. We assume that the Big Data is already clean and harmonized.
 This version 1.0.0 is fully tested ONLY on continuous features Xtemp and binary outcome Ytemp.
}
