% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/bvs.R
\name{bvs}
\alias{bvs}
\title{High dimensional Bayesian variable selection using nonlocal priors}
\usage{
bvs(X, resp, prep = TRUE, fixed_cols = NULL, eff_size = 0.7,
  family = c("logistic", "survival"), hselect = TRUE, r = 1, tau = 0.25,
  niter, mod_prior = c("unif", "beta"), inseed = NULL, ncpu = 4,
  cplng = F)
}
\arguments{
\item{X}{The \code{n} times \code{p} design matrix. The columns should
represent genes and rows represent the observations. The column names are
used as gene names so they should not be left as \code{NULL}. For logistic
regression, \code{X} should NOT contain vector of \code{1}'s representing
the intercept as it will be added automatically.}

\item{resp}{For logistic regression models it is the binary response
vector. For Cox proportional hazard models this is a two column matrix
where the first column contains survival time vector and the second column
is the censoring status for each observation.}

\item{prep}{A logical value determining if the preprocessing step should
be performed on the design matrix or not. That step contains removing
columns that have \code{NA}'s or all their elements are equal to 0, along
with standardizing non-binary columns. This step is recommended and thus the
default value is \code{TRUE}.}

\item{fixed_cols}{A vector of indices showing those columns of the design
matrix that are not supposed to enter the selection procedure. These
columns are always in the final selected model. Note that if any of these
columns contain \code{NA}, they will be removed.}

\item{eff_size}{This is the expected effect size in the model for a
standardized design matrix, which is basically the coefficient value that is
expected to occur the most based on some prior knowledge.}

\item{family}{Determines the type of data analysis. \code{logistic} is for
binary outcome data where logistic regression modeling is used, whereas
\code{survival} is for survival outcome data using Cox proportional
hazard model.}

\item{hselect}{A boolean variable indicating whether the automatic procedure
for hyperparameter selection should be run or not. The default value is
\code{TRUE}. Note that in this setting, \code{r} is always chosen to be 1.}

\item{r}{The paramter \code{r} of the iMOM prior, when no automatic
procedure for hyperparameter selection is done. As a result, this is
relevant only when \code{hselect = FALSE}, otherwise it is ignored.}

\item{tau}{The paramter \code{tau} of the iMOM prior, when no automatic
procedure for hyperparameter selection is done. As a result, this is
relevant only when \code{hselect = FALSE}, otherwise it is ignored.}

\item{niter}{Number of iterations. For binary outcome data, this
determines the number of MCMC iterations per CPU. For survival outcome data
this is the number of iterations per temperature schedule in the stochastic
search algorithm.}

\item{mod_prior}{Type of prior used for the model space. \code{unif} is
for a uniform binomial and \code{beta} is for a beta binomial prior. In the
former case, both hyper parameters in the beta prior are equal to \code{1},
but in the latter case those two hyper parameters are chosen as explained in
the reference papers. The default choice for this variable is the uniform
prior.}

\item{inseed}{The input seed for making the parallel processing
reproducible. This parameter is ignored in logistic regression models when
\code{cplng = FALSE}. The default value is \code{NULL} which means that each
time the search for model space is started from different starting points.
In case it is set to a number, it initializes the RNG for the first task and
subsequent tasks to get separate substreams, using L'Ecuyer algorithm as
described in doMPI package.}

\item{ncpu}{This is the number of cpus used in parallel processing. For
logistic regression models this is the number of parallel coupled chains
run at the same time. For survival outcome data this is the number of cpus
doing stochastic search at the same time to increase th enumber of visited
models.}

\item{cplng}{This parameter is only used in logistic regression models, and
indicating if coupling algorithm for MCMC output should be performed or not.}
}
\value{
It returns a list containing different objects that depend on the
family of the model and the coupling flag for logistic regression models.
The following describes the objects in the output list based on different
combinations of those two input arguments.\cr \cr
\strong{1) } \code{family = logistic && cplng = FALSE}
\item{num_vis_models}{Number of unique models visited throughout the search
of the model space.}
\item{max_prob}{Maximum unnormalized probability among all visited models}
\item{HPM}{The indices of the model with highest posterior
probability among all visited models, with respect to the columns in
the output \code{des_mat}. The names of the selected columns can be checked
using \code{gene_names}. The corresponding design matrix is also one
of the outputs that can be checked in \code{des_mat}.}
\item{beta_hat}{The coefficient vector for the selected model. The first
component is always for the intercept.}
\item{MPM}{The indices of median probability model. According to the paper
Barbieri et. al., this is defined to be the model consisting of those
variables whose posterior inclusion probability is at least 1/2. The order
of columns is similar to that is explained for \code{HPM}.}
\item{max_prob_vec}{A \code{100} by \code{1} vector of unnormalized
probabilities of the first 100 models with highest posterior probability
among all visited models.}
\item{max_models}{A list of length 100 containing top 100 models
corresponding to \code{max_prob_vec} vector. Each entry of this list
contains the indices of covariates for the model with posterior probability
reported in the corresponding entry in \code{max_prob_vec}. Note that the
intercept is always used along with the selected columns and in calculating
the probabilities in \code{max_prob_vec}.}
\item{inc_probs}{A vector of length \code{p+1} containing the posterior
inclusion probability for each covariate in the design matrix. The order of
columns is with respect to processed design matrix, \code{des_mat}.}
\item{des_mat}{The design matrix used in the analysis where fixed columns
are moved to the beginning of the matrix and if \code{prep=TRUE}, the
columns containing \code{NA} are all removed. The reported indices in
selected models are all with respect to the columns of this matrix.}
\item{gene_names}{Names of the genes extracted from the design matrix.}
\item{r}{The hyperparameter for iMOM prior density function, calculated
using the proposed algorithm for the given dataset.}
\item{tau}{The hyperparameter for iMOM prior density function, calculated
using the proposed algorithm for the given dataset.}
\strong{2) } \code{family = logistic && cplng = TRUE}
\item{cpl_percent}{Shows what percentage of pairs of chains are coupled.}
\item{margin_probs}{A \code{k} by \code{1} vector of marginal probabilities
where element \code{i} shows the maximum marginal probability of the
data under the maximum model for the \eqn{i^{th}} pair of chains. \code{k}
is the number of paired chains which is the same as number of CPUs.}
\item{chains}{A \code{k} by \code{p} binary matrix, where each row is the
model for the \eqn{i^{th}} pair of chains. Note that the index of nonzero
elements are not necessarily in the same order as the input design matrix,
\code{X}, depending on existence of fixed columns in selection procedure.
As a result, always match the indices to the columns of the design matrix
that is reported as an output in \code{des_mat}.}
\item{cpl_flags}{A \code{k} by \code{1} binary vector, showing which pairs
are coupled, (=\code{1}) and which are not, (= \code{0}).}
\item{beta_hat}{A \code{k} by \code{(p+1)} matrix where each row is the
estimated coefficient for each modelin the rows of \code{Chains} variable.}
\item{uniq_models}{A list showing unique models with the indices of the
included covariates at each model.}
\item{freq}{Frequency of each of the unique models. It is used to find
the highest frquency model.}
\item{probs}{Unnormalized probability of each of the unique models.}
\item{des_mat}{The design matrix used in the analysis where fixed columns
are moved to the beginning of the matrix and if \code{prep=TRUE}, the
columns containing \code{NA} are all removed. The reported indices in
selected models are all with respect to the columns of this matrix.}
\item{gene_names}{Names of the genes extracted from the design matrix.}
\item{r}{The hyperparameter for iMOM prior density function, calculated
using the proposed algorithm for the given dataset.}
\item{tau}{The hyperparameter for iMOM prior density function, calculated
using the proposed algorithm for the given dataset.}
\strong{3) } \code{family = survival}
\item{num_vis_models}{Number of visited models during the whole process.}
\item{max_prob}{The unnormalized probability of the maximum model among
all visited models.}
\item{HPM}{The indices of the model with highest posterior
probability among all visited models, with respect to the columns in
\code{des_mat}. As a result, always look at the names of the selected
columns using \code{gene_names}. The corresponding design matrix is one of
the outputs that can be checked in \code{des_mat}.}
\item{MPM}{The indices of median probability model. According to the paper
Barbieri et. al., this is defined to be the model consisting of those
variables whose posterior inclusion probability is at least 1/2. The order
of columns is similar to that is explained for \code{HPM}.}
\item{max_prob_vec}{A \code{100} by \code{1} vector of unnormalized
probabilities of the first 100 models with highest posterior probability
among all visited models.}
\item{max_models}{A list of length 100 containing top 100 models
corresponding to \code{max_prob_vec} vector. Each entry of this list
contains the indices of covariates for the model with posterior probability
reported in the corresponding entry in \code{max_prob_vec}.}
\item{inc_probs}{A \code{p} by \code{1} vector containing the posterior
inclusion probability for each covariate in the design matrix. The order of
columns is with respect to processed design matrix, \code{des_mat}.}
\item{des_mat}{The design matrix used in the analysis where fixed columns
are moved to the beginning of the matrix and if \code{prep=TRUE}, the
columns containing \code{NA} are all removed. The reported indices in
selected models are all with respect to the columns of this matrix.}
\item{start_models}{A \code{k} by \code{3} matrix showing the starting model
for each worker CPU. Obviously \code{k} is equal to the number of CPUs.}
\item{gene_names}{Names of the genes extracted from the design matrix.}
\item{r}{The hyperparameter for iMOM prior density function, calculated
using the proposed algorithm for the given dataset.}
\item{tau}{The hyperparameter for iMOM prior density function, calculated
using the proposed algorithm for the given dataset.}
}
\description{
This function performs Bayesian variable selection for high
dimensional design matrix using iMOM prior for non-zero coefficients. It
also performs adaptive hyperparameter selection for iMOM prior. Cleaning
the data in a preprocessing step and before any data analysis is done by
user preference. This function is for binary and survival time response
datasets. In the former, MCMC is used to search in the model space while for
the latter a stochastic search does that job. This function has the option
to do all the mentioned tasks in a parallel fashion, exploiting hundreds of
CPUs. It is highly recommended to use a cluster for this purpose. This
function also supports having fixed columns in the design matrix which are
always in the final model and do not enter the selection procedure. Clinical
variables such as age, gender or stage of cancer are some examples. For the
output, it reports necessary measurements that is common in Bayesian
variable selection algorithms. They include Highest Posterior Probability
model, median probability model and posterior inclusion probability for each
of the covariates in the design matrix.
}
\examples{
### Simulating Logistic Regression Data
n <- 200
p <- 40
set.seed(123)
Sigma <- diag(p)
full <- matrix(c(rep(0.5, p*p)), ncol=p)
Sigma <- full + 0.5*Sigma
cholS <- chol(Sigma)
Beta <- c(-1.9,1.3,2.2)
X <- matrix(rnorm(n*p), ncol=p)
X <- X\%*\%cholS
beta <- numeric(p)
beta[c(1:length(Beta))] <- Beta
XB <- X\%*\%beta
probs <- as.vector(exp(XB)/(1+exp(XB)))
y <- rbinom(n,1,probs)
colnames(X) <- paste("gene_",c(1:p),sep="")

### Running 'bvs' function without coupling and with hyperparamter selection
### procedure
bout <- bvs(X, y, family = "logistic", mod_prior = "beta", niter = 50)

### Highes Posterior Model
bout$HPM

### Estimated Coefficients:
bout$beta_hat

### Number of Visited Models:
bout$num_vis_models
}
\references{
Nikooienejad, A., Wang, W., and Johnson, V. E. (2016). Bayesian
variable selection for binary outcomes in high dimensional genomic studies
using nonlocal priors. Bioinformatics, 32(9), 1338-1345.\cr\cr
Nikooienejad, A., Wang, W., and Johnson, V. E. (2017). Bayesian Variable
Selection in High Dimensional Survival Time Cancer Genomic Datasets using
Nonlocal Priors. arXiv preprint, arXiv:1712.02964. \cr\cr
Johnson, V. E. (1998). A coupling-regeneration scheme for
diagnosing convergence in Markov chain Monte Carlo algorithms. Journal of
the American Statistical Association, 93(441), 238-248.\cr\cr
Shin, M., Bhattacharya, A., and Johnson, V. E. (2017). Scalable Bayesian
variable selection using nonlocal prior densities in ultrahigh dimensional
settings. Statistica Sinica.\cr\cr
Johnson, V. E., and Rossell, D. (2010). On the use of non-local prior
densities in Bayesian hypothesis tests. Journal of the Royal Statistical
Society: Series B (Statistical Methodology), 72(2), 143-170.\cr\cr
Barbieri, M. M., and Berger, J. O. (2004). Optimal predictive model
selection. The annals of statistics, 32(3), 870-897.
}
\seealso{
\code{\link{ModProb}}, \code{\link{CoefEst}}
}
\author{
Amir Nikooienejad
}
