% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/linda.R
\name{linda}
\alias{linda}
\title{Linear (Lin) Model for Differential Abundance (DA) Analysis of High-dimensional Compositional Data}
\description{
The function implements a simple, robust and highly scalable approach to tackle
the compositional effects in differential abundance analysis of high-dimensional compositional data. 
It fits linear regression models on the centered log2-ratio transformed data, identifies a bias term due to the transformation
and compositional effect, and corrects the bias using the mode of the regression coefficients.
It could fit mixed-effect models for analysis of correlated data.
}
\usage{
linda(
  feature.dat,
  meta.dat,
  formula,
  feature.dat.type = c('count', 'proportion'),
  prev.filter = 0,
  mean.abund.filter = 0, 
  max.abund.filter = 0,
  is.winsor = TRUE,
  outlier.pct = 0.03,
  adaptive = TRUE,
  zero.handling = c('pseudo-count', 'imputation'),
  pseudo.cnt = 0.5,
  corr.cut = 0.1,
  p.adj.method = "BH",
  alpha = 0.05,
  n.cores = 1, 
  verbose = TRUE
)
}
\arguments{

\item{feature.dat}{a matrix of counts/proportions, row - features (OTUs, genes, etc) , column - samples.}

\item{meta.dat}{a data frame containing the sample meta data. If there are NAs, the corresponding samples
 will be removed in the analysis.}

\item{formula}{a character string for the formula. The formula should conform to that used by \code{lm} (independent 
data) or \code{lmer} (correlated data).
 For example: \code{formula = '~x1*x2+x3+(1|id)'}. At least one fixed effect is required.}

\item{feature.dat.type}{the type of the feature data. It could be "count" or "proportion".}

\item{prev.filter}{the prevalence (percentage of non-zeros) cutoff, under which the features will  be filtered. The default is 0. }

\item{mean.abund.filter}{the mean relative abundance cutoff, under which the features will  be filtered. The default is 0.}

\item{max.abund.filter}{the max relative abundance cutoff, under which the features will  be filtered. The default is 0.}

\item{is.winsor}{a logical value indicating whether winsorization should be performed to replace outliers (high values).
 The default is TRUE.}

\item{outlier.pct}{the expected percentage of outliers. These outliers will be winsorized. The default is 0.03.}


\item{adaptive}{a logical value indicating whether the approach to handle zeros (pseudo-count or imputation)
will be determined based on the correlations between the log(sequencing depth) and the explanatory variables
in \code{formula} when \code{feature.dat} is 'count'. If TRUE and the correlation p-value for any explanatory variable 
 is smaller than or equal to \code{corr.cut}, the imputation approach will be used; otherwise, the pseudo-count approach will be used. }


\item{zero.handling}{a character string of 'pseudo-count' or 'imputation' indicating the zero handling method
used when \code{feature.dat} is 'count'.  If 'pseudo-count', a\code{pseudo.cnt} will be added to each value in \code{feature.dat}. 
If 'imputation', then we use the imputation approach using the formula in the referenced paper. Basically,
zeros are imputed with values proportional to the sequencing depth. When \code{feature.dat} is 'proportion',
this parameter will be ignored and zeros will be imputed by half of the minimum for each feature.}

\item{pseudo.cnt}{a positive numeric value for the pseudo-count to be added if \code{zero.handling}
is 'pseudo-count'. Default is 0.5. }

\item{corr.cut}{a numerical value between 0 and 1, indicating the significance level used for determining
the zero-handling approach when \code{adaptive} is TRUE. Default is 0.1.}

\item{p.adj.method}{a character string indicating the p-value adjustment approach for 
addressing multiple testing. See R function \code{p.adjust}. Default is 'BH'.}

\item{alpha}{a numerical value between 0 and 1 indicating the significance level 
for declaring differential features. Default is 0.05.}


\item{n.cores}{a positive integer. If \code{n.cores > 1} and formula is in a form of mixed-effect model,
\code{n.cores} parallels will be conducted. Default is 1.}

\item{verbose}{a logical value indicating whether the trace information should be printed out.}

}
\value{
A list with the elements
\item{variables}{a vector of variable names of all fixed effects in \code{formula}. For example: \code{formula = '~x1*x2+x3+(1|id)'}.
Suppose \code{x1} and \code{x2} are numerical, and \code{x3} is a categorical variable of three levels: a, b and c.
Then the elements of \code{variables} would be \code{('x1', 'x2', 'x3b', 'x3c', 'x1:x2')}.}
\item{bias}{a numeric vector; each element corresponds to one variable in \code{variables};
the estimated bias of the regression coefficients due to the compositional effect.}
\item{output}{a list of data frames with columns 'baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue', 'padj', 'reject',
 'df'; \code{names(output)} is equal to \code{variables}; the rows of the data frame corresponds to features.
 Note: if there are features being excluded due to filtering, the number of the rows of the output data frame
 will be not equal to the number of the rows of \code{feature.dat}. Features are identified by the row names.
 If the row names of \code{feature.dat} are NULL, then \code{1 : nrow(feature.dat)} is set as the row names of \code{feature.dat}.
 \itemize{
 \item{baseMean:}{ 2 to the power of the intercept coefficients (normalized by one million)}
 \item{log2FoldChange:}{ bias-corrected coefficients}
 \item{lfcSE:}{ standard errors of the coefficients}
 \item{stat:}{ \code{log2FoldChange / lfcSE}}
 \item{pvalue:}{ \code{2 * pt(-abs(stat), df)}}
 \item{padj:}{ \code{p.adjust(pvalue, method = p.adj.method)}}
 \item{reject:}{ \code{padj <= alpha}}
 \item{df:}{ degrees of freedom. The number of samples minus the number of explanatory variables (intercept included) for
 fixed-effect models; estimates from R package \code{lmerTest} with Satterthwaite method of approximation for mixed-effect models.}
 }}
\item{feature.dat.use}{the actual feature table used in the differential analysis after filtering, winsorization and zero handling.}
\item{meta.dat.use}{the meta data used in the abundance analysis (only variables in \code{formula} are stored; samples that have NAs
 are removed; numerical variables are scaled).}
}

\examples{

data(throat.otu.tab)
data(throat.tree)
data(throat.meta)

comm <- t(throat.otu.tab)
meta.dat <- throat.meta

# For count data
linda.obj <- linda(comm, meta.dat, formula = '~SmokingStatus+Sex', feature.dat.type = 'count', 
           prev.filter = 0.2, is.winsor = TRUE, outlier.pct = 0.03,
           p.adj.method = "BH", alpha = 0.1
   )

rownames(linda.obj$output[[1]])[which(linda.obj$output[[1]]$reject)]

linda.plot(linda.obj, c('SmokingStatusSmoker', 'Sexmale'),
           titles = c('Smoke: n v.s. y', 'Sex: female v.s. male'), alpha = 0.1, lfc.cut = 1,
           legend = TRUE, directory = NULL, width = 11, height = 8)
           
# For proportion data   
comm.p <- t(t(comm) / colSums(comm))
linda.obj <- linda(comm.p, meta.dat, formula = '~SmokingStatus+Sex', 
           feature.dat.type = 'proportion', 
           prev.filter = 0.2, is.winsor = TRUE, outlier.pct = 0.03,
           p.adj.method = "BH", alpha = 0.1
   )

# For mixed effects model; demonstration only, the dataset does not have repeated measurements.
\dontrun{
linda.obj <- linda(comm, meta.dat, formula = '~SmokingStatus+Sex+(1|PatientID)', 
           feature.dat.type = 'count', 
           prev.filter = 0.2, is.winsor = TRUE, outlier.pct = 0.03,
           p.adj.method = "BH", alpha = 0.1)
} 


}
\references{
Huijuan Zhou, Kejun He, Jun Chen, and Xianyang Zhang. LinDA: Linear Models for Differential Abundance
Analysis of Microbiome Compositional Data. 
}
\author{
Huijuan Zhou,
Jun Chen,
Xianyang Zhang

}
