% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stat-poly-eq.R
\name{stat_poly_eq}
\alias{stat_poly_eq}
\title{Equation, p-value, \eqn{R^2}, AIC and BIC of fitted polynomial}
\usage{
stat_poly_eq(
  mapping = NULL,
  data = NULL,
  geom = "text_npc",
  position = "identity",
  ...,
  formula = NULL,
  method = "lm",
  method.args = list(),
  n.min = 2L,
  fit.seed = NA,
  eq.with.lhs = TRUE,
  eq.x.rhs = NULL,
  small.r = getOption("ggpmisc.small.r", default = FALSE),
  small.p = getOption("ggpmisc.small.p", default = FALSE),
  CI.brackets = c("[", "]"),
  rsquared.conf.level = 0.95,
  coef.digits = 3,
  coef.keep.zeros = TRUE,
  decreasing = getOption("ggpmisc.decreasing.poly.eq", FALSE),
  rr.digits = 2,
  f.digits = 3,
  p.digits = 3,
  label.x = "left",
  label.y = "top",
  hstep = 0,
  vstep = NULL,
  output.type = NULL,
  na.rm = FALSE,
  orientation = NA,
  parse = NULL,
  show.legend = FALSE,
  inherit.aes = TRUE
)
}
\arguments{
\item{mapping}{The aesthetic mapping, usually constructed with
\code{\link[ggplot2]{aes}}. Only needs to be set at the layer level if you
are overriding the plot defaults.}

\item{data}{A layer specific dataset, only needed if you want to override the
plot defaults.}

\item{geom}{The geometric object to use display the data}

\item{position}{The position adjustment to use for overlapping points on this
layer.}

\item{...}{other arguments passed on to \code{\link[ggplot2]{layer}}. This
can include aesthetics whose values you want to set, not map. See
\code{\link[ggplot2]{layer}} for more details.}

\item{formula}{a formula object. Using aesthetic names \code{x} and \code{y}
instead of original variable names.}

\item{method}{function or character If character, "lm", "rlm", "lqs". "gls"
"ma", "sma", or the name of a model fit function are accepted, possibly
followed by the fit function's \code{method} argument separated by a colon
(e.g. \code{"rlm:M"}). If a function is different to \code{lm()},
\code{rlm()}, \code{lqs()}, \code{gls()}, \code{ma}, \code{sma}, it must
have formal parameters named \code{formula}, \code{data}, \code{weights},
and \code{method}. See Details.}

\item{method.args}{named list with additional arguments. Not \code{data}
or \code{weights} which are always passed through aesthetic mappings.}

\item{n.min}{integer Minimum number of distinct values in the explanatory
variable (on the rhs of formula) for fitting to the attempted.}

\item{fit.seed}{RNG seed argument passed to
\code{\link[base:Random]{set.seed}()}. Defaults to \code{NA}, indicating
that \code{set.seed()} should not be called.}

\item{eq.with.lhs}{If \code{character} the string is pasted to the front of
the equation label before parsing or a \code{logical} (see note).}

\item{eq.x.rhs}{\code{character} this string will be used as replacement for
\code{"x"} in the model equation when generating the label before parsing
it.}

\item{small.r, small.p}{logical Flags to switch use of lower case r and p for
coefficient of determination and p-value.}

\item{CI.brackets}{character vector of length 2. The opening and closing
brackets used for the CI label.}

\item{rsquared.conf.level}{numeric Confidence level for the returned
confidence interval. Set to NA to skip CI computation.}

\item{coef.digits, f.digits}{integer Number of significant digits to use for
the fitted coefficients and F-value.}

\item{coef.keep.zeros}{logical Keep or drop trailing zeros when formatting
the fitted coefficients and F-value.}

\item{decreasing}{logical It specifies the order of the terms in the
returned character string; in increasing (default) or decreasing powers.}

\item{rr.digits, p.digits}{integer Number of digits after the decimal point to
use for \eqn{R^2} and P-value in labels. If \code{Inf}, use exponential
notation with three decimal places.}

\item{label.x, label.y}{\code{numeric} with range 0..1 "normalized parent
coordinates" (npc units) or character if using \code{geom_text_npc()} or
\code{geom_label_npc()}. If using \code{geom_text()} or \code{geom_label()}
numeric in native data units. If too short they will be recycled.}

\item{hstep, vstep}{numeric in npc units, the horizontal and vertical step
used between labels for different groups.}

\item{output.type}{character One of "expression", "LaTeX", "text",
"markdown" or "numeric".}

\item{na.rm}{a logical indicating whether NA values should be stripped before
the computation proceeds.}

\item{orientation}{character Either "x" or "y" controlling the default for
\code{formula}. The letter indicates the aesthetic considered the
explanatory variable in the model fit.}

\item{parse}{logical Passed to the geom. If \code{TRUE}, the labels will be
parsed into expressions and displayed as described in \code{?plotmath}.
Default is \code{TRUE} if \code{output.type = "expression"} and
\code{FALSE} otherwise.}

\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE}
never includes, and \code{TRUE} always includes.}

\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics, rather
than combining with them. This is most useful for helper functions that
define both data and aesthetics and shouldn't inherit behaviour from the
default plot specification, e.g. \code{\link[ggplot2]{borders}}.}
}
\value{
A data frame, with a single row per group and columns as described
  under \strong{Computed variables}. In cases when the number of observations
  is less than \code{n.min} a data frame with no rows or columns is returned,
  and rendered as an empty/invisible plot layer.
}
\description{
\code{stat_poly_eq} fits a polynomial, by default with \code{stats::lm()},
but alternatively using robust, resistant or generalized least squares. Major
axis regression and segmented linear regression are also supported. Using the
fitted model it generates several labels including the fitted model equation,
p-value, F-value, coefficient of determination (R^2) and its confidence
interval, 'AIC', 'BIC', number of observations and method name, if available.
}
\details{
This statistic can be used to automatically annotate a plot with
  \eqn{R^2}, adjusted \eqn{R^2}, the fitted model equation, \eqn{P}, and
  other parameters from a fitted model. It supports linear regression and
  polynomial fits with \code{\link[stats]{lm}()}, segmented linear regression
  with package 'segmented' and major axis and standardized major axis
  regression with package 'smatr', robust and resistant regression with
  packages 'MASS' and 'robustbase'. The list is not exhaustive, and depends
  on the availability of methods for the model fit objects. Lack of methods
  or explicit support results in individual parameters and matching labels
  being set to NA. As some model fitting results can depend on the RNG,
  \code{fit.seed} if different to \code{NA} is used as argument in a call to
  \code{\link[base:Random]{set.seed}()} immediately ahead of model fitting.

  While strings for \eqn{R^2}, adjusted \eqn{R^2}, \eqn{F}, and \eqn{P}
  annotations are returned for all valid linear models, A character string
  for the fitted model is returned only for polynomials (see below). When
  not generated automatically, the equation can still be assembled by the
  user within the call to \code{\link[ggplot2]{aes}()}. In addition, a label
  for the confidence interval of \eqn{R^2}, based on values computed with
  function \code{\link[confintr]{ci_rsquared}} from package 'confintr' is
  returned when possible.

  Model formulas can use \code{poly()} or be defined algebraically including
  the intercept indicated by \code{+1}, \code{-1}, \code{+0} or implicit. If
  defined using \code{poly()} the argument \code{raw = TRUE} must be passed.
  The \code{model formula} is checked, and if not recognized as a polynomial
  with no missing terms and terms ordered by increasing powers, no equation
  label is generated. Thus, as the value returned for \code{eq.label} can be
  \code{NA}, the default aesthetic mapping to \emph{label} is \eqn{R^2}.

  By default, the character strings are generated as suitable for parsing
  into R's plotmath expressions. However, LaTeX (use TikZ device in R),
  markdown (use package 'ggtext') and plain text are also supported, as well
  as returning numeric values for user-generated text labels. The argument of
  \code{parse} is set automatically based on \code{output-type}, but if you
  assemble labels that need parsing from \code{numeric} output, the default
  needs to be overridden.

  This statistic only generates annotation labels, the predicted values/line
  need to be added to the plot as a separate layer using
  \code{\link{stat_poly_line}} (or \code{\link[ggplot2]{stat_smooth}}).
  Passing the same arguments in \code{stat_poly_line()} and in
  \code{stat_poly_eq()} to parameters \code{method} and \code{formula}, and
  if used also to \code{method.args} ensures that the plotted curve and
  equation are consistent. Thus, it is best to save these arguments as named
  objects and pass them as arguments to the two statistics.

  A ggplot statistic receives as \code{data} a data frame that is not the one
  passed as argument by the user, but instead a data frame with the variables
  mapped to aesthetics. \code{stat_poly_eq()} mimics how
  \code{\link[ggplot2]{stat_smooth}()} works. Thus, the model formula should
  be defined based on the names of aesthetics \code{x} and \code{y}, not the
  names of the variables in the data. Before fitting the model, data are
  split based on groupings created by any other
  mappings present in a plot panel: \emph{fitting is done separately for each
  group in each plot panel}.

  With method \code{"lm"}, singularity results in terms being dropped with a
  message if more numerous than can be fitted with a singular (exact) fit. In
  this case or if the model results in a perfect fit due to a low number of
  observations, estimates for various parameters are \code{NaN} or \code{NA}.
  When this is the case the corresponding labels are set to
  \code{character(0L)} and thus not visible in the plot. With methods other
  than \code{"lm"}, the model fit functions simply fail in case of
  singularity, e.g., singular fits are not implemented in
  \code{\link[MASS]{rlm}()}.

  A requirement for a minimum number of observations with distinct values in
  the explanatory variable can be set through parameter \code{n.min}. The
  default \code{n.min = 2L} is the smallest suitable for method \code{"lm"}
  but too small for method \code{"rlm"} for which \code{n.min = 3L} is
  needed. Anyway, model fits with very few observations are of little
  interest and using larger values of \code{n.min} than the default is
  usually wise. This can be useful as when this threshold is not reached
  an empty data frame is returned resulting in an empty plot layer.
}
\note{
For backward compatibility a logical is accepted as argument for
  \code{eq.with.lhs}. If \code{TRUE}, the default is used, either
  \code{"x"} or \code{"y"}, depending on the argument passed to \code{formula}.
  However, \code{"x"} or \code{"y"} can be substituted by providing a
  suitable replacement character string through \code{eq.x.rhs}.
  Parameter \code{orientation} is redundant as it only affects the default
  for \code{formula} but is included for consistency with
  \code{ggplot2::stat_smooth()}.

  R option \code{OutDec} is obeyed based on its value at the time the plot
  is rendered, i.e., displayed or printed. Set \code{options(OutDec = ",")}
  for languages like Spanish or French.
}
\section{User-defined methods}{
 User-defined functions can be passed as
  argument to \code{method}. The requirements are 1) that the signature is
  similar to that of function \code{lm()} (with parameters \code{formula},
  \code{data}, \code{weights} and any other arguments passed by name through
  \code{method.args}) and 2) that the value returned by the function is an
  object of a class such as \code{"lm"} for which \code{coefs()} and similar
  query methods are available or for empty plot layer output, an atomic
  \code{NA} value.

  When possible, i.e., nearly allways, the \code{formula} used to build
  the equation label is extracted from the returned fitted model object.
  Most fitted model objects returned follow the example of \code{lm()} and
  include the model formula fitted. Thus, this model formula can safely
  differ from the argument passed to parameter \code{formula} in the call
  to \code{stat_poly_eq()}.
  Thus, user-defined methods can implement any or all of \code{method}
  selection, model \code{formula} selection, dynamically adjusted
  \code{method.args} and conditional skipping of labelling on a by group
  basis.
}

\section{Aesthetics}{
 \code{stat_poly_eq()} understands \code{x} and \code{y},
  to be referenced in the \code{formula} and \code{weight} passed as argument
  to parameter \code{weights}. All three must be mapped to \code{numeric}
  variables. In addition, the aesthetics understood by the geom
  (\code{"text"} is the default) are understood and grouping respected.

  If the model formula includes a transformation of \code{x}, a
  matching argument should be passed to parameter \code{eq.x.rhs}
  as its default value \code{"x"} will not reflect the applied
  transformation. In plots, transformation should never be applied to the
  left hand side of the model formula, but instead in the mapping of the
  variable within \code{aes}, as otherwise plotted observations and fitted
  curve will not match. In this case it may be necessary to also pass
  a matching argument to parameter \code{eq.with.lhs}.
}

\section{Computed variables}{

If the model fit function used does not returns \code{NA} or no value,
the label is set to \code{character(0L)}. The position of the columns in
the data frame can change between package versions, extract values always
by name.

For all \code{output.type} arguments the following values are returned.
\describe{
  \item{x,npcx}{x position}
  \item{y,npcy}{y position}
  \item{coefs}{fitted coefficients, named numeric vector as a list member}
  \item{r.squared, rr.confint.level, rr.confint.low, rr.confint.high, adj.r.squared, f.value, f.df1, f.df2, p.value, AIC, BIC, n, knots, knots.se}{numeric values, from the model fit object}
  \item{grp.label}{Set according to mapping in \code{aes}.}
  \item{knots}{list containing a numeric vector of knot or "psi" \emph{x}-value for linear splines}
  \item{fm.method}{name of method used, character}
  \item{fm.class}{most derived class or the fitted model object, character}
  \item{fm.formula.chr}{formatted model formula, character}}

If \code{output.type} is not \code{"numeric"} the returned tibble contains in
addition to those above the columns listed below, each containing a single
character string. The markup used depends on the value of \code{output.type}.
\describe{
  \item{eq.label}{equation for the fitted polynomial as a character string to be parsed or \code{NA}}
  \item{rr.label}{\eqn{R^2} of the fitted model as a character string to be parsed}
  \item{adj.rr.label}{Adjusted \eqn{R^2} of the fitted model as a character string to be parsed}
  \item{rr.confint.label}{Confidence interval for \eqn{R^2} of the fitted model as a character string to be parsed}
  \item{f.value.label}{F value and degrees of freedom for the fitted model as a whole.}
  \item{p.value.label}{P-value for the F-value above.}
  \item{AIC.label}{AIC for the fitted model.}
  \item{BIC.label}{BIC for the fitted model.}
  \item{n.label}{Number of observations used in the fit.}
  \item{knots.label}{The knots or change points in segmented regression.}
  \item{grp.label}{Set according to mapping in \code{aes}.}
  \item{method.label}{Set according \code{method} used.}}

If output.type is \code{"numeric"} the returned tibble contains columns
listed below in addition to the base ones. If the model fit function used
does not return a value, the variable is set to \code{NA_real_}.
\describe{
  \item{coef.ls}{list containing the "coefficients" matrix from the summary of the fit object}
  \item{b_0.constant}{TRUE is polynomial is forced through the origin}
  \item{b_i}{One or more columns with the coefficient estimates}}

To explore the computed values returned for a given input we suggest the use
of \code{\link[gginnards]{geom_debug}} as shown in the last examples below.
}

\examples{
# generate artificial data
set.seed(4321)
x <- 1:100
y <- (x + x^2 + x^3) + rnorm(length(x), mean = 0, sd = mean(x^3) / 4)
y <- y / max(y)
my.data <- data.frame(x = x, y = y,
                      group = c("A", "B"),
                      y2 = y * c(1, 2) + c(0, 0.1),
                      w = sqrt(x))

# give a name to a formula
formula <- y ~ poly(x, 3, raw = TRUE)

# using defaults
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line() +
  stat_poly_eq()

# no weights
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(formula = formula)

# other labels
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(use_label("eq"), formula = formula)

# other labels
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(use_label("eq"), formula = formula, decreasing = TRUE)

ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(use_label("eq", "R2"), formula = formula)

ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(use_label("R2", "R2.CI", "P", "method"), formula = formula)

ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(use_label("R2", "F", "P", "n", sep = "*\"; \"*"),
               formula = formula)

# grouping
ggplot(my.data, aes(x, y2, color = group)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(formula = formula)

# rotation
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(formula = formula, angle = 90)

# label location
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(formula = formula, label.y = "bottom", label.x = "right")

ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(formula = formula, label.y = 0.1, label.x = 0.9)

# modifying the explanatory variable within the model formula
# modifying the response variable within aes()
# eq.x.rhs and eq.with.lhs defaults must be overridden!!
formula.trans <- y ~ I(x^2)
ggplot(my.data, aes(x, y + 1)) +
  geom_point() +
  stat_poly_line(formula = formula.trans) +
  stat_poly_eq(use_label("eq"),
               formula = formula.trans,
               eq.x.rhs = "~x^2",
               eq.with.lhs = "y + 1~~`=`~~")

# using weights
ggplot(my.data, aes(x, y, weight = w)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(formula = formula)

# no weights, 4 digits for R square
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(formula = formula, rr.digits = 4)

# manually assemble and map a specific label using paste() and aes()
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(aes(label =  paste(after_stat(rr.label),
                                  after_stat(n.label), sep = "*\", \"*")),
               formula = formula)

# manually assemble and map a specific label using sprintf() and aes()
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(aes(label =  sprintf("\%s*\" with \"*\%s*\" and \"*\%s",
                                    after_stat(rr.label),
                                    after_stat(f.value.label),
                                    after_stat(p.value.label))),
               formula = formula)

# x on y regression
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula, orientation = "y") +
  stat_poly_eq(use_label("eq", "adj.R2"),
               formula = x ~ poly(y, 3, raw = TRUE))

# conditional user specified label
ggplot(my.data, aes(x, y2, color = group)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(aes(label =  ifelse(after_stat(adj.r.squared) > 0.96,
                                   paste(after_stat(adj.rr.label),
                                         after_stat(eq.label),
                                         sep = "*\", \"*"),
                                   after_stat(adj.rr.label))),
               rr.digits = 3,
               formula = formula)

# geom = "text"
ggplot(my.data, aes(x, y)) +
  geom_point() +
  stat_poly_line(formula = formula) +
  stat_poly_eq(geom = "text", label.x = 100, label.y = 0, hjust = 1,
               formula = formula)

# Inspecting the returned data using geom_debug()
# This provides a quick way of finding out the names of the variables that
# are available for mapping to aesthetics with after_stat().

gginnards.installed <- requireNamespace("gginnards", quietly = TRUE)

if (gginnards.installed)
  library(gginnards)

if (gginnards.installed)
  ggplot(my.data, aes(x, y)) +
    geom_point() +
    stat_poly_line(formula = formula) +
    stat_poly_eq(formula = formula,
                 geom = "debug")

if (gginnards.installed)
  ggplot(my.data, aes(x, y)) +
    geom_point() +
    stat_poly_line(formula = formula) +
    stat_poly_eq(formula = formula,
                 geom = "debug",
                 output.type = "numeric")

# names of the variables
if (gginnards.installed)
  ggplot(my.data, aes(x, y)) +
    geom_point() +
    stat_poly_line(formula = formula) +
    stat_poly_eq(formula = formula,
                 geom = "debug",
                 dbgfun.data = colnames)

# only data$eq.label
if (gginnards.installed)
  ggplot(my.data, aes(x, y)) +
    geom_point() +
    stat_poly_line(formula = formula) +
    stat_poly_eq(formula = formula,
                 geom = "debug",
                 output.type = "expression",
                 dbgfun.data = function(x) {x[["eq.label"]]})

# only data$eq.label
if (gginnards.installed)
  ggplot(my.data, aes(x, y)) +
    geom_point() +
    stat_poly_line(formula = formula) +
    stat_poly_eq(formula = formula,
                 geom = "debug",
                 output.type = "text",
                 dbgfun.data = function(x) {x[["eq.label"]]})

}
\references{
Originally written as an answer to question 7549694 at
  Stackoverflow but enhanced based on suggestions from users and my own
  needs.
}
\seealso{
This statistics fits a model with function \code{\link[stats]{lm}()}
  as default, several other functions returning objects of class \code{"lm"}
  or objects of classes for which the common R fitted-model-object
  extraction/query methods are available. Consult the documentation of these
  functions for the details and additional arguments that can be passed to
  them by name through parameter \code{method.args}. User-defined
  model-fitting functions are also supported.

  Please, see the articles
  \href{https://docs.r4photobiology.info/ggpmisc/}{online documentation}
  for additional use examples and guidance.

  For quantile regression \code{\link{stat_quant_eq}()} should be used
  instead of \code{stat_poly_eq()} while for model II or major axis
  regression with package 'lmodel2' \code{\link{stat_ma_eq}()} should be
  used. For methods not supportted by these three statistics, such as
  non-linear models, statistics \code{\link{stat_fit_glance}()} and
  \code{\link{stat_fit_tidy}()} can be used but require the user to create
  character strings from numeric values and map them to aesthetic
  \code{label}.

Other ggplot statistics for linear and polynomial regression: 
\code{\link{stat_poly_line}()}
}
\concept{ggplot statistics for linear and polynomial regression}
