#' Compare two numerical data subsets
#'
#' Uses the Welch rank-test (a robust alternative to the classical t-test,
#' with better resistance to outliers and asymmetry) to compare the
#' distributions of two subsets of the same numerical variable.  The
#' result characterizes the subsets in terms of their median values,
#' and a small p-value (traditionally less than 0.05) implies
#' significant distributional differences between the two subsets.
#'
#' @param DF data frame containing `xVar`
#' @param xVar numerical variable whose subsets are to be compared
#' @param indexA record index defining the first subset of `xVar` values
#' @param indexB record index defining the second subset of `xVar` values
#' (default NULL means the second subset is all records not contained in
#' the first)
#' @param cLevel confidence level for the test (default = 0.95)
#'
#' @return a named vector with these 5 elements:
#'   * nA the number of records in the first `xVar` subset
#'   * nB the number of records in the second `xVar` subset
#'   * medianA the median `xVar` value in the first subset
#'   * medianB the median `xVar` value in the second subset
#'   * pValue the p-value returned by the Welch rank test
#' @export
#'
#' @examples
#' x <- seq(-1, 1, length = 200)
#' a <- rep(c("a", "b"), 100)
#' DF <- data.frame(numVar = x, setVar = a)
#' indexA <- which(DF$setVar == "a")
#' WelchRankTest(DF, "numVar", indexA)  # No difference in distribution
#' offset <- rep(c(0, 0.2), 100)
#' DF$numVar2 <- x + offset
#' WelchRankTest(DF, "numVar2", indexA) # Significant difference
#' xMod <- x
#' xMod[indexA[1:4]] <- x[indexA[1:4]] + 10
#' DF$numVar3 <- xMod
#' WelchRankTest(DF, "numVar3", indexA) # No difference even with outliers
#' stats::t.test(DF[indexA, "numVar3"], DF[-indexA, "numVar3"]) # Compare t-test
WelchRankTest <- function(DF, xVar, indexA, indexB = NULL, cLevel = 0.95){
  #
  stopifnot("DF must be a data frame"= is.data.frame(DF))
  stopifnot("xVar not found in DF"= xVar %in% colnames(DF))
  stopifnot("cLevel not between 0 and 1"= cLevel > 0 & cLevel < 1)
  #
  #  Extract x and y cases from data frame DF
  #
  x <- DF[indexA, xVar]
  #
  if (is.null(indexB)){
    y <- DF[-indexA, xVar]
  } else {
    y <- DF[indexB, xVar]
  }
  #
  #  Remove missing values
  #
  noXindex <- which(is.na(x))
  if (length(noXindex) > 0){
    x <- x[-noXindex]
  }
  stopifnot("indexA subset contains no non-missing values"= length(x) > 0)
  noYindex <- which(is.na(y))
  if (length(noYindex) > 0){
    y <- y[-noYindex]
  }
  stopifnot("indexB subset contains no non-missing values"= length(y) > 0)
  #
  #  Create pooled ranks
  #
  z <- c(x, y)
  Nx <- length(x)
  Ny <- length(y)
  zRanks <- rank(z)
  xRanks <- zRanks[1:Nx]
  yRanks <- zRanks[(Nx + 1):(Nx + Ny)]
  #
  #  Apply the Welch t-test to the ranks
  #
  testResult <- stats::t.test(xRanks, yRanks, conf.level = cLevel)
  pValue <- testResult$p.value
  cLims <- testResult$conf.int
  #
  outVec <- c(Nx, Ny, stats::median(x), stats::median(y), pValue)
  names(outVec) <- c("nA", "nB", "medianA", "medianB", "pValue")
  return(outVec)
}

