#### PLFIT ####

#' Power-law fit (PLFIT) Algorithm
#'
#' This function implements the PLFIT algorithm as described by *Clauset et al.* to determine the value of \eqn{\hat k}. It minimizes the Kolmorogorov-Smirnoff (KS) distance between the empirical cumulative distribution function and the fitted power law.
#'
#' \deqn{D_{n,k} := \sup_{y \ge 1} |\frac{1}{k-1} \sum_{i=1}^{k-1} I (\frac{X_{(i)}}{X_{(k)}} > y) - y^{-\hat{\alpha}_{n,k}^H}|}
#'
#'  The above equation, as described by *Nair et al.*, is implemented in this function with an Empirical CDF instead of the empirical survival function, which is mathematical equivalent since they are both complements of each other.
#'
#' \deqn{D_{n,k} :=
#'\sup_{y \ge 1}
#'|
#'  \underbrace{
#'    \frac{1}{k-1}
#'    \sum_{i=1}^{k-1}
#'    I(\frac{X_{(i)}}{X_{(k)}} \le y)
#'  }_{\text{Empirical CDF}}
#'-
#'  \underbrace{
#'    (1 - y^{-\hat{\alpha}_{n,k}})
#'  }_{\text{Theoretical CDF}}|}
#'
#'  \deqn{\hat k = \text{argmin} (D_{n,k})}
#'
#' @param data A numeric vector of i.i.d. observations.
#' @param kmax Maximum number of top-order statistics. If kmax = -1, then kmax=(n-1) where n is the length of dataset
#' @param kmin Minimum number of top-order statistics to start with
#' @param na.rm Logical. If \code{TRUE}, missing values (\code{NA}) are removed
#'   before analysis. Defaults to \code{FALSE}.
#'
#' @returns A named list containing the results of the PLFIT algorithm:
#' \itemize{
#'   \item{\code{k_hat}:} The optimal number of top-order statistics \eqn{\hat{k}}.
#'   \item{\code{alpha_hat}:} The estimated power-law exponent \eqn{\hat{\alpha}} corresponding to \eqn{\hat{k}}.
#'   \item{\code{xmin_hat}:} The minimum value \eqn{x_{\min} = X_{(\hat{k})}} above which the power law is fitted.
#'   \item{\code{ks_distance}:} The minimum Kolmogorov-Smirnov distance \eqn{D_{n,k}} found.
#' }
#'
#' @export
#'
#' @examples
#'
#' xmin <- 1
#' alpha <- 2
#' r <- runif(800, 0, 1)
#' x <- (xmin * r^(-1/(alpha)))
#' plfit_values <- plfit(data = x, kmax = -1, kmin = 2)
#'
#' @references
#'
#' Clauset, A., Shalizi, C. R., & Newman, M. E. (2009). Power-law distributions in empirical data. \emph{SIAM Review}, \bold{51}(4), 661-703. \doi{10.1137/070710111}
#'
#' Nair, J., Wierman, A., & Zwart, B. (2022). \emph{The Fundamentals of Heavy Tails: Properties, Emergence, and Estimation}. Cambridge University Press. (pp. 227-229) \doi{10.1017/9781009053730}
#'
plfit <- function(data, kmax=-1, kmin=2, na.rm = FALSE) {

  if (!is.numeric(data) || !is.null(dim(data))) {
    stop("`data` must be a numeric vector.")
  }

  if (all(data <= 0)) {
    stop("`data` must contain positive values for power-law fitting.")
  }

  if (anyNA(data) && na.rm==TRUE) {
    data <- data[!is.na(data)]
    if (length(data) <= 1) {
      stop("Removing NAs resulted in a data vector with length <= 1. Data must be a vector with length > 1")
    }
  }

  if (!is.numeric(kmin) || length(kmin) != 1 || kmin < 2) {
    stop("`kmin` must be a numeric scalar >= 2.")
  }

  if (!is.numeric(kmax) || length(kmax) != 1) {
    stop("`kmax` must be a numeric scalar.")
  }

  n <- length(data)

  if (any(data <= 0)) {
    warning("`data` has negative values. They will be excluded from calculations.")
    data <- data[data > 0]
  }

  if (n < 3) {
    stop("Not enough data to fit.")
  }

  if (kmax == -1) {
    kmax <- n - 1
  }

  x = sort(data, decreasing=TRUE)
  alphas <- rep(NA, kmax)
  ks_distances <- rep(NA, kmax)
  for (k in (kmin:kmax)) {

    k1 <- k-1

    # We normalize the data to be >= 1 (Eq. 9.17 note 5)
    # by dividing by xmin (current_xmin, which is X_(k))
    current_xmin <- x[k]
    scaled <- x[1:k1]/current_xmin

    log_s <- log(scaled)

    xi_est <- mean(log_s[is.finite(log_s)])

    if (xi_est == 0) {
      current_alpha <- Inf
    } else {
      current_alpha <- 1/xi_est
    }
    alphas[k] <- current_alpha
    ecdf_vals <- (1:k1)/k1

    # Theoretical CDF
    scaled <- sort(scaled, decreasing=F)
    theoretical_cdf_vals <- pareto_cdf(scaled, xmin = 1, alpha = current_alpha)

    ks_distances[k] <- max(abs(ecdf_vals-theoretical_cdf_vals))
  }

  k_hat <- which.min(ks_distances)

  return(list(
    k_hat = k_hat,
    alpha_hat = alphas[k_hat],
    xmin_hat = x[k_hat],
    ks_distance = ks_distances[k_hat]
  ))
}

