#' Frequentist Changepoint Detection Methods
#'
#' @description
#' Implementation of frequentist methods for changepoint detection:
#' CUSUM, PELT, Binary Segmentation, and Wild Binary Segmentation.
#'
#' @name frequentist_methods
#' @noRd
NULL


#' @noRd
segment_cost <- function(data, start, end, type = "both") {
  if (end < start) return(Inf)
  segment <- if (is.matrix(data)) data[start:end, , drop = FALSE] else data[start:end]
  n <- if (is.matrix(segment)) nrow(segment) else length(segment)
  if (n < 2) return(Inf)
  if (is.matrix(segment)) cost_multivariate(segment, type) else cost_univariate(segment, type)
}

#' @noRd
cost_univariate <- function(x, type) {
  n <- length(x)
  switch(type,
         mean = sum((x - mean(x))^2),
         variance = {
           v <- var(x); if (is.na(v) || v <= 0) v <- .Machine$double.eps
           n * (log(2 * pi * v) + 1)
         },
         var = {
           v <- var(x); if (is.na(v) || v <= 0) v <- .Machine$double.eps
           n * (log(2 * pi * v) + 1)
         },
         both = {
           v <- var(x); if (is.na(v) || v <= 0) v <- .Machine$double.eps
           n * (log(2 * pi * v) + 1)
         },
         trend = {
           t <- seq_len(n); fit <- lm.fit(cbind(1, t), x)
           sum(fit$residuals^2)
         },
         distribution = {
           n * log(var(x) + .Machine$double.eps)
         }
  )
}

#' @noRd
cost_multivariate <- function(x, type) {
  n <- nrow(x)
  d <- ncol(x)
  switch(type,
         mean = sum(scale(x, scale = FALSE)^2),
         variance = {
           S <- cov(x); det_val <- det(S)
           if (det_val <= 0) det_val <- .Machine$double.eps
           n * (d * log(2 * pi) + log(det_val) + d)
         },
         both = {
           S <- cov(x); det_val <- det(S)
           if (det_val <= 0) det_val <- .Machine$double.eps
           n * (d * log(2 * pi) + log(det_val) + d)
         },
         {
           S <- cov(x); det_val <- det(S)
           if (det_val <= 0) det_val <- .Machine$double.eps
           n * log(det_val)
         }
  )
}

#' @noRd
sum_segment_costs <- function(data, changepoints, type) {
  n <- if (is.matrix(data)) nrow(data) else length(data)
  boundaries <- c(0, sort(changepoints), n)
  total <- 0
  for (i in seq_len(length(boundaries) - 1)) {
    total <- total + segment_cost(data, boundaries[i] + 1, boundaries[i + 1], type)
  }
  total
}


#' @noRd
compute_penalty <- function(penalty, n, d = 1) {
  if (is.numeric(penalty)) {
    return(penalty)
  }
  
  p <- switch(as.character(d),
              "1" = 2,
              d + d * (d + 1) / 2
  )
  
  switch(penalty,
         BIC = p * log(n),
         AIC = 2 * p,
         MBIC = p * log(n) + 2 * log(log(n)),
         MDL = p * log(n) / 2,
         adaptive = 2 * log(log(n + 3)) + p,
         none = 0,
         p * log(n)
  )
}

#' @noRd
compute_penalty_enhanced <- function(penalty, n, type) {
  if (is.numeric(penalty)) return(penalty)
  
  p <- switch(type,
              mean = 1,
              variance = 1,
              var = 1,
              2)
  
  switch(penalty,
         BIC = p * log(n),
         MBIC = (p + 1) * log(n),
         AIC = 2 * p,
         SIC = p * log(n),
         HQ = 2 * p * log(log(n)),
         p * log(n))
}


#' Winsorize data at specified probability level
#' @noRd
winsorize <- function(x, prob = 0.05) {
  if (prob <= 0) return(x)
  q <- quantile(x, c(prob, 1 - prob), na.rm = TRUE)
  pmax(pmin(x, q[2]), q[1])
}

#' Qn scale estimator (Rousseeuw & Croux)
#' @noRd
Qn_scale <- function(x) {
  robust_Qn(x)
}

#' @noRd
robust_Qn <- function(x) {
  n <- length(x)
  if (n < 2) return(0)
  
  if (n < 10) {
    return(mad(x, constant = 1.4826))
  }
  
  if (n > 500) {
    idx <- sample(n, 500)
    x <- x[idx]
    n <- 500
  }
  
  diffs <- abs(outer(x, x, "-"))
  diffs <- diffs[lower.tri(diffs)]
  
  h <- floor(n/2) + 1
  k <- choose(h, 2)
  k <- min(k, length(diffs))
  
  if (k < 1) return(mad(x, constant = 1.4826))
  
  Qn <- sort(diffs, partial = k)[k]
  
  2.2219 * Qn
}

#' Tukey biweight rho function
#' @noRd
tukey_biweight_rho <- function(u, k = 4.685) {
  ifelse(abs(u) <= k,
         (k^2 / 6) * (1 - (1 - (u/k)^2)^3),
         k^2 / 6)
}

#' Tukey biweight psi function (derivative of rho)
#' @noRd
tukey_biweight_psi <- function(u, k = 4.685) {
  ifelse(abs(u) <= k,
         u * (1 - (u/k)^2)^2,
         0)
}

#' Tukey biweight weight function
#' @noRd
tukey_biweight_weight <- function(u, k = 4.685) {
  ifelse(abs(u) <= k,
         (1 - (u/k)^2)^2,
         0)
}

#' Huber loss function
#' @noRd
huber_loss <- function(r, k = 1.345) {
  ifelse(abs(r) <= k,
         r^2 / 2,
         k * abs(r) - k^2 / 2)
}

#' Huber weight function
#' @noRd
huber_weight <- function(r, k = 1.345) {
  ifelse(abs(r) <= k, 1, k / pmax(abs(r), 1e-10))
}

#' Get robust parameters based on robustness level
#' @noRd
get_robust_params <- function(robust_level, data) {
  
  if (is.null(robust_level)) robust_level <- "none"
  if (isTRUE(robust_level)) robust_level <- "moderate"
  if (isFALSE(robust_level)) robust_level <- "none"
  
  if (identical(robust_level, "auto")) {
    robust_level <- tryCatch({
      detect_contamination_level(data)
    }, error = function(e) "moderate")
  }
  
  params <- switch(robust_level,
                   "none" = list(
                     winsor_prob = 0, estimator = "mle", 
                     description = "Standard (no robustness)"
                   ),
                   "mild" = list(
                     winsor_prob = 0.05, estimator = "huber", huber_k = 1.345, 
                     scale_estimator = "mad",
                     description = "Mild robustness (5% winsor, Huber k=1.345)"
                   ),
                   "moderate" = list(
                     winsor_prob = 0.10, estimator = "huber", huber_k = 1.0, 
                     scale_estimator = "Qn",
                     description = "Moderate robustness (10% winsor, Huber k=1.0)"
                   ),
                   "aggressive" = list(
                     winsor_prob = 0.15, estimator = "tukey", huber_k = 4.685, 
                     scale_estimator = "Qn",
                     description = "Aggressive robustness (15% winsor, Tukey)"
                   ),
                   list(
                     winsor_prob = 0.10, estimator = "huber", huber_k = 1.0, 
                     scale_estimator = "Qn",
                     description = "Moderate robustness (fallback)"
                   )
  )
  
  params$level <- robust_level
  params
}

#' Auto-detect contamination level
#' @noRd
detect_contamination_level <- function(data) {
  data_clean <- data[!is.na(data) & is.finite(data)]
  n <- length(data_clean)
  
  if (n < 10) return("mild")
  
  med_val <- median(data_clean)
  mad_val <- mad(data_clean, constant = 1.4826)
  sd_val <- sd(data_clean)
  
  if (mad_val < 1e-10 || sd_val < 1e-10) return("none")
  
  efficiency_ratio <- mad_val / sd_val
  
  q1 <- quantile(data_clean, 0.25)
  q3 <- quantile(data_clean, 0.75)
  iqr <- q3 - q1
  
  if (iqr < 1e-10) iqr <- 2 * mad_val
  
  lower <- q1 - 1.5 * iqr
  upper <- q3 + 1.5 * iqr
  
  outliers <- data_clean < lower | data_clean > upper
  outlier_prop <- mean(outliers)
  
  
  if (efficiency_ratio > 0.85 && outlier_prop < 0.01) {
    return("none")
  }
  
  if (outlier_prop > 0.15 || efficiency_ratio < 0.4) {
    return("aggressive")
  }
  
  if (outlier_prop > 0.05 || efficiency_ratio < 0.6) {
    return("moderate")
  }
  
  return("mild")
}

#' Robust location and scale estimation
#' @noRd
robust_location_scale <- function(x, k = 1.345, max_iter = 10, tol = 1e-6) {
  n <- length(x)
  
  mu <- median(x)
  sigma <- mad(x, constant = 1.4826)
  if (sigma < .Machine$double.eps * 100) {
    sigma <- sd(x)
  }
  if (sigma < .Machine$double.eps * 100) {
    return(list(mu = mu, sigma = .Machine$double.eps * 100))
  }
  
  for (iter in 1:max_iter) {
    mu_old <- mu
    sigma_old <- sigma
    
    r <- (x - mu) / sigma
    w <- ifelse(abs(r) <= k, 1, k / abs(r))
    mu <- sum(w * x) / sum(w)
    
    sigma <- mad(x - mu, constant = 1.4826)
    if (sigma < .Machine$double.eps * 100) sigma <- sigma_old
    
    if (abs(mu - mu_old) < tol * sigma_old &&
        abs(sigma - sigma_old) < tol * sigma_old) {
      break
    }
  }
  
  list(mu = mu, sigma = sigma, iterations = iter)
}

#' Estimate AR(1) coefficient robustly
#' @noRd
estimate_ar1_robust <- function(x, robust = FALSE) {
  n <- length(x)
  if (n < 10) return(0)
  
  if (robust) {
    x1 <- x[1:(n-1)]
    x2 <- x[2:n]
    
    x1_c <- x1 - median(x1)
    x2_c <- x2 - median(x2)
    
    mad1 <- mad(x1_c, constant = 1.4826)
    mad2 <- mad(x2_c, constant = 1.4826)
    
    if (mad1 < 1e-10 || mad2 < 1e-10) return(0)
    
    x1_w <- pmax(pmin(x1_c / mad1, 3), -3)
    x2_w <- pmax(pmin(x2_c / mad2, 3), -3)
    
    rho <- median(x1_w * x2_w) / 0.4549
    
  } else {
    x_centered <- x - mean(x)
    numerator <- sum(x_centered[1:(n-1)] * x_centered[2:n])
    denominator <- sum(x_centered^2)
    
    if (denominator < 1e-10) return(0)
    rho <- numerator / denominator
  }
  
  max(-0.99, min(0.99, rho))
}

#' Merge nearby changepoints
#' @noRd
merge_nearby_changepoints <- function(cpts, threshold) {
  if (length(cpts) <= 1) return(cpts)
  
  cpts <- sort(cpts)
  merged <- cpts[1]
  
  for (i in 2:length(cpts)) {
    if (cpts[i] - merged[length(merged)] > threshold) {
      merged <- c(merged, cpts[i])
    } else {
      merged[length(merged)] <- round((merged[length(merged)] + cpts[i]) / 2)
    }
  }
  
  as.integer(merged)
}


#' CUSUM Changepoint Detection
#'
#' @description
#' Detects changepoints using the Cumulative Sum (CUSUM) statistic.
#' Foundational method for detecting changes in mean.
#'
#' @param data Numeric vector
#' @param type Type of change ("mean", "variance", "both")
#' @param threshold Detection threshold (alarm when statistic exceeds this)
#' @param mode "offline" for retrospective or "online" for sequential
#' @param mu0 Target mean under null hypothesis (for online mode)
#' @param sigma Known standard deviation (if NULL, estimated from data)
#' @param ... Additional arguments
#'
#' @return List with changepoints and statistics
#'
#' @examples
#' data <- c(rnorm(50), rnorm(50, mean = 2))
#' result <- cusum(data)
#'
#' @export
cusum <- function(data, type = "mean", threshold = 4, mode = "offline",
                  mu0 = NULL, sigma = NULL, ...) {
  detect_cusum(data, type, threshold, mode, mu0, sigma, ...)
}

#' @noRd
detect_cusum <- function(data, type = "mean", threshold = 4, mode = "offline",
                         mu0 = NULL, sigma = NULL, ...) {
  n <- length(data)
  
  if (is.null(mu0)) mu0 <- mean(data[1:min(30, n/4)])
  if (is.null(sigma)) sigma <- sd(data[1:min(30, n/4)])
  if (sigma <= 0) sigma <- 1
  
  z <- (data - mu0) / sigma
  
  if (mode == "offline") {
    result <- cusum_offline(z, type)
    
    if (result$max_stat > threshold) {
      changepoints <- result$changepoint
      n_changepoints <- 1
    } else {
      changepoints <- integer(0)
      n_changepoints <- 0
    }
    
    list(
      changepoints = changepoints,
      n_changepoints = n_changepoints,
      statistic = result$cusum,
      max_stat = result$max_stat,
      threshold = threshold,
      information_criterion = NULL
    )
    
  } else {
    cusum_online(z, threshold, type)
  }
}

#' @noRd
cusum_offline <- function(z, type = "mean") {
  n <- length(z)
  
  if (type == "mean") {
    S <- cumsum(z)
    
    cusum_stat <- numeric(n)
    for (k in 1:(n-1)) {
      factor <- sqrt(k * (n - k) / n)
      if (factor > 0) {
        cusum_stat[k] <- abs(S[k] - k * S[n] / n) / factor
      }
    }
    cusum_stat[n] <- 0
    
  } else if (type == "variance") {
    z2 <- z^2
    S <- cumsum(z2 - 1)
    cusum_stat <- abs(S)
    
  } else {
    S_mean <- cumsum(z)
    S_var <- cumsum(z^2 - 1)
    
    cusum_stat <- numeric(n)
    for (k in 1:(n-1)) {
      factor <- sqrt(k * (n - k) / n)
      if (factor > 0) {
        stat_mean <- abs(S_mean[k] - k * S_mean[n] / n) / factor
        stat_var <- abs(S_var[k] - k * S_var[n] / n) / factor
        cusum_stat[k] <- max(stat_mean, stat_var)
      }
    }
    cusum_stat[n] <- 0
  }
  
  max_idx <- which.max(cusum_stat)
  
  list(
    cusum = cusum_stat,
    changepoint = max_idx,
    max_stat = cusum_stat[max_idx]
  )
}

#' @noRd
cusum_online <- function(z, threshold, type = "mean") {
  n <- length(z)
  
  S_plus <- numeric(n)
  S_minus <- numeric(n)
  alarm <- FALSE
  alarm_time <- NA
  
  for (t in seq_len(n)) {
    S_plus[t] <- max(0, (if (t > 1) S_plus[t-1] else 0) + z[t])
    S_minus[t] <- max(0, (if (t > 1) S_minus[t-1] else 0) - z[t])
    
    if (max(S_plus[t], S_minus[t]) > threshold && !alarm) {
      alarm <- TRUE
      alarm_time <- t
    }
  }
  
  list(
    changepoints = if (!is.na(alarm_time)) alarm_time else integer(0),
    n_changepoints = as.integer(!is.na(alarm_time)),
    S_plus = S_plus,
    S_minus = S_minus,
    alarm = alarm,
    alarm_time = alarm_time,
    threshold = threshold,
    information_criterion = NULL
  )
}


#' Detect changepoints using PELT algorithm
#'
#' @description
#' Wrapper for \code{\link{detect_pelt}} for compatibility.
#'
#' @inheritParams detect_pelt
#' @return List with changepoints and diagnostics
#' @seealso \code{\link{detect_pelt}}
#' @export
pelt <- function(data, type = "both", penalty = "MBIC", min_segment = 2,
                 robust = FALSE, correct_ar = FALSE, merge_close = NULL, ...) {
  detect_pelt(data, type, penalty, min_segment, robust, correct_ar, merge_close, ...)
}

#' Detect changepoints using enhanced PELT algorithm
#'
#' @description
#' Pruned Exact Linear Time (PELT) algorithm for optimal changepoint detection
#' with enhancements for autocorrelated data and numerical stability.
#'
#' @param data Numeric vector or matrix. For matrices, rows are observations.
#' @param type Type of change to detect: \code{"mean"}, \code{"var"}, or \code{"both"}.
#' @param penalty Penalty type (\code{"MBIC"}, \code{"BIC"}, \code{"AIC"}) or numeric value.
#' @param min_segment Minimum segment length (default: 2).
#' @param robust Logical or character. FALSE for standard, TRUE for moderate robustness,
#'   or one of "mild", "moderate", "aggressive", "auto" for specific levels.
#' @param correct_ar Logical. Apply pre-whitening for autocorrelated data. Default: FALSE.
#' @param merge_close Integer or NULL. Merge changepoints within this distance. Default: NULL.
#' @param ... Additional arguments (currently unused).
#'
#' @return A list with components:
#' \describe{
#'   \item{changepoints}{Integer vector of detected changepoint locations}
#'   \item{n_changepoints}{Number of changepoints detected}
#'   \item{cost}{Final cost value}
#'   \item{ar_coefficient}{Estimated AR(1) coefficient (if \code{correct_ar = TRUE})}
#'   \item{prewhitened}{Logical indicating if pre-whitening was applied}
#'   \item{robust}{Logical indicating if robust estimation was used}
#'   \item{robust_level}{Character indicating robustness level used}
#'   \item{information_criterion}{Reserved for future use}
#' }
#'
#' @details
#' This implementation includes enhancements over standard PELT:
#'
#' \strong{Pre-whitening for autocorrelated data:} When \code{correct_ar = TRUE},
#' the function estimates the AR(1) coefficient and transforms the data via
#' \eqn{y'_t = y_t - \rho y_{t-1}} before applying PELT.
#'
#' \strong{Configurable robustness:} The \code{robust} parameter accepts:
#' \itemize{
#'   \item \code{FALSE}: Standard MLE-based estimation
#'   \item \code{TRUE} or \code{"moderate"}: 10\% winsorization, Huber k=1.0, Qn scale
#'   \item \code{"mild"}: 5\% winsorization, Huber k=1.345, MAD scale
#'   \item \code{"aggressive"}: 15\% winsorization, Tukey biweight, Qn scale
#'   \item \code{"auto"}: Automatically select based on data characteristics
#' }
#'
#' @references
#' Killick, R., Fearnhead, P., and Eckley, I. A. (2012). Optimal detection of
#' changepoints with a linear computational cost. JASA, 107(500), 1590-1598.
#'
#' @examples
#'
#' data <- c(rnorm(100, 0), rnorm(100, 3))
#' result <- detect_pelt(data, type = "mean")
#'
#'
#' data_contaminated <- c(rnorm(100, 0), rnorm(100, 3))
#' data_contaminated[sample(200, 10)] <- rnorm(10, 0, 10)
#' result_robust <- detect_pelt(data_contaminated, robust = TRUE)
#'
#'
#' result_auto <- detect_pelt(data_contaminated, robust = "auto")
#'
#' @export
detect_pelt <- function(data, type = "both", penalty = "MBIC", min_segment = 2,
                        robust = FALSE, correct_ar = FALSE, merge_close = NULL,
                        ...) {
  
  if (is.matrix(data) && ncol(data) == 1) {
    data <- as.vector(data)
  }
  
  n <- if (is.matrix(data)) nrow(data) else length(data)
  d <- if (is.matrix(data)) ncol(data) else 1
  
  if (n < 2) {
    stop("Data must have at least 2 observations.")
  }
  
  if (n < 2 * min_segment) {
    return(list(
      changepoints = integer(0),
      n_changepoints = 0L,
      cost = NA_real_,
      ar_coefficient = NA_real_,
      prewhitened = FALSE,
      robust = FALSE,
      information_criterion = NULL
    ))
  }
  
  if (d == 1) {
    result <- pelt_univariate_enhanced(data, type, penalty, min_segment,
                                       robust, correct_ar)
  } else {
    result <- pelt_multivariate_enhanced(data, type, penalty, min_segment)
  }
  
  if (!is.null(merge_close) && merge_close > 0 && length(result$changepoints) > 1) {
    result$changepoints <- merge_nearby_changepoints(result$changepoints, merge_close)
    result$n_changepoints <- length(result$changepoints)
  }
  
  result
}

#' @noRd
pelt_univariate_enhanced <- function(data, type, penalty, minseglen,
                                     robust = FALSE, correct_ar = FALSE) {
  n <- length(data)
  
  ar_coef <- NA_real_
  
  if (correct_ar && n > 30) {
    ar_coef <- estimate_ar1_robust(data, robust = TRUE)
    if (abs(ar_coef) > 0.1) {
      data <- data[2:n] - ar_coef * data[1:(n-1)]
      n <- length(data)
    }
  }
  
  robust_level <- if (identical(robust, "auto")) {
    detect_contamination_level(data)
  } else if (isTRUE(robust)) {
    "moderate"
  } else if (isFALSE(robust)) {
    "none"
  } else {
    robust
  }
  
  if (robust_level == "none") {
    result <- pelt_standard_core(data, type, penalty, minseglen)
    result$robust_level <- "none"
    result$robust <- FALSE
    
  } else {
    robust_params <- get_robust_params(robust_level, data)
    result <- pelt_robust_core_configurable(data, type, penalty, minseglen, robust_params)
    result$robust_level <- robust_params$level
    result$robust_description <- robust_params$description
    result$robust <- TRUE
  }
  
  if (correct_ar && !is.na(ar_coef) && abs(ar_coef) > 0.1) {
    result$changepoints <- result$changepoints + 1L
  }
  
  result$ar_coefficient <- ar_coef
  result$prewhitened <- correct_ar && !is.na(ar_coef) && abs(ar_coef) > 0.1
  
  result
}

#' @noRd
pelt_standard_core <- function(data, type, penalty, minseglen) {
  valid_idx <- which(!is.na(data) & is.finite(data))
  if (length(valid_idx) < 2 * minseglen) {
    return(list(
      changepoints = integer(0),
      n_changepoints = 0L,
      cost = NA_real_,
      information_criterion = NULL,
      note = "Insufficient valid data points"
    ))
  }
  
  data_clean <- data[valid_idx]
  n <- length(data_clean)
  has_missing <- length(valid_idx) < length(data)
  
  cumsum_x <- c(0, cumsum(data_clean))
  cumsum_x2 <- c(0, cumsum(data_clean^2))
  global_mean <- mean(data_clean)
  cumsum_x3 <- c(0, cumsum((data_clean - global_mean)^2))
  
  var_floor <- var(data_clean) / 100
  
  cost_mean <- function(start, end) {
    seg_n <- end - start
    if (seg_n < 1) return(Inf)
    x <- cumsum_x[end + 1] - cumsum_x[start + 1]
    x2 <- cumsum_x2[end + 1] - cumsum_x2[start + 1]
    x2 - (x * x) / seg_n
  }
  
  cost_var <- function(start, end) {
    seg_n <- end - start
    if (seg_n < 2) return(Inf)
    x3 <- cumsum_x3[end + 1] - cumsum_x3[start + 1]
    sigsq <- x3 / seg_n
    sigsq <- max(sigsq, var_floor)
    seg_n * (log(2 * pi) + log(sigsq) + 1)
  }
  
  cost_meanvar <- function(start, end) {
    seg_n <- end - start
    if (seg_n < 2) return(Inf)
    x <- cumsum_x[end + 1] - cumsum_x[start + 1]
    x2 <- cumsum_x2[end + 1] - cumsum_x2[start + 1]
    sigsq <- (x2 - (x * x) / seg_n) / seg_n
    sigsq <- max(sigsq, var_floor)
    seg_n * (log(2 * pi) + log(sigsq) + 1)
  }
  
  costfn <- switch(type,
                   mean = cost_mean,
                   variance = cost_var,
                   var = cost_var,
                   cost_meanvar)
  
  result <- pelt_algorithm(data_clean, costfn, penalty, minseglen, type)
  
  if (length(result$changepoints) > 0 && has_missing) {
    result$changepoints <- valid_idx[result$changepoints]
  }
  
  result
}

#' @noRd
pelt_robust_core_configurable <- function(data, type, penalty, minseglen, robust_params) {
  valid_idx <- which(!is.na(data) & is.finite(data))
  if (length(valid_idx) < 2 * minseglen) {
    return(list(
      changepoints = integer(0),
      n_changepoints = 0L,
      cost = NA_real_,
      information_criterion = NULL,
      winsorized = FALSE,
      note = "Insufficient valid data points"
    ))
  }
  
  data_clean <- data[valid_idx]
  n <- length(data_clean)
  has_missing <- length(valid_idx) < length(data)
  
  if (robust_params$winsor_prob > 0) {
    data_w <- winsorize(data_clean, robust_params$winsor_prob)
  } else {
    data_w <- data_clean
  }
  
  global_med <- median(data_w, na.rm = TRUE)
  
  if (robust_params$scale_estimator == "Qn") {
    global_scale <- Qn_scale(data_w)
  } else if (robust_params$scale_estimator == "mad") {
    global_scale <- mad(data_w, constant = 1.4826)
  } else {
    global_scale <- sd(data_w)
  }
  
  if (global_scale < 1e-10) global_scale <- sd(data_w)
  if (global_scale < 1e-10) global_scale <- 1
  
  scale_floor <- max(global_scale / 100, .Machine$double.eps * 100)
  
  k_const <- robust_params$huber_k
  estimator_type <- robust_params$estimator
  
  loss_fn <- if (estimator_type == "tukey") {
    function(r) tukey_biweight_rho(r, k_const)
  } else if (estimator_type == "huber") {
    function(r) huber_loss(r, k_const)
  } else {
    function(r) r^2 / 2
  }
  
  weight_fn <- if (estimator_type == "tukey") {
    function(r) tukey_biweight_weight(r, k_const)
  } else if (estimator_type == "huber") {
    function(r) huber_weight(r, k_const)
  } else {
    function(r) rep(1, length(r))
  }
  
  cost_robust <- function(start, end) {
    seg_n <- end - start
    if (seg_n < 2) return(Inf)
    
    segment <- data_w[(start + 1):end]
    
    mu <- median(segment)
    sigma <- if (robust_params$scale_estimator == "Qn") {
      Qn_scale(segment)
    } else {
      mad(segment, constant = 1.4826)
    }
    sigma <- max(sigma, scale_floor)
    
    for (iter in 1:10) {
      r <- (segment - mu) / sigma
      w <- weight_fn(r)
      sum_w <- sum(w)
      if (sum_w > 1e-10) {
        mu_new <- sum(w * segment) / sum_w
        if (abs(mu_new - mu) < 1e-6 * sigma) break
        mu <- mu_new
      }
    }
    
    if (type == "mean") {
      residuals <- (segment - mu) / sigma
      2 * sum(loss_fn(residuals)) 
    } else if (type == "var" || type == "variance") {
      sigma_seg <- if (robust_params$scale_estimator == "Qn") {
        Qn_scale(segment - mu)
      } else {
        mad(segment - mu, constant = 1.4826)
      }
      sigma_seg <- max(sigma_seg, scale_floor)
      seg_n * log(sigma_seg)
    } else {
      residuals <- (segment - mu) / sigma
      seg_n * log(sigma) + sum(loss_fn(residuals))
    }
  }
  
  result <- pelt_algorithm(data_w, cost_robust, penalty, minseglen, type)
  
  if (length(result$changepoints) > 0 && has_missing) {
    result$changepoints <- valid_idx[result$changepoints]
  }
  
  result$winsorized <- TRUE
  result$winsor_prob <- robust_params$winsor_prob
  result
}

#' Core PELT algorithm
#' @noRd
pelt_algorithm <- function(data, costfn, penalty, minseglen, type) {
  n <- length(data)
  
  pen <- compute_penalty_enhanced(penalty, n, type)
  
  F <- rep(Inf, n + 1)
  F[1] <- -pen
  cp <- rep(0L, n + 1)
  
  candidates <- 0L
  
  for (t in minseglen:n) {
    costs <- sapply(candidates, function(s) {
      if (t - s >= minseglen) {
        F[s + 1] + costfn(s, t) + pen
      } else {
        Inf
      }
    })
    
    if (length(costs) > 0) {
      best_idx <- which.min(costs)
      F[t + 1] <- costs[best_idx]
      cp[t + 1] <- candidates[best_idx]
      
      keep <- costs <= F[t + 1] + pen
      candidates <- candidates[keep]
    }
    
    candidates <- c(candidates, t - minseglen + 1L)
    candidates <- candidates[candidates >= 0]
  }
  
  changepoints <- integer(0)
  t <- n
  while (t > 0) {
    tau <- cp[t + 1]
    if (tau > 0) changepoints <- c(tau, changepoints)
    t <- tau
  }
  
  list(
    changepoints = as.integer(changepoints),
    n_changepoints = length(changepoints),
    cost = F[n + 1],
    information_criterion = NULL
  )
}

#' @noRd
pelt_multivariate_enhanced <- function(data, type, penalty, minseglen) {
  n <- nrow(data)
  d <- ncol(data)
  
  pen <- compute_penalty_enhanced(penalty, n, "both") * d
  
  mv_cost <- function(start, end) {
    if (end <= start) return(Inf)
    seg_n <- end - start
    if (seg_n < d + 1) return(Inf)
    
    segment <- data[(start + 1):end, , drop = FALSE]
    S <- cov(segment)
    det_val <- det(S)
    if (det_val <= 0) det_val <- .Machine$double.eps
    seg_n * (d * log(2 * pi) + log(det_val) + d)
  }
  
  F <- rep(Inf, n + 1)
  F[1] <- -pen
  cp <- rep(0L, n + 1)
  
  for (t in minseglen:n) {
    for (s in 0:(t - minseglen)) {
      cost <- F[s + 1] + mv_cost(s, t) + pen
      if (cost < F[t + 1]) {
        F[t + 1] <- cost
        cp[t + 1] <- s
      }
    }
  }
  
  changepoints <- integer(0)
  t <- n
  while (t > 0) {
    tau <- cp[t + 1]
    if (tau > 0) changepoints <- c(tau, changepoints)
    t <- tau
  }
  
  list(
    changepoints = as.integer(changepoints),
    n_changepoints = length(changepoints),
    cost = F[n + 1],
    ar_coefficient = NA_real_,
    prewhitened = FALSE,
    robust = FALSE,
    information_criterion = NULL
  )
}


#' Binary Segmentation Changepoint Detection
#'
#' @description
#' Recursive binary segmentation for changepoint detection.
#' Greedy algorithm that recursively splits at the best changepoint.
#'
#' @param data Numeric vector or matrix
#' @param type Type of change to detect
#' @param penalty Penalty for adding changepoints
#' @param min_segment Minimum segment length
#' @param n_changepoints Maximum number of changepoints to detect
#' @param threshold Significance threshold
#' @param ... Additional arguments
#'
#' @return List with changepoints
#'
#' @examples
#' data <- c(rnorm(100), rnorm(100, mean = 2))
#' result <- binary_segmentation(data)
#'
#' @export
binary_segmentation <- function(data, type = "both", penalty = "BIC",
                                min_segment = 2, n_changepoints = "multiple",
                                threshold = NULL, ...) {
  detect_binseg(data, type, penalty, min_segment, n_changepoints, threshold, ...)
}

#' @noRd
detect_binseg <- function(data, type = "both", penalty = "BIC",
                          min_segment = 2, n_changepoints = "multiple",
                          threshold = NULL, ...) {
  n <- if (is.matrix(data)) nrow(data) else length(data)
  d <- if (is.matrix(data)) ncol(data) else 1
  
  pen <- compute_penalty(penalty, n, d)
  
  max_cp <- if (is.numeric(n_changepoints)) {
    n_changepoints
  } else if (n_changepoints == "single") {
    1
  } else {
    floor(n / (2 * min_segment))
  }
  
  binseg_recursive <- function(start, end, current_cps) {
    if (length(current_cps) >= max_cp) return(current_cps)
    if (end - start + 1 < 2 * min_segment) return(current_cps)
    
    best_gain <- -Inf
    best_split <- NULL
    
    full_cost <- segment_cost(data, start, end, type)
    
    for (k in (start + min_segment - 1):(end - min_segment)) {
      cost_left <- segment_cost(data, start, k, type)
      cost_right <- segment_cost(data, k + 1, end, type)
      
      gain <- full_cost - cost_left - cost_right - pen
      
      if (gain > best_gain) {
        best_gain <- gain
        best_split <- k
      }
    }
    
    if (!is.null(threshold)) {
      if (best_gain < threshold) return(current_cps)
    } else {
      if (best_gain < 0) return(current_cps)
    }
    
    if (!is.null(best_split)) {
      current_cps <- c(current_cps, best_split)
      current_cps <- binseg_recursive(start, best_split, current_cps)
      current_cps <- binseg_recursive(best_split + 1, end, current_cps)
    }
    
    return(current_cps)
  }
  
  changepoints <- binseg_recursive(1, n, integer(0))
  changepoints <- sort(unique(changepoints))
  
  list(
    changepoints = changepoints,
    n_changepoints = length(changepoints),
    information_criterion = list(
      BIC = sum_segment_costs(data, changepoints, type) +
        (length(changepoints) + 1) * 2 * log(n),
      AIC = sum_segment_costs(data, changepoints, type) +
        (length(changepoints) + 1) * 2 * 2
    )
  )
}


#' Wild Binary Segmentation
#'
#' @description
#' Detects multiple changepoints using the Wild Binary Segmentation algorithm.
#' Uses random intervals to improve detection in long time series.
#'
#' @param data Numeric vector or matrix of time series data
#' @param type Type of change to detect: "mean", "variance", or "both"
#' @param penalty Penalty for model complexity: "BIC", "AIC", "MBIC", or numeric
#' @param min_segment Minimum segment length
#' @param n_changepoints Expected number of changepoints: "single", "multiple", or integer
#' @param M Number of random intervals to draw
#' @param threshold Detection threshold for CUSUM statistic. If NULL, automatically determined
#' @param ... Additional arguments
#'
#' @return A list with:
#'   \item{changepoints}{Vector of detected changepoint locations}
#'   \item{n_changepoints}{Number of changepoints detected}
#'   \item{information_criterion}{BIC value for the segmentation}
#'
#' @references
#' Fryzlewicz, P. (2014). Wild Binary Segmentation for multiple change-point detection.
#' Annals of Statistics, 42(6), 2243-2281.
#'
#' @examples
#' data <- c(rnorm(100), rnorm(100, mean = 2), rnorm(100))
#' result <- wild_binary_segmentation(data)
#'
#' @export
wild_binary_segmentation <- function(data, type = "both", penalty = "BIC",
                                     min_segment = 2, n_changepoints = "multiple",
                                     M = 5000, threshold = NULL, ...) {
  detect_wbs(data, type, penalty, min_segment, n_changepoints, M, threshold, ...)
}

#' @noRd
detect_wbs <- function(data, type = "both", penalty = "BIC",
                       min_segment = 2, n_changepoints = "multiple",
                       M = 5000, threshold = NULL, ...) {
  
  if (is.matrix(data) && ncol(data) == 1) data <- as.vector(data)
  n <- length(data)
  
  if (n < 2 * min_segment) {
    return(list(
      changepoints = integer(0),
      n_changepoints = 0L,
      information_criterion = NULL
    ))
  }
  
  
  if (type == "mean") {
    sigma_est <- mad(data, constant = 1.4826, na.rm = TRUE)
  } else {
    diffs <- diff(data)
    sigma_diff <- mad(diffs, constant = 1.4826) / sqrt(2)
    sigma_mad <- mad(data, constant = 1.4826)
    sigma_est <- if (sigma_diff > 0.1 * sigma_mad && sigma_diff < 10 * sigma_mad) {
      sigma_diff
    } else {
      sigma_mad
    }
  }
  
  if (sigma_est < 1e-10) sigma_est <- sd(data)
  if (sigma_est < 1e-10) sigma_est <- 1
  
  if (is.null(threshold)) {
    if (type == "mean") {
      threshold <- sqrt(2 * log(n))
    } else {
      threshold <- 0.8 * sqrt(2 * log(n))
    }
  }
  
  cumsum_data <- c(0, cumsum(data))
  
  compute_cusum_interval <- function(s, e) {
    if (e - s + 1 < 2 * min_segment) return(list(best_k = NA, best_stat = -Inf))
    
    len <- e - s + 1
    Cs <- cumsum_data[s]
    Ce <- cumsum_data[e + 1]
    
    best_stat <- -Inf
    best_k <- NA
    
    for (k in (s + min_segment - 1):(e - min_segment)) {
      n_left <- k - s + 1
      n_right <- e - k
      
      Ck <- cumsum_data[k + 1]
      
      mean_diff <- (Ck - Cs)/n_left - (Ce - Ck)/n_right
      stat <- sqrt(n_left * n_right / len) * abs(mean_diff) / sigma_est
      
      if (stat > best_stat) {
        best_stat <- stat
        best_k <- k
      }
    }
    list(best_k = best_k, best_stat = best_stat)
  }
  
  generate_hybrid_intervals <- function(n, M, min_seg) {
    min_len <- 2 * min_seg
    if (n <= min_len) return(matrix(c(1, n), nrow = 1))
    
    intervals <- matrix(NA, nrow = 0, ncol = 2)
    
    generate_seeded <- function(s, e, depth = 0) {
      if (depth > 12) return(NULL)
      len <- e - s + 1
      if (len < min_len) return(NULL)
      
      result <- matrix(c(s, e), nrow = 1)
      mid <- s + floor(len / 2)
      
      if (mid - s + 1 >= min_len) {
        left <- generate_seeded(s, mid, depth + 1)
        if (!is.null(left)) result <- rbind(result, left)
      }
      if (e - mid >= min_len) {
        right <- generate_seeded(mid + 1, e, depth + 1)
        if (!is.null(right)) result <- rbind(result, right)
      }
      result
    }
    
    seeded <- generate_seeded(1, n)
    if (!is.null(seeded)) intervals <- rbind(intervals, seeded)
    
    n_random <- max(M - nrow(intervals), floor(M * 0.7))
    
    if (n_random > 0) {
      random_int <- matrix(NA, nrow = n_random, ncol = 2)
      for (i in seq_len(n_random)) {
        s <- sample.int(n - min_len + 1, 1)
        e <- sample((s + min_len - 1):n, 1)
        random_int[i, ] <- c(s, e)
      }
      intervals <- rbind(intervals, random_int)
    }
    
    intervals <- unique(intervals)
    if (!any(intervals[,1] == 1 & intervals[,2] == n)) {
      intervals <- rbind(intervals, c(1, n))
    }
    intervals
  }
  
  all_intervals <- generate_hybrid_intervals(n, M, min_segment)
  
  max_cp_limit <- if (is.numeric(n_changepoints)) n_changepoints else floor(n / min_segment)
  
  
  if (type == "mean") {
    wbs_recursive_mean <- function(start, end, current_cps, depth = 0) {
      if (length(current_cps) >= max_cp_limit) return(current_cps)
      if (end - start + 1 < 2 * min_segment) return(current_cps)
      if (depth > 50) return(current_cps)
      
      valid_idx <- which(all_intervals[, 1] >= start & all_intervals[, 2] <= end &
                           (all_intervals[, 2] - all_intervals[, 1] + 1) >= 2 * min_segment)
      
      if (length(valid_idx) == 0) {
        intervals_to_use <- matrix(c(start, end), nrow = 1)
      } else {
        intervals_to_use <- all_intervals[valid_idx, , drop = FALSE]
      }
      
      best_stat <- -Inf
      best_k <- NA
      
      for (i in seq_len(nrow(intervals_to_use))) {
        res <- compute_cusum_interval(intervals_to_use[i, 1], intervals_to_use[i, 2])
        if (!is.na(res$best_stat) && res$best_stat > best_stat) {
          best_stat <- res$best_stat
          best_k <- res$best_k
        }
      }
      
      if (best_stat > threshold && !is.na(best_k)) {
        current_cps <- c(current_cps, best_k)
        current_cps <- wbs_recursive_mean(start, best_k, current_cps, depth + 1)
        current_cps <- wbs_recursive_mean(best_k + 1, end, current_cps, depth + 1)
      }
      return(current_cps)
    }
    
    found_cps <- wbs_recursive_mean(1, n, integer(0))
    changepoints <- sort(unique(found_cps))
    
  } else {
    all_candidates <- list()
    
    wbs_recursive_both <- function(start, end, depth = 0) {
      if (end - start + 1 < 2 * min_segment) return(NULL)
      if (depth > 50) return(NULL)
      
      valid_idx <- which(all_intervals[, 1] >= start & all_intervals[, 2] <= end &
                           (all_intervals[, 2] - all_intervals[, 1] + 1) >= 2 * min_segment)
      
      if (length(valid_idx) == 0) {
        intervals_to_use <- matrix(c(start, end), nrow = 1)
      } else {
        intervals_to_use <- all_intervals[valid_idx, , drop = FALSE]
      }
      
      best_stat <- -Inf
      best_k <- NA
      
      for (i in seq_len(nrow(intervals_to_use))) {
        res <- compute_cusum_interval(intervals_to_use[i, 1], intervals_to_use[i, 2])
        if (!is.na(res$best_stat) && res$best_stat > best_stat) {
          best_stat <- res$best_stat
          best_k <- res$best_k
        }
      }
      
      if (!is.na(best_k) && best_stat > threshold) {
        all_candidates[[length(all_candidates) + 1]] <<- list(
          position = best_k,
          statistic = best_stat
        )
        wbs_recursive_both(start, best_k, depth + 1)
        wbs_recursive_both(best_k + 1, end, depth + 1)
      }
      NULL
    }
    
    wbs_recursive_both(1, n)
    
    if (length(all_candidates) == 0) {
      changepoints <- integer(0)
    } else {
      positions <- sapply(all_candidates, `[[`, "position")
      statistics <- sapply(all_candidates, `[[`, "statistic")
      
      ord <- order(statistics, decreasing = TRUE)
      positions <- positions[ord]
      unique_pos <- unique(positions)
      
      compute_bic <- function(cps) {
        cost <- sum_segment_costs(data, cps, type)
        n_params <- 2 * (length(cps) + 1)
        cost + n_params * log(n)
      }
      
      selected <- integer(0)
      best_bic <- compute_bic(selected)
      
      for (cp in unique_pos) {
        if (length(selected) >= max_cp_limit) break
        if (length(selected) > 0 && any(abs(selected - cp) < min_segment)) next
        
        test_cps <- sort(c(selected, cp))
        test_bic <- compute_bic(test_cps)
        
        if (test_bic < best_bic) {
          selected <- test_cps
          best_bic <- test_bic
        }
      }
      
      changepoints <- sort(unique(selected))
    }
  }
  
  total_cost <- sum_segment_costs(data, changepoints, type)
  n_params <- (length(changepoints) + 1) * (if (type == "both") 2 else 1)
  
  list(
    changepoints = as.integer(changepoints),
    n_changepoints = length(changepoints),
    information_criterion = list(
      BIC = total_cost + n_params * log(n)
    )
  )
}

#' @noRd
generate_random_intervals <- function(n, M, min_segment) {
  intervals <- matrix(NA, nrow = M, ncol = 2)
  min_len <- 2 * min_segment
  if (n <= min_len) return(matrix(c(1, n), nrow = 1))
  
  for (i in seq_len(M)) {
    s <- sample(1:(n - min_len + 1), 1)
    e <- sample((s + min_len - 1):n, 1)
    intervals[i, ] <- c(s, e)
  }
  rbind(unique(intervals), c(1, n))
}


#' @noRd
detect_ensemble <- function(data, type = "both", penalty = "BIC",
                            min_segment = 2,
                            methods = c("pelt", "binseg", "wbs"),
                            min_agreement = 2, tolerance = 5, ...) {
  results <- lapply(methods, function(m) {
    tryCatch({
      switch(m,
             pelt = detect_pelt(data, type, penalty, min_segment, ...),
             binseg = detect_binseg(data, type, penalty, min_segment, "multiple", ...),
             wbs = detect_wbs(data, type, penalty, min_segment, "multiple", ...),
             cusum = detect_cusum(data, type, 4, "offline", ...)
      )
    }, error = function(e) list(changepoints = integer(0)))
  })
  names(results) <- methods
  
  all_cp <- unlist(lapply(results, `[[`, "changepoints"))
  
  if (length(all_cp) == 0) {
    return(list(
      changepoints = integer(0),
      n_changepoints = 0,
      individual_results = results,
      information_criterion = NULL
    ))
  }
  
  consensus <- find_consensus_changepoints(all_cp, tolerance, min_agreement)
  
  list(
    changepoints = consensus,
    n_changepoints = length(consensus),
    individual_results = results,
    information_criterion = NULL
  )
}

#' @noRd
find_consensus_changepoints <- function(changepoints, tolerance, min_agreement) {
  changepoints <- changepoints[!is.na(changepoints)]
  if (length(changepoints) == 0) return(integer(0))
  
  cp <- sort(changepoints)
  
  if (length(cp) == 1) {
    if (min_agreement <= 1) {
      return(as.integer(cp))
    } else {
      return(integer(0))
    }
  }
  
  clusters <- list()
  current_cluster <- c(cp[1])
  
  for (i in 2:length(cp)) {
    if (cp[i] - cp[i-1] <= tolerance) {
      current_cluster <- c(current_cluster, cp[i])
    } else {
      clusters <- c(clusters, list(current_cluster))
      current_cluster <- c(cp[i])
    }
  }
  clusters <- c(clusters, list(current_cluster))
  
  consensus <- sapply(clusters, function(cl) {
    if (length(cl) >= min_agreement) {
      round(median(cl))
    } else {
      NA
    }
  })
  
  consensus <- consensus[!is.na(consensus)]
  as.integer(consensus)
}