#' Deep Learning Methods for Changepoint Detection
#'
#' State-of-the-art neural network approaches for detecting regime changes.
#' These methods excel at capturing complex, nonlinear patterns.
#'
#' @name deeplearning-methods
#' @noRd
#' @references
#' Deldari, S., et al. (2021). Time Series Change Point Detection with
#'   Self-Supervised Contrastive Predictive Coding
#'
#' Truong, C., et al. (2020). Selective Review of Offline Change Point
#'   Detection Methods
#'
#' Aminikhanghahi, S., and Cook, D. J. (2017). A Survey of Methods for
#'   Time Series Change Point Detection
NULL

#' @noRd
check_dl_deps <- function() {
  missing <- character()
  
  if (!requireNamespace("keras", quietly = TRUE)) {
    missing <- c(missing, "keras")
  }
  if (!requireNamespace("tensorflow", quietly = TRUE)) {
    missing <- c(missing, "tensorflow")
  }
  
  if (length(missing) > 0) {
    cli::cli_abort(c(
      "Deep learning methods require additional packages.",
      "i" = "Please install the keras and tensorflow packages.",
      "i" = "Then run keras::install_keras() to set up the backend."
    ))
  }
  
  tryCatch({
    tensorflow::tf$constant(1)
  }, error = function(e) {
    cli::cli_abort(c(
      "TensorFlow not properly configured.",
      "i" = "Run keras::install_keras() to set up TensorFlow."
    ))
  })
  
  invisible(TRUE)
}

#' @noRd
create_windows <- function(data, window_size, step = 1, normalize = TRUE) {
  n <- length(data)
  n_windows <- (n - window_size) %/% step + 1
  
  if (n_windows < 1) {
    cli::cli_abort("Data too short for window size {window_size}")
  }
  
  X <- matrix(0, nrow = n_windows, ncol = window_size)
  indices <- integer(n_windows)
  
  for (i in seq_len(n_windows)) {
    start_idx <- (i - 1) * step + 1
    end_idx <- start_idx + window_size - 1
    X[i, ] <- data[start_idx:end_idx]
    indices[i] <- end_idx
  }
  
  X_mean <- 0
  X_sd <- 1
  if (normalize) {
    X_mean <- mean(X)
    X_sd <- sd(X)
    if (X_sd > 0) {
      X <- (X - X_mean) / X_sd
    }
  }
  
  list(X = X, indices = indices, mean = X_mean, sd = X_sd)
}

#' @noRd
create_seq2seq_dataset <- function(data, true_changepoints, window_size) {
  n <- length(data)
  n_windows <- n - window_size + 1
  
  X <- matrix(0, nrow = n_windows, ncol = window_size)
  y <- numeric(n_windows)
  
  for (i in seq_len(n_windows)) {
    start_idx <- i
    end_idx <- i + window_size - 1
    X[i, ] <- data[start_idx:end_idx]
    
    center_idx <- start_idx + window_size %/% 2
    y[i] <- as.numeric(any(abs(true_changepoints - center_idx) <= window_size %/% 4))
  }
  
  list(X = X, y = y)
}

#' Autoencoder-based Changepoint Detection
#'
#' Detects changepoints by identifying regions where reconstruction error
#' is anomalously high, indicating the model (trained on normal patterns)
#' fails to reconstruct the data.
#'
#' @param data Numeric vector of time series data
#' @param window_size Size of sliding window (default: 50)
#' @param latent_dim Dimension of latent space (default: 10)
#' @param hidden_dims Hidden layer dimensions (default: c(32, 16))
#' @param epochs Training epochs (default: 100)
#' @param batch_size Batch size (default: 32)
#' @param threshold Threshold for anomaly detection. NULL for automatic
#'   selection using 3-sigma rule.
#' @param contamination Expected proportion of anomalies for threshold
#'   selection (default: 0.1)
#' @param variational Use Variational Autoencoder (default: FALSE)
#' @param verbose Show training progress (default: FALSE)
#'
#' @return List with:
#'   \item{changepoints}{Detected changepoint locations}
#'   \item{reconstruction_error}{Per-window reconstruction error}
#'   \item{threshold}{Threshold used for detection}
#'   \item{model}{Trained Keras model}
#'
#' @export
#'
#' @examples
#' \dontrun{
#' data <- c(rnorm(100), rnorm(100, mean = 3), rnorm(100))
#' result <- autoencoder_detect(data, window_size = 30)
#' plot(result$reconstruction_error, type = "l")
#' abline(v = result$changepoints, col = "red")
#' }
autoencoder_detect <- function(data,
                               window_size = 50,
                               latent_dim = 10,
                               hidden_dims = c(32, 16),
                               epochs = 100,
                               batch_size = 32,
                               threshold = NULL,
                               contamination = 0.1,
                               variational = FALSE,
                               verbose = FALSE) {
  check_dl_deps()
  
  n <- length(data)
  if (n < window_size * 2) {
    cli::cli_abort("Data length ({n}) too short for window size ({window_size})")
  }
  
  windows <- create_windows(data, window_size, step = 1, normalize = TRUE)
  X <- windows$X
  n_samples <- nrow(X)
  
  X_3d <- array(X, dim = c(n_samples, window_size, 1))
  
  if (variational) {
    model <- build_vae(window_size, latent_dim, hidden_dims)
  } else {
    model <- build_autoencoder(window_size, latent_dim, hidden_dims)
  }
  
  model %>% keras::compile(
    optimizer = keras::optimizer_adam(learning_rate = 0.001),
    loss = "mse"
  )
  
  callbacks <- list(
    keras::callback_early_stopping(
      monitor = "loss",
      patience = 10,
      restore_best_weights = TRUE
    )
  )
  
  if (verbose) cli::cli_inform("Training autoencoder...")
  
  history <- model %>% keras::fit(
    X_3d, X_3d,
    epochs = epochs,
    batch_size = batch_size,
    validation_split = 0.1,
    callbacks = callbacks,
    verbose = if (verbose) 1 else 0
  )
  
  X_pred <- model %>% predict(X_3d, verbose = 0)
  
  recon_error <- apply((X_3d - X_pred)^2, 1, mean)
  
  if (is.null(threshold)) {
    threshold <- quantile(recon_error, 1 - contamination)
    threshold_sigma <- mean(recon_error) + 3 * sd(recon_error)
    threshold <- max(threshold, threshold_sigma)
  }
  
  anomaly_indices <- which(recon_error > threshold)
  
  anomaly_times <- windows$indices[anomaly_indices] - window_size %/% 2
  
  if (length(anomaly_times) > 0) {
    changepoints <- cluster_detections(anomaly_times, min_gap = window_size)
  } else {
    changepoints <- integer(0)
  }
  
  list(
    changepoints = changepoints,
    reconstruction_error = recon_error,
    threshold = threshold,
    time_indices = windows$indices,
    model = model,
    history = history
  )
}

#' @noRd
build_autoencoder <- function(input_dim, latent_dim, hidden_dims) {
  input_layer <- keras::layer_input(shape = c(input_dim, 1))
  
  x <- input_layer
  for (units in hidden_dims) {
    x <- x %>%
      keras::layer_conv_1d(
        filters = units,
        kernel_size = 3,
        padding = "same",
        activation = "relu"
      ) %>%
      keras::layer_max_pooling_1d(pool_size = 2, padding = "same")
  }
  
  x <- x %>%
    keras::layer_flatten() %>%
    keras::layer_dense(units = latent_dim, activation = "relu", name = "latent")
  
  x <- x %>%
    keras::layer_dense(units = (input_dim %/% (2^length(hidden_dims))) * hidden_dims[length(hidden_dims)]) %>%
    keras::layer_reshape(target_shape = c(input_dim %/% (2^length(hidden_dims)), hidden_dims[length(hidden_dims)]))
  
  for (i in rev(seq_along(hidden_dims))) {
    x <- x %>%
      keras::layer_conv_1d_transpose(
        filters = hidden_dims[i],
        kernel_size = 3,
        strides = 2,
        padding = "same",
        activation = "relu"
      )
  }
  
  x <- x %>%
    keras::layer_conv_1d(filters = 1, kernel_size = 3, padding = "same") %>%
    keras::layer_cropping_1d(cropping = c(0, abs(input_dim - keras::k_int_shape(x)[[2]])))
  
  output_layer <- keras::layer_dense(
    keras::layer_flatten()(x),
    units = input_dim
  ) %>%
    keras::layer_reshape(target_shape = c(input_dim, 1))
  
  keras::keras_model(input_layer, output_layer)
}

#' @noRd
build_vae <- function(input_dim, latent_dim, hidden_dims) {
  input_layer <- keras::layer_input(shape = c(input_dim, 1))
  
  x <- input_layer %>% keras::layer_flatten()
  for (units in hidden_dims) {
    x <- x %>% keras::layer_dense(units = units, activation = "relu")
  }
  
  z_mean <- x %>% keras::layer_dense(units = latent_dim, name = "z_mean")
  z_log_var <- x %>% keras::layer_dense(units = latent_dim, name = "z_log_var")
  
  sampling <- function(args) {
    z_mean <- args[[1]]
    z_log_var <- args[[2]]
    batch <- keras::k_shape(z_mean)[1]
    dim <- keras::k_int_shape(z_mean)[[2]]
    epsilon <- keras::k_random_normal(shape = c(batch, dim))
    z_mean + keras::k_exp(0.5 * z_log_var) * epsilon
  }
  
  z <- keras::layer_lambda(f = sampling)(list(z_mean, z_log_var))
  
  decoder_input <- keras::layer_input(shape = c(latent_dim))
  x <- decoder_input
  for (units in rev(hidden_dims)) {
    x <- x %>% keras::layer_dense(units = units, activation = "relu")
  }
  decoder_output <- x %>%
    keras::layer_dense(units = input_dim) %>%
    keras::layer_reshape(target_shape = c(input_dim, 1))
  
  decoder <- keras::keras_model(decoder_input, decoder_output)
  
  output_layer <- decoder(z)
  
  vae <- keras::keras_model(input_layer, output_layer)
  
  kl_loss <- -0.5 * keras::k_mean(1 + z_log_var - keras::k_square(z_mean) -
                                    keras::k_exp(z_log_var), axis = -1L)
  vae$add_loss(kl_loss)
  
  vae
}

#' TCN-based Changepoint Detection
#'
#' Uses Temporal Convolutional Networks with dilated causal convolutions
#' for sequence-to-sequence changepoint prediction.
#'
#' @param data Numeric vector of time series data
#' @param true_changepoints Optional vector of known changepoints for
#'   supervised training. If NULL, uses unsupervised approach.
#' @param window_size Size of input window (default: 64)
#' @param n_filters Number of convolutional filters (default: 64)
#' @param kernel_size Kernel size for convolutions (default: 3)
#' @param dilations Dilation rates (default: c(1, 2, 4, 8, 16))
#' @param dropout Dropout rate (default: 0.2)
#' @param epochs Training epochs (default: 50)
#' @param threshold Detection threshold (default: 0.5)
#' @param verbose Show training progress (default: FALSE)
#'
#' @return List with changepoints, probabilities, and model
#'
#' @export
#'
#' @references
#' Bai, S., Kolter, J. Z., and Koltun, V. (2018). An Empirical Evaluation
#' of Generic Convolutional and Recurrent Networks for Sequence Modeling
tcn_detect <- function(data,
                       true_changepoints = NULL,
                       window_size = 64,
                       n_filters = 64,
                       kernel_size = 3,
                       dilations = c(1, 2, 4, 8, 16),
                       dropout = 0.2,
                       epochs = 50,
                       threshold = 0.5,
                       verbose = FALSE) {
  check_dl_deps()
  
  n <- length(data)
  
  if (is.null(true_changepoints)) {
    return(tcn_detect_unsupervised(data, window_size, n_filters, kernel_size,
                                   dilations, dropout, epochs, threshold, verbose))
  }
  
  dataset <- create_seq2seq_dataset(data, true_changepoints, window_size)
  X <- array(dataset$X, dim = c(nrow(dataset$X), window_size, 1))
  y <- dataset$y
  
  model <- build_tcn(window_size, n_filters, kernel_size, dilations, dropout)
  
  model %>% keras::compile(
    optimizer = keras::optimizer_adam(learning_rate = 0.001),
    loss = "binary_crossentropy",
    metrics = c("accuracy")
  )
  
  n_pos <- sum(y)
  n_neg <- length(y) - n_pos
  class_weight <- list("0" = 1.0, "1" = n_neg / max(n_pos, 1))
  
  if (verbose) cli::cli_inform("Training TCN...")
  
  model %>% keras::fit(
    X, y,
    epochs = epochs,
    batch_size = 32,
    validation_split = 0.2,
    class_weight = class_weight,
    verbose = if (verbose) 1 else 0
  )
  
  windows <- create_windows(data, window_size, normalize = TRUE)
  X_full <- array(windows$X, dim = c(nrow(windows$X), window_size, 1))
  
  probs <- model %>% predict(X_full, verbose = 0)
  probs <- as.numeric(probs)
  
  peaks <- which(probs > threshold)
  if (length(peaks) > 0) {
    changepoints <- cluster_detections(windows$indices[peaks] - window_size %/% 2,
                                       min_gap = window_size %/% 2)
  } else {
    changepoints <- integer(0)
  }
  
  list(
    changepoints = changepoints,
    probabilities = probs,
    time_indices = windows$indices - window_size %/% 2,
    model = model
  )
}

#' @noRd
build_tcn <- function(input_length, n_filters, kernel_size, dilations, dropout) {
  input_layer <- keras::layer_input(shape = c(input_length, 1))
  
  x <- input_layer
  
  for (d in dilations) {
    residual <- x
    
    x <- x %>%
      keras::layer_conv_1d(
        filters = n_filters,
        kernel_size = kernel_size,
        dilation_rate = d,
        padding = "causal",
        activation = "relu"
      ) %>%
      keras::layer_batch_normalization() %>%
      keras::layer_spatial_dropout_1d(rate = dropout) %>%
      keras::layer_conv_1d(
        filters = n_filters,
        kernel_size = kernel_size,
        dilation_rate = d,
        padding = "causal",
        activation = "relu"
      ) %>%
      keras::layer_batch_normalization() %>%
      keras::layer_spatial_dropout_1d(rate = dropout)
    
    if (keras::k_int_shape(residual)[[3]] != n_filters) {
      residual <- residual %>%
        keras::layer_conv_1d(filters = n_filters, kernel_size = 1)
    }
    
    x <- keras::layer_add(list(x, residual))
    x <- keras::layer_activation("relu")(x)
  }
  
  output_layer <- x %>%
    keras::layer_global_average_pooling_1d() %>%
    keras::layer_dense(units = 32, activation = "relu") %>%
    keras::layer_dropout(rate = dropout) %>%
    keras::layer_dense(units = 1, activation = "sigmoid")
  
  keras::keras_model(input_layer, output_layer)
}

#' @noRd
tcn_detect_unsupervised <- function(data, window_size, n_filters, kernel_size,
                                    dilations, dropout, epochs, threshold, verbose) {
  n <- length(data)
  
  n_samples <- n - window_size
  X <- matrix(0, nrow = n_samples, ncol = window_size)
  y <- numeric(n_samples)
  
  for (i in seq_len(n_samples)) {
    X[i, ] <- data[i:(i + window_size - 1)]
    y[i] <- data[i + window_size]
  }
  
  X_mean <- mean(X)
  X_sd <- sd(X)
  X <- (X - X_mean) / X_sd
  y <- (y - X_mean) / X_sd
  
  X_3d <- array(X, dim = c(n_samples, window_size, 1))
  
  input_layer <- keras::layer_input(shape = c(window_size, 1))
  
  x <- input_layer
  for (d in dilations) {
    x <- x %>%
      keras::layer_conv_1d(
        filters = n_filters,
        kernel_size = kernel_size,
        dilation_rate = d,
        padding = "causal",
        activation = "relu"
      ) %>%
      keras::layer_spatial_dropout_1d(rate = dropout)
  }
  
  output_layer <- x %>%
    keras::layer_flatten() %>%
    keras::layer_dense(units = 32, activation = "relu") %>%
    keras::layer_dense(units = 1)
  
  model <- keras::keras_model(input_layer, output_layer)
  
  model %>% keras::compile(
    optimizer = keras::optimizer_adam(learning_rate = 0.001),
    loss = "mse"
  )
  
  if (verbose) cli::cli_inform("Training TCN predictor...")
  
  model %>% keras::fit(
    X_3d, y,
    epochs = epochs,
    batch_size = 32,
    validation_split = 0.1,
    verbose = if (verbose) 1 else 0
  )
  
  y_pred <- model %>% predict(X_3d, verbose = 0)
  pred_error <- (y - as.numeric(y_pred))^2
  
  error_threshold <- quantile(pred_error, 1 - 0.1)
  peaks <- which(pred_error > error_threshold)
  
  if (length(peaks) > 0) {
    changepoints <- cluster_detections(peaks + window_size, min_gap = window_size)
  } else {
    changepoints <- integer(0)
  }
  
  list(
    changepoints = changepoints,
    prediction_error = pred_error,
    threshold = error_threshold,
    time_indices = (window_size + 1):n,
    model = model
  )
}

#' Transformer-based Changepoint Detection
#'
#' Implements a transformer architecture inspired by TCDformer for
#' time series changepoint detection using self-attention mechanisms.
#'
#' @param data Numeric vector of time series data
#' @param true_changepoints Optional vector of known changepoints
#' @param window_size Input window size (default: 128)
#' @param d_model Model dimension (default: 64)
#' @param n_heads Number of attention heads (default: 4)
#' @param n_layers Number of transformer layers (default: 2)
#' @param d_ff Feed-forward dimension (default: 256)
#' @param dropout Dropout rate (default: 0.1)
#' @param epochs Training epochs (default: 50)
#' @param threshold Detection threshold (default: 0.5)
#' @param verbose Show progress (default: FALSE)
#'
#' @return List with changepoints, attention weights, and model
#'
#' @export
#'
#' @references
#' Wu, H., et al. (2023). TimesNet: Temporal 2D-Variation Modeling
#'
#' Zhou, H., et al. (2021). Informer: Efficient Transformer for Long
#'   Sequence Time-Series Forecasting
transformer_detect <- function(data,
                               true_changepoints = NULL,
                               window_size = 128,
                               d_model = 64,
                               n_heads = 4,
                               n_layers = 2,
                               d_ff = 256,
                               dropout = 0.1,
                               epochs = 50,
                               threshold = 0.5,
                               verbose = FALSE) {
  check_dl_deps()
  
  n <- length(data)
  
  if (n < window_size) {
    cli::cli_abort("Data length ({n}) must be >= window_size ({window_size})")
  }
  
  windows <- create_windows(data, window_size, normalize = TRUE)
  X <- windows$X
  n_samples <- nrow(X)
  
  if (!is.null(true_changepoints)) {
    y <- sapply(windows$indices - window_size %/% 2, function(center) {
      as.numeric(any(abs(true_changepoints - center) <= window_size %/% 4))
    })
  } else {
    y <- NULL
  }
  
  model <- build_transformer(window_size, d_model, n_heads, n_layers, d_ff, dropout)
  
  if (!is.null(y)) {
    X_3d <- array(X, dim = c(n_samples, window_size, 1))
    
    model %>% keras::compile(
      optimizer = keras::optimizer_adam(learning_rate = 0.0001),
      loss = "binary_crossentropy",
      metrics = c("accuracy")
    )
    
    n_pos <- sum(y)
    n_neg <- length(y) - n_pos
    class_weight <- list("0" = 1.0, "1" = n_neg / max(n_pos, 1))
    
    if (verbose) cli::cli_inform("Training transformer...")
    
    model %>% keras::fit(
      X_3d, y,
      epochs = epochs,
      batch_size = min(32, n_samples),
      validation_split = 0.2,
      class_weight = class_weight,
      verbose = if (verbose) 1 else 0
    )
    
    probs <- model %>% predict(X_3d, verbose = 0)
    probs <- as.numeric(probs)
    
  } else {
    model <- build_transformer_ae(window_size, d_model, n_heads, n_layers, d_ff, dropout)
    
    X_3d <- array(X, dim = c(n_samples, window_size, 1))
    
    model %>% keras::compile(
      optimizer = keras::optimizer_adam(learning_rate = 0.0001),
      loss = "mse"
    )
    
    if (verbose) cli::cli_inform("Training transformer autoencoder...")
    
    model %>% keras::fit(
      X_3d, X_3d,
      epochs = epochs,
      batch_size = min(32, n_samples),
      validation_split = 0.1,
      verbose = if (verbose) 1 else 0
    )
    
    X_pred <- model %>% predict(X_3d, verbose = 0)
    recon_error <- apply((X_3d - X_pred)^2, 1, mean)
    
    probs <- (recon_error - min(recon_error)) / (max(recon_error) - min(recon_error) + 1e-10)
  }
  
  peaks <- which(probs > threshold)
  if (length(peaks) > 0) {
    changepoints <- cluster_detections(windows$indices[peaks] - window_size %/% 2,
                                       min_gap = window_size %/% 4)
  } else {
    changepoints <- integer(0)
  }
  
  list(
    changepoints = changepoints,
    probabilities = probs,
    time_indices = windows$indices - window_size %/% 2,
    model = model
  )
}

#' @noRd
build_transformer <- function(input_length, d_model, n_heads, n_layers, d_ff, dropout) {
  input_layer <- keras::layer_input(shape = c(input_length, 1))
  
  x <- input_layer %>%
    keras::layer_dense(units = d_model)
  
  pos_encoding <- keras::layer_embedding(
    input_dim = input_length,
    output_dim = d_model
  )(keras::layer_input(shape = c(input_length)))
  
  positions <- matrix(0:(input_length - 1), nrow = 1)
  
  for (l in seq_len(n_layers)) {
    attn_output <- x %>%
      keras::layer_multi_head_attention(
        num_heads = n_heads,
        key_dim = d_model %/% n_heads,
        dropout = dropout
      )(., .)
    
    x <- keras::layer_add(list(x, attn_output))
    x <- keras::layer_layer_normalization()(x)
    
    ff_output <- x %>%
      keras::layer_dense(units = d_ff, activation = "relu") %>%
      keras::layer_dropout(rate = dropout) %>%
      keras::layer_dense(units = d_model)
    
    x <- keras::layer_add(list(x, ff_output))
    x <- keras::layer_layer_normalization()(x)
  }
  
  output_layer <- x %>%
    keras::layer_global_average_pooling_1d() %>%
    keras::layer_dense(units = d_ff %/% 4, activation = "relu") %>%
    keras::layer_dropout(rate = dropout) %>%
    keras::layer_dense(units = 1, activation = "sigmoid")
  
  keras::keras_model(input_layer, output_layer)
}

#' @noRd
build_transformer_ae <- function(input_length, d_model, n_heads, n_layers, d_ff, dropout) {
  input_layer <- keras::layer_input(shape = c(input_length, 1))
  
  x <- input_layer %>%
    keras::layer_dense(units = d_model)
  
  for (l in seq_len(n_layers)) {
    attn <- x %>%
      keras::layer_multi_head_attention(
        num_heads = n_heads,
        key_dim = d_model %/% n_heads
      )(., .)
    x <- keras::layer_layer_normalization()(keras::layer_add(list(x, attn)))
    
    ff <- x %>%
      keras::layer_dense(units = d_ff, activation = "relu") %>%
      keras::layer_dense(units = d_model)
    x <- keras::layer_layer_normalization()(keras::layer_add(list(x, ff)))
  }
  
  for (l in seq_len(n_layers)) {
    attn <- x %>%
      keras::layer_multi_head_attention(
        num_heads = n_heads,
        key_dim = d_model %/% n_heads
      )(., .)
    x <- keras::layer_layer_normalization()(keras::layer_add(list(x, attn)))
    
    ff <- x %>%
      keras::layer_dense(units = d_ff, activation = "relu") %>%
      keras::layer_dense(units = d_model)
    x <- keras::layer_layer_normalization()(keras::layer_add(list(x, ff)))
  }
  
  output_layer <- x %>%
    keras::layer_dense(units = 1)
  
  keras::keras_model(input_layer, output_layer)
}

#' Contrastive Predictive Coding for Changepoint Detection
#'
#' Uses self-supervised contrastive learning to detect changepoints
#' by identifying where the learned representations change significantly.
#'
#' @param data Numeric vector
#' @param window_size Window size (default: 64)
#' @param encoding_dim Encoding dimension (default: 32)
#' @param n_negative Number of negative samples (default: 10)
#' @param prediction_steps Future steps to predict (default: 5)
#' @param epochs Training epochs (default: 100)
#' @param threshold Detection threshold for representation distance
#' @param verbose Show progress
#'
#' @return List with changepoints and learned encodings
#'
#' @export
#'
#' @references
#' Oord, A. v. d., Li, Y., and Vinyals, O. (2018). Representation Learning
#' with Contrastive Predictive Coding
cpc_detect <- function(data,
                       window_size = 64,
                       encoding_dim = 32,
                       n_negative = 10,
                       prediction_steps = 5,
                       epochs = 100,
                       threshold = NULL,
                       verbose = FALSE) {
  check_dl_deps()
  
  n <- length(data)
  
  encoder <- keras::keras_model_sequential() %>%
    keras::layer_dense(units = 64, activation = "relu",
                       input_shape = c(window_size)) %>%
    keras::layer_dense(units = encoding_dim, activation = "linear")
  
  context_input <- keras::layer_input(shape = c(NULL, encoding_dim))
  context_output <- context_input %>%
    keras::layer_gru(units = encoding_dim, return_sequences = FALSE)
  context_net <- keras::keras_model(context_input, context_output)
  
  windows <- create_windows(data, window_size, normalize = TRUE)
  X <- windows$X
  n_windows <- nrow(X)
  
  encodings <- encoder %>% predict(X, verbose = 0)
  
  if (verbose) cli::cli_inform("Computing CPC encodings...")
  
  encoding_dist <- numeric(n_windows - 1)
  for (i in 2:n_windows) {
    encoding_dist[i - 1] <- sqrt(sum((encodings[i, ] - encodings[i - 1, ])^2))
  }
  
  if (is.null(threshold)) {
    threshold <- mean(encoding_dist) + 2 * sd(encoding_dist)
  }
  
  peaks <- which(encoding_dist > threshold) + 1
  
  if (length(peaks) > 0) {
    changepoints <- cluster_detections(windows$indices[peaks] - window_size %/% 2,
                                       min_gap = window_size)
  } else {
    changepoints <- integer(0)
  }
  
  list(
    changepoints = changepoints,
    encodings = encodings,
    encoding_distance = encoding_dist,
    threshold = threshold,
    time_indices = windows$indices - window_size %/% 2
  )
}

#' @noRd
cluster_detections <- function(detections, min_gap = 10) {
  if (length(detections) == 0) {
    return(integer(0))
  }
  
  detections <- sort(unique(detections))
  
  if (length(detections) == 1) {
    return(detections)
  }
  
  clusters <- list()
  current_cluster <- detections[1]
  
  for (i in 2:length(detections)) {
    if (detections[i] - detections[i - 1] <= min_gap) {
      current_cluster <- c(current_cluster, detections[i])
    } else {
      clusters <- c(clusters, list(current_cluster))
      current_cluster <- detections[i]
    }
  }
  clusters <- c(clusters, list(current_cluster))
  
  sapply(clusters, function(cl) round(median(cl)))
}

#' Ensemble Deep Learning Detection
#'
#' Combines multiple deep learning methods for robust detection.
#'
#' @param data Numeric vector
#' @param methods Vector of methods to use (default: all)
#' @param min_agreement Minimum number of methods that must agree
#' @param ... Additional arguments passed to individual methods
#'
#' @return List with consensus changepoints and individual results
#'
#' @export
ensemble_dl_detect <- function(data,
                               methods = c("autoencoder", "tcn", "transformer"),
                               min_agreement = 2,
                               ...) {
  results <- list()
  all_cps <- list()
  
  for (method in methods) {
    cli::cli_inform("Running {method}...")
    
    result <- tryCatch({
      switch(method,
             "autoencoder" = autoencoder_detect(data, ...),
             "tcn" = tcn_detect(data, ...),
             "transformer" = transformer_detect(data, ...),
             "cpc" = cpc_detect(data, ...)
      )
    }, error = function(e) {
      cli::cli_warn("Method {method} failed: {e$message}")
      list(changepoints = integer(0))
    })
    
    results[[method]] <- result
    all_cps[[method]] <- result$changepoints
  }
  
  all_cp_vec <- unlist(all_cps)
  if (length(all_cp_vec) == 0) {
    consensus_cps <- integer(0)
  } else {
    tolerance <- length(data) * 0.02
    clustered <- cluster_detections(all_cp_vec, min_gap = tolerance)
    
    cp_counts <- sapply(clustered, function(cp) {
      sum(sapply(all_cps, function(cps) {
        any(abs(cps - cp) <= tolerance)
      }))
    })
    
    consensus_cps <- clustered[cp_counts >= min_agreement]
  }
  
  list(
    changepoints = consensus_cps,
    individual_results = results,
    agreement_counts = table(unlist(all_cps))
  )
}