#' Generate Automated Data Quality Audit Report
#'
#' Executes the complete anomaly detection pipeline (preprocessing, scoring, flagging)
#' and generates a professional PDF, HTML, or DOCX report with visualizations and prioritized
#' audit listings.
#'
#' @param data A data frame containing the data to be audited.
#' @param filename Character string for the output file (without extension).
#'   Default is "dq_audit_report".
#' @param output_dir Character string specifying the directory for the output file.
#'   If NULL (default), uses tempdir(). Users should specify a directory explicitly
#'   for production use.
#' @param output_format Character string indicating the output format.
#'   Options: "pdf" (default), "html", or "docx" (for editable Word document).
#'   Note: PDF format provides the best color rendering for heat map tables.
#'   DOCX format is generated by first creating a PDF, then converting to DOCX.
#' @param method Character string indicating the anomaly detection method.
#'   Passed to \code{score_anomaly()}. Default is "iforest".
#' @param contamination Numeric value between 0 and 1. Passed to \code{score_anomaly()}.
#'   Default is 0.05.
#' @param top_n Integer indicating the number of top anomalous records to display
#'   in the prioritized audit listing. Default is 100.
#' @param id_cols Character vector of column names to exclude from scoring.
#'   Passed to \code{prep_for_anomaly()}.
#' @param exclude_cols Character vector of additional columns to exclude.
#'   Passed to \code{prep_for_anomaly()}.
#' @param ground_truth_col Character string naming a column with ground truth labels.
#'   If provided, benchmarking metrics will be included in the report.
#' @param ... Additional arguments passed to \code{score_anomaly()}.
#'
#' @return Invisibly returns the path to the generated report file.
#'
#' @export
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   patient_id = 1:50,
#'   age = rnorm(50, 50, 15),
#'   cost = rnorm(50, 10000, 5000),
#'   gender = sample(c("M", "F"), 50, replace = TRUE)
#' )
#' # Generate HTML report (fastest, no LaTeX/pandoc required)
#' generate_audit_report(data, filename = "my_audit", output_format = "html",
#'                        output_dir = tempdir())
#' }
generate_audit_report <- function(data, filename = "dq_audit_report",
                                  output_dir = NULL, output_format = "pdf",
                                  method = "iforest", contamination = 0.05,
                                  top_n = 100, id_cols = NULL, exclude_cols = NULL,
                                  ground_truth_col = NULL, ...) {
  
  # Validate inputs
  if (!is.data.frame(data)) {
    stop("data must be a data frame")
  }
  
  if (!output_format %in% c("pdf", "html", "docx")) {
    stop("output_format must be 'pdf', 'html', or 'docx'")
  }
  
  if (is.null(output_dir)) {
    output_dir <- tempdir()
  }
  
  if (!dir.exists(output_dir)) {
    dir.create(output_dir, recursive = TRUE)
  }
  
  message("Scoring anomalies...")
  scored_data <- score_anomaly(
    data = data,
    method = method,
    contamination = contamination,
    ground_truth_col = ground_truth_col,
    id_cols = id_cols,
    exclude_cols = exclude_cols,
    ...
  )
  
  message("Flagging top anomalies...")
  flagged_data <- flag_top_anomalies(scored_data, contamination = contamination)
  
  benchmark_metrics <- attr(scored_data, "benchmark_metrics")
  metadata <- attr(scored_data, "metadata")
  
  # Get top anomalies first (before calculating feature importance)
  top_anomalies <- flagged_data %>%
    dplyr::arrange(dplyr::desc(.data$anomaly_score)) %>%
    dplyr::slice_head(n = top_n)
  
  # Only calculate feature importance for top_n anomalies (much faster for large datasets)
  # This is what's displayed in the report, so we don't need it for all 45K anomalies
  # Also limit to top 10 columns by variance for even better performance
  message("Calculating feature importance for top ", top_n, " anomalies...")
  if (nrow(top_anomalies) > 0) {
    top_anomalies <- calculate_feature_importance(top_anomalies, metadata, max_cols = 10)
    # Initialize reason columns in full dataset
    flagged_data$reason_feature <- NA_character_
    flagged_data$reason_value <- NA_character_
    flagged_data$reason_code <- NA_character_
    flagged_data$reason_deviation <- NA_real_
    # Copy reason codes to full dataset for top anomalies only
    top_indices <- match(top_anomalies$anomaly_score, flagged_data$anomaly_score)
    top_indices <- top_indices[!is.na(top_indices)][1:nrow(top_anomalies)]
    if (length(top_indices) == nrow(top_anomalies)) {
      flagged_data$reason_feature[top_indices] <- top_anomalies$reason_feature
      flagged_data$reason_value[top_indices] <- top_anomalies$reason_value
      flagged_data$reason_code[top_indices] <- top_anomalies$reason_code
      flagged_data$reason_deviation[top_indices] <- top_anomalies$reason_deviation
    }
  } else {
    # Initialize reason columns if no anomalies
    flagged_data$reason_feature <- NA_character_
    flagged_data$reason_value <- NA_character_
    flagged_data$reason_code <- NA_character_
    flagged_data$reason_deviation <- NA_real_
  }
  
  anomalous_records <- flagged_data[flagged_data$is_anomaly == TRUE, ]
  if (nrow(anomalous_records) > 0) {
    csv_file <- file.path(output_dir, paste0(filename, "_anomalies.csv"))
    utils::write.csv(anomalous_records, file = csv_file, row.names = FALSE)
    message("Anomalous records saved to: ", csv_file)
  }
  
  numeric_cols <- metadata$numeric_cols
  if (length(numeric_cols) > 0) {
    numeric_cols <- numeric_cols[numeric_cols %in% names(flagged_data)]
    if (length(numeric_cols) > 0) {
      variances <- sapply(numeric_cols, function(col) {
        stats::var(flagged_data[[col]], na.rm = TRUE)
      })
      top_numeric_cols <- names(sort(variances, decreasing = TRUE))[1:min(2, length(numeric_cols))]
    } else {
      top_numeric_cols <- character(0)
    }
  } else {
    top_numeric_cols <- character(0)
  }
  
  template_path <- system.file("rmarkdown", "templates", "audit_report",
                               "skeleton", "skeleton.Rmd", package = "autoFlagR")
  
  if (!file.exists(template_path) || template_path == "") {
    # Try to find template in package installation
    template_dir <- tempfile("autoFlagR_template")
    dir.create(template_dir, recursive = TRUE)
    template_path <- file.path(template_dir, "skeleton.Rmd")
    create_rmd_template(template_path, output_format)
  }
  
  message("Generating report...")
  
  temp_rmd <- tempfile(fileext = ".Rmd")
  template_content <- readLines(template_path)
  writeLines(template_content, temp_rmd)
  
  # Find logo file - use black background for PDF/DOCX, transparent for HTML
  if (output_format == "pdf" || output_format == "docx") {
    logo_filename <- "autoFlagR_hex_black.png"
  } else {
    logo_filename <- "autoFlagR_hex.png"
  }
  
  # Search for logo in installed package
  logo_path <- system.file("extdata", logo_filename, package = "autoFlagR")
  if (logo_path == "" || !file.exists(logo_path)) {
    logo_path <- NULL
  }
  
  # Copy logo to output directory if it exists
  logo_in_output <- NULL
  logo_available <- FALSE
  if (!is.null(logo_path) && file.exists(logo_path)) {
    logo_in_output <- file.path(output_dir, logo_filename)
    
    # Normalize paths to check if they're the same file
    logo_path_normalized <- normalizePath(logo_path, mustWork = FALSE)
    logo_in_output_normalized <- normalizePath(logo_in_output, mustWork = FALSE)
    
    # Copy if source and destination are different
    if (logo_path_normalized != logo_in_output_normalized) {
      tryCatch({
        if (file.exists(logo_in_output)) {
          file.remove(logo_in_output)
        }
        file.copy(logo_path, logo_in_output, overwrite = TRUE)
        logo_available <- file.exists(logo_in_output)
      }, error = function(e) {
        logo_available <- FALSE
      })
    } else {
      logo_available <- TRUE
    }
  }
  
  # Create header file with colortbl and all necessary packages
  header_lines <- c(
    "\\usepackage{xcolor}",
    "\\usepackage{array}",
    "\\usepackage{colortbl}",
    "\\usepackage{longtable}",
    "\\usepackage{multirow}",
    "\\usepackage{wrapfig}",
    "\\usepackage{float}",
    "\\usepackage{booktabs}",
    "\\usepackage{fancyhdr}",
    "\\usepackage{graphicx}",
    "\\usepackage{tikz}",
    "\\pagestyle{fancy}",
    "\\fancyhead[L]{\\textcolor[rgb]{0.16,0.50,0.73}{\\textbf{Data Quality Audit Report}}}",
    "\\fancyhead[R]{\\textcolor[rgb]{0.17,0.24,0.31}{\\today}}",
    "\\fancyfoot[C]{\\textcolor[rgb]{0.17,0.24,0.31}{\\thepage}}",
    "\\renewcommand{\\headrulewidth}{0.5pt}",
    "\\renewcommand{\\footrulewidth}{0.5pt}",
    "\\usepackage{titlesec}",
    "\\titleformat{\\section}{\\Large\\bfseries\\color[rgb]{0.16,0.50,0.73}}{}{0em}{}[\\titlerule]",
    "\\titleformat{\\subsection}{\\large\\bfseries\\color[rgb]{0.17,0.24,0.31}}{}{0em}{}"
  )
  
  # Add logo to title page if available (only for PDF/DOCX)
  if (logo_available && (output_format == "pdf" || output_format == "docx")) {
    # Use absolute path for logo to ensure LaTeX can find it
    logo_abs_path <- normalizePath(logo_in_output, winslash = "/", mustWork = FALSE)
    # Escape backslashes and spaces for LaTeX
    logo_abs_path <- gsub("\\\\", "/", logo_abs_path)
    logo_abs_path <- gsub(" ", "\\ ", logo_abs_path, fixed = TRUE)
    
    # Customize title to include logo on the left, title on the right (inline, not separate page)
    # Note: subtitle is already appended to @title by rmarkdown, so we just show @title
    header_lines <- c(
      header_lines,
      "\\makeatletter",
      "\\renewcommand{\\@maketitle}{%",
      "  \\noindent",
      "  \\begin{minipage}{0.25\\textwidth}",
      paste0("    \\includegraphics[width=\\textwidth,keepaspectratio]{", logo_abs_path, "}"),
      "  \\end{minipage}",
      "  \\hfill",
      "  \\begin{minipage}{0.7\\textwidth}",
      "    {\\huge\\bfseries\\@title\\par}",
      "    \\vspace{0.5cm}",
      "    {\\large\\@author\\par}",
      "    \\vspace{0.3cm}",
      "    {\\large\\@date\\par}",
      "  \\end{minipage}",
      "  \\vspace{1.5cm}",
      "  \\par",
      "}",
      "\\makeatother"
    )
  }
  
  header_content <- paste0(header_lines, collapse = "\n")
  temp_header <- tempfile(fileext = ".tex")
  writeLines(header_content, temp_header)
  
  if (output_format == "pdf") {
    custom_output <- rmarkdown::pdf_document(
      toc = TRUE,
      toc_depth = 2,
      number_sections = TRUE,
      fig_caption = TRUE,
      highlight = "tango",
      keep_tex = TRUE,
      latex_engine = "pdflatex",
      includes = rmarkdown::includes(in_header = temp_header)
    )
  } else if (output_format == "docx") {
    message("Generating PDF first, then converting to DOCX...")
    
    pdf_output <- rmarkdown::pdf_document(
      toc = TRUE,
      toc_depth = 2,
      number_sections = TRUE,
      fig_caption = TRUE,
      highlight = "tango",
      keep_tex = TRUE,
      latex_engine = "pdflatex",
      includes = rmarkdown::includes(in_header = temp_header)
    )
    
    pdf_file <- rmarkdown::render(
      input = temp_rmd,
      output_format = pdf_output,
      output_dir = output_dir,
      output_file = paste0(filename, ".pdf"),
      params = list(
        flagged_data = flagged_data,
        top_anomalies = top_anomalies,
        metadata = metadata,
        benchmark_metrics = benchmark_metrics,
        method = method,
        contamination = contamination,
        top_n = top_n,
        top_numeric_cols = top_numeric_cols
      ),
      quiet = TRUE
    )
    
    message("Converting LaTeX source to DOCX...")
    docx_file <- file.path(output_dir, paste0(filename, ".docx"))
    
    if (!rmarkdown::pandoc_available()) {
      stop("pandoc is required for DOCX conversion but is not available. Please install pandoc.")
    }
    
    tex_file <- sub("\\.pdf$", ".tex", pdf_file)
    
    if (!file.exists(tex_file)) {
      stop("LaTeX source file not found. Cannot convert to DOCX.")
    }
    
    # Ensure logo is available for pandoc conversion
    # Copy logo to same directory as tex file so pandoc can find it
    if (logo_available && !is.null(logo_in_output) && file.exists(logo_in_output)) {
      tex_dir <- dirname(tex_file)
      logo_for_pandoc <- file.path(tex_dir, basename(logo_in_output))
      if (normalizePath(logo_in_output, mustWork = FALSE) != normalizePath(logo_for_pandoc, mustWork = FALSE)) {
        file.copy(logo_in_output, logo_for_pandoc, overwrite = TRUE)
      }
    }
    
    tryCatch({
      rmarkdown::pandoc_convert(
        input = tex_file,
        to = "docx",
        output = docx_file,
        options = NULL
      )
      
      unlink(tex_file)
      
      message("Report generated: ", docx_file)
      return(invisible(docx_file))
    }, error = function(e) {
      stop("Failed to convert LaTeX to DOCX: ", e$message, 
           "\nPlease ensure pandoc is installed and can convert LaTeX to DOCX.")
    })
  } else {
    # For HTML, no logo needed
    custom_output <- rmarkdown::html_document(
      toc = TRUE,
      toc_depth = 2,
      number_sections = TRUE,
      fig_caption = TRUE
    )
  }
  
  file_ext <- output_format
  
  # For PDF/DOCX, copy logo to temp RMD directory so LaTeX can find it during compilation
  if ((output_format == "pdf" || output_format == "docx") && 
      logo_available && !is.null(logo_in_output) && file.exists(logo_in_output)) {
    temp_rmd_dir <- dirname(temp_rmd)
    temp_logo <- file.path(temp_rmd_dir, basename(logo_in_output))
    # Only copy if source and destination are different
    if (normalizePath(logo_in_output, mustWork = FALSE) != normalizePath(temp_logo, mustWork = FALSE)) {
      tryCatch({
        file.copy(logo_in_output, temp_logo, overwrite = TRUE)
      }, error = function(e) {
        # If copy fails, try using absolute path in LaTeX instead
        warning("Could not copy logo to temp directory, will use absolute path")
      })
    }
  }
  
  
  output_file <- rmarkdown::render(
    input = temp_rmd,
    output_format = custom_output,
    output_dir = output_dir,
    output_file = paste0(filename, ".", file_ext),
    params = list(
      flagged_data = flagged_data,
      top_anomalies = top_anomalies,
      metadata = metadata,
      benchmark_metrics = benchmark_metrics,
      method = method,
      contamination = contamination,
      top_n = top_n,
      top_numeric_cols = top_numeric_cols
    ),
    quiet = TRUE
  )
  
  unlink(temp_rmd)
  
  message("Report generated: ", output_file)
  
  return(invisible(output_file))
}

#' Create R Markdown Template
#'
#' Creates a default R Markdown template for audit reports.
#'
#' @param template_path Path where the template should be created
#' @param output_format Output format ("pdf", "html", or "docx")
#'
#' @return No return value, called for side effects. Creates an R Markdown
#'   template file at the specified path.
#'
#' @export
create_rmd_template <- function(template_path, output_format = "pdf") {
  
  template_content <- paste0('---
title: "Data Quality Audit Report"
author: "autoFlagR Package"
date: "`r Sys.Date()`"
output: ', if (output_format == "pdf" || output_format == "docx") 'pdf_document' else 'html_document', '
params:
  flagged_data: NULL
  top_anomalies: NULL
  metadata: NULL
  benchmark_metrics: NULL
  method: "iforest"
  contamination: 0.05
  top_n: 100
  top_numeric_cols: NULL
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
library(autoFlagR)
library(ggplot2)
library(dplyr)
library(gt)
```

# Executive Summary

## Key Metrics

- **Total Records Processed:** `r nrow(params$flagged_data)`
- **Total Records Flagged:** `r sum(params$flagged_data$is_anomaly, na.rm = TRUE)`
- **Anomaly Rate:** `r sprintf("%.2f%%", 100 * mean(params$flagged_data$is_anomaly, na.rm = TRUE))`
- **Detection Method:** `r toupper(params$method)`
- **Contamination Parameter:** `r params$contamination`

```{r benchmark-metrics, eval=!is.null(params$benchmark_metrics)}
if (!is.null(params$benchmark_metrics)) {
  cat("## Benchmarking Results\n\n")
  cat("- **AUC-ROC:** ", sprintf("%.4f", params$benchmark_metrics$auc_roc), "\n")
  cat("- **AUC-PR:** ", sprintf("%.4f", params$benchmark_metrics$auc_pr), "\n")
  cat("- **Contamination Rate:** ", sprintf("%.2f%%", 100 * params$benchmark_metrics$contamination_rate), "\n\n")
  
  if (length(params$benchmark_metrics$top_k_recall) > 0) {
    cat("### Top-K Recall\n\n")
    for (k_name in names(params$benchmark_metrics$top_k_recall)) {
      k_value <- gsub("top_", "", k_name)
      recall <- params$benchmark_metrics$top_k_recall[[k_name]]
      cat("- **Top ", k_value, " Recall:** ", sprintf("%.2f%%", 100 * recall), "\n")
    }
  }
}
```

# Anomaly Score Distribution

```{r score-distribution}
ggplot(params$flagged_data, aes(x = anomaly_score)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7, color = "black") +
  geom_vline(xintercept = attr(params$flagged_data, "anomaly_threshold"),
             color = "red", linetype = "dashed", linewidth = 1) +
  labs(title = "Distribution of Anomaly Scores",
       x = "Anomaly Score",
       y = "Frequency") +
  theme_minimal()
```

# Prioritized Audit Listing

The following table shows the top `r params$top_n` most anomalous records, sorted by anomaly score (highest first).

```{r top-anomalies-table}
# Select key columns for display
display_cols <- c()
if (length(params$metadata$id_cols) > 0) {
  display_cols <- c(display_cols, params$metadata$id_cols[1])
}
display_cols <- c(display_cols, "anomaly_score")
if (length(params$top_numeric_cols) > 0) {
  display_cols <- c(display_cols, params$top_numeric_cols)
}

# Create display table
if (length(display_cols) > 0 && all(display_cols %in% names(params$top_anomalies))) {
  display_table <- params$top_anomalies %>%
    dplyr::select(dplyr::all_of(display_cols)) %>%
    dplyr::mutate(anomaly_score = round(anomaly_score, 4))
  
  # Format as gt table
  gt_table <- display_table %>%
    gt::gt() %>%
    gt::fmt_number(columns = "anomaly_score", decimals = 4) %>%
    gt::tab_header(title = paste("Top", params$top_n, "Most Anomalous Records"))
  
  print(gt_table)
} else {
  # Fallback to simple table
  knitr::kable(params$top_anomalies[1:min(20, nrow(params$top_anomalies)), 
                                    c("anomaly_score", names(params$top_anomalies)[1:min(5, ncol(params$top_anomalies))])],
               caption = paste("Top", min(20, nrow(params$top_anomalies)), "Most Anomalous Records"))
}
```

# Bivariate Anomaly Visualization

```{r bivariate-plot, eval=length(params$top_numeric_cols) >= 2}
if (length(params$top_numeric_cols) >= 2) {
  col1 <- params$top_numeric_cols[1]
  col2 <- params$top_numeric_cols[2]
  
  ggplot(params$flagged_data, aes_string(x = col1, y = col2)) +
    geom_point(aes(color = is_anomaly, alpha = is_anomaly), size = 1) +
    scale_color_manual(values = c("FALSE" = "gray70", "TRUE" = "red"),
                       labels = c("FALSE" = "Normal", "TRUE" = "Anomalous"),
                       name = "Status") +
    scale_alpha_manual(values = c("FALSE" = 0.3, "TRUE" = 0.8), guide = "none") +
    labs(title = paste("Anomaly Detection:", col1, "vs", col2),
         subtitle = "Red points indicate flagged anomalies") +
    theme_minimal() +
    theme(legend.position = "bottom")
}
```

# Variable Distribution Comparison

```{r distribution-comparison, eval=length(params$top_numeric_cols) > 0}
if (length(params$top_numeric_cols) > 0) {
  for (col in params$top_numeric_cols[1:min(2, length(params$top_numeric_cols))]) {
    cat("## ", col, "\n\n")
    
    normal_data <- params$flagged_data[!params$flagged_data$is_anomaly, col, drop = TRUE]
    anomaly_data <- params$flagged_data[params$flagged_data$is_anomaly, col, drop = TRUE]
    
    plot_data <- data.frame(
      value = c(normal_data, anomaly_data),
      group = rep(c("Normal", "Anomalous"), 
                  c(length(normal_data), length(anomaly_data)))
    )
    
    p <- ggplot(plot_data, aes(x = value, fill = group)) +
      geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
      labs(title = paste("Distribution of", col, "by Anomaly Status"),
           x = col,
           y = "Frequency",
           fill = "Status") +
      theme_minimal()
    
    print(p)
    cat("\n\n")
  }
}
```

# Technical Appendix

## Algorithm Details

- **Method:** `r toupper(params$method)`
- **Contamination Parameter:** `r params$contamination`
- **Anomaly Threshold:** `r sprintf("%.4f", attr(params$flagged_data, "anomaly_threshold"))`
- **Package Version:** `r packageVersion("autoFlagR")`
- **Report Generation Date:** `r Sys.Date()`

## Columns Included in Scoring

### Numeric Columns
`r paste(params$metadata$numeric_cols, collapse = ", ")`

### Categorical Columns
`r if (length(params$metadata$categorical_cols) > 0) {
  paste(params$metadata$categorical_cols, collapse = ", ")
} else {
  "None"
}`

### Excluded Columns
`r if (length(params$metadata$excluded_cols) > 0) {
  paste(params$metadata$excluded_cols, collapse = ", ")
} else {
  "None"
}`

---

*Report generated by autoFlagR: AI-Driven Anomaly Detection for Data Quality*
')
  
  writeLines(template_content, template_path)
}

