#' Natural Language Processing for Meta Analysis
#'
#' The \pkg{MetaNLP} package provides methods to quickly transform a
#' CSV-file with titles and abstracts to an R data frame that can be
#' used for automatic title-abstract screening using machine learning.
#'
#'
#' @import methods
#' @name MetaNLP
"_PACKAGE"


#' Create a data frame with document-term matrix
#'
#' A \code{MetaNLP} object is the base class of the package \pkg{MetaNLP}.
#' It is initialized by passing the path to a CSV file and constructs
#' a data frame whose column names are the words that occur in the titles
#' and abstracts and whose cells contain the word frequencies for each
#' paper.
#'
#' @rdname MetaNLP
setClass("MetaNLP", representation(data_frame = "data.frame"))

#' @param file Either the path to the CSV file or a data frame containing the
#' abstracts
#' @param bounds An integer vector of length 2. The first value specifies
#' the minimum number of appearances of a word to become a column of the word
#' count matrix, the second value specifies the maximum number.
#' Defaults to \code{c(2, Inf)}. Note that the bounds are with respect to
#' the potentially weighted entries of the matrix.
#' @param word_length An integer vector of length 2. The first value specifies
#' the minimum number of characters of a word to become a column of the word
#' count matrix, the second value specifies the maximum number.
#' Defaults to \code{c(3, Inf)}.
#' @param language The language for lemmatization and stemming. Supported
#' languages are \code{english}, \code{french}, \code{german}, \code{russian} and
#' \code{spanish}. For non-english languages make sure that the csv
#' which is processed has the correct encoding.
#' @param weighting A weighting function for the entries of the document-term matrix.
#' Default is "frequency", other options are "binary" and "tf-idf".
#' @param ... Additional arguments passed on to \code{read.csv2}, e.g. when
#' "," should be used as a separator or when the encoding should be changed.
#' See \link[utils]{read.table}.
#' @return An object of class \code{MetaNLP}
#'
#' @details
#' An object of class \code{MetaNLP} contains a slot data_frame where
#' the document-term matrix is stored as a data frame.
#' The CSV file must have a column \code{ID} to identify each paper, a column
#' \code{title} with the belonging titles of the papers and a column
#' \code{abstract} which contains the abstracts. If the CSV stores training data,
#' a column \code{decision} should exist, indicating whether an abstract
#' is included in the meta analysis. This column does not need to exist, because
#' there is no decision for test data yet. Allowed values in this column are
#' either "yes" and "no" or "include" and "exclude" or "maybe". The value "maybe"
#' is handled as a "yes"/"include".
#'
#' @examples
#' path <- system.file("extdata", "test_data.csv", package = "MetaNLP", mustWork = TRUE)
#' obj <- MetaNLP(path)
#'
#' @note
#' To ensure correct processing of the data when there are special characters
#' (e.g. "é" or "ü"), make sure that the csv-file is correctly encoded
#' as \code{UTF-8}.
#' The stemming algorithm makes use of the C libstemmer library generated by
#' Snowball. When german texts are stemmed, umlauts are replaced by their
#' non-umlaut equivalent, so "ä" becomes "a" etc.
#'
#' @rdname MetaNLP
#' @export
MetaNLP <- function(file,
                    bounds      = c(2, Inf),
                    word_length = c(3, Inf),
                    language    = "english",
                    weighting   = "frequency",
                    ...) {
  title <- NULL
  abstract <- NULL

  # match language
  language <- match.arg(language, c("english", "french", "german",
                                    "russian", "spanish"), several.ok = FALSE)
  # get lemmatization dictionaries in correct language
  if(language != "english"){
    lexicon <- get0(language, envir = asNamespace("MetaNLP"))
  } else {
    lexicon <- lexicon::hash_lemmas
  }

  # match weighting function
  weighting = match.arg(weighting, c("frequency", "binary", "tf-idf"))
  fn <- switch(weighting, "frequency" = tm::weightTf,
               "binary"    = tm::weightBin,
               "tf-idf"    = tm::weightTfIdf)

  # load file
  if(is.character(file)) data <- utils::read.csv2(file, header = TRUE, ...)
  else data <- as.data.frame(file)

  # make column names lower case
  names(data) <- tolower(names(data))

  # check that all the necessary columns exist
  if(any(c(is.null(data$id), is.null(data$title), is.null(data$abstract)))) {
    stop("The columns 'id', 'title' and 'abstract' must exist!")
  }

  # ensure UTF-8 encoding and replace all non-convertable bytes by an empty space
  data$title <- iconv(data$title, to = "UTF-8", sub = " ")
  data$abstract <- iconv(data$abstract, to = "UTF-8", sub = " ")

  # only select rows without na values or empty string
  n_exclude <- nrow(subset(data, ((is.na(data$abstract) | data$abstract == "") |
                                (is.na(data$title) | data$title == ""))))
  data <-  subset(data, !((is.na(data$abstract) | data$abstract == "") |
                            (is.na(data$title) | data$title == "")))
  if(n_exclude > 0) {
    warning(paste(n_exclude, "row(s) was/were removed due to missing values!"))
  }

  suppressWarnings({data |>
    # select the columns "abstract" and "title"
    (`[`)(c("title", "abstract")) |>
    # add new column x where Title and Abstract are pasted
    within(x <- paste(title, abstract)) |>
    (`[[`)(c("x")) |>
    # lower case
    tolower() |>
    tm::VectorSource() |>
    # create corpus object
    tm::Corpus() |>
    # remove special characters
    tm::tm_map(tm::content_transformer(replaceSpecialChars), language = language) |>
    # strip white space
    tm::tm_map(tm::stripWhitespace) |>
    # lemmatization of the words
    tm::tm_map(textstem::lemmatize_strings, dictionary = lexicon) |>
    # only use word stems
    tm::tm_map(tm::stemDocument, language = language) |>
    # create matrix
    tm::DocumentTermMatrix(control = list(wordLengths = word_length,
                                          weighting   = fn)) |>
    as.matrix() |>
    as.data.frame() -> temp
  })

  # only choose word stems that appear at least a pre-specified number of times
  temp <- temp[, colSums(temp) >= bounds[1] & colSums(temp) <= bounds[2]]

  # order by column name
  index_vec <- order(names(temp))
  temp |>
    subset(select = index_vec) -> temp

  if(!is.null(data$decision)) {
    # use grepl to ensure that words like "included" or "Inclusion" are treated
    #correctly
    decision <- ifelse(grepl("incl", data$decision, ignore.case = TRUE) |
                           grepl("yes", data$decision, ignore.case = TRUE) |
                           grepl("maybe", data$decision, ignore.case = TRUE),
                       "include", "exclude")

    # add columns containing the ids of the papers and the belonging decisions
    res <- cbind("id_" = data$id, "decision_" = decision, temp)
  } else {
    res <- cbind("id_" = data$id, temp)
  }

  return(new("MetaNLP", data_frame = res))
}


setMethod("print", signature("MetaNLP"),
          function(x) {
            sprintf("MetaNLP<nrow=%i,ncol=%i>",
                    nrow(x@data_frame), ncol(x@data_frame))
          })

setMethod("show", signature("MetaNLP"),
          function(object) {
            cat(print(object))
          })


#' Create bar plot from MetaNLP-object
#'
#' This method creates a bar plot from a MetaNLP object, displaying the most
#' frequent word stems.
#'
#' @param x A MetaNLP object to plot
#' @param y not used
#' @param n Number of bars
#' @param decision Stratify bar plot by decision. Default is no stratification.
#' @param stop_words Boolean to decide whether stop words shall be included in
#' the summary. \code{stop_words = TRUE} means, that stop words are included.
#' @param ... Additional parameters for \code{delete_stop_words} (e.g. language
#' of the stop words).
#'
#' @examples
#' path <- system.file("extdata", "test_data.csv", package = "MetaNLP", mustWork = TRUE)
#' obj <- MetaNLP(path)
#' plt <- plot(obj)
#'
#' @note
#' Note that "most frequent" here refers to the entries
#' of the document-term matrix. If "binary" or "tf-idf" weighting was chosen,
#' the displayed values are in terms of the weighted entries.
#'
#' @return nothing
#'
#' @importFrom graphics barplot
#'
#' @export
setMethod("plot", signature("MetaNLP", y = "missing"),
          function(x,  y = NULL, n = 10,
                   decision = c("total", "include", "exclude"),
                   stop_words = FALSE,
                   ...) {

            decision_ <- NULL

            # delete stop words
            if(!stop_words) {
              data <- delete_stop_words(x, ...)@data_frame
            } else {
              data <- x@data_frame
            }

            dec <- match.arg(decision)
            # check whether decision column exists and filter data
            if(dec != "total") {
              if(is.null(data$decision_)) {
                warning("Column decision_ does not exist. Bar plot is created
                          by using the whole document-term matrix.")
              }
              else {
                data <- data[data$decision_ == dec, ]
              }
            }

            data$id_ <- NULL
            data$decision_ <- NULL

            # get n most frequent words
            data |>
              (`[`)(-c(1, 2)) |>
              colSums() |>
              sort(decreasing = TRUE) |>
              (`[`)(1:n) |>
              rev() -> total

            # create bar plot
            graphics::barplot(total,
                              col = "#4A90E2",
                              horiz = TRUE,
                              border = NA,
                              xlim = c(0, max(total) * 1.1),
                              xlab = "Number of appearances",
                              main = "Most frequent words",
                              las = 2,
                              cex.names = min(0.7, 8 / length(total)),
                              cex.axis = 0.8)
          })


