### ctrdata package
### utility functions

## variable definitions
#
# EUCTR definitions
countriesEUCTR <- c(
  "AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR",
  "DE", "GR", "HU", "IE", "IT", "LV", "LT", "LU", "MT", "NL",
  "PL", "PT", "RO", "SK", "SE", "SI", "ES", "GB", "IS", "LI",
  "NO", "3RD")
#
# regexpr
# - EudraCT e.g. 2010-022945-52
regEuctr <- "[0-9]{4}-[0-9]{6}-[0-9]{2}"
# - CTGOV
regCtgov <- "NCT[0-9]{8}"
# - regIsrctn
# FIXME check if first digit is always non zero
regIsrctn <- "[1-9][0-9]{7}"
#
# register list
registerList <- c("EUCTR", "CTGOV", "ISRCTN")


#' Check and prepare nodbi connection object for ctrdata
#'
#' @param con A \link[nodbi]{src} connection object, as obtained with
#'  nodbi::\link[nodbi]{src_mongo}() or nodbi::\link[nodbi]{src_sqlite}()
#'
#' @keywords internal
#'
#' @importFrom nodbi src_sqlite
#' @importFrom utils capture.output
#'
#' @return Connection object as list, with collection
#'  element under root
#'
ctrDb <- function(
  con = nodbi::src_sqlite(
    collection = "ctrdata_auto_generated")) {

  ## sqlite
  if (inherits(con, "src_sqlite")) {

    if (is.null(con$collection)) {
      stop("In src_sqlite(), a parameter 'collection' needs to specify ",
           "the name of a table, such as src_sqlite(collection = 'test'), ",
           "for package ctrdata to work with other nosql databases.",
           call. = FALSE)
    }

    # check
    if (!RSQLite::dbIsValid(con$con)) {
      con <- nodbi::src_sqlite(dbname = con$dbname,
                               collection = con$collection)
    }

    # add database as element under root
    con <- c(con,
             "db" = con$dbname,
             "ctrDb" = TRUE)

    # print warning from nodbi::src_sqlite()
    if (grepl(":memory:", con$dbname)) {
      warning("Database not persisting,\ncopy to persistant database like ",
              "this:\n\nRSQLite::sqliteCopyDatabase(",
              "\n  from = <your in-memory-database-object>$con,",
              "\n  to = RSQLite::dbConnect(RSQLite::SQLite(),",
              "\n                          dbname = 'local_file.db'))\n",
              call. = FALSE,
              noBreaks. = FALSE,
              immediate. = TRUE)
    }

    ## return
    return(structure(con,
                     class = c("src_sqlite", "docdb_src")))
  }

  ## mongo
  if (inherits(con, "src_mongo")) {

    # rights may be insufficient to call info(),
    # hence this workaround that should always
    # work and be stable to retrieve name of
    # collection in the mongo connection
    # suppress... for reconnect info from mongolite
    coll <- suppressMessages(utils::capture.output(con$con)[1])
    coll <- sub("^.*'(.*)'.*$", "\\1", coll)

    # add collection as element under root
    con <- c(con,
             "collection" = coll,
             "ctrDb" = TRUE)

    ## return
    return(structure(con,
                     class = c("src_mongo", "docdb_src")))
  }

  ## unprepared for other nodbi adapters so far
  stop("Please specify in parameter 'con' a database connection. ",
       "crdata supports so far only src_mongo() and src_sqlite().",
       call. = FALSE)

} # end ctrDb


#' Open advanced search pages of register(s) or execute search in browser
#'
#' @param url of search results page to show in the browser.
#'   May be the output of \link{ctrGetQueryUrl} or from \link{dbQueryHistory}.
#'
#' @param register Register(s) to open. Either "EUCTR" or "CTGOV" or a vector of
#'   both. Default is to open both registers' advanced search pages. To open the
#'   browser with a previous search, the output of \link{ctrGetQueryUrl}
#'   or one row from \link{dbQueryHistory} can be used.
#'
#' @param copyright (Optional) If set to \code{TRUE}, opens copyright pages of
#'   register(s).
#'
#' @param ... May include the deprecated \code{input} parameter.
#'
#' @export
#'
#' @return Is always true, invisibly.
#'
#' @examples
#' \dontrun{
#' ctrOpenSearchPagesInBrowser(
#'  "https://www.clinicaltrialsregister.eu/ctr-search/search?query=cancer")
#'
#' # for this example, the clipboard has to
#' # contain the URL from a search in a register
#' ctrOpenSearchPagesInBrowser(
#'  ctrGetQueryUrl())
#'
#' # open the last query that was
#' # loaded into the database
#' db <- nodbi::src_sqlite(
#'   collection = "previously_created"
#' )
#' ctrOpenSearchPagesInBrowser(
#'   dbQueryHistory(con = db))
#' }
#'
ctrOpenSearchPagesInBrowser <- function(
  url = "",
  register = "",
  copyright = FALSE,
  ...) {

  ## FIXME migrate from previously used parameter "input"
  tmp <- list(...)
  tmp <- tmp[["input"]]
  if (length(tmp)) {
    url <- tmp
    warning("Parameter 'input' is deprecated, use 'url' instead.",
            call. = FALSE)
  }

  ## check combination of arguments to select action

  # - open all registers if no parameter is specified
  if (all(register == "") && all(url == "")) {
    sapply(
      c("https://www.clinicaltrialsregister.eu/ctr-search/search",
        "https://clinicaltrials.gov/ct2/search/advanced",
        "https://www.isrctn.com/editAdvancedSearch"),
      function(u) utils::browseURL(u))
  }

  # - open copyright or similar pages
  if (copyright) {
    sapply(
      c("https://www.clinicaltrialsregister.eu/disclaimer.html",
        "https://clinicaltrials.gov/ct2/about-site/terms-conditions#Use",
        "https://www.isrctn.com/page/faqs#usingISRCTN"),
      function(u) utils::browseURL(u))
  }

  # - open from url, or query and register
  if (is.atomic(url) && url != "") {
    url <- ctrGetQueryUrl(url = url, register = register)
  }

  # - get from a data frame, such as from
  #   ctrQueryHistoryInDb() or ctrGetQueryUrl()
  if (is.data.frame(url) &&
      all(substr(names(url), 1, 6) == "query-")) {
    nr <- nrow(url)
    if (nr > 1L) warning("Using last query",
                         call. = FALSE, immediate. = TRUE)
    register  <- url[nr, "query-register"]
    url <- url[nr, "query-term"]
  }

  # - open from url and register
  if (is.atomic(url) && url != "" && register != "") {
    url <- switch(
      register,
      "EUCTR" = paste0("https://www.clinicaltrialsregister.eu/ctr-search/search?", url),
      "CTGOV" = paste0("https://clinicaltrials.gov/ct2/results?", url),
      "ISRCTN" = paste0("https://www.isrctn.com/search?", url))
    utils::browseURL(url = url)
    return(url)
  }

  # return
  invisible(NULL)
}
# end ctrOpenSearchPagesInBrowser


#' Extract query parameters and register name from input or from
#' clipboard into which the URL of a register search was copied
#'
#' @param url URL such as from the browser address bar.
#' If not specified, clipboard contents will be checked for
#' a suitable URL. Can also contain a query term such as from
#' \link{dbQueryHistory}()["query-term"]
#'
#' @param register Optional name of register (i.e., "EUCTR" or
#' "CTGOV") in case url is a query term
#'
#' @return A string of query parameters that can be used to retrieve data
#' from the register.
#'
#' @export
#'
#' @return A data frame with column names query term and register name
#' that can directly be used in \link{ctrLoadQueryIntoDb} and in
#' \link{ctrOpenSearchPagesInBrowser}
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#'
#' # user now copies into the clipboard the URL from
#' # the address bar of the browser that shows results
#' # from a query in one of the trial registers
#' ctrLoadQueryIntoDb(
#'   ctrGetQueryUrl(),
#'   con = db
#' )
#' }
#'
#' @importFrom clipr read_clip
#'
ctrGetQueryUrl <- function(
  url = "",
  register = "") {
  #
  # check parameters expectations
  if (!is.atomic(url) || !is.atomic(register) ||
      is.null(url) || is.null(register) ||
      !inherits(url, "character") || !inherits(register, "character") ||
      length(url) != 1L || length(register) != 1L ||
      is.na(url) || is.na(register)) {
    stop("ctrGetQueryUrl(): 'url' and / or 'register' ",
         "is not a single character string, url: '",
         deparse(url), "', register: '", deparse(register), "'",
         call. = FALSE)
  }
  #
  # if no parameter specified,
  # check clipboard contents
  if (nchar(url) == 0L) {
    url <- suppressWarnings(
      clipr::read_clip(
        allow_non_interactive = TRUE)
    )
    if (is.null(url) || (length(url) != 1L) || (nchar(url) == 0L)) {
      stop("ctrGetQueryUrl(): no clinical trial register ",
           "search URL found in parameter 'url' or in clipboard.",
           call. = FALSE)
    }
    message("* Using clipboard content as register query URL: ", url)
  }
  #
  #
  if (register != "" && grepl("^http", url)) {
    warning("Full URL but also 'register' specified; ",
            "continuing with register = ''", immediate. = TRUE)
    register <- ""
  }
  #
  # identify domain and register short name
  if (register == "") {
    register <- switch(
      sub("^https://[w]{0,3}[.]?([a-zA-Z.]+)/.*", "\\1", url),
      "clinicaltrialsregister.eu" = "EUCTR",
      "clinicaltrials.gov" = "CTGOV",
      "isrctn.com" = "ISRCTN",
      "NONE")
  }
  #
  outdf <- function(qt, reg) {
    qt <- utils::URLdecode(qt)
    message("* Found search query from ", reg, ": ", qt)
    data.frame(
      `query-term` = qt,
      `query-register` = reg,
      check.names = FALSE,
      stringsAsFactors = FALSE)
  }
  # identify query term per register
  #
  if (register == "EUCTR") {
    # search result page
    queryterm <- sub(".*/ctr-search/search[?](.*)", "\\1", url)
    # single trial page
    queryterm <- sub(paste0(".*/ctr-search/trial/(", regEuctr, ")/.*"),
                     "\\1", queryterm)
    # remove any intrapage anchor, e.g. #tableTop
    queryterm <- sub("#.+$", "", queryterm)
    # sanity correction for naked terms
    queryterm <- sub(
      "(^|&|[&]?\\w+=\\w+&)([ a-zA-Z0-9+-]+)($|&\\w+=\\w+)",
      "\\1query=\\2\\3", queryterm)
    # check if url was for results of single trial
    if (grepl(".*/results$", url)) {
      queryterm <- paste0(queryterm, "&resultsstatus=trials-with-results")
    }
    #
    return(outdf(queryterm, register))
  }
  #
  if (register == "CTGOV") {
    # single trial page
    queryterm <- sub(paste0(".*/ct2/show/(", regCtgov, ")([?][a-z]+.*|$)"),
                     "\\1", url)
    # inform user
    if (grepl("[?][a-z]+=\\w+", url, perl = TRUE) &
        grepl(paste0("^", regCtgov, "$"), queryterm)) {
      message("* Note: 'url' shows a single trial (and is returned by the ",
              "function) but also had search parameters: If interested in ",
              "search results, click 'Return to List' in browser and use ",
              "this as 'url'.")
    }
    # search results page
    queryterm <- sub(".*/ct2/results[?](.*)", "\\1", queryterm)
    # other results page
    queryterm <- sub("(.*)&Search[a-zA-Z]*=(Search|Find)[a-zA-Z+]*",
                     "\\1", queryterm)
    # remove empty parameters
    queryterm <- gsub("[a-z_0-9]+=&", "", queryterm)
    queryterm <- sub("&[a-z_0-9]+=$", "", queryterm)
    # correct naked terms
    queryterm <- sub(
      # "(^|&|[&]?\\w+=\\w+&)(\\w+|[NCT0-9-]+)($|&\\w+=\\w+)",
      "(^|&|[&]?\\w+=\\w+&)(\\w+|[a-zA-z0-9+-.:]+)($|&\\w+=\\w+)",
      "\\1term=\\2\\3", queryterm)
    #
    return(outdf(queryterm, register))
  }
  #
  if (register == "ISRCTN") {
    # single trial page
    queryterm <- sub(paste0("^.*/ISRCTN(", regIsrctn, ")$"),
                     "ISRCTN\\1", url)
    # search results page
    queryterm <- sub(".*/search[?](.*)", "\\1", queryterm)
    # remove unnecessary parameter
    queryterm <- sub("&searchType=[a-z]+-search", "", queryterm)
    # correct naked terms
    queryterm <- sub(
      "(^|&|[&]?\\w+=\\w+&)(\\w+|[ a-zA-Z0-9+-]+)($|&\\w+=\\w+)",
      "\\1q=\\2\\3", queryterm)
    #
    return(outdf(queryterm, register))
  }
  #
  # default / NONE
  warning("ctrGetQueryUrl(): no clinical trial register ",
          "search URL found in parameter 'url' or in clipboard.",
          call. = FALSE, immediate. = TRUE)
  #
  return(invisible(NULL))
}
# end ctrGetQueryUrl


#' Import from clipboard the URL of a search in one of the registers
#'
#' @inheritParams ctrGetQueryUrl
#'
#' @return A string of query parameters that can be used to retrieve data
#' from the register.
#'
#' @export
#'
#' @return A data frame with column names query term and register name
#' that can directly be used in \link{ctrLoadQueryIntoDb} and in
#' \link{ctrOpenSearchPagesInBrowser}
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#'
#' # user now copies into the clipboard the URL from
#' # the address bar of the browser that shows results
#' # from a query in one of the trial registers
#' ctrLoadQueryIntoDb(
#'   ctrGetQueryUrlFromBrowser(),
#'   con = db
#' )
#' }
#'
#' @importFrom clipr read_clip
#'
ctrGetQueryUrlFromBrowser <- function(
  url = "",
  register = "") {

  # deprecate
  .Deprecated(new = "ctrGetQueryUrl")

  # defer call
  ctrGetQueryUrl(url = url, register = register)

}
# end ctrGetQueryUrlFromBrowser


#' Find synonyms of an active substance
#'
#' An active substance can be identified by a recommended international
#' nonproprietary name, a trade or product name, or a company code(s).
#'
#' At this time, this function uses the register ClinicalTrials.Gov to
#' detect which substances were also searched for.
#'
#' @param activesubstance An active substance, in an atomic character vector
#'
#' @return A character vector of the active substance (input parameter) and
#'  synonyms, if any were found
#'
#' @importFrom xml2 read_html
#' @importFrom rvest html_node html_table
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' ctrFindActiveSubstanceSynonyms(
#'   activesubstance = "imatinib"
#' )
#' }
#'
ctrFindActiveSubstanceSynonyms <- function(activesubstance = "") {

  # check parameters
  if ((length(activesubstance) != 1) ||
      !is.character(activesubstance) ||
      (nchar(activesubstance) == 0)) {
    stop("ctrFindActiveSubstanceSynonyms(): ",
         "activesubstance should be a single string.",
         call. = FALSE)
  }

  # initialise output variable
  as <- activesubstance

  # check and set proxy if needed to access internet
  setProxy()

  # getting synonyms
  ctgovfirstpageurl <-
    paste0("https://clinicaltrials.gov/ct2/results/details?term=",
           activesubstance)
  tmp <- xml2::read_html(x = utils::URLencode(ctgovfirstpageurl))

  # extract from table "Terms and Synonyms Searched:"
  tmp <- rvest::html_node(
    tmp, xpath =
      '//*[@id="searchdetail"]//table[1]')
  tmp <- rvest::html_table(tmp, fill = TRUE)
  asx <- tmp[[1]]
  asx <- asx[!grepl(
    paste0("(more|synonyms|terms|", as, "|",
           paste0(unlist(strsplit(as, " "), use.names = FALSE),
                  collapse = "|"), ")"), asx,
    ignore.case = TRUE)]

  # prepare and return output
  as <- c(as, asx)
  as <- unique(as)
  return(as)
}
# end ctrFindActiveSubstanceSynonyms


#' Show the history of queries that were loaded into a database
#'
#' @inheritParams ctrDb
#'
#' @return A data frame with columns: query-timestamp, query-register,
#'  query-records (note: this is the number of records loaded when last
#'  executing \link{ctrLoadQueryIntoDb}, not the total record number) and
#'  query-term, and with one row for each \link{ctrLoadQueryIntoDb}
#'  loading trial records in this collection.
#'
#' @param verbose If \code{TRUE}, prints additional information
#' (default \code{FALSE}).
#'
#' @importFrom nodbi docdb_query
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' dbQueryHistory(
#'   con = db
#' )
#' }
#'
dbQueryHistory <- function(con, verbose = FALSE) {

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # debug
  if (verbose) message("Running dbQueryHistory ...")

  hist <- nodbi::docdb_query(
    src = con,
    key = con$collection,
    query = '{"_id": {"$eq": "meta-info"}}',
    fields = '{"queries": 1}')

  # check if meeting expectations
  if (is.null(hist) ||
      nrow(hist) == 0L) {
    #
    message("No history found in expected format.")
    #
    # return (class data.frame is expected)
    return(invisible(data.frame(NULL)))
    #
  }

  # access data frame of queries
  hist <- hist[["queries"]][[1]]

  # inform user
  if (verbose) {

    message("Number of queries in history of \"",
            con$collection, "\": ", nrow(hist))

    # total number of records in collection to inform user
    countall <- length(nodbi::docdb_query(
      src = con,
      key = con$collection,
      query =  '{"_id": {"$ne": "meta-info"}}',
      fields = '{"_id": 1}')[["_id"]])

    message("Number of records in collection \"",
            con$collection, "\": ", countall)

  }

  # return
  return(hist)

}
# end ctrQueryHistoryInDb


#' Find names of fields in the database collection
#'
#' Given part of the name of a field of interest to the user, this
#' function returns the full field names as found in the database.
#'
#' For fields in EUCTR (protocol- and results-related information),
#' see also the register's documentation at
#' \url{https://eudract.ema.europa.eu/result.html}.
#'
#' For fields in CTGOV (protocol-related information), see also
#' the register's definitions at
#' \url{https://prsinfo.clinicaltrials.gov/definitions.html}.
#'
#' Note: Generating a list of fields with this function may take
#' some time, and may involve running a mapreduce function if using
#' a MongoDB server. If the user is not not authorized to run
#' such a function, random documents are sampled to generate a
#' list of fields.
#'
#' @param namepart A plain string (can include a regular expression,
#' including Perl-style) to be searched for among all field names
#' (keys) in the database.
#'
#' @param verbose If \code{TRUE}, prints additional information
#' (default \code{FALSE}).
#'
#' @importFrom nodbi docdb_query
#'
#' @inheritParams ctrDb
#'
#' @return Vector of names of found field(s) in alphabetical
#' order (that is, not by register or field frequency)
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' dbFindFields(
#'   nampepart = "date",
#'   con = db
#' )
#' }
#'
dbFindFields <- function(namepart = "",
                         con,
                         verbose = FALSE) {

  ## sanity checks
  if (!is.atomic(namepart)) stop("'namepart' should be atomic.", call. = FALSE)
  if (length(namepart) > 1) stop("'namepart' should have one element.", call. = FALSE)
  if (namepart == "")       stop("Empty 'namepart' parameter.", call. = FALSE)

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  ## check if cache for list of keys in collection exists,
  # otherwise create new environment as session cache
  if (!exists(".dbffenv")) {
    .dbffenv <- new.env(parent = emptyenv())
  }

  ## check if cache environment has entry for the database
  if (exists(x = paste0(con$db, "/", con$collection),
             envir = .dbffenv)) {

    # if true, get keys list from cache
    keyslist <- get(x = paste0(con$db, "/", con$collection),
                    envir = .dbffenv)

    # informing user
    message("Using cache of fields.")

  } else {

    # inform user
    message("Finding fields in database (may take some time)")

    ## using storage backend- specific methods, since
    ## no canonical way was found yet to retrieve
    ## field / key names

    ## - method for mongodb
    if (inherits(con, "src_mongo")) {

      # try mapreduce to get all keys
      keyslist <- try({
        con$con$mapreduce(
          map = "function() {
      obj = this;
      return searchInObj(obj, '');
      function searchInObj(obj, pth) {
         for(var key in obj) {
            if(typeof obj[key] == 'object' && obj[key] !== null) {
               if(pth != '') {pth = pth + '.'}
                  searchInObj(obj[key], pth + key);
            }else{
               key = pth + '.' + key;
               key = key.replace(/[.][0-9]+[.]/g, '.');
               key = key.replace(/[.][0-9]+$/, '');
               key = key.replace(/[.][.]+/g, '.');
               key = key.replace(/[.]$/g, '');
               key = key.replace(/^[.]/, '');
               emit(key, 1);
      }}}}",
      reduce = "function(id, counts) {return Array.sum(counts)}"
      # extract and keep only "_id" = first column, with keys
        )[["_id"]]},
      silent = TRUE)

      # if mapreduce does not work or is not permitted, revert to guessing
      if (inherits(keyslist, "try-error")) {

        warning("Mongo server returned: ", as.character(keyslist),
                "Using alternative method (extracting keys from ",
                "sample documents, may be incomplete).",
                call. = FALSE, immediate. = TRUE)

        # get 2 random documents, one for each register EUCTR and CTGOV,
        # if in collection, and retrieve keys from documents
        keyslist <- c(
          "", # avoid empty vector
          names(con$con$find(
            query = '{"_id": { "$regex": "^NCT[0-9]{8}", "$options": ""} }',
            limit = 1L)),
          names(con$con$find(
            query = '{"_id": { "$regex": "^[0-9]{4}-[0-9]{6}", "$options": ""} }',
            limit = 1L)))

      } # end if error with mapreduce
    } # end if src_mongo

    ## - method for sqlite
    if (inherits(con, "src_sqlite")) {

      # uses special function parameter for
      # src_sqlite query method: listfields
      keyslist <- c("", # avoid empty vector
                    nodbi::docdb_query(
                      src = con,
                      key = con$collection,
                      query = "",
                      listfields = TRUE))

    }

    ## store keyslist to environment (cache)
    if (length(keyslist) > 1) {
      assign(x = paste0(con$db, "/", con$collection),
             value = keyslist,
             envir = .dbffenv)
      message("Field names cached for this session.")
    }

  } # end get cached list or generate new list

  ## inform user of unexpected situation
  if ((length(keyslist) == 0) || all(keyslist == "")) {
    warning("No keys could be extracted, please check database ",
            "and contents: ", con$db, "/", con$collection, call. = FALSE)
  }

  ## now do the actual search and find for key name parts
  fields <- keyslist[grepl(pattern = namepart, x = keyslist,
                           ignore.case = TRUE, perl = TRUE)]

  # clean empty entries and exclude _id for consistency
  # since different approaches above return _id or not
  fields <- fields[fields != "_id" & fields != ""]
  if (!length(fields)) fields <- ""

  # return the match(es)
  return(sort(fields))

} # end dbFindFields


#' Deduplicate records to provide unique clinical trial identifiers
#'
#' If records for a clinical trial are found from more than one register, the
#' record from EUCTR is returned. The function currently relies on CTGOV
#' recording other identifiers such as the EudraCT number in the field "Other
#' IDs".
#'
#' @param preferregister A vector of the sequence of preference for registers
#' from which to generate unique _id's, default
#' \code{c("EUCTR", "CTGOV", "ISRCTN")}
#'
#' @inheritParams dfFindUniqueEuctrRecord
#'
#' @param verbose If set to \code{TRUE}, prints out information about numbers
#' of records found at subsequent steps when searching for duplicates
#'
#' @importFrom nodbi docdb_query
#'
#' @inheritParams ctrDb
#'
#' @return A vector with strings of keys ("_id" in the database) that
#'   represent non-duplicate trials.
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' dbFindIdsUniqueTrials(
#'   con = db
#' )
#' }
#'
dbFindIdsUniqueTrials <- function(
  preferregister = c("EUCTR", "CTGOV", "ISRCTN"),
  prefermemberstate = "GB",
  include3rdcountrytrials = TRUE,
  con,
  verbose = TRUE) {

  # parameter checks
  if (!all(preferregister %in% registerList)) {
    stop("'preferregister' not known: ", preferregister, call. = FALSE)
  }
  if (length(prefermemberstate) != 1L |
      !any(prefermemberstate == countriesEUCTR)) {
    stop("'prefermemberstate' not known: ", prefermemberstate, call. = FALSE)
  }
  # complete if preferregister does not have all
  preferregister <- unique(preferregister)
  preferregister <- union(preferregister, registerList)

  # objective: create a vector of database record identifiers (_id)
  # that represent unique records of clinical trials, based on user's
  # preferences for selecting the preferred from any multiple records

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # inform user
  message("Searching for duplicate trials... ")
  message(" - Getting trial ids...", appendLF = FALSE)

  # fields for database query
  fields <- c(
    "ctrname",
    # euctr
    "a2_eudract_number",
    "a52_us_nct_clinicaltrialsgov_registry_number",
    "trialInformation.usctnIdentifier",
    "a51_isrctn_international_standard_randomised_controlled_trial_number",
    "trialInformation.isrctnIdentifier",
    "a41_sponsors_protocol_code_number",
    # ctgov
    "id_info",
    # isrctn
    "externalRefs",
    "isrctn"
  )

  # get identifiers
  listofIds <- try(suppressMessages(suppressWarnings(
    dbGetFieldsIntoDf(
      fields = fields,
      con = con,
      verbose = FALSE,
      stopifnodata = FALSE)
  )),
  silent = TRUE
  )

  # error check
  if (inherits(listofIds, "try-error") ||
      !length(listofIds) || !nrow(listofIds)) {
    stop("No records found, check collection '", con$collection, "'",
         call. = FALSE)
  }

  # inform user
  message("\b\b\b, ", nrow(listofIds), " found in collection")

  # copy attributes
  attribsids <- attributes(listofIds)

  # target fields for further steps in this function
  fields <- c(
    "_id",
    "ctrname",
    # euctr
    "a2_eudract_number",
    "a52_us_nct_clinicaltrialsgov_registry_number",
    "trialInformation.usctnIdentifier",
    "a51_isrctn_international_standard_randomised_controlled_trial_number",
    "trialInformation.isrctnIdentifier",
    "a41_sponsors_protocol_code_number",
    # ctgov
    "id_info.secondary_id",
    "id_info.org_study_id",
    "id_info.nct_id",
    "id_info.nct_alias",
    "id_info.secondary_id",
    "id_info.secondary_id",
    "id_info.org_study_id",
    # isrctn
    "externalRefs.eudraCTNumber",
    "externalRefs.clinicalTrialsGovNumber",
    "isrctn",
    "externalRefs.protocolSerialNumber"
  )

  # add any missing columns
  missFields <- setdiff(fields, names(listofIds))
  if (length(missFields)) {
    missCols <- matrix(nrow = nrow(listofIds), ncol = length(missFields))
    missCols <- data.frame(missCols)
    names(missCols) <- missFields
    listofIds <- cbind(listofIds, missCols)
  }

  # replicate columns to make data frame fit subsequent steps
  listofIds <- listofIds[, fields, drop = FALSE]

  # rename columns for content mangling, needs to
  # correspond to columns and sequence in fields
  # for mapping identifiers across registers
  names(listofIds) <- c(
    "_id", "ctrname",
    # euctr
    "euctr.1", "ctgov.1a", "ctgov.1b", "isrctn.1a", "isrctn.1b", "sponsor.1",
    # ctgov
    "euctr.2a", "euctr.2b", "ctgov.2a", "ctgov.2b", "isrctn.2",
    "sponsor.2a", "sponsor.2b",
    # isrctn
    "euctr.3", "ctgov.3", "isrctn.3", "sponsor.3"
  )

  # keep only relevant content
  # - in certain raw value columns
  colsToMangle <- list(
    c("ctgov.1a", regCtgov),
    c("ctgov.1b", regCtgov),
    c("ctgov.2a", regCtgov),
    c("ctgov.2b", regCtgov),
    c("isrctn.1a", regIsrctn),
    c("isrctn.1b", regIsrctn),
    c("isrctn.2", regIsrctn),
    c("isrctn.3", regIsrctn),
    c("euctr.1", regEuctr),
    c("euctr.2a", regEuctr),
    c("euctr.2b", regEuctr),
    c("euctr.3", regEuctr)
  )
  # - do mangling; prerequisite is
  #   that each of the columns holds
  #   a single character vector,
  #   possibly collapsed with " / "
  invisible(sapply(
    colsToMangle,
    function(ctm) {
      colMangled <- regmatches(
        listofIds[[ ctm[[1]] ]],
        regexec(ctm[[2]], listofIds[[ ctm[[1]] ]]))
      colMangled[!lengths(colMangled)] <- ""
      listofIds[[ ctm[[1]] ]] <<- unlist(colMangled)
    }))
  # - merge columns for register ids and sponsor ids
  for (reg in c(registerList, "SPONSOR")) {
    listofIds[[reg]] <- apply(listofIds[
      , grepl(paste0("^", reg, "[.][0-9]"), names(listofIds),
              ignore.case = TRUE), drop = FALSE], MARGIN = 1,
      function(r) gsub("^ ?/ | / ?$", "",
                       paste0(na.omit(unique(r)), collapse = " / ")))
  }
  # - delete raw columns
  listofIds <- listofIds[
    , c("_id", "ctrname", registerList, "SPONSOR"), drop = FALSE]

  # inform user
  message(" - Finding duplicates among registers' and sponsor ids...")

  # find duplicates
  colsToCheck <- match(c(preferregister, "SPONSOR"), names(listofIds))
  outSet <- NULL
  for (i in seq_along(preferregister)) {

    # to be added
    tmp <- listofIds[
      listofIds[["ctrname"]] == preferregister[i], , drop = FALSE]
    row.names(tmp) <- NULL

    # check if second etc. set has identifiers
    # in the previously rbind'ed sets
    if (i > 1L && nrow(tmp)) {

      # check for duplicates
      dupes <- mapply(
        function(c1, c2) {
          tmpIs <- intersect(
            unlist(strsplit(c1, " / ")),
            unlist(strsplit(c2, " / ")))
          if (length(tmpIs)) {
            # map found intersecting names back
            # to the rows of the input data frame
            grepl(paste0(tmpIs, collapse = "|"), c1)
          } else {
            rep(FALSE, times = length(c1))
          }},
        tmp[, colsToCheck, drop = FALSE],
        outSet[, colsToCheck, drop = FALSE]
      )
      # keep uniques
      tmp <- tmp[rowSums(dupes) == 0L, , drop = FALSE]
      rm(dupes)
    }

    # add to output set
    outSet <- rbind(outSet, tmp,
                    make.row.names = FALSE,
                    stringsAsFactors = FALSE)
  }
  rm(tmp)

  # keep necessary columns
  listofIds <- outSet[, c("_id", "EUCTR", "ctrname")]
  names(listofIds)[2] <- "a2_eudract_number"
  rm(outSet)

  # find unique, preferred country version of euctr
  listofIds <- dfFindUniqueEuctrRecord(
    df = listofIds,
    prefermemberstate = prefermemberstate,
    include3rdcountrytrials = include3rdcountrytrials)

  # count
  countIds <- table(listofIds[["ctrname"]])

  # prepare output
  listofIds <- listofIds[["_id"]]

  # copy attributes
  attributes(listofIds) <- attribsids[grepl("^ctrdata-", names(attribsids))]

  # avoid returning list() if none found
  if (length(listofIds) == 0) {
    listofIds <- character()
  }

  # inform user
  message(" - Keeping ", paste0(countIds, collapse = " / "), " records",
          " from ", paste0(names(countIds), collapse = " / "))
  message(
    "= Returning keys (_id) of ", length(listofIds),
    " records in collection \"", con$collection, "\"")

  # return
  return(listofIds)

}
# end dbFindIdsUniqueTrials


#' Create data frame by extracting specified fields from database collection
#'
#' With this convenience function, fields in the database are retrieved
#' into an R data frame. Note that fields within the record of a trial
#' can be hierarchical and structured, that is, nested.
#'
#' With both src_sqlite and src_mongo, the function returns a list of data
#' for a field that includes nested content; use function
#' \link{dfTrials2Long} followed by \link{dfName2Value} to
#' extract desired nested variables.
#'
#' For more sophisticated data retrieval from the database, see vignette
#' examples and other packages to query mongodb such as mongolite.
#'
#' @param fields Vector of one or more strings, with names of the sought fields.
#' See function \link{dbFindFields} for how to find names of fields.
#' Regular expressions are possible. "item.subitem" notation is supported.
#'
#' @param stopifnodata Stops with an error (\code{TRUE}, default) or with
#' a warning (\code{FALSE}) if the sought field is empty in all,
#' or not available in any of the records in the database collection.
#'
#' @param verbose Printing additional information if set to \code{TRUE};
#' default is \code{FALSE}.
#'
#' @inheritParams ctrDb
#'
#' @return A data frame with columns corresponding to the sought fields.
#' Note: a column for the record _id will always be included.
#' Each column can be either a simple data type (numeric, character, date)
#' or a list (see example below): For complicated lists, use function
#' \link{dfTrials2Long} followed by function \link{dfName2Value} to
#' extract values for nested variables.
#' The maximum number of rows of the returned data frame is equal to,
#' or less than the number of records of trials in the database.
#'
#' @importFrom nodbi docdb_query
#' @importFrom stats na.omit
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#'
#' # access fields that are nested within another field
#' # and can have multiple values with the other field
#' dbGetFieldsIntoDf(
#'   "b1_sponsor.b31_and_b32_status_of_the_sponsor",
#'   con = db
#' )[1,]
#' #                 _id b1_sponsor.b31_and_b32_status_of_the_sponsor
#' # 1 2004-000015-25-GB                  Non-commercial / Commercial
#'
#' # access fields that include a list of values
#' # which are printed as comma separated values
#' dbGetFieldsIntoDf(
#'   "keyword",
#'   con = db
#' )[1,]
#'
#' #           _id                                 keyword
#' # 1 NCT00129259 T1D, type 1 diabetes, juvenile diabetes
#'
#' str(.Last.value)
#' # 'data.frame':	1 obs. of  2 variables:
#' # $ _id    : chr "NCT00129259"
#' # $ keyword:List of 1
#' # ..$ : chr  "T1D" "type 1 diabetes" "juvenile diabetes"
#'
#' }
#'
dbGetFieldsIntoDf <- function(fields = "",
                              con, verbose = FALSE,
                              stopifnodata = TRUE) {

  # check parameters
  if (!is.vector(fields) |
      class(fields) != "character") {
    stop("Input should be a vector of strings of field names.", call. = FALSE)
  }

  # remove NA, NULL if included in fields
  fields <- fields[!is.null(fields) & !is.na(fields)]

  # remove _id if included in fields
  fields <- fields["_id" != fields]

  # check if valid fields
  if (any(fields == "") | (length(fields) == 0)) {
    stop("'fields' contains empty elements; ",
         "please provide a vector of strings of field names. ",
         "Function dbFindFields() can be used to find field names. ",
         call. = FALSE)
  }

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # get all ids to enable Reduce which would fail
  # due to holes from NULLs from the merge step
  dft <- nodbi::docdb_query(
    src = con,
    key = con$collection,
    query = '{}',
    fields = paste0('{"_id": 1}'))
  dft <- dft[dft[["_id"]] != "meta-info", "_id", drop = FALSE]

  # initialise output
  nFields <- length(fields)

  # iterate over fields so that we can
  # use a custom function to merge results
  result <- lapply(
    seq_len(nFields),
    function(i) {
      #
      item <- fields[i]
      message("     \r", i, appendLF = FALSE)
      #
      query <- '{"_id": {"$ne": "meta-info"}}'
      if (verbose) message("DEBUG: field: ", item)
      #
      tmpItem <- try({

        # execute query
        dfi <- nodbi::docdb_query(
          src = con,
          key = con$collection,
          query = query,
          fields = paste0('{"_id": 1, "', item, '": 1}'))

        # leave try() early if no results
        if (!nrow(dfi) || ncol(dfi) == 1L) simpleError(message = "")

        # remove any rows without index variable
        dfi <- dfi[!is.na(dfi[["_id"]]), , drop = FALSE]

        # simplify by extracting recursively any requested subitem
        itemSegments <- strsplit(item, "[.]")[[1]]
        itemSegments <- setdiff(itemSegments, names(dfi))
        for (iS in itemSegments) {
          if ((length(names(dfi[[2]])) == 1L) && (iS == names(dfi[[2]]))) {
            dfi[[2]] <- dfi[[2]][[iS]]
          } else {
            tn <- sapply(dfi[[2]], names)
            if (length(unique(tn)) == 1L && (iS == tn[1]))
              dfi[[2]] <- sapply(dfi[[2]], "[[", 1)
          }
        }

        # simplify by expanding a resulting data frame
        if (length(unique(names(dfi[[2]]))) > 1L) {
          item <- paste0(item, ".", names(dfi[[2]]))
          dfi <- cbind("_id" = dfi[["_id"]], as.data.frame(dfi[[2]]))
          emptyCols <- sapply(dfi, function(c) all(is.na(c)))
          emptyCols <- seq_along(emptyCols)[emptyCols]
          if (length(emptyCols)) dfi <- dfi[, -emptyCols, drop = FALSE]
          if (length(emptyCols)) item <- item[-(emptyCols - 1L)]
        }

        # name result set
        names(dfi) <- c("_id", item)

        # create NA output from template
        dfo <- dft

        # simplify by processing columns
        for (c in seq_len(ncol(dfi))[-1]) {

          # special case: column is one-column data frame
          if (is.data.frame(dfi[[c]]) && (ncol(dfi[[c]]) == 1L) &&
              (nrow(dfi[[c]]) == nrow(dfi))) dfi[[c]] <-
              dfi[[c]][, 1, drop = TRUE]

          # simplify at row level, replaces NULL with NA
          if (!is.data.frame(dfi[[c]]) &&
              !any(sapply(dfi[[c]], class) == "data.frame")) {
            dfi[[c]] <- sapply(dfi[[c]], function(i) {
              l <- length(i)
              if (l == 0L) i <- NA
              if (l == 1L) i <- i[1]
              if (l >= 2L) {
                if (all(sapply(i, is.character))) {
                  i } else {i <- list(i) }}
              i}, USE.NAMES = FALSE, simplify = TRUE)}

          # simplify vectors in cells by collapsing
          # (compatibility with previous version)
          if (all(sapply(dfi[[c]], function(r) is.na(r)[1] | is.character(r))) &&
              any(sapply(dfi[[c]], function(r) length(r) > 1L))) {
            dfi[[c]] <- sapply(dfi[[c]], function(i) paste0(i, collapse = " / "))
          }

          # type results
          if (typeof(dfi[[c]]) == "character") dfi[[c]] <-
              typeField(dfi[, c(1, c), drop = FALSE])[, 2, drop = TRUE]

          # add a column into copy of NA template
          dfo[[c]] <- switch(
            class(dfi[[c]]),
            "Date" = as.Date(NA),
            "numeric" = as.numeric(NA),
            "character" = as.character(NA),
            "data.frame" = NA,
            "integer" = as.integer(NA),
            "list" = NA,
            "logical" = as.logical(NA),
            NA
          )

        } # for process

        # add NA where dfi has no data to avoid NULL when merge'ing
        names(dfo) <- names(dfi)
        dfi <- rbind(dfo[!(dfo[["_id"]] %in% dfi[["_id"]]), , drop = FALSE], dfi)

      },
      silent = TRUE) # tmpItem try

      # inform user
      if (inherits(tmpItem, "try-error") ||
          !nrow(dfi) || (ncol(dfi) == 1L)) {

        # try-error occurred or no data retrieved
        if (stopifnodata) {
          stop("No data could be extracted for '", item,
               "'. \nUse dbGetFieldsIntoDf(stopifnodata = ",
               "FALSE) to ignore this. ", call. = FALSE)
        } else {
          message("* No data: '", item, "'")
          # create empty data set
          dfi <- cbind(dft, NA)
          names(dfi) <- c("_id", fields[i])
        } # stopifnodata
      } # if

      # add to result
      dfi

    }) # end lapply
  message("")

  # bring result lists into data frame, by record _id
  result <- Reduce(function(...) merge(..., all = TRUE, by = "_id"), result)

  # prune rows without _id
  result <- result[!is.na(result[["_id"]]), , drop = FALSE]

  # remove rows with only NAs; try because
  # is.na may fail for complex cells
  onlyNas <- try({apply(result[, -1, drop = FALSE], 1,
                  function(r) all(is.na(r)))}, silent = TRUE)
  if (!inherits(onlyNas, "try-error")) {
    result <- result[!onlyNas, , drop = FALSE]
  } else {
    message("Could not remove rows with only NAs")
  }

  # inform user
  if (is.null(result) || !nrow(result)) {
    warning("No records with values for any specified field. ",
            call. = FALSE)
    return(NULL)
  }

  # sort, add meta data and return
  return(
    addMetaData(
      result[order(result[["_id"]]), , drop = FALSE],
      con = con))
}
# end dbGetFieldsIntoDf


#' Extract information of interest (e.g., endpoint)
#' from long data frame of protocol- or result-related
#' trial information as returned by \link{dfTrials2Long}
#'
#' @param df A data frame with four columns (_id,
#'  identifier, name, value) as returned by
#'  \link{dfTrials2Long}
#'
#' @param valuename A character string for the name of the variable
#'  from which to extract information for the variable of interest
#'
#' @param wherename A character string to identify the variable
#'  of interest
#'
#' @param wherevalue A character string with the value of interest
#'  for the variable of interest
#'
#' @return A data frame with columns _id, identifier,
#'  name, value that only includes the values of interest,
#'  where value are strings unless all value elements
#'  are numbers.
#'
#' @export
#' @examples
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' df <- ctrdata::dbGetFieldsIntoDf(
#' fields = c(
#'   # ctgov - typical results fields
#'   "clinical_results.baseline.analyzed_list.analyzed.count_list.count",
#'   "clinical_results.baseline.group_list.group",
#'   "clinical_results.baseline.analyzed_list.analyzed.units",
#'   "clinical_results.outcome_list.outcome",
#'   "study_design_info.allocation",
#'   # euctr - typical results fields
#'   "trialInformation.fullTitle",
#'   "subjectDisposition.recruitmentDetails",
#'   "baselineCharacteristics.baselineReportingGroups.baselineReportingGroup",
#'   "endPoints.endPoint",
#'   "trialChanges.hasGlobalInterruptions",
#'   "subjectAnalysisSets",
#'   "adverseEvents.seriousAdverseEvents.seriousAdverseEvent"
#'   ), con = dbc
#' )
#' # convert to long
#' reslong <- ctrdata::dfTrials2Long(
#'   df = df
#' )
#' # get values for endpoint of interest, duration of response
#' ctrdata::dfValue2Name(
#'   df = df,
#'   valuename = paste0(
#'     "endPoints.endPoint.*armReportingGroup.tendencyValues.tendencyValue.value|",
#'     "clinical_results.*category.measurement_list.measurement.value|",
#'     "clinical_results.*outcome.measure.units|endPoints.endPoint.unit"
#'   ),
#'   wherename = "clinical_results.*outcome.measure.title|endPoints.endPoint.title",
#'   wherevalue = "duration of response"
#' )
#' }
#'
dfName2Value <- function(df, valuename = "",
                         wherename = "", wherevalue = "") {

  # check parameters
  if (valuename == "") {
    stop("'valuename' must be specified.",
         call. = FALSE)
  }
  if (!identical(names(df),
                 c("_id", "identifier", "name", "value"))) {
    stop("'df' does not seem to come from dfTrials2Long()",
         call. = FALSE)
  }

  # indices of valuename
  indexVnames <- which(grepl(valuename, df[["name"]],
                             perl = TRUE, ignore.case = TRUE))
  if (!length(indexVnames)) stop("No rows found for 'valuename' = ", valuename)

  # if no where... are specified, just
  # return rows where name corresponds
  # to valuename
  if (wherename == "" & wherevalue == "") {

    # get relevant rows
    out <- df[indexVnames, , drop = FALSE]

  } else {# if where... are specified, continue

    # get where... indices per trial
    indexRows <- which(
      grepl(wherename, df[["name"]], perl = TRUE, ignore.case = TRUE) &
        grepl(wherevalue, df[["value"]], perl = TRUE, ignore.case = TRUE))
    if (!length(indexRows)) stop("No rows found for 'wherename' and 'wherevalue'")

    # get trial ids and identifiers for where...
    indexCases <- df[indexRows, c("_id", "identifier"), drop = FALSE]

    # get output iterate over trials
    out <- apply(
      indexCases, 1,
      function(i) {
        ids <- Reduce(
          intersect, list(
            # trial id
            which(grepl(i[["_id"]], df[["_id"]], fixed = TRUE)),
            # identifier to match starting from left and
            # do not match e.g. 22 for identifier 2
            which(grepl(paste0("^", i[["identifier"]], "([.]|$)"),
                        df[["identifier"]])),
            # indices of sought valuename
            indexVnames
          ))
        # return value
        if (length(ids)) df[ids, ]
      }
    )

    # bind into data frame
    out <- do.call(
      rbind,
      c(out, stringsAsFactors = FALSE, make.row.names = FALSE))

  } # if where...

  # value column is character
  # try to convert it to numeric
  tmp <- suppressWarnings(
    as.numeric(out[["value"]])
  )
  # use if converted ok
  if (all(is.na(tmp) == is.na(out[["value"]]))) {
    out["value"] <- tmp
  }
  # remove any duplicates such as
  # from duplicate where... criteria
  out <- unique(out)

  # return
  return(out)

} # end dfName2Value


#' Extract trial information into long format
#'
#' The function works with procotol- and results-
#' related information. It converts lists and other
#' values into individual rows of a long data frame.
#' From the resulting data frame, values of interest
#' can then be selected (e.g. select an outcome
#' and its analysis by the identifier of the measure
#' which has "Hazard Ratio" in its name, see
#' \link{dfName2Value}).
#'
#' @param df Data frame with columns including
#'  the trial identifier (\code{_id}) and
#'  one or more variables as obtained from
#'  \link{dbGetFieldsIntoDf}
#'
#' @return A data frame with the four columns:
#'  _id, identifier, name, value
#'
#' @importFrom stringi stri_extract_all_charclass
#' @importFrom stringi stri_extract_first
#' @importFrom stringi stri_replace_first
#'
#' @export
#'
#' @examples
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' df <- dbGetFieldsIntoDf(
#'   fields = c(
#'     "clinical_results.outcome_list.outcome"),
#'   con = db
#' )
#' dfTrials2Long(
#'   df = df
#' )
#' }
dfTrials2Long <- function(df) {

  # check parameters
  if (!any("_id" == names(df)) ||
      ncol(df) == 1L) stop(
        "Missing _id column or other variables in 'df'",
        call. = FALSE
      )
  if (any(c("identifier", "name", "value") %in% names(df))) stop(
    "Unexpected columns; 'df' should not come from dfTrials2Long",
    call. = FALSE
  )

  # helper function
  flattenDf <- function(x) {
    while (any(vapply(x, is.list, logical(1L)))) {
      x <- lapply(x, function(x) if (is.list(x)) x else list(x))
      x <- unlist(x, recursive = FALSE, use.names = TRUE)
    }
    x
  }

  # to add a first row in the next step,
  # columns that are not compatible with
  # adding a row are converted to character
  conv <- sapply(df, class) == "Date"
  conv <- seq_len(ncol(df))[conv]
  for (c in conv) df[, c] <- as.character(df[, c, drop = TRUE])

  # add a first row to df to hold item name
  # which otherwise is not available in apply
  df <- rbind(
    names(df),
    df)

  # iterative unnesting, by column
  out <- lapply(
    df[, -match("_id", names(df)), drop = FALSE],
    function(cc) {
      message(". ", appendLF = FALSE)
      # get item name as added in first row
      tn <- cc[[1]]
      # and by element in column
      lapply(cc[-1], function(c) {
        x <- unlist(flattenDf(c))
        if (!is.null(names(x))) tn <- names(x)
        if (is.null(x)) x <- NA
        data.frame(
          "name" = tn,
          "value" = x,
          check.names = FALSE,
          stringsAsFactors = FALSE,
          row.names = NULL)
      })})

  # add _id to list elements and
  # simplify into data frames
  tmpNames <- df[-1, "_id", drop = TRUE]
  out <- lapply(
    out, function(e) {
      message(". ", appendLF = FALSE)
      names(e) <- tmpNames
      # duplicate e to force generating
      # names in the later rbind step
      do.call(rbind, c(e, e, stringsAsFactors = FALSE))
    })

  # combine lists into data frame
  out <- do.call(rbind, c(out, stringsAsFactors = FALSE))
  message(". ", appendLF = FALSE)

  # remove rows where value is NA
  out <- out[!is.na(out[["value"]]), , drop = FALSE]

  # process row.names such as "clinical_results.NCT00082758.73"
  # to to obtain "clinical_results" as part of variable name
  names <- stringi::stri_replace_first(
    str = row.names(out), replacement = "",
    regex = c(paste0(".(", regCtgov, "|", regEuctr, "-[3A-Z]+)[.0-9]*")))

  # generate new data frame with target columns and order
  out <- data.frame(
    # process row.names to obtain trial id
    "_id" = stringi::stri_extract_first(
      str = row.names(out),
      regex = c(paste0(regCtgov, "|", regEuctr, "-[3A-Z]+"))),
    "identifier" = NA,
    "name" = out[["name"]],
    "value" = out[["value"]],
    check.names = FALSE,
    row.names = NULL,
    stringsAsFactors = FALSE)
  message(". ", appendLF = FALSE)

  # generate variable names from processed row names
  # and name unless the same is as already in name
  out[["name"]] <- ifelse(
    out[["name"]] == names,
    out[["name"]],
    paste0(names, ".0.", out[["name"]]))

  # name can have from 0 to about 6 number groups, get all
  # and concatenate to oid-like string such as "1.2.2.1.4"
  out[["identifier"]] <- vapply(
    stringi::stri_extract_all_regex(out[["name"]], "[0-9]+([.]|$)"),
    function(i) paste0(gsub("[.]", "", i), collapse = "."), character(1L))
  out[["identifier"]] [out[["identifier"]] == "NA"] <- "0"
  message(". ", appendLF = FALSE)

  # remove numbers from variable name
  out[["name"]] <- gsub(
    "[0-9]+([.])|[0-9]+$|[.]?@attributes", "\\1", out[["name"]], perl = TRUE)

  # remove any double separators
  out[["name"]] <- gsub("[.]+", ".", out[["name"]], perl = TRUE)

  # remove double rows from duplicating e above
  out <- unique(out)

  # inform
  message("\nTotal ", nrow(out), " rows, ",
          length(unique(out[["name"]])),
          " unique names of variables")

  # output
  return(out)

} # end dfTrials2Long


#' Extract named element(s) from list(s) into long-format
#' data frame
#'
#' The function uses a name (key) to extract an element
#' from a list in a data.frame such as obtained with
#' \link{dbGetFieldsIntoDf}. This helps to simplify
#' working with nested lists and with complex structures.
#'
#' @param df A data frame
#' @param list.key A list of pairs of list names and
#'  key names, where the list name corresponds to the
#'  name of a column in \code{df} that holds a list and
#'  the name of the key identifies the element to be
#'  extracted. See example.
#'
#' @return A data frame in long format with columns
#'  name (identifying the full path in the data frame,
#'  "<list>.<key>"), _id (of the trial record), value
#'  (of name per _id), item (number of value of name
#'  per _id).
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' df <- dbGetFieldsIntoDf(
#'   fields = c(
#'     "endPoints.endPoint",
#'     "subjectDisposition.postAssignmentPeriods"),
#'   con = db
#' )
#' dfListExtractKey(
#'   df = df,
#'   list.key = list(
#'       c("endPoints.endPoint",
#'         "^title"),
#'       c("subjectDisposition.postAssignmentPeriods",
#'         "arms.arm.type.value")
#' ))
#' }
#'
dfListExtractKey <- function(
  df,
  list.key =
    list(c("endPoints.endPoint", "^title")
    )) {

  # deprecate
  .Deprecated(new = "dfName2Value")

  # check
  if (!any("_id" == names(df))) {
    stop("Data frame 'df' lacks '_id' column.",
         call. = FALSE)
  }

  # helper function to extract from
  # a named vector elements by name
  extractKey <- function(flattenedList, key) {

    # find element by key
    selected <- grepl(key,
                      names(flattenedList),
                      ignore.case = TRUE)


    # extract value for key
    extracted <- flattenedList[selected]

    # if key is not found, return a value
    # e.g. missing value (NA) or empty string ("")
    # please change as wanted for later processing
    if (length(extracted) == 0) extracted <- NA

    # return
    return(extracted)
  }

  # dots needs to be defined because passing
  # it in .Internal(mapply()) is not enough
  out <- lapply(
    list.key,
    function(k)
      lapply(df[[k[1]]],
             # k[1] = "endPoints.endPoint" identifies
             # the column in data frame with the list
             function(l) extractKey(
               unlist(l, recursive = TRUE, use.names = TRUE), k[2])
             # k[2] = "^title" identifies the key in the sublist
      ))

  out <- sapply(seq_along(list.key), function(li) {

    tmp <- out[[li]]

    tmp <- sapply(

      seq_along(tmp),
      function(ii) {

        data.frame(
          name = gsub("[-0-9]*$", "", # trailing number
                      gsub("[^a-zA-Z0-9_.-]", "",
                           paste0(list.key[[li]], collapse = "."))),
          "_id" = df[["_id"]][[ii]],
          value = tmp[[ii]],
          item = seq_along(tmp[[ii]]),
          row.names = NULL,
          stringsAsFactors = FALSE,
          check.names = FALSE)
      }, simplify = FALSE)

    do.call(rbind, tmp)

  }, simplify = FALSE)

  # return
  do.call(rbind, c(out, stringsAsFactors = FALSE, make.row.names = FALSE))

} # end dfListExtractKey


#' Merge two variables into one, optionally map values to new levels
#'
#' @param df A \link{data.frame} in which there are two variables (columns)
#' to be merged into one.
#' @param colnames A vector of length two with names of the two columns
#' that hold the variables to be merged. See \link{colnames} for how to
#' obtain the names of columns of a data frame.
#' @param levelslist A list with one slice each for a new value to be
#' used for a vector of old values (optional).
#' @param ... for deprecated \code{varnames} parameter (will be removed)
#'
#' @return A vector of strings
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' statusvalues <- list(
#'   "ongoing" = c("Recruiting", "Active", "Ongoing",
#'                 "Active, not recruiting", "Enrolling by invitation"),
#'   "completed" = c("Completed", "Prematurely Ended", "Terminated"),
#'   "other" = c("Withdrawn", "Suspended",
#'               "No longer available", "Not yet recruiting"))
#'
#' dfMergeTwoVariablesRelevel(
#'   df = result,
#'   colnames = c("Recruitment", "x5_trial_status"),
#'   levelslist = statusvalues)
#' }
#'
dfMergeTwoVariablesRelevel <- function(
  df = NULL,
  colnames = "",
  levelslist = NULL,
  ...) {

  # check parameters

  # FIXME migrate from previously
  # used parameter "varnames"
  tmp <- match.call()
  tmp <- tmp["varnames"]
  tmp <- as.list(tmp)[[1]]
  if (length(tmp) == 3 && colnames == "") {
    colnames <- unlist(as.list(tmp[-1], use.names = FALSE))
    warning("Parameter varnames is deprecated, use colnames instead.",
            call. = FALSE)
  }

  # other checks
  if (class(df) != "data.frame") {
    stop("Need a data frame as input.", call. = FALSE)
  }
  if (length(colnames) != 2) {
    stop("Please provide exactly two column names.", call. = FALSE)
  }

  # find variables in data frame and merge
  tmp <- match(colnames, names(df))
  df <- df[, tmp, drop = FALSE]

  # bind as ...
  if (class(df[, 1]) == class(df[, 2]) &&
      class(df[, 1]) != "character") {
    # check
    if (nrow(na.omit(df[!vapply(df[, 1, drop = TRUE], is.null, logical(1L)) &
                        !vapply(df[, 2, drop = TRUE], is.null, logical(1L)), ,
                        drop = FALSE]))) {
      warning("Some rows had values for both columns, used first",
              noBreaks. = TRUE, immediate. = TRUE)
    }
    # values, with first having
    # priority over the second
    tmp <- ifelse(is.na(tt <- df[, 1]), df[, 2], df[, 1])
  } else {
    # check
    if (nrow(df[(!is.na(df[, 1]) & df[, 1] != "") &
                (!is.na(df[, 2]) & df[, 2] != ""), , drop = FALSE])) {
      warning("Some rows had values for both columns, concatenated",
              noBreaks. = TRUE, immediate. = TRUE)
    }
    # strings, concatenated
    tmp <- paste0(
      ifelse(is.na(tt <- as.character(df[, 1])), "", tt),
      ifelse(is.na(tt <- as.character(df[, 2])), "", tt))
  }

  # type where possible
  if (class(df[, 1]) == class(df[, 2]) &&
      class(df[, 1]) != "character") {
    mode(tmp) <- mode(df[, 1])
    class(tmp) <- class(df[, 1])
  }

  # relevel if specified
  if (!is.null(levelslist)) {

    # check
    if (class(levelslist) != "list") {
      stop("Need list for parameter 'levelslist'.", call. = FALSE)
    }

    # helper function to collapse factor levels into the first
    refactor <- function(x, collapselevels, levelgroupname) {
      levels(x) [match(collapselevels, levels(x))] <- levelgroupname
      return(x)
    }

    # convert result to factor as this is needed for helper function
    tmp <- as.factor(tmp)

    # apply helperfunction to elements of the list
    for (i in seq_len(length(levelslist))) {
      tmp <- refactor(tmp, unlist(levelslist[i], use.names = FALSE),
                      attr(levelslist[i], "names"))
    }

    # convert factor back into string vector
    tmp <- as.character(tmp)

  }

  # check and inform user
  if (length(tt <- unique(tmp)) > 3L) {
    message("Unique values returned (first three): ",
            paste(tt[1L:3L], collapse = ", "))
  } else {
    message("Unique values returned: ",
            paste(tt, collapse = ", "))
  }

  # return
  return(tmp)
}
# end dfMergeTwoVariablesRelevel


#' Select single trial record from records of different EU Member States
#'
#' The EUCTR provides one record per trial per EU Member State in which the
#' trial is conducted. For all trials conducted in more than one Member State,
#' this function returns only one record per trial.
#'
#' Note: To deduplicate trials from different registers (EUCTR and CTGOV),
#' please first use function \code{\link{dbFindIdsUniqueTrials}}.
#'
#' @param df A data frame created from the database that includes the columns
#'   "_id" and "a2_eudract_number", for example created with function
#'   dbGetFieldsIntoDf(c("_id", "a2_eudract_number")).
#' @param prefermemberstate Code of single EU Member State for which records
#' should returned. If not available, a record for GB or lacking this, any
#' other record for the trial will be returned. For a list of codes of EU
#'   Member States, please see vector \code{countriesEUCTR}. Alternatively,
#'   "3RD" will lead to return a Third Country record of a trial, if available.
#' @param include3rdcountrytrials A logical value if trials should be retained
#'   that are conducted exclusively in third countries, that is, outside
#'   the European Union.
#'
#' @return A data frame as subset of \code{df} corresponding to the sought
#'   records.
#'
#' @keywords internal
#
dfFindUniqueEuctrRecord <- function(
  df = NULL,
  prefermemberstate = "GB",
  include3rdcountrytrials = TRUE) {

  # check parameters
  if (class(df) != "data.frame") {
    stop("Parameter df is not a data frame.", call. = FALSE)
  }
  #
  if (is.null(df[["_id"]]) ||
      is.null(df["a2_eudract_number"])) {
    stop('Data frame does not include "_id"',
         ' and "a2_eudract_number" columns.',
         call. = FALSE)
  }
  #
  if (nrow(df) == 0) {
    stop("Data frame does not contain records (0 rows).",
         call. = FALSE)
  }
  #
  if (!(prefermemberstate %in% countriesEUCTR)) {
    stop("Value specified for prefermemberstate does not match",
         " one of the recognised codes: ",
         paste(sort(countriesEUCTR), collapse = ", "),
         call. = FALSE)
  }

  # notify it mismatching parameters
  if (prefermemberstate == "3RD" & !include3rdcountrytrials) {
    warning("Preferred EUCTR version set to 3RD country trials, but ",
            "'include3rdcountrytrials' was FALSE, setting it to TRUE.",
            call. = FALSE,
            noBreaks. = FALSE,
            immediate. = FALSE)
    include3rdcountrytrials <- TRUE
  }

  # count total
  totalEuctr <- unique(df[["a2_eudract_number"]])
  totalEuctr <- na.omit(totalEuctr[totalEuctr != ""])
  totalEuctr <- length(totalEuctr)

  # as a first step, handle 3rd country trials e.g. 2010-022945-52-3RD
  # if retained, these trials would count as record for a trial
  if (!include3rdcountrytrials) {
    df <- df[!grepl("-3RD", df[["_id"]]), ]
  }

  # count number of records by eudract number
  tbl <- table(df[["_id"]], df[["a2_eudract_number"]])
  tbl <- as.matrix(tbl)
  # nms has names of all records
  nms <- dimnames(tbl)[[1]]

  # nrs has eudract numbers for which is there more than 1 record
  nrs <- colSums(tbl)
  nrs <- nrs[nrs > 1]
  nrs <- names(nrs)

  # nst is a list of nrs trials of a logical vector along nms
  # that indicates if the indexed record belongs to the trial
  nms2 <- substr(nms, 1, 14)
  nst <- lapply(nrs, function(x) nms2 %in% x)

  # helper function to find the Member State version
  removeMSversions <- function(indexofrecords) {
    # given a vector of records (nnnn-nnnnnnn-nn-MS) of a single trial, this
    # returns all those _ids of records that do not correspond to the preferred
    # Member State record, based on the user's choices and defaults.
    # Function uses prefermemberstate, nms from the caller environment
    recordnames <- nms[indexofrecords]
    #
    # fnd should be only a single string, may need to be checked
    if (sum(fnd <- grepl(prefermemberstate, recordnames)) != 0) {
      result <- recordnames[!fnd]
      return(result)
    }
    #
    if (sum(fnd <- grepl("GB", recordnames)) != 0) {
      result <- recordnames[!fnd]
      return(result)
    }
    #
    # default is to list all but first record
    # the listed records are the duplicates
    # 3RD country trials would be listed first
    # hence selected, which is not desirable
    # unless chosen as prefermemberstate
    return(rev(sort(recordnames))[-1])
  }

  # finds per trial the desired record;
  # uses prefermemberstate and nms
  result <- lapply(nst,
                   function(x) removeMSversions(x))
  result <- unlist(result, use.names = FALSE)

  # eleminate the unwanted EUCTR records
  df <- df[!(df[["_id"]] %in% result), ]

  # also eliminate the meta-info record
  df <- df[!(df[["_id"]] == "meta-info"), ]

  # inform user about changes to data frame
  if (length(nms) > (tmp <- length(result))) {
    message(
      " - ", tmp,
      " EUCTR _id were not preferred EU Member State record for ",
      totalEuctr, " trials")
  }

  # return
  return(df)

}
# end dfFindUniqueEuctrRecord


#' Change type of field based on name of field
#'
#' @param dfi a data frame of columns _id, fieldname
#'
#' @keywords internal
#' @noRd
#'
typeField <- function(dfi) {

  # check
  if (ncol(dfi) != 2L) {
    stop("Expect data frame with two columns, _id and a field.",
         call. = FALSE)
  }

  # clean up input
  # - if NA as string, change to NA
  dfi[grepl("^N/?A$|^ND$", dfi[, 2]), 2] <- NA
  # - give Month Year also a Day to work with as.Date
  dfi[, 2] <- sub("^([a-zA-Z]+) ([0-9]{4})$", "\\1 15, \\2", dfi[, 2])
  # - convert html entities
  if (any(grepl("&[a-z]+;", dfi[, 2]))) dfi[, 2] <- sapply(
    dfi[, 2], function(i) xml2::xml_text(xml2::read_html(charToRaw(i))))

  # for date time conversion
  lct <- Sys.getlocale("LC_TIME")
  Sys.setlocale("LC_TIME", "C")
  on.exit(Sys.setlocale("LC_TIME", lct))

  # main typing functions
  ctrDate      <- function() as.Date(dfi[, 2], format = "%Y-%m-%d")
  ctrDateUs    <- function() as.Date(dfi[, 2], format = "%b %e, %Y")
  ctrDateCtr   <- function() as.Date(dfi[, 2], format = "%Y-%m-%d %H:%M:%S")
  ctrDateTime  <- function() as.Date(dfi[, 2], format = "%Y-%m-%dT%H:%M:%S")
  ctrYesNo     <- function() vapply(dfi[, 2], FUN = function(x) switch(x, "Yes" = TRUE, "No" = FALSE, NA), logical(1L))
  ctrFalseTrue <- function() vapply(dfi[, 2], FUN = function(x) switch(x, "true" = TRUE, "false" = FALSE, NA), logical(1L))
  ctrInt       <- function() vapply(dfi[, 2], FUN = function(x) as.integer(x = x), integer(1L))

  # selective typing
  tmp <- try({
    switch(
      EXPR = names(dfi)[2],
      #
      #
      # dates
      #
      # - intern
      "record_last_import" = ctrDateCtr(),
      # - EUCTR
      "n_date_of_ethics_committee_opinion"     = ctrDate(),
      "n_date_of_competent_authority_decision" = ctrDate(),
      "p_date_of_the_global_end_of_the_trial"  = ctrDate(),
      "x6_date_on_which_this_record_was_first_entered_in_the_eudract_database" = ctrDate(),
      "firstreceived_results_date"             = ctrDate(),
      "trialInformation.primaryCompletionDate" = ctrDate(),
      "trialInformation.globalEndOfTrialDate"  = ctrDateTime(),
      "trialInformation.recruitmentStartDate"  = ctrDateTime(),
      # - CTGOV
      "start_date"              = ctrDateUs(),
      "primary_completion_date" = ctrDateUs(),
      "completion_date"         = ctrDateUs(),
      "study_first_posted"      = ctrDateUs(),
      "results_first_posted"    = ctrDateUs(),
      "last_update_posted"      = ctrDateUs(),
      # - ISRCTN
      "participants.recruitmentStart" = ctrDateTime(),
      "participants.recruitmentEnd"   = ctrDateTime(),
      "trialDesign.overallStartDate"  = ctrDateTime(),
      "trialDesign.overallEndDate"    = ctrDateTime(),
      #
      # factors
      #
      # - EUCTR Yes / No / Information not present in EudraCT
      "d21_imp_to_be_used_in_the_trial_has_a_marketing_authorisation" = ctrYesNo(),
      "e13_condition_being_studied_is_a_rare_disease" = ctrYesNo(),
      #
      "e61_diagnosis"         = ctrYesNo(),
      "e62_prophylaxis"       = ctrYesNo(),
      "e63_therapy"           = ctrYesNo(),
      "e64_safety"            = ctrYesNo(),
      "e65_efficacy"          = ctrYesNo(),
      "e66_pharmacokinetic"   = ctrYesNo(),
      "e67_pharmacodynamic"   = ctrYesNo(),
      "e68_bioequivalence"    = ctrYesNo(),
      "e69_dose_response"     = ctrYesNo(),
      "e610_pharmacogenetic"  = ctrYesNo(),
      "e611_pharmacogenomic"  = ctrYesNo(),
      "e612_pharmacoeconomic" = ctrYesNo(),
      "e613_others"           = ctrYesNo(),
      #
      "e71_human_pharmacology_phase_i"         = ctrYesNo(),
      "e711_first_administration_to_humans"    = ctrYesNo(),
      "e712_bioequivalence_study"              = ctrYesNo(),
      "e713_other"                             = ctrYesNo(),
      "e72_therapeutic_exploratory_phase_ii"   = ctrYesNo(),
      "e73_therapeutic_confirmatory_phase_iii" = ctrYesNo(),
      "e74_therapeutic_use_phase_iv"           = ctrYesNo(),
      #
      "e81_controlled"      = ctrYesNo(),
      "e811_randomised"     = ctrYesNo(),
      "e812_open"           = ctrYesNo(),
      "e813_single_blind"   = ctrYesNo(),
      "e814_double_blind"   = ctrYesNo(),
      "e815_parallel_group" = ctrYesNo(),
      "e816_cross_over"     = ctrYesNo(),
      "e817_other"          = ctrYesNo(),
      #
      "e83_the_trial_involves_single_site_in_the_member_state_concerned"    = ctrYesNo(),
      "e84_the_trial_involves_multiple_sites_in_the_member_state_concerned" = ctrYesNo(),
      "e85_the_trial_involves_multiple_member_states"                       = ctrYesNo(),
      "e861_trial_being_conducted_both_within_and_outside_the_eea"          = ctrYesNo(),
      "e862_trial_being_conducted_completely_outside_of_the_eea"            = ctrYesNo(),
      "e87_trial_has_a_data_monitoring_committee"                           = ctrYesNo(),
      #
      "f11_trial_has_subjects_under_18"            = ctrYesNo(),
      "f111_in_utero"                              = ctrYesNo(),
      "f112_preterm_newborn_infants_up_to_gestational_age__37_weeks" = ctrYesNo(),
      "f113_newborns_027_days"                     = ctrYesNo(),
      "f114_infants_and_toddlers_28_days23_months" = ctrYesNo(),
      "f115_children_211years"                     = ctrYesNo(),
      "f116_adolescents_1217_years"                = ctrYesNo(),
      "f12_adults_1864_years"                      = ctrYesNo(),
      "f13_elderly_65_years"                       = ctrYesNo(),
      "f21_female"                                 = ctrYesNo(),
      "f22_male"                                   = ctrYesNo(),
      "f31_healthy_volunteers"                     = ctrYesNo(),
      "f32_patients"                               = ctrYesNo(),
      "f33_specific_vulnerable_populations"        = ctrYesNo(),
      "f331_women_of_childbearing_potential_not_using_contraception_" = ctrYesNo(),
      "f332_women_of_childbearing_potential_using_contraception"      = ctrYesNo(),
      "f333_pregnant_women"      = ctrYesNo(),
      "f334_nursing_women"       = ctrYesNo(),
      "f335_emergency_situation" = ctrYesNo(),
      "f336_subjects_incapable_of_giving_consent_personally" = ctrYesNo(),
      #
      # - CTGOV
      "has_expanded_access"            = ctrYesNo(),
      "oversight_info.has_dmc"         = ctrYesNo(),
      "eligibility.healthy_volunteers" = ctrYesNo(),
      #
      # - ISRCTN
      "trialDescription.acknowledgment" = ctrFalseTrue(),
      "results.biomedRelated"           = ctrFalseTrue(),
      #
      # numbers
      #
      # - EUCTR
      "e824_number_of_treatment_arms_in_the_trial" = ctrInt(),
      "e891_in_the_member_state_concerned_years"   = ctrInt(),
      "e891_in_the_member_state_concerned_months"  = ctrInt(),
      "e891_in_the_member_state_concerned_days"    = ctrInt(),
      "e892_in_all_countries_concerned_by_the_trial_years"  = ctrInt(),
      "e892_in_all_countries_concerned_by_the_trial_months" = ctrInt(),
      "e892_in_all_countries_concerned_by_the_trial_days"   = ctrInt(),
      "e841_number_of_sites_anticipated_in_member_state_concerned" = ctrInt(),
      "f11_number_of_subjects_for_this_age_range"   = ctrInt(),
      "f1111_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1121_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1131_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1141_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1151_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1161_number_of_subjects_for_this_age_range" = ctrInt(),
      "f121_number_of_subjects_for_this_age_range"  = ctrInt(),
      "f131_number_of_subjects_for_this_age_range"  = ctrInt(),
      "f41_in_the_member_state"          = ctrInt(),
      "f421_in_the_eea"                  = ctrInt(),
      "f422_in_the_whole_clinical_trial" = ctrInt(),
      #
      # - CTGOV
      "number_of_arms" = ctrInt(),
      "enrollment"     = ctrInt(),
      #
      # - ISRCTN
      "participants.targetEnrolment"      = ctrInt(),
      "participants.totalTarget"          = ctrInt(),
      "participants.totalFinalEnrolment"  = ctrInt(),
      "externalRefs.protocolSerialNumber" = ctrInt(),
      #
      # TODO results-related variables
      "trialInformation.analysisForPrimaryCompletion" = ctrFalseTrue()
      #
    )
  },
  silent = TRUE)

  # prepare output
  if (!inherits(tmp, "try-error") &&
      !is.null(unlist(tmp, use.names = FALSE))) {

    # need to construct new data frame, because
    # replacing columns with posixct does not work
    dfn <- names(dfi)
    dfi <- data.frame(dfi[["_id"]],
                      tmp,
                      stringsAsFactors = FALSE)
    names(dfi) <- dfn

  }

  # return
  return(dfi)

} # end typeField


#' Annotate ctrdata function return values
#'
#' @param x object to be annotated
#'
#' @inheritParams ctrDb
#'
#' @keywords internal
#' @noRd
#'
addMetaData <- function(x, con) {

  # add metadata
  attr(x, "ctrdata-dbname")         <- con$db
  attr(x, "ctrdata-table")          <- con$collection
  attr(x, "ctrdata-dbqueryhistory") <- dbQueryHistory(
    con = con,
    verbose = FALSE)

  # return annotated object
  return(x)

} # end addMetaData


#' Function to set proxy
#'
#' @importFrom curl ie_proxy_info
#'
#' @keywords internal
#' @noRd
#'
setProxy <- function() {

  # only act if environment
  # variable is not already set
  if (Sys.getenv("https_proxy") == "") {

    # works under windows only
    p <- curl::ie_proxy_info()$Proxy

    if (!is.null(p)) {

      # used by httr and curl
      Sys.setenv(https_proxy = p)

    }
  }
} # end setproxy


#' Convenience function to install a minimal cygwin environment under MS
#' Windows, including perl, sed and php
#'
#' Alternatively and in case of difficulties, download and run the cygwin
#' setup yourself as follows: \code{cygwinsetup.exe --no-admin --quiet-mode
#' --verbose --upgrade-also --root c:/cygwin --site
#' http://www.mirrorservice.org/sites/sourceware.org/pub/cygwin/ --packages
#' perl,php-jsonc,php-simplexml}
#'
#' @export
#'
#' @param force Set to \code{TRUE} to force updating and overwriting an existing
#'   installation in \code{c:\\cygwin}
#' @param proxy Specify any proxy to be used for downloading via http, e.g.
#'   "host_or_ip:port". \code{installCygwinWindowsDoInstall} may detect and use
#'   the proxy configuration used in MS Windows to use an automatic proxy
#'   configuration script. Authenticated proxies are not supported at this time.
#'
installCygwinWindowsDoInstall <- function(
  force = FALSE,
  proxy = "") {

  # checks
  if (.Platform$OS.type != "windows") {
    stop(
      "This function is only for MS Windows operating systems.",
      call. = FALSE)
  }
  #
  if (!force & dir.exists("c:\\cygwin")) {
    message("cygwin is already installed in c:\\cygwin. ",
            "To re-install, use force = TRUE.")
    # exit function after testing
    return(installCygwinWindowsTest(verbose = TRUE))
  }

  # define installation command
  installcmd <- paste0(
    "--no-admin --quiet-mode --upgrade-also --no-shortcuts --prune-install ",
    "--root c:/cygwin ",
    "--site http://www.mirrorservice.org/sites/sourceware.org/pub/cygwin/ ",
    "--packages perl,php-simplexml,php-json")

  # create R session temporary directory
  tmpfile <- paste0(tempdir(), "/cygwin_inst")
  dir.create(tmpfile)
  dstfile <- paste0(tmpfile, "/cygwinsetup.exe")

  # generate download url
  tmpurl <- ifelse(
    grepl("x64", utils::win.version()),
    "setup-x86_64.exe",
    "setup-x86.exe")
  tmpurl <- paste0("https://cygwin.org/", tmpurl)

  # inform user
  message("Attempting cygwin download using ",
          tmpurl, " ...")

  # check and set proxy if needed to access internet
  setProxy()

  # download.file uses the proxy configured in the system
  tmpdl <- try({
    utils::download.file(
      url = tmpurl,
      destfile = dstfile,
      quiet = FALSE,
      mode = "wb")
  }, silent = TRUE)

  # check
  if (!file.exists(dstfile) ||
      file.size(dstfile) < (5 * 10 ^ 5) ||
      (inherits(tmpdl, "try-error"))) {
    stop("Failed, please download manually and install with:\n",
         tmpurl, " ", installcmd,
         call. = FALSE)
  }

  # proxy handling
  if (proxy != "") {
    # manual setting overriding
    proxy <- paste0(" --proxy ", proxy)
    message("Setting cygwin proxy install argument to: ",
            proxy, ", based on provided parameter.")
  } else {
    # detect proxy
    proxy <- curl::ie_proxy_info()$Proxy
    if (!is.null(proxy)) {
      message("Setting cygwin proxy install argument to: ",
              proxy, ", based on system settings.")
      proxy <- paste0(" --proxy ", proxy)
    }
  }

  # execute cygwin setup command
  system(paste0(dstfile, " ", installcmd,
                " --local-package-dir ", tmpfile, " ", proxy))

  # return cygwin installation test
  return(installCygwinWindowsTest(verbose = TRUE))

}
# end installCygwinWindowsDoInstall


#' Convenience function to test for working cygwin installation
#'
#' @param verbose If \code{TRUE}, prints confirmatory
#'  message (default \code{FALSE})
#'
#' @return Information if cygwin can be used, \code{TRUE}
#'  or \code{FALSE}, or NULL if not under MS Windows
#'
#' @keywords internal
#' @noRd
#
installCygwinWindowsTest <- function(verbose = FALSE) {
  #
  if (.Platform$OS.type != "windows") {
    message("Function installCygwinWindowsTest() is ",
            "only for MS Windows operating systems.")
    return(invisible(NULL))
  }
  #
  tmpcygwin <- try({
    suppressWarnings(
      system(
        paste0("cmd.exe /c ",
               rev(Sys.glob("c:\\cygw*\\bin\\bash.exe"))[1],
               " --version"),
        intern = TRUE,
        ignore.stderr = TRUE
      ))},
    silent = TRUE)
  #
  if (!inherits(tmpcygwin, "try-error") &
      (length(tmpcygwin) > 5L)) {
    if (verbose) message("cygwin seems to work correctly")
    return(invisible(TRUE))
  } else {
    message(
      "cygwin is not available, ctrLoadQueryIntoDb() will not work.\n",
      "Consider calling ctrdata::installCygwinWindowsDoInstall()")
    return(invisible(FALSE))
  }
}
# end installCygwinWindowsTest


#' Check availability of binaries installed locally
#'
#' @param commandtest Command to be used for testing
#' the availability of the binary, e.g. "php -v".
#' Note internal quotes need to be escaped, e.g.
#' \code{installFindBinary('php -r
#' \"simplexml_load_string(\'\');\"')}.
#' See R/onload.R for tested binaries.
#'
#' @param verbose Set to \code{TRUE} to see printed
#' return value of \code{commandtest}
#'
#' @return A logical if executing commandtest
#' returned an error or not
#'
#' @keywords internal
#' @noRd
#
installFindBinary <- function(commandtest = NULL, verbose = FALSE) {
  #
  if (is.null(commandtest)) {
    stop("Empty argument: commandtest",
         call. = FALSE)
  }
  #
  if (.Platform$OS.type == "windows") {
    commandtest <-
      paste0(rev(Sys.glob("c:\\cygw*\\bin\\bash.exe"))[1],
             " --login -c ",
             shQuote(commandtest))
  }
  #
  if (verbose) print(commandtest)
  #
  commandresult <- try(
    suppressWarnings(
      system(commandtest,
             intern = TRUE,
             ignore.stderr =
               ifelse(.Platform$OS.type == "windows",
                      FALSE, TRUE))),
    silent = TRUE
  )
  #
  commandreturn <- ifelse(
    inherits(commandresult, "try-error") ||
      grepl("error|not found", tolower(paste(commandresult, collapse = " "))) ||
      (!is.null(attr(commandresult, "status")) &&
         (attr(commandresult, "status") != 0)),
    FALSE, TRUE)
  #
  if (!commandreturn) {
    # warning(commandtest, " not found.",
    #         call. = FALSE,
    #         immediate. = FALSE)
  } else {
    if (interactive()) {
      message(". ", appendLF = FALSE)
    }
  }
  #
  if (verbose) {
    print(commandresult)
  }
  #
  return(commandreturn)
  #
}
# end installFindBinary


#' checkBinary
#'
#' @param b Vector of pre-defined binaries to be tested
#'
#' @keywords internal
#' @noRd
#'
#' @return Logical, \code{TRUE} if all binaries ok
#'
checkBinary <- function(b = NULL) {

  # check actions and user infos
  actionsInfos <- list(
    "notworking" = c("nonexistingbinarytested",
                     "nonexistingbinarytested not found"),
    "php" = c("php --version",
              "php not found, ctrLoadQueryIntoDb() will not work "),
    "phpxml" = c("php -r 'simplexml_load_string(\"\");'",
                 "php xml not found, ctrLoadQueryIntoDb() will not work "),
    "phpjson" = c("php -r 'json_encode(\"<foo>\");'",
                  "php json not found, ctrLoadQueryIntoDb() will not work "),
    "sed" = c("echo x | sed s/x/y/",
              "sed not found, ctrLoadQueryIntoDb() will not work "),
    "perl" = c("perl -V:osname",
               "perl not found, ctrLoadQueryIntoDb() will not work ")
  )

  # check private environment and create if not found
  if (!exists(x = ".dbffenv", mode = "environment")) {
    .dbffenv <- new.env(parent = emptyenv())
  }

  # if input empty, just check all except test
  if (is.null(b)) b <- names(actionsInfos)[-1]

  # do check
  out <- sapply(X = b, function(bi) {

    # check input
    actionsInfo <- actionsInfos[[bi]]
    if (is.null(actionsInfo)) stop("Unknown binary to check: ", bi, call. = FALSE)

    # previously checked and successful?
    checked <- exists(x = paste0("bin_check_", bi), envir = .dbffenv)
    if (checked) checked <- get(x = paste0("bin_check_", bi), envir = .dbffenv)
    if (checked) return(TRUE)

    # continue to check binary
    ok <- installFindBinary(commandtest = actionsInfo[1])
    if (!ok) message("\n", actionsInfo[2], appendLF = FALSE)

    # store check to private environment
    assign(x = paste0("bin_check_", bi), value = ok, envir = .dbffenv)

    # return
    ok

  })

  # inform user
  if (!all(out)) message(
    "\nTo install command line binaries needed for the function ",
    "ctrLoadQueryIntoDb() of package ctrdata, see recommendations at ",
    "https://github.com/rfhb/ctrdata#",
    "2-command-line-tools-perl-sed-cat-and-php-52-or-higher",
    "\nAfter installation, detach and load package ctrdata again, ",
    "or restart the R session.\n")

  # return single value since
  # all tests need to be ok
  invisible(all(out))

}
