#' Experimental fuzzy join function
#'
#' \code{fuzzy_join} uses record linkage methods to match observations between two datasets where no perfect key fields exist.  For each row in x, \code{fuzzy_join} finds the closest row(s) in y. The distance is a weighted average of the string distances defined in \code{method} over multiple columns.
#'
#' @param x The master data.frame
#' @param y The using data.frame
#' @param exact Character vector specifying variables on which to match exactly. 
#' @param fuzzy Character vector specifying columns on which to match in a fuzzy way
#' @param gen Name of new variable with the distance between matched observations. Default to "distance".
#' @param suffixes A character vector of length 2 specifying  suffix of overlapping columns. Defaut to ".x" and ".y".
#' @param which With \code{which = TRUE}, returns a three columns data.tables where he first column corresponds to \code{x}'s row number, the second column corresponds to \code{y}'s row number and the third column corresponds to the score of the match. Default is \code{FALSE}, which returns a join with the rows in y.
#' @param w Numeric vector of the same length as \code{fuzzy} specifying the weights to use when summing across different column of \code{fuzzy}. Default to \code{rep(1, length(fuzzy))}.
#' @param na.score Numeric that specifies the distance between NA and another string. Default to 1/3
#' @param method See the \code{\link[stringdist]{stringdist}} documentation. Default to \code{"jw"}
#' @param p See  the \code{\link[stringdist]{stringdist}} documentation. Default to \code{0.1}
#' @param ... Other arguments to pass to \code{stringdist}. See the \code{\link[stringdist]{stringdist}} documentation.
#' @examples
#' library(stringdist)
#' library(dplyr)
#' x <- data_frame(a = c("france", "franc"), b = c("arras", "dijon"))
#' y <- data_frame(a = c("franc", "france"), b = c("arvars", "dijjon"))
#' fuzzy_join(x, y, fuzzy = c("a", "b"))
#' fuzzy_join(x, y, fuzzy = c("a", "b"), w = c(0.9, 0.1))
#' fuzzy_join(x,y, fuzzy = c("a", "b"), w = c(0, 0.9))
#' x <- data_frame(a = c(1, 1), b = c("arras", "dijon"))
#' y <- data_frame(a = c(1, 1), b = c("arvars", "dijjon"))
#' fuzzy_join(x, y, exact = "a", fuzzy = "b")
#' x <- data_frame(a = c(1, 2), b = c("arras", "dijon"))
#' y <- data_frame(a = c(1, 1), b = c("arvars", "dijjon"))
#' fuzzy_join(x, y, exact = "a", fuzzy = "b")
#' @details Typically, \code{x} is a dataset with dirty names, while \code{y} is the dataset with true names. When \code{exact} is specified, rows without matches are returned with distance NA.
#' @export
fuzzy_join <- function(x, y, exact = NULL, fuzzy = NULL, gen = "distance", suffixes = c(".x",".y"), which = FALSE, w = rep(1, length(fuzzy)), na.score = 1/3, method = "jw", p = 0.1, ...){

  try_require("stringdist")

  if (gen %in% union(names(x), names(y))) stop(gen, "already exists")
  if (!(length(w)==length(fuzzy))){
    stop("fuzzy and w must have the same length)")
  }
  #if (length(exact.or.NA)){
  #  condition <- build_condition(exact.or.NA, x)
  #} else{
  #  condition <-  NULL
  #}

  index.x <- tempname(c(names(x), names(y)))
  index.y <- tempname(c(names(x), names(y), index.x))
  w <- w/sum(w)

  ## create unique identifiers 
  x <- ungroup(x)
  y <- ungroup(y)
  tx <- mutate_(x, .dots = setNames(list(~row_number()), "x"))
  ty <- mutate_(y, .dots = setNames(list(~row_number()), "y"))

  # remove duplicates with respect to key columns in x and y
  ans.x <- mutate_(tx, .dots = setNames(list(~ group_indices_(tx, .dots = c(exact, fuzzy))), index.x))
  ans.y <- mutate_(ty, .dots = setNames(list(~ group_indices_(ty, .dots = c(exact, fuzzy))), index.y))

  ans.x <- select_(ans.x, .dots =  c(exact, fuzzy, index.x, "x"))
  ans.y <- select_(ans.y, .dots =  c(exact, fuzzy, index.y, "y"))
  ans.x = distinct_(ans.x, .dots = c(exact, fuzzy))
  ans.y = distinct_(ans.y, .dots = c(exact, fuzzy))
  merge.x <- select_(ans.x, .dots = c(index.x, "x"))
  merge.y <- select_(ans.y, .dots = c(index.y, "y"))

  # exact matching
  exact.matched <- suppressMessages(inner_join(ans.x, ans.y, on = c(exact, fuzzy)))
  exact.matched <- select_(exact.matched, .dots = interp(~index.x, index.x = as.name(index.x)), interp(~index.y, index.y = as.name(index.y)))
  exact.matched <- mutate_(exact.matched, .dots = setNames(list(~0), gen))
  length <- n_distinct(exact.matched[[index.x]])
  message(paste(length,"rows of x are exactly matched on all variables"))
  ans.x <- suppressMessages(anti_join(ans.x, ans.y, on = c(exact, fuzzy)))

  # fuzzy matching
  ans.y <- as.list(select_(ans.y, .dots = c(exact, ~everything())))
  setDT(ans.y)
  setkeyv(ans.y, exact)
  result <- lapply(seq_len(nrow(ans.x)), function(i){
    c(ans.x[[index.x]][i], score_row(l = ans.x[i,], index.y = index.y, ans.y = ans.y, exact = exact, fuzzy = fuzzy, w = w, method = method, p = p, na.score = na.score, ...))
    })
  result <- simplify2array(result, higher = FALSE)
  result <- t(result)
  result <- as.data.frame(result)

  # append exact and fuzzy matching
  result <- setNames(result, c(index.x, index.y, gen))
  out <- bind_rows(exact.matched, result)
  # add back duplicated
  out <- suppressMessages(left_join(out, merge.x, on = index.x))
  out <- suppressMessages(left_join(out, merge.y, on = index.y))
  out <- select_(out, .dots = c("x","y", gen))
  # if which = FALSE, output rows instead of row index
  if (!which){
    common_names = intersect(names(x), names(y))
    out <- suppressMessages(left_join(tx, out, by = "x"))
    out <- suppressMessages(left_join(out, ty, by = "y"))
    out <- select_(out, ~-one_of("x", "y")) 
    out <- select_(out, .dots = c(gen, ~everything()))
  }
  out
}


score_row <- function(l, index.y, ans.y, exact = NULL, fuzzy = NULL, w = rep(1, length(fuzzy)), ...){
  # binary search
  if (length(exact)){
    ans.y <- ans.y[as.list(l[exact]), nomatch = 0]
  }
  if (nrow(ans.y)==0){
    return(c(NA, NA))
  } 
  #if (!is.null(condition.exact.or.NA) && condition.exact.or.NA!=""){
  #  expression <- parse(text = condition.exact.or.NA)
  #  ans.y <- eval(substitute(ans.y[v,], list(v = expression)))
  #} 
  #if (nrow(ans.y)==0){
  #  	return(c(NA, NA))
  #} 
  tempv <- rep(0, nrow(ans.y))
  for (i in seq_along(fuzzy)){
	  tempv <- tempv + w[i]*stringdist2(l[[fuzzy[i]]], ans.y[[fuzzy[i]]], ...)
  }
  index <- which.min(tempv)
  return(c(ans.y[[index.y]][index], tempv[index]))
}

  

stringdist2 <- function(x, y, na.score,  ...){
  out <- stringdist::stringdist(x,y, ...)
  out[is.na(x)] <- na.score
  out[is.na(y)] <- na.score
  out
}


#build_condition <- function(exact.or.NA, ans.x){
#  condition <- NULL
#  for (i in seq_along(exact.or.NA)){
#    condition <- paste0(condition, 
#    	ifelse(
#    		!is.na(ans.x[[exact.or.NA[i]]]),
#    		paste0("&(", exact.or.NA[i], " == ", ans.x[[exact.or.NA[i]]], " | is.na(", exact.or.NA[i],"))"),
#    		"")
#    	)
#  }
#  str_replace(condition, fixed("&"), "")
#}
