% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/record_group.R
\name{record_group}
\alias{record_group}
\title{Multistage deterministic record linkage}
\usage{
record_group(df, sn = NULL, criteria, sub_criteria = NULL,
  data_source = NULL, group_stats = FALSE, display = TRUE,
  to_s4 = FALSE)
}
\arguments{
\item{df}{\code{data.frame}. One or more datasets appended together.}

\item{sn}{Unique numerical record identifier. Optional.}

\item{criteria}{Column names of attributes to match. Records with matching values in these columns are grouped together.}

\item{sub_criteria}{Matching sub-criteria. Additional matching conditions for each stage (\code{criteria}).}

\item{data_source}{Unique dataset identifier. Useful when \code{df} contains data from multiple sources.}

\item{group_stats}{If \code{TRUE}, output will include additional columns with useful stats for each record group.}

\item{display}{If \code{TRUE}, status messages are printed on screen.}

\item{to_s4}{if \code{TRUE}, changes the returned output to a \code{\link[=pid-class]{pid}} object.}
}
\value{
\code{data.frame} (\code{\link[=pid-class]{pid}} objects if \code{to_s4} is \code{TRUE})

\itemize{
\item \code{sn} - unique record identifier as provided
\item \code{pid | .Data} - unique group identifier
\item \code{pid_cri} - matched criteria for each record in the group
\item \code{pid_dataset} - data sources in each group
\item \code{pid_total} - number of records in each group
}

\code{pid} objects will be the default output from the next release.
}
\description{
Group matching records from one or more datasets.
}
\details{
Record grouping occurs in stages of matching \code{criteria}.

Records are matched in two ways; an exact match - the equivalent of \code{(==)}, or matching a range of numeric values.
An example of range matching is matching a date give or take 5 days, or matching an age give or take 2 years.
To do this, create a \code{\link{number_line}} object based on the range of values, and assign the actual value assigned to \code{gid}.
Then use the \code{\link{number_line}} as a \code{sub_criteria}.

A match at each stage is considered more relevant than those at subsequent stages.
Therefore, \code{criteria} should be listed in order of decreasing relevance or certainty.

\code{sub_criteria} can be used to force additional matching conditions at each stage.
If \code{sub_criteria} is not \code{NULL}, only records with matching \code{criteria} and \code{sub_criteria} values are grouped together.
If a record has missing values for any \code{criteria}, it's skipped at that stage, and another attempt is made at the next stage.
If all \code{criteria} values are missing, that record is assigned a unique group ID.

When a \code{data_source} identifier is included,
\code{pid_dataset} is included in the output. This lists the source of every record in each record group.
}
\examples{
library(dplyr)
library(tidyr)

three_people <- data.frame(forename=c("Obinna","James","Ojay","James","Obinna"),
                           stringsAsFactors = FALSE)

three_people$pids_a <- record_group(three_people, criteria= forename, to_s4 = TRUE)
three_people

# To handle missing or unknown data, recode missing or unknown values to NA or "".
three_people$forename[c(1,4)] <- NA
three_people$pids_b <- record_group(three_people, criteria= forename, to_s4 =TRUE)
three_people

data(staff_records); staff_records

# Range matching
dob <- staff_records["sex"]
dob$age <- c(30,28,40,25,25,29,27)

# age range: age + 20 years
dob$range_a <- number_line(dob$age, dob$age+20, gid=dob$age)
dob$pids_a <- record_group(dob, criteria = sex, sub_criteria = list(s1a="range_a"), to_s4 = TRUE)
dob[c("sex","age","range_a","pids_a")]

# age range: age +- 20 years
dob$range_b <- number_line(dob$age-20, dob$age+20, gid=dob$age)
dob$pids_b <- record_group(dob, criteria = sex, sub_criteria = list(s1a="range_b"), to_s4 = TRUE)
dob[c("sex","age","range_b","pids_b")]

dob$pids_c <- record_group(dob, criteria = range_b, to_s4 = TRUE)
dob[c("age","range_b","pids_c")]


# Multistage record grouping
staff_records$pids_a <- record_group(staff_records, sn = r_id, criteria = c(forename, surname),
                                     data_source = sex, display = FALSE, to_s4 = TRUE)
staff_records

# Add `sex` to the second stage (`cri`) to be more certain
staff_records <- unite(staff_records, cri_2, c(surname, sex), sep ="-")
staff_records$pids_b <- record_group(staff_records, r_id, c(forename, cri_2),
                                     data_source = dataset, display = FALSE, to_s4 = TRUE)
staff_records

# Using sub-criteria
data(missing_staff_id); missing_staff_id

missing_staff_id$pids <- record_group(missing_staff_id, r_id, c(staff_id, age),
list(s2a=c("initials","hair_colour","branch_office")), data_source = source_1, to_s4 = TRUE)

missing_staff_id

}
\seealso{
\code{\link{episode_group}} and \code{\link{number_line}}
}
