% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/read_naaccr.R
\name{read_naaccr_plain}
\alias{read_naaccr_plain}
\alias{read_naaccr}
\alias{read_naaccr_xml_plain}
\alias{read_naaccr_xml}
\title{Read NAACCR records from a file}
\usage{
read_naaccr_plain(
  input,
  version = NULL,
  format = NULL,
  keep_fields = NULL,
  skip = 0,
  nrows = Inf,
  buffersize = 10000,
  encoding = getOption("encoding")
)

read_naaccr(
  input,
  version = NULL,
  format = NULL,
  keep_fields = NULL,
  keep_unknown = FALSE,
  skip = 0,
  nrows = Inf,
  buffersize = 10000,
  encoding = getOption("encoding"),
  ...
)

read_naaccr_xml_plain(
  input,
  version = NULL,
  format = NULL,
  keep_fields = NULL,
  as_text = FALSE,
  encoding = getOption("encoding")
)

read_naaccr_xml(
  input,
  version = NULL,
  format = NULL,
  keep_fields = NULL,
  keep_unknown = FALSE,
  as_text = FALSE,
  encoding = getOption("encoding"),
  ...
)
}
\arguments{
\item{input}{Either a string with a file name (containing no \code{\\n}
character), a \code{\link[base:connections]{connection}} object, or the text records
themselves as a character vector.}

\item{version}{An integer specifying the NAACCR format version for parsing
the records. Use this or \code{format}, not both. If both \code{version}
and \code{format} are \code{NULL} (default), the most recent NAACCR format
will be used.}

\item{format}{A \code{\link{record_format}} object for parsing the records.}

\item{keep_fields}{Character vector of XML field names to keep in the
dataset. If \code{NULL} (default), all columns are kept.}

\item{skip}{An integer specifying the number of lines of the data file to
skip before beginning to read data.}

\item{nrows}{A number specifying the maximum number of records to read.
\code{Inf} (the default) means "all records."}

\item{buffersize}{Maximum number of lines to read at one time.}

\item{encoding}{String giving the input's encoding. See the 'Encoding'
section of \code{\link[base:connections]{file}} in the \pkg{base} package.
For \code{read_naaccr_xml} and \code{read_naaccr_xml_plain}, this is a
\emph{backup} encoding. If the XML document includes an encoding
specification, that will be used. Otherwise, \code{encoding} will be used.}

\item{keep_unknown}{Logical indicating whether values of "unknown" should be
a level in the factor or \code{NA}.}

\item{...}{Additional arguments passed onto \code{\link{as.naaccr_record}}.}

\item{as_text}{Logical indicating (if \code{TRUE}) that \code{input} is a
character string containing XML or (if \code{FALSE}) it is the path to a
file with XML content.}
}
\value{
For \code{read_naaccr}, a \code{data.frame} of the records.
  The columns included depend on the NAACCR \code{\link{record_format}} version.
  Columns are atomic vectors; there are too many to describe them all.

  For \code{read_naaccr_plain}, a \code{data.frame} based on the
  \code{record_format} specified by either the \code{version} or
  \code{format} argument.
  The names of the columns will be those in the format's \code{name} column.
  All columns are character vectors.
}
\description{
Read and parse cancer incidence records according to a NAACCR format from
either fixed-width files (\code{read_naaccr} and \code{read_naaccr_plain})
or XML documents (\code{read_naaccr_xml} and \code{read_naaccr_xml_plain}).
}
\details{
\code{read_naaccr} and \code{read_naaccr_xml} return data sets suited for
analysis in R.
\code{read_naaccr_plain} and \code{read_naaccr_xml_plain} return data sets
with the unchanged record values.

Anyone who wants to analyze the records in R should use \code{read_naaccr}
or \code{read_naaccr_xml}.
In the returned \code{\link{naaccr_record}}, columns are of appropriate
classes, coded values are replaced with factors, and unknowns are replaced
with \code{NA}.

\code{read_naaccr_plain} and \code{read_naaccr_xml_plain} is a "format strict"
way to read incidence records.
All values returned are the literal character values from the records.
The only processing done is that leading and trailing whitespace is trimmed.
This is useful if the values will be passed to other software that expects
the plain NAACCR values.

For \code{read_naaccr_plain} and \code{read_naaccr}, if the \code{version}
and \code{format} arguments are left \code{NULL}, the default format is
version 18. This was the last format to be used for fixed-width files.
}
\note{
Some of the parameter text was shamelessly copied from the
  \code{\link[utils]{read.table}} and \code{\link[utils]{read.fwf}} help
  pages.
}
\examples{
  # This file has synthetic abstract records
  incfile <- system.file(
    "extdata", "synthetic-naaccr-18-abstract.txt",
    package = "naaccr"
  )
  fields <- c("ageAtDiagnosis", "sex", "sequenceNumberCentral")
  read_naaccr(incfile, version = 18, keep_fields = fields)
  recs <- read_naaccr_plain(incfile, version = 18, keep_fields = fields)
  recs
  # Note sequenceNumberCentral has been split in two: a number and a flag
  summary(recs[["sequenceNumberCentral"]])
  summary(recs[["sequenceNumberCentralFlag"]])
}
\references{
North American Association of Central Cancer Registries (October 2018).
 Standards for Cancer Registries Volume II: Data Standards and Data Dictionary.
 Twenty first edition.
 \url{http://datadictionary.naaccr.org/}.

 North American Association of Central Cancer Registries (April 2019).
 NAACCR XML Data Exchange Standard. Version 1.4.
 \url{https://www.naaccr.org/xml-data-exchange-standard/}.
}
\seealso{
\code{\link{naaccr_record}}
}
