% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm_select.R, R/fcm_select.R
\name{dfm_select}
\alias{dfm_select}
\alias{dfm_remove}
\alias{dfm_keep}
\alias{fcm_select}
\alias{fcm_remove}
\alias{fcm_keep}
\title{Select features from a dfm or fcm}
\usage{
dfm_select(
  x,
  pattern = NULL,
  selection = c("keep", "remove"),
  valuetype = c("glob", "regex", "fixed"),
  case_insensitive = TRUE,
  min_nchar = NULL,
  max_nchar = NULL,
  verbose = quanteda_options("verbose")
)

dfm_remove(x, ...)

dfm_keep(x, ...)

fcm_select(
  x,
  pattern = NULL,
  selection = c("keep", "remove"),
  valuetype = c("glob", "regex", "fixed"),
  case_insensitive = TRUE,
  verbose = quanteda_options("verbose"),
  ...
)

fcm_remove(x, ...)

fcm_keep(x, ...)
}
\arguments{
\item{x}{the \link{dfm} or \link{fcm} object whose features will be selected}

\item{pattern}{a character vector, list of character vectors, \link{dictionary},
or collocations object.  See \link{pattern} for details.}

\item{selection}{whether to \code{keep} or \code{remove} the features}

\item{valuetype}{the type of pattern matching: \code{"glob"} for "glob"-style
wildcard expressions; \code{"regex"} for regular expressions; or \code{"fixed"} for
exact matching. See \link{valuetype} for details.}

\item{case_insensitive}{logical; if \code{TRUE}, ignore case when matching a
\code{pattern} or \link{dictionary} values}

\item{min_nchar, max_nchar}{optional numerics specifying the minimum and
maximum length in characters for tokens to be removed or kept; defaults are
\code{NULL} for no limits.  These are applied after (and hence, in addition
to) any selection based on pattern matches.}

\item{verbose}{if \code{TRUE} print message about how many pattern were
removed}

\item{...}{used only for passing arguments from \code{dfm_remove} or
\code{dfm_keep} to \code{dfm_select}. Cannot include
\code{selection}.}
}
\value{
A \link{dfm} or \link{fcm} object, after the feature selection has
been applied.

For compatibility with earlier versions, when \code{pattern} is a
\link{dfm} object and \code{selection = "keep"}, then this will be
equivalent to calling \code{\link[=dfm_match]{dfm_match()}}.  In this case, the following
settings are always used: \code{case_insensitive = FALSE}, and
\code{valuetype = "fixed"}.  This functionality is deprecated, however, and
you should use \code{\link[=dfm_match]{dfm_match()}} instead.
}
\description{
This function selects or removes features from a \link{dfm} or \link{fcm},
based on feature name matches with \code{pattern}.  The most common usages
are to eliminate features from a dfm already constructed, such as stopwords,
or to select only terms of interest from a dictionary.
}
\details{
\code{dfm_remove} and \code{fcm_remove} are simply a convenience
wrappers to calling \code{dfm_select} and \code{fcm_select} with
\code{selection = "remove"}.

\code{dfm_keep} and \code{fcm_keep} are simply a convenience wrappers to
calling \code{dfm_select} and \code{fcm_select} with \code{selection = "keep"}.
}
\note{
This function selects features based on their labels.  To select
features based on the values of the document-feature matrix, use
\code{\link[=dfm_trim]{dfm_trim()}}.
}
\examples{
dfmat <- tokens(c("My Christmas was ruined by your opposition tax plan.",
               "Does the United_States or Sweden have more progressive taxation?")) \%>\%
    dfm(tolower = FALSE)
dict <- dictionary(list(countries = c("United_States", "Sweden", "France"),
                        wordsEndingInY = c("by", "my"),
                        notintext = "blahblah"))
dfm_select(dfmat, pattern = dict)
dfm_select(dfmat, pattern = dict, case_insensitive = FALSE)
dfm_select(dfmat, pattern = c("s$", ".y"), selection = "keep", valuetype = "regex")
dfm_select(dfmat, pattern = c("s$", ".y"), selection = "remove", valuetype = "regex")
dfm_select(dfmat, pattern = stopwords("english"), selection = "keep", valuetype = "fixed")
dfm_select(dfmat, pattern = stopwords("english"), selection = "remove", valuetype = "fixed")

# select based on character length
dfm_select(dfmat, min_nchar = 5)

dfmat <- dfm(tokens(c("This is a document with lots of stopwords.",
                      "No if, and, or but about it: lots of stopwords.")))
dfmat
dfm_remove(dfmat, stopwords("english"))
toks <- tokens(c("this contains lots of stopwords",
                 "no if, and, or but about it: lots"),
               remove_punct = TRUE)
fcmat <- fcm(toks)
fcmat
fcm_remove(fcmat, stopwords("english"))
}
\seealso{
\code{\link[=dfm_match]{dfm_match()}}
}
\keyword{dfm}
