% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textstat_simil.R
\name{textstat_simil}
\alias{textstat_simil}
\alias{textstat_dist}
\title{Similarity and distance computation between documents or features}
\usage{
textstat_simil(
  x,
  y = NULL,
  selection = NULL,
  margin = c("documents", "features"),
  method = c("correlation", "cosine", "jaccard", "ejaccard", "dice", "edice", "hamann",
    "simple matching"),
  min_simil = NULL,
  ...
)

textstat_dist(
  x,
  y = NULL,
  selection = NULL,
  margin = c("documents", "features"),
  method = c("euclidean", "manhattan", "maximum", "canberra", "minkowski"),
  p = 2,
  ...
)
}
\arguments{
\item{x, y}{a \link[quanteda:dfm]{dfm} objects; \code{y} is an optional target matrix
matching \code{x} in the margin on which the similarity or distance will be
computed.}

\item{selection}{(deprecated - use \code{y} instead).}

\item{margin}{identifies the margin of the dfm on which similarity or
difference will be computed:  \code{"documents"} for documents or
\code{"features"} for word/term features.}

\item{method}{character; the method identifying the similarity or distance
measure to be used; see Details.}

\item{min_simil}{numeric; a threshold for the similarity values below which similarity
values will not be returned}

\item{...}{unused}

\item{p}{The power of the Minkowski distance.}
}
\value{
A sparse matrix from the \pkg{Matrix} package that will be symmetric
unless \code{y} is specified.
}
\description{
These functions compute matrixes of distances and similarities between
documents or features from a \link[quanteda:dfm]{dfm} and return a matrix of
similarities or distances in a sparse format.  These methods are fast and
robust because they operate directly on the sparse \link[quanteda:dfm]{dfm}
objects. The output can easily be coerced to an ordinary matrix, a data.frame
of pairwise comparisons, or a \link[stats:dist]{dist} format.
}
\details{
\code{textstat_simil} options are: \code{"correlation"} (default),
\code{"cosine"}, \code{"jaccard"}, \code{"ejaccard"}, \code{"dice"},
\code{"edice"}, \code{"simple matching"}, and \code{"hamann"}.

\code{textstat_dist} options are: \code{"euclidean"} (default),
\code{"manhattan"}, \code{"maximum"}, \code{"canberra"},
and \code{"minkowski"}.
}
\note{
If you want to compute similarity on a "normalized" dfm object
(controlling for variable document lengths, for methods such as correlation
for which different document lengths matter), then wrap the input dfm in
\verb{[dfm_weight](x, "prop")}.
}
\section{Conversion to other data types}{

The output objects from \code{textstat_simil()} and \code{textstat_dist()} can be
transformed easily into a list format using
\code{\link[=as.list.textstat_proxy]{as.list()}}, which returns a list for each unique
element of the second of the pairs, a data.frame using
\code{\link[=as.data.frame.textstat_proxy]{as.data.frame()}}, which returns pairwise
scores, \code{as.dist()}for a \link[stats:dist]{dist} object,
or \code{as.matrix()} to convert it into an ordinary matrix.
}

\examples{
# similarities for documents
library("quanteda")
dfmat <- corpus_subset(data_corpus_inaugural, Year > 2000) \%>\%
    tokens(remove_punct = TRUE) \%>\%
    tokens_remove(stopwords("english")) \%>\%
    dfm()
(tstat1 <- textstat_simil(dfmat, method = "cosine", margin = "documents"))
as.matrix(tstat1)
as.list(tstat1)
as.list(tstat1, diag = TRUE)

# min_simil
(tstat2 <- textstat_simil(dfmat, method = "cosine", margin = "documents", min_simil = 0.6))
as.matrix(tstat2)

# similarities for for specific documents
textstat_simil(dfmat, dfmat["2017-Trump", ], margin = "documents")
textstat_simil(dfmat, dfmat["2017-Trump", ], method = "cosine", margin = "documents")
textstat_simil(dfmat, dfmat[c("2009-Obama", "2013-Obama"), ], margin = "documents")

# compute some term similarities
tstat3 <- textstat_simil(dfmat, dfmat[, c("fair", "health", "terror")], method = "cosine",
                         margin = "features")
head(as.matrix(tstat3), 10)
as.list(tstat3, n = 6)


# distances for documents
(tstat4 <- textstat_dist(dfmat, margin = "documents"))
as.matrix(tstat4)
as.list(tstat4)
as.dist(tstat4)

# distances for specific documents
textstat_dist(dfmat, dfmat["2017-Trump", ], margin = "documents")
(tstat5 <- textstat_dist(dfmat, dfmat[c("2009-Obama" , "2013-Obama"), ], margin = "documents"))
as.matrix(tstat5)
as.list(tstat5)

\dontrun{
# plot a dendrogram after converting the object into distances
plot(hclust(as.dist(tstat4)))
}
}
\seealso{
\code{\link[=as.list.textstat_proxy]{as.list.textstat_proxy()}}, \code{\link[=as.data.frame.textstat_proxy]{as.data.frame.textstat_proxy()}},
\code{\link[stats:dist]{stats::as.dist()}}
}
