% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/PsychWordVec.R
\name{data_transform}
\alias{data_transform}
\title{Transform plain text data of word vectors into a compressed ".RData" file.}
\usage{
data_transform(
  file.load,
  file.save,
  sep = " ",
  header = "auto",
  encoding = "auto",
  compress = "bzip2",
  compress.level = 9,
  verbose = TRUE
)
}
\arguments{
\item{file.load}{File name of raw data (must be plain text).

Data must be in this format (values separated by \code{sep}):

cat 0.001 0.002 0.003 0.004 0.005 ... 0.300

dog 0.301 0.302 0.303 0.304 0.305 ... 0.600}

\item{file.save}{File name of to-be-saved R data (must be .RData).}

\item{sep}{Column separator. Defaults to \code{" "}.}

\item{header}{Is the 1st row a header (e.g., meta-information such as "2000000 300")?
Defaults to \code{"auto"}, which automatically determines whether there is a header.
If \code{TRUE}, then the 1st row will be dropped.}

\item{encoding}{File encoding. Defaults to \code{"auto"}
(using \code{\link[vroom:vroom_lines]{vroom::vroom_lines()}} to fast read the file).
If specified to any other value (e.g., \code{"UTF-8"}),
then it uses \code{\link[base:readLines]{readLines()}} to read the file,
which is much slower than \code{vroom}.}

\item{compress}{Compression method for the saved file. Defaults to \code{"bzip2"}.

Options include:
\itemize{
  \item \code{1} or \code{"gzip"}: modest file size (fastest)
  \item \code{2} or \code{"bzip2"}: small file size (fast)
  \item \code{3} or \code{"xz"}: minimized file size (slow)
}}

\item{compress.level}{Compression level from \code{0} (none) to \code{9}
(maximal compression for minimal file size). Defaults to \code{9}.}

\item{verbose}{Print information to the console? Defaults to \code{TRUE}.}
}
\value{
A \code{data.table} (of new class \code{wordvec}) with two variables: \code{word} and \code{vec}.
}
\description{
Transform plain text data of word vectors into a compressed ".RData" file.

\emph{Speed}: In total (preprocess + compress + save),
it can process about 30000 words/min
with the slowest settings (\code{compress="xz"}, \code{compress.level=9})
on a modern computer (HP ProBook 450, Windows 11, Intel i7-1165G7 CPU, 32GB RAM).
}
\section{Download}{

Download pre-trained word vectors data (\code{.RData}):
\url{https://psychbruce.github.io/WordVector_RData.pdf}
}

\examples{
\dontrun{
# please first manually download plain text data of word vectors
# e.g., from: https://fasttext.cc/docs/en/crawl-vectors.html

# the text file must be on your disk
# the following code cannot run unless you have the file
library(bruceR)
set.wd()
data_transform(file.load="cc.zh.300.vec",   # plain text file
               file.save="cc.zh.300.vec.RData",  # RData file
               header=TRUE, compress="xz")  # of minimal size
}

}
\seealso{
\code{\link{data_wordvec_load}}

\code{\link{data_wordvec_normalize}}

\code{\link{data_wordvec_reshape}}

\code{\link{data_wordvec_subset}}
}
