% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{tokenize_transform_vec_docs}
\alias{tokenize_transform_vec_docs}
\title{String tokenization and transformation ( vector of documents )}
\usage{
tokenize_transform_vec_docs(object = NULL, as_token = FALSE,
  to_lower = FALSE, to_upper = FALSE, utf_locale = "",
  remove_char = "", remove_punctuation_string = FALSE,
  remove_punctuation_vector = FALSE, remove_numbers = FALSE,
  trim_token = FALSE, split_string = FALSE,
  split_separator = " \\r\\n\\t.,;:()?!//", remove_stopwords = FALSE,
  language = "english", min_num_char = 1, max_num_char = Inf,
  stemmer = NULL, min_n_gram = 1, max_n_gram = 1, skip_n_gram = 1,
  skip_distance = 0, n_gram_delimiter = " ", concat_delimiter = NULL,
  path_2folder = "", threads = 1, vocabulary_path_file = NULL,
  verbose = FALSE)
}
\arguments{
\item{object}{a character string vector of documents}

\item{as_token}{if TRUE then the output of the function is a list of (split) token. Otherwise is a vector of character strings (sentences)}

\item{to_lower}{either TRUE or FALSE. If TRUE the character string will be converted to lower case}

\item{to_upper}{either TRUE or FALSE. If TRUE the character string will be converted to upper case}

\item{utf_locale}{the language specific locale to use in case that either the \emph{to_lower} or the \emph{to_upper} parameter is TRUE and the text file language is other than english. For instance if the language of a text file is greek then the \emph{utf_locale} parameter should be \emph{'el_GR.UTF-8'} ( \emph{language_country.encoding} ). A wrong utf-locale does not raise an error, however the runtime of the function increases.}

\item{remove_char}{a character string with specific characters that should be removed from the text file. If the \emph{remove_char} is "" then no removal of characters take place}

\item{remove_punctuation_string}{either TRUE or FALSE. If TRUE then the punctuation of the character string will be removed (applies before the split function)}

\item{remove_punctuation_vector}{either TRUE or FALSE. If TRUE then the punctuation of the vector of the character strings will be removed  (after the string split has taken place)}

\item{remove_numbers}{either TRUE or FALSE. If TRUE then any numbers in the character string will be removed}

\item{trim_token}{either TRUE or FALSE. If TRUE then the string will be trimmed (left and/or right)}

\item{split_string}{either TRUE or FALSE. If TRUE then the character string will be split using the \emph{split_separator} as delimiter. The user can also specify multiple delimiters.}

\item{split_separator}{a character string specifying the character delimiter(s)}

\item{remove_stopwords}{either TRUE, FALSE or a character vector of user defined stop words. If TRUE then by using the \emph{language} parameter the corresponding stop words vector will be uploaded.}

\item{language}{a character string which defaults to english. If the \emph{remove_stopwords} parameter is TRUE then the corresponding stop words vector will be uploaded. Available languages
are \emph{afrikaans}, \emph{arabic}, \emph{armenian}, \emph{basque}, \emph{bengali}, \emph{breton}, \emph{bulgarian}, \emph{catalan},
\emph{croatian}, \emph{czech}, \emph{danish}, \emph{dutch}, \emph{english}, \emph{estonian},
\emph{finnish}, \emph{french}, \emph{galician}, \emph{german}, \emph{greek}, \emph{hausa}, \emph{hebrew}, \emph{hindi}, \emph{hungarian},
\emph{indonesian}, \emph{irish}, \emph{italian}, \emph{latvian}, \emph{marathi},
\emph{norwegian}, \emph{persian}, \emph{polish}, \emph{portuguese}, \emph{romanian}, \emph{russian}, \emph{slovak}, \emph{slovenian},
\emph{somalia}, \emph{spanish}, \emph{swahili}, \emph{swedish}, \emph{turkish}, \emph{yoruba}, \emph{zulu}}

\item{min_num_char}{an integer specifying the minimum number of characters to keep. If the \emph{min_num_char} is greater than 1 then character strings with more than 1 characters will be returned}

\item{max_num_char}{an integer specifying the maximum number of characters to keep. The \emph{max_num_char} should be less than or equal to \emph{Inf} (in this function the Inf value translates to a word-length of 1000000000)}

\item{stemmer}{a character string specifying the stemming method. Available method is the \emph{porter2_stemmer}. See details for more information.}

\item{min_n_gram}{an integer specifying the minimum number of n-grams. The minimum number of min_n_gram is 1.}

\item{max_n_gram}{an integer specifying the maximum number of n-grams. The minimum number of max_n_gram is 1.}

\item{skip_n_gram}{an integer specifying the number of skip-n-grams. The minimum number of skip_n_gram is 1. The skip_n_gram gives the (max.) n-grams using the \emph{skip_distance} parameter. If \emph{skip_n_gram} is greater than 1 then both \emph{min_n_gram} and \emph{max_n_gram} should be set to 1.}

\item{skip_distance}{an integer specifying the skip distance between the words. The minimum value for the skip distance is 0, in which case simple n-grams will be returned.}

\item{n_gram_delimiter}{a character string specifying the n-gram delimiter (applies to both n-gram and skip-n-gram cases)}

\item{concat_delimiter}{either NULL or a character string specifying the delimiter to use in order to concatenate the end-vector of character strings to a single character string (recommended in case that the end-vector should be saved to a file)}

\item{path_2folder}{a character string specifying the path to the folder where the file(s) will be saved}

\item{threads}{an integer specifying the number of cores to run in parallel}

\item{vocabulary_path_file}{either NULL or a character string specifying the output path to a file where the vocabulary should be saved once the text is tokenized}

\item{verbose}{either TRUE or FALSE. If TRUE then information will be printed out}
}
\value{
a character vector
}
\description{
String tokenization and transformation ( vector of documents )
}
\details{
It is memory efficient to give a \emph{path_2folder} in case that a big file should be saved, rather than return the vector of all character strings in the R-session.

The \emph{skip-grams} are a generalization of n-grams in which the components (typically words) need not to be consecutive in the text under consideration, but may leave gaps that are skipped over. They provide one way of overcoming the \emph{data sparsity problem} found with conventional n-gram analysis.

Many character string pre-processing functions (such as the \emph{utf-locale} or the \emph{split-string} function ) are based on the \emph{boost} library ( \url{http://www.boost.org/} ).

Stemming of the english language is done using the porter2-stemmer, for details see \url{https://github.com/smassung/porter2_stemmer}

The list of stop-words in the available languages was downloaded from the following link, \url{https://github.com/6/stopwords-json}
}
\examples{

library(textTinyR)

token_doc_vec = c("CONVERT to lower", "remove.. punctuation11234", "trim token and split ")

res = tokenize_transform_vec_docs(object = token_doc_vec, to_lower = TRUE, split_string = TRUE)
}
