% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/phonetise.R
\name{phonetise}
\alias{phonetise}
\alias{phonetize}
\title{Tokenise IPA strings}
\usage{
phonetise(
  strings,
  multi = NULL,
  regex = NULL,
  split = TRUE,
  sep = " ",
  sanitise = TRUE,
  ignore_stress = TRUE,
  ignore_tone = TRUE,
  diacritics = FALSE,
  affricates = FALSE,
  v_sequences = FALSE,
  prenasalised = FALSE,
  all_multi = FALSE,
  sanitize = sanitise
)

phonetize(
  strings,
  multi = NULL,
  regex = NULL,
  split = TRUE,
  sep = " ",
  sanitise = TRUE,
  ignore_stress = TRUE,
  ignore_tone = TRUE,
  diacritics = FALSE,
  affricates = FALSE,
  v_sequences = FALSE,
  prenasalised = FALSE,
  all_multi = FALSE,
  sanitize = sanitise
)
}
\arguments{
\item{strings}{A character vector with a list of words in IPA.}

\item{multi}{A character vector of one or more multi-character phones as
strings.}

\item{regex}{A string with a regular expression to match several
multi-character phones.}

\item{split}{If set to \code{TRUE} (the default), the tokenised strings are split
into phones (i.e. the output is a vector with one element per phone). If
set to \code{FALSE}, the string is not split and the phones are separated with
the character defined in \code{sep}.}

\item{sep}{A character to be used as the separator of the phones if \code{split = FALSE} (default is \verb{ }, space).}

\item{sanitise}{Whether to remove all non-IPA characters (\code{TRUE} by default).}

\item{ignore_stress}{If \code{TRUE} (the default), stress marks are not parsed.}

\item{ignore_tone}{If \code{TRUE} (the default), tone marks and letters are not parsed.}

\item{diacritics}{If set to \code{TRUE}, parses all valid diacritics as part of
the previous character (\code{FALSE} by default).}

\item{affricates}{If set to \code{TRUE}, parses homorganic stop + fricative as
affricates.}

\item{v_sequences}{If set to \code{TRUE}, collapses vowel sequences (\code{FALSE} by
default).}

\item{prenasalised}{If set to \code{TRUE}, parses prenasalised consonants as such
(\code{FALSE} by default).}

\item{all_multi}{If set to \code{TRUE}, \code{diacritics}, \code{affricates}, \code{v_sequences}
and \code{prenasalised} are all set to \code{TRUE}.}

\item{sanitize}{Alias of \code{sanitise}.}
}
\value{
A list of phonetised strings.
}
\description{
\code{phonetise()} tokenises strings of IPA symbols (like phonetic transcriptions
of words) into individual "phones". The output is a list.
}
\examples{
# using unicode escapes for CRAN policy
ipa <- c("p\u02B0a\u0303k\u02B0", "t\u02B0um\u0325", "\u025Bk\u02B0\u026F")
ph <- c("p\u02B0", "t\u02B0", "k\u02B0", "a\u0303", "m\u0325")

phonetise(ipa, multi = ph)

ph_2 <- ph[4:5]

# Match any character followed by <\u02B0> with ".\u02B0".
phonetise(ipa, multi = ph_2, regex = ".\u02B0")

# Same result.
phonetise(ipa, regex = ".(\u0303|\u0325|\u02B0)")

# Don't split strings and use "." as separator
phonetise(ipa, multi = ph, split = FALSE, sep = ".")

}
