% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/prep.R
\name{prep_dataset}
\alias{prep_dataset}
\alias{read_dataset}
\title{Prepare kraken report, k-mer statistics, UMI data}
\usage{
prep_dataset(
  fa1,
  kraken_report,
  kraken_out,
  fa2 = NULL,
  cb_and_umi = function(sequence_id, read1, read2) {
     list(substring(read1, 1L, 16L),
    substring(read1, 17L, 28L))
 },
  ranks = c("G", "S"),
  kmer_len = 35L,
  min_frac = 0.5,
  exclude = "9606",
  threads = 10L,
  overwrite = TRUE,
  odir = NULL
)

read_dataset(dir)
}
\arguments{
\item{fa1, fa2}{The path to microbiome fasta 1 and 2 file (returned by
\code{\link[=extract_kraken_reads]{extract_kraken_reads()}}).}

\item{kraken_report}{The path to kraken report file.}

\item{kraken_out}{The path of microbiome output file. Usually should be
filtered with \code{\link[=extract_kraken_output]{extract_kraken_output()}}.}

\item{cb_and_umi}{A function takes sequence id, read1, read2 and return a
list of 2 corresponding to cell barcode and UMI respectively., each should
have the same length of the input.}

\item{ranks}{Taxa ranks to analyze.}

\item{kmer_len}{Kraken kmer length. Default: \code{35L}, which is the default kmer
size of kraken2.}

\item{min_frac}{Minimum fraction of kmers directly assigned to taxid to use
read. Reads with \verb{<=min_frac} of the k-mers map inside the taxon's lineage
are also discarded.}

\item{exclude}{A character of taxid to exclude, for \code{SAHMI}, the host taxid.
Reads with any k-mers mapped to the \code{exclude} are discarded.}

\item{threads}{Number of threads to use.}

\item{overwrite}{A bool indicates whether to overwrite the files in \code{odir}.}

\item{odir}{A string of directory to save the results.}

\item{dir}{A string of directory containing the files returned by
\code{prep_dataset}.}
}
\value{
A list of three polars \link[polars:DataFrame_class]{DataFrame}:
\itemize{
\item kreport: Used by \code{\link[=slsd]{slsd()}}.
\item kmer: Used by \code{\link[=blsd]{blsd()}}.
\item umi: Used by \code{\link[=taxa_counts]{taxa_counts()}}.
}
}
\description{
Three elements returned by this function:
\itemize{
\item \code{kreport}: Used by \code{\link[=slsd]{slsd()}}.
\item \code{kmer}: Used by \code{\link[=blsd]{blsd()}}. The function count the number of k-mers and
unique k-mers assigned to a taxon across barcodes. The cell barcode
and unique molecular identifier (UMI) are used to identify unique
barcodes and reads.  Data is reported for taxa of pre-specified ranks
(default genus + species) taking into account all subsequently higher
resolution ranks. The output is a table of barcodes, taxonomic IDs,
number of k-mers, and number of unique k-mers.
\item \code{umi}: Used by \code{\link[=taxa_counts]{taxa_counts()}}.
}
}
\examples{
# for sequence from `umi-tools`, we can use following function
cb_and_umi <- function(sequence_id, read1, read2) {
    out <- lapply(
        strsplit(sequence_id, "_", fixed = TRUE),
        `[`, 2:3
    )
    lapply(1:2, function(i) {
        vapply(out, function(o) as.character(.subset2(o, i)), character(1L))
    })
}

\dontrun{
# 1. `fa1` and `fa2` should be the output of `extract_kraken_reads()`
# 2. `kraken_report` should be the output of `blit::kraken2()`
# 3. `kraken_out` should be the output of `extract_kraken_output()`
# 4. `dir`: you may want to specify the output directory since this process 
#           is time-consuming
sahmi_dataset <- prep_dataset(
    fa1 = "kraken_microbiome_reads.fa",
    # if you have paired sequence, please also specify `fa2`,
    # !!! Also pay attention to the file name of `fa1` (add suffix `_1`)
    # if you use paired reads.
    # fa2 = "kraken_microbiome_reads_2.fa",
    kraken_report = "kraken_report.txt",
    kraken_out = "kraken_microbiome_output.txt",
    odir = NULL
)
# you may want to prepare all datasets for subsequent workflows.
# `paths` should be the output directory for each sample from
# `blit::kraken2()`, `extract_kraken_output()` and `extract_kraken_reads()`.
sahmi_datasets <- lapply(paths, function(dir) {
    prep_dataset(
        fa1 = file.path(dir, "kraken_microbiome_reads.fa"),
        # fa2 = file.path(dir, "kraken_microbiome_reads_2.fa"),
        kraken_report = file.path(dir, "kraken_report.txt"),
        kraken_out = file.path(dir, "kraken_microbiome_output.txt"),
        odir = dir
    )
})
}
}
\seealso{
\url{https://github.com/sjdlabgroup/SAHMI}
}
