% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/loadSQM.R
\name{loadSQM}
\alias{loadSQM}
\title{Load a SqueezeMeta project into R}
\usage{
loadSQM(
  project_path,
  tax_mode = "prokfilter",
  trusted_functions_only = FALSE,
  engine = "data.table"
)
}
\arguments{
\item{project_path}{character, project directory generated by SqueezeMeta, or zip file generated by \code{sqm2zip.py}.}

\item{tax_mode}{character, which taxonomic classification should be loaded? SqueezeMeta applies the identity thresholds described in \href{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4005636/}{Luo \emph{et al.}, 2014}. Use \code{allfilter} for applying the minimum identity threshold to all taxa, \code{prokfilter} for applying the threshold to Bacteria and Archaea, but not to Eukaryotes, and \code{nofilter} for applying no thresholds at all (default \code{prokfilter}).}

\item{trusted_functions_only}{logical. If \code{TRUE}, only highly trusted functional annotations (best hit + best average) will be considered when generating aggregated function tables. If \code{FALSE}, best hit annotations will be used (default \code{FALSE}). Will only have an effect if the \code{project_dir/results/tables} is not already present.}

\item{engine}{character. Engine used to load the ORFs and contigs tables. Either \code{data.frame} or \code{data.table} (significantly faster if your project is large). Default \code{data.table}.}
}
\value{
SQM object containing the parsed project.
}
\description{
This function takes the path to a project directory generated by \href{https://github.com/jtamames/SqueezeMeta}{SqueezeMeta} (whose name is specified in the \code{-p} parameter of the SqueezeMeta.pl script) and parses the results into a SQM object. Alternatively, it can load the project data from a zip file produced by \code{sqm2zip.py}.
}
\section{Prerequisites}{

{
Run \href{https://github.com/jtamames/SqueezeMeta}{SqueezeMeta}! An example call for running it would be:

        \code{/path/to/SqueezeMeta/scripts/SqueezeMeta.pl}\cr
        \code{-m coassembly -f fastq_dir -s samples_file -p project_dir}
}
}

\section{The SQM object structure}{

{
The SQM object is a nested list which contains the following information:
\tabular{lllllll}{
\bold{lvl1}         \tab \bold{lvl2}               \tab \bold{lvl3}          \tab \bold{type}             \tab \bold{rows/names} \tab \bold{columns} \tab \bold{data}        \cr
\bold{$orfs}        \tab \bold{$table}             \tab                      \tab \emph{dataframe}        \tab orfs              \tab misc. data     \tab misc. data         \cr
                    \tab \bold{$abund}             \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab abundances (reads) \cr
                    \tab \bold{$bases}             \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab abundances (bases) \cr
                    \tab \bold{$cov}               \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab coverages          \cr
                    \tab \bold{$cpm}               \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab covs. / 10^6 reads \cr
                    \tab \bold{$tpm}               \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab tpm                \cr
                    \tab \bold{$seqs}              \tab                      \tab \emph{character vector} \tab orfs              \tab (n/a)          \tab sequences          \cr
                    \tab \bold{$tax}               \tab                      \tab \emph{character matrix} \tab orfs              \tab tax. ranks     \tab taxonomy           \cr
\bold{$contigs}     \tab \bold{$table}             \tab                      \tab \emph{dataframe}        \tab contigs           \tab misc. data     \tab misc. data         \cr
                    \tab \bold{$abund}             \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab abundances (reads) \cr
                    \tab \bold{$bases}             \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab abundances (bases) \cr
                    \tab \bold{$cov}               \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab coverages          \cr
                    \tab \bold{$cpm}               \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab covs. / 10^6 reads \cr
                    \tab \bold{$tpm}               \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab tpm                \cr
                    \tab \bold{$seqs}              \tab                      \tab \emph{character vector} \tab contigs           \tab (n/a)          \tab sequences          \cr
                    \tab \bold{$tax}               \tab                      \tab \emph{character matrix} \tab contigs           \tab tax. ranks     \tab taxonomies         \cr
                    \tab \bold{$bins}              \tab                      \tab \emph{character matrix} \tab contigs           \tab bin. methods   \tab bins               \cr
$bins               \tab \bold{$table}             \tab                      \tab \emph{dataframe}        \tab bins              \tab misc. data     \tab misc. data         \cr
                    \tab \bold{$length}            \tab                      \tab \emph{numeric vector}   \tab bins              \tab (n/a)          \tab length             \cr
                    \tab \bold{$abund}             \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab abundances (reads) \cr
                    \tab \bold{$percent}           \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab abundances (reads) \cr
                    \tab \bold{$bases}             \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab abundances (bases) \cr
                    \tab \bold{$cov}               \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab coverages          \cr
                    \tab \bold{$cpm}               \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab covs. / 10^6 reads \cr
                    \tab \bold{$tax}               \tab                      \tab \emph{character matrix} \tab bins              \tab tax. ranks     \tab taxonomy           \cr
\bold{$taxa}        \tab \bold{$superkingdom}      \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab superkingdoms     \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab superkingdoms     \tab samples        \tab percentages        \cr
                    \tab \bold{$phylum}            \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab phyla             \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab phyla             \tab samples        \tab percentages        \cr
                    \tab \bold{$class}             \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab classes           \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab classes           \tab samples        \tab percentages        \cr
                    \tab \bold{$order}             \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab orders            \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab orders            \tab samples        \tab percentages        \cr
                    \tab \bold{$family}            \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab families          \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab families          \tab samples        \tab percentages        \cr
                    \tab \bold{$genus}             \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab genera            \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab genera            \tab samples        \tab percentages        \cr
                    \tab \bold{$species}           \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab species           \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab species           \tab samples        \tab percentages        \cr
\bold{$functions}   \tab \bold{$KEGG}              \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$bases}        \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab abundances (bases) \cr
                    \tab                           \tab \bold{$cov}          \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab coverages          \cr
                    \tab                           \tab \bold{$cpm}          \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab covs. / 10^6 reads \cr
                    \tab                           \tab \bold{$tpm}          \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab tpm                \cr
                    \tab                           \tab \bold{$copy_number}  \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab avg. copies        \cr
                    \tab \bold{$COG}               \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$bases}        \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab abundances (bases) \cr
                    \tab                           \tab \bold{$cov}          \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab coverages          \cr
                    \tab                           \tab \bold{$cpm}          \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab covs. / 10^6 reads \cr
                    \tab                           \tab \bold{$tpm}          \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab tpm                \cr
                    \tab                           \tab \bold{$copy_number}  \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab avg. copies        \cr
                    \tab \bold{$PFAM}              \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab abundances (reads) \cr
                    \tab                           \tab \bold{$bases}        \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab abundances (bases) \cr
                    \tab                           \tab \bold{$cov}          \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab coverages          \cr
                    \tab                           \tab \bold{$cpm}          \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab covs. / 10^6 reads \cr
                    \tab                           \tab \bold{$tpm}          \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab tpm                \cr
                    \tab                           \tab \bold{$copy_number}  \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab avg. copies        \cr
\bold{$total_reads} \tab                           \tab                      \tab \emph{numeric vector}   \tab samples           \tab (n/a)          \tab total reads        \cr
\bold{$misc}        \tab \bold{$project_name}      \tab                      \tab \emph{character vector} \tab (empty)           \tab (n/a)          \tab project name       \cr
                    \tab \bold{$samples}           \tab                      \tab \emph{character vector} \tab (empty)           \tab (n/a)          \tab samples            \cr
                    \tab \bold{$tax_names_long}    \tab \bold{$superkingdom} \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
                    \tab                           \tab \bold{$phylum}       \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
                    \tab                           \tab \bold{$class}        \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
                    \tab                           \tab \bold{$order}        \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
                    \tab                           \tab \bold{$family}       \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
                    \tab                           \tab \bold{$genus}        \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
                    \tab                           \tab \bold{$species}      \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
                    \tab \bold{$tax_names_short}   \tab                      \tab \emph{character vector} \tab full names        \tab (n/a)          \tab short names        \cr
                    \tab \bold{$KEGG_names}        \tab                      \tab \emph{character vector} \tab KEGG ids          \tab (n/a)          \tab KEGG names         \cr
                    \tab \bold{$KEGG_paths}        \tab                      \tab \emph{character vector} \tab KEGG ids          \tab (n/a)          \tab KEGG hiararchy     \cr
                    \tab \bold{$COG_names}         \tab                      \tab \emph{character vector} \tab COG ids           \tab (n/a)          \tab COG names          \cr
                    \tab \bold{$COG_paths}         \tab                      \tab \emph{character vector} \tab COG ids           \tab (n/a)          \tab COG hierarchy      \cr
                    \tab \bold{$ext_annot_sources} \tab                      \tab \emph{character vector} \tab COG ids           \tab (n/a)          \tab external databases \cr
}
If external databases for functional classification were provided to SqueezeMeta via the \code{-extdb} argument, the corresponding abundance (reads and bases), coverages, tpm and copy number profiles will be present in \code{SQM$functions} (e.g. results for the CAZy database would be present in \code{SQM$functions$CAZy}). Additionally, the extended names of the features present in the external database will be present in \code{SQM$misc} (e.g. \code{SQM$misc$CAZy_names}).
}
}

\examples{
\dontrun{
## (outside R)
## Run SqueezeMeta on the test data.
 /path/to/SqueezeMeta/scripts/SqueezeMeta.pl -p Hadza -f raw -m coassembly -s test.samples
## Now go into R.
library(SQMtools)
Hadza = loadSQM("Hadza") # Where Hadza is the path to the SqueezeMeta output directory.
}

data(Hadza) # We will illustrate the structure of the SQM object on the test data
# Which are the ten most abundant KEGG IDs in our data?
topKEGG = names(sort(rowSums(Hadza$functions$KEGG$tpm), decreasing=TRUE))[1:11]
topKEGG = topKEGG[topKEGG!="Unclassified"]
# Which functions do those KEGG IDs represent?
Hadza$misc$KEGG_names[topKEGG]
# What is the relative abundance of the Gammaproteobacteria class across samples?
Hadza$taxa$class$percent["Gammaproteobacteria",]
# Which information is stored in the orf, contig and bin tables?
colnames(Hadza$orfs$table)
colnames(Hadza$contigs$table)
colnames(Hadza$bins$table)
# What is the GC content distribution of my metagenome?
boxplot(Hadza$contigs$table[,"GC perc"]) # Not weighted by contig length or abundance!
}
