% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/CheckGeno.R
\name{CheckGeno}
\alias{CheckGeno}
\title{Check Genotype Matrix}
\usage{
CheckGeno(
  GenoM,
  quiet = FALSE,
  Plot = FALSE,
  Return = "GenoM",
  Strict = TRUE,
  DumPrefix = c("F0", "M0")
)
}
\arguments{
\item{GenoM}{the genotype matrix.}

\item{quiet}{suppress messages.}

\item{Plot}{display the plots of \code{\link{SnpStats}}.}

\item{Return}{either 'GenoM' to return the cleaned-up genotype matrix, or
'excl' to return a list with excluded SNPs and individuals (see Value).}

\item{Strict}{Exclude any individuals genotyped for <5% of SNPs, and any SNPs
genotyped for <5% of individuals (TRUE); this was the unavoidable default
up to version 2.4.1. Otherwise only excluded are (very nearly) monomorphic
SNPs, SNPs scored for fewer than 2 individuals, and individuals scored for
fewer than 2 SNPs.}

\item{DumPrefix}{length 2 vector, to check if these don't occur among
genotyped individuals.}
}
\value{
If \code{Return='excl'} a list with, if any are found:
 \item{ExcludedSNPs}{SNPs scored for <10% of individuals; automatically
   excluded when running \code{\link{sequoia}}}
 \item{ExcludedSnps-mono}{monomorphic (fixed) SNPs; automatically excluded
   when running \code{\link{sequoia}}. This includes nearly-fixed SNPs with
   MAF \eqn{= 1/2N}. Column numbers are *after* removal of
   \code{ExcludedSNPs}, if any.}
 \item{ExcludedIndiv}{Individuals scored for <5% of SNPs; these cannot be
   reliably included during pedigree reconstruction. Individual call rate is
   calculated after removal of 'Excluded SNPs'}
 \item{Snps-LowCallRate}{SNPs scored for 10% -- 50% of individuals; strongly
   recommended to be filtered out}
 \item{Indiv-LowCallRate}{individuals scored for <50% of SNPs; strongly
   recommended to be filtered out}

When \code{Return='excl'} the return is \code{\link{invisible}}, i.e. a check
is run and warnings or errors are always displayed, but nothing may be
returned.
}
\description{
Check that the provided genotype matrix is in the correct
  format, and check for low call rate samples and SNPs.
}
\section{Thresholds}{
 Appropriate call rate thresholds for SNPs and
  individuals depend on the total number of SNPs, distribution of call rates,
  genotyping errors, and the proportion of candidate parents that are SNPd
  (sibship clustering is more prone to false positives). Note that filtering
  first on SNP call rate tends to keep more individuals in.
}

\examples{
GenoM <- SimGeno(Ped_HSg5, nSnp=400, CallRate = runif(400, 0.2, 0.8))
# the quick way:
GenoM.checked <- CheckGeno(GenoM, Return="GenoM")

# the user supervised way:
Excl <- CheckGeno(GenoM, Return = "excl")
GenoM.orig <- GenoM   # make a 'backup' copy
if ("ExcludedSnps" \%in\% names(Excl))
  GenoM <- GenoM[, -Excl[["ExcludedSnps"]]]
if ("ExcludedSnps-mono" \%in\% names(Excl))
  GenoM <- GenoM[, -Excl[["ExcludedSnps-mono"]]]
if ("ExcludedIndiv" \%in\% names(Excl))
  GenoM <- GenoM[!rownames(GenoM) \%in\% Excl[["ExcludedIndiv"]], ]

# warning about  SNPs scored for <50\% of individuals ?
# note: this is not necessarily a problem, and sometimes unavoidable.
SnpCallRate <- apply(GenoM, MARGIN=2,
                     FUN = function(x) sum(x!=-9)) / nrow(GenoM)
hist(SnpCallRate, breaks=50, col="grey")
GenoM <- GenoM[, SnpCallRate > 0.6]

# to filter out low call rate individuals: (also not necessarily a problem)
IndivCallRate <- apply(GenoM, MARGIN=1,
                       FUN = function(x) sum(x!=-9)) / ncol(GenoM)
hist(IndivCallRate, breaks=50, col="grey")
GoodSamples <- rownames(GenoM)[ IndivCallRate > 0.8]

}
\seealso{
\code{\link{SnpStats}} to calculate SNP call rates;
  \code{\link{CalcOHLLR}} to count the number of SNPs scored in both focal
  individual and parent.
}
