% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/do_cumulative_htrx.R
\name{do_cumulative_htrx}
\alias{do_cumulative_htrx}
\alias{do_cumulative_htrx_step1}
\alias{extend_haps}
\alias{make_cumulative_htrx}
\title{Cumulative HTRX on long haplotypes}
\usage{
do_cumulative_htrx(
  data_nosnp,
  hap1,
  hap2 = hap1,
  train_proportion = 0.5,
  sim_times = 10,
  featurecap = 40,
  usebinary = 1,
  randomorder = TRUE,
  fixorder = NULL,
  method = "simple",
  criteria = "BIC",
  gain = TRUE,
  runparallel = FALSE,
  mc.cores = 6,
  rareremove = FALSE,
  rare_threshold = 0.001,
  dataseed = 1:sim_times,
  tenfoldseed = 123,
  returnall = FALSE,
  max_int = NULL,
  verbose = FALSE
)

do_cumulative_htrx_step1(
  data_nosnp,
  hap1,
  hap2 = hap1,
  train_proportion = 0.5,
  featurecap = 40,
  usebinary = 1,
  randomorder = TRUE,
  fixorder = NULL,
  method = "simple",
  criteria = "BIC",
  splitseed = 123,
  gain = TRUE,
  runparallel = FALSE,
  mc.cores = 6,
  rareremove = FALSE,
  rare_threshold = 0.001,
  max_int = NULL,
  verbose = FALSE
)

extend_haps(
  data_nosnp,
  featuredata,
  train,
  featurecap = dim(featuredata)[2],
  usebinary = 1,
  gain = TRUE,
  runparallel = FALSE,
  mc.cores = 6,
  verbose = FALSE
)

make_cumulative_htrx(
  hap1,
  hap2 = hap1,
  featurename,
  rareremove = FALSE,
  rare_threshold = 0.001,
  max_int = NULL
)
}
\arguments{
\item{data_nosnp}{a data frame with outcome (the outcome must be the first column)
and fixed covariates (for example, sex, age and the first 18 PCs)
and without SNPs or haplotypes.}

\item{hap1}{a data frame of the SNPs' genotype of the first genome. The genotype of a SNP for each individual is either 0 (reference allele) or 1 (alternative allele).}

\item{hap2}{a data frame of the SNPs' genotype of the second genome.
The genotype of a SNP for each individual is either 0 (reference allele) or 1 (alternative allele).
#' By default, hap2=hap1 representing haploid.}

\item{train_proportion}{a positive number between 0 and 1 giving
the proportion of the training dataset when splitting data into 2 folds.
By default, train_proportion=0.5.}

\item{sim_times}{an integer giving the number of simulations in step 1 (see details).
By default, sim_times=10.}

\item{featurecap}{a positive integer which manually sets the maximum number of independent features.
By default, featurecap=40.}

\item{usebinary}{a non-negative number representing different models.
Use linear model if usebinary=0,
use logistic regression model via fastglm if usebinary=1 (by default),
and use logistic regression model via glm if usebinary>1.}

\item{randomorder}{logical. If randomorder=TRUE (default),
use random order of all the SNPs to add SNPs in cumulative HTRX.}

\item{fixorder}{a vector of the fixed order of SNPs to be added in cumulative HTRX.
This only works by setting randomorder=FALSE. Otherwise, fixorder=NULL (default).
The length of "fixorder" can be smaller than the total number of SNPs,
i.e. users can specify the order of some instead of all of the SNPs.}

\item{method}{the method used for data splitting, either "simple" (default) or "stratified".}

\item{criteria}{the information criteria for model selection, either "BIC" (default) or "AIC".}

\item{gain}{logical. If gain=TRUE (default), report the variance explained in addition to fixed covariates;
otherwise, report the total variance explained by all the variables.}

\item{runparallel}{logical. Use parallel programming based on "mclapply" function or not.
Note that for Windows users, "mclapply" doesn't work, so please set runparallel=FALSE (default).}

\item{mc.cores}{an integer giving the number of cores used for parallel programming.
By default, mc.cores=6.
This only works when runparallel=TRUE.}

\item{rareremove}{logical. Remove rare SNPs and haplotypes or not. By default, rareremove=FALSE.}

\item{rare_threshold}{a numeric number below which the haplotype or SNP is removed.
This only works when rareremove=TRUE. By default, rare_threshold=0.001.}

\item{dataseed}{a vector of the seed that each simulation in step 1 (see details) uses.
The length of dataseed must be the same as sim_times.
By default, dataseed=1:sim_times.}

\item{tenfoldseed}{a positive integer specifying the seed used to
split data for 10-fold cross validation. By default, tenfoldseed=123.}

\item{returnall}{logical. If returnall=TRUE, return all the candidate models and
the variance explained in each of 10 test set for these the candidate models.
If returnall=FALSE (default), only return the best candidate model
and the variance explained in each of 10 test set by this model.}

\item{max_int}{a positive integer which specifies the maximum number of SNPs that can interact.
If no value is given, interactions between all the SNPs will be considered.}

\item{verbose}{logical. If verbose=TRUE, print out the inference steps. By default, verbose=FALSE.}

\item{splitseed}{a positive integer giving the seed that a single simulation in step 1 (see details) uses.}

\item{featuredata}{a data frame of the feature data, e.g. haplotype data created by HTRX or SNPs.
These features exclude all the data in data_nosnp, and will be selected using 2-step cross-validation.}

\item{train}{a vector of the indexes of the training data.}

\item{featurename}{a character giving the names of features (haplotypes).}
}
\value{
\code{\link{do_cumulative_htrx}} returns a list containing the best model selected,
 and the out-of-sample variance explained in each test set.
 If returnall=TRUE, this function also returns all the candidate models,
 and the out-of-sample variance explained in each test set by each candidate model.

\code{\link{do_cv_step1}} returns a list of three candidate models selected by a single simulation.

\code{\link{extend_haps}} returns a character of the names of the selected features.

\code{\link{make_cumulative_htrx}} returns a data frame of the haplotype matrix.
}
\description{
Two step cross-validation used to select the best HTRX model for longer haplotypes,
i.e. include at least 7 single nucleotide polymorphisms (SNPs).
}
\details{
Longer haplotypes are important for discovering interactions.
However, there are \ifelse{html}{\out{3<sup>k</sup>}}{\eqn{3^k}}-1 haplotypes in HTRX
if the region contains k SNPs,
making HTRX (\code{\link{do_cv}}) unrealistic to apply on for regions with large numbers of SNPs.
To address this issue, we proposed "cumulative HTRX" (\code{\link{do_cumulative_htrx}})
that enables HTRX to run on longer haplotypes,
 i.e. haplotypes which include at least 7 SNPs (we recommend).
 There are 2 steps to implement cumulative HTRX.

Step 1: extend haplotypes and select candidate models.

(1) Randomly sample a subset (50%) of data,
use stratified sampling when the outcome is binary.
This subset is used for all the analysis in (2) and (3);

(2) Start with L randomly chosen SNPs from the entire k SNPs,
and keep the top M haplotypes that are chosen from the forward regression.
Then add another SNP to the M  haplotypes to create 3M+2 haplotypes.
There are 3M haplotypes obtained by adding "0", "1" or "X" to the previous M haplotypes,
as well as 2 bases of the added SNP, i.e. "XX...X0" and "XX...X1"
(as "X" was implicitly used in the previous step).
The top M haplotypes from them are then selected using forward regression.
Repeat this process until obtaining M haplotypes which include k-1 SNPs;

(3) Add the last SNP to create 3M+2 haplotypes.
Afterwards, start from a model with fixed covariates (e.g. 18 PCs, sex and age),
perform forward regression on the training set,
and select s models with the lowest BIC to enter the candidate model pool;

(4) repeat (1)-(3) B times, and select all the different models
in the candidate model pool as the candidate models.

Step 2: select the best model using 10-fold cross-validation.

(1) Randomly split the whole data into 10 groups with approximately equal sizes,
using stratified sampling when the outcome is binary;

(2) In each of the 10 folds, use a different group as the test dataset,
and take the remaining groups as the training dataset.
Then, fit all the candidate models on the training dataset,
and use these fitted models to compute the additional variance explained by features
(out-of-sample R2) in the test dataset.
Finally, select the candidate model with the biggest
average out-of-sample R2 as the best model.

Function \code{\link{do_cumulative_htrx_step1}} is the Step 1 (1)-(3) described above.
Function \code{\link{extend_haps}} is used to select haplotypes in the Step 1 (2) described above.
Function \code{\link{make_cumulative_htrx}} is used to generate the haplotype data
(by adding a new SNP into the haplotypes) from M haplotypes to 3M+2 haplotypes,
which is also described in the Step 1 (2)-(3).
}
\examples{
## use dataset "example_hap1", "example_hap2" and "example_data_nosnp"
## "example_hap1" and "example_hap2" are
## both genomes of 8 SNPs for 5,000 individuals (diploid data)
## "example_data_nosnp" is a simulated dataset
## which contains the outcome (binary), sex, age and 18 PCs

## visualise the covariates data
## we will use only the first two covariates: sex and age in the example
head(HTRX::example_data_nosnp)

## visualise the genotype data for the first genome
head(HTRX::example_hap1)

## we perform cumulative HTRX on all the 8 SNPs using 2-step cross-validation
## to compute additional variance explained by haplotypes
## If the data is haploid, please set hap2=HTRX::example_hap1
## If you want to compute total variance explained, please set gain=FALSE
## For Linux/MAC users, we recommend setting runparallel=TRUE
\donttest{
cumu_htrx_results <- do_cumulative_htrx(HTRX::example_data_nosnp[1:500,1:3],
                                        HTRX::example_hap1[1:500,],
                                        HTRX::example_hap2[1:500,],
                                        train_proportion=0.5,sim_times=5,
                                        featurecap=10,usebinary=1,
                                        randomorder=TRUE,method="stratified",
                                        criteria="BIC",gain=TRUE,
                                        runparallel=FALSE,verbose=TRUE)
}
#This result would be more precise when setting larger sim_times and featurecap
}
\references{
Barrie W, Yang Y, Attfield K E, et al. Genetic risk for Multiple Sclerosis originated in Pastoralist Steppe populations. bioRxiv (2022).

Efron, B. Bootstrap Methods: Another Look at the Jackknife. Ann. Stat. 7, 1-26 (1979).

Kass, R. E. & Wasserman, L. A Reference Bayesian Test for Nested Hypotheses and its Relationship to the Schwarz Criterion. J. Am. Stat. Assoc. 90, 928-934 (1995).
}
