% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/core_functions.R
\name{create_features_df}
\alias{create_features_df}
\title{Create Data Frame of Features for Driver Gene Prioritization}
\usage{
create_features_df(
  annovar_csv_path,
  scna_df,
  phenolyzer_annotated_gene_list_path,
  batch_analysis = FALSE,
  prep_phenolyzer_input = FALSE,
  log2_ratio_threshold = 0.25,
  gene_overlap_threshold = 25,
  MCR_overlap_threshold = 25,
  hotspot_threshold = 5L,
  log2_hom_loss_threshold = -1,
  verbose = TRUE,
  na.string = "."
)
}
\arguments{
\item{annovar_csv_path}{path to 'ANNOVAR' csv output file}

\item{scna_df}{the SCNA segments data frame. Must contain: \describe{
\item{chr}{chromosome the segment is located in}
\item{start}{start position of the segment}
\item{end}{end position of the segment}
\item{log2ratio}{\ifelse{html}{\out{log<sub>2</sub>}}{\eqn{log_2}} ratio of the segment}
}}

\item{phenolyzer_annotated_gene_list_path}{path to 'phenolyzer'
"annotated_gene_list" file}

\item{batch_analysis}{boolean to indicate whether to perform batch analysis
(\code{TRUE}, default) or personalized analysis (\code{FALSE}). If \code{TRUE},
a column named 'tumor_id' should be present in both the ANNOVAR csv and the SCNA
table.}

\item{prep_phenolyzer_input}{boolean to indicate whether or not to create
a vector of genes for use as the input of 'phenolyzer' (default = \code{FALSE}).
If \code{TRUE}, the features data frame is not created and instead the vector
of gene symbols (union of all genes for which scores are available) is
returned.}

\item{log2_ratio_threshold}{the \ifelse{html}{\out{log<sub>2</sub>}}{\eqn{log_2}}
ratio threshold for keeping high-confidence SCNA events (default = 0.25)}

\item{gene_overlap_threshold}{the percentage threshold for the overlap between
a segment and a transcript (default = 25). This means that if only a segment
overlaps a transcript more than this threshold, the transcript is assigned
the segment's SCNA event.}

\item{MCR_overlap_threshold}{the percentage threshold for the overlap between
a gene and an MCR region (default = 25). This means that if only a gene
overlaps an MCR region more than this threshold, the gene is assigned the
SCNA density of the MCR}

\item{hotspot_threshold}{to determine hotspot genes, the (integer) threshold
for the minimum number of cases with certain mutation in COSMIC (default = 5)}

\item{log2_hom_loss_threshold}{to determine double-hit events, the
\ifelse{html}{\out{log<sub>2</sub>}}{\eqn{log_2}} threshold for identifying
homozygous loss events (default = -1).}

\item{verbose}{boolean controlling verbosity (default = \code{TRUE})}

\item{na.string}{string that was used to indicate when a score is not available
during annotation with ANNOVAR (default = ".")}
}
\value{
If \code{prep_phenolyzer_input=FALSE} (default), a data frame of
features for prioritizing cancer driver genes (\code{gene_symbol} as
the first column and 26 other columns containing features). If
\code{prep_phenolyzer_input=TRUE}, the functions returns a vector gene symbols
(union of all gene symbols for which scores are available) to be used as the
input for performing 'phenolyzer' analysis.

The features data frame contains the following columns:
\describe{
\item{gene_symbol}{HGNC gene symbol}
\item{metaprediction_score}{the maximum metapredictor (coding) impact score for the gene}
\item{noncoding_score}{the maximum non-coding PHRED-scaled CADD score for the gene}
\item{scna_score}{SCNA proxy score. SCNA density (SCNA/Mb) of the minimal common region (MCR) in which the gene is located}
\item{hotspot_double_hit}{boolean indicating whether the gene is a hotspot gene (indication of oncogenes) or subject to double-hit (indication of tumor-suppressor genes)}
\item{phenolyzer_score}{'phenolyzer' score for the gene}
\item{hsa03320}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04010}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04020}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04024}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04060}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04066}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04110}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04115}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04150}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04151}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04210}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04310}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04330}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04340}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04350}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04370}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04510}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04512}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04520}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04630}{boolean indicating whether or not the gene takes part in this KEGG pathway}
\item{hsa04915}{boolean indicating whether or not the gene takes part in this KEGG pathway}
}
}
\description{
Create Data Frame of Features for Driver Gene Prioritization
}
\examples{
\donttest{
path2annovar_csv <- system.file("extdata/example.hg19_multianno.csv",
                                package = "driveR")
path2phenolyzer_out <- system.file("extdata/example.annotated_gene_list",
                                   package = "driveR")
features_df <- create_features_df(annovar_csv_path = path2annovar_csv,
                                  scna_df = example_scna_table,
                                  phenolyzer_annotated_gene_list_path = path2phenolyzer_out)
}

}
\seealso{
\code{\link{prioritize_driver_genes}} for prioritizing cancer driver genes
}
