#######################################################################
# Note that this note can directly be run in R.
#######################################################################

#
# EXAMPLE SESSION FOR INFERRING GENETIC NETWORKS
# 

# for details see:
#
# Schaefer, J., and K. Strimmer. 2005a.
# An empirical Bayes approach to inferring large-scale gene
# association networks. Bioinformatics 21:754-764.
# 
# Schaefer, J., and K. Strimmer. 2005b. Learning large-scale
# graphical Gaussian models from genomic data.
# Proceedings of CNET 2004, Aveiro, Pt. (AIP)
# 
# Schaefer, J., and K. Strimmer, K. 2005c.  A shrinkage approach
# to large-scale covariance estimation and implications for
# functional genomics.  Submitted to SAGMB.
#



# load GeneTS library
library(GeneTS)

#######################################################################

# GET DATA:

# As an example we select 42 genes from the Caulobacter data set
data(caulobacter)

# test for periodicity
pval.caulobacter <- fisher.g.test(caulobacter)
fdr.conservative <- fdr.control(pval.caulobacter, Q = 0.05)
data.matrix <- caulobacter[,fdr.conservative$significant]

# remove some unknow ORFs
keep <- rep(TRUE,45)
keep[7] <- FALSE #06901
keep[22] <- FALSE #04476
keep[27] <- FALSE #02688
data.matrix <- data.matrix[,keep]


#######################################################################

# THE DATA:

# the normalized data need to be ready in time series format, i.e. in
# a matrix where each *column* corresponds to a gene, and where the
# *rows* correspond to the individual measurements (time points).

# our example data is stored in "data.matrix"
data.matrix

# 42 genes with 11 time points
dim(data.matrix)

# number of nodes
num.nodes <- dim(data.matrix)[2]

# node.labels
node.labels <- c("CheA", "CheR", "CheD", "ABC transporter", "hfaA",
  "#06446",  "#02759", "peptidase (M23/M37)", "#03144", "#04700",
   "fljO 1", "fljK", "fljN", "#4480", "flbT", "LexA", "fljM 1",
   "fljO 2",  "#08039", "#04977", "#02998 (5-repeat)",
   "#02058 (S-transferase)",  "fljM 2", "#02730" , "divK",  "orfA",
   "#03649", "DnaA", "bacA",  "#01232 (regulator)",  "fljL", 
   "#05886 (GGDEF)", "McpH", "#04700",  "#01720", "neuB",  "#02930",
   "#03170",    "cheW",    "#01459 (receptor)",    "CtrA", "fliJ")


#######################################################################

# INFER GRAPHICAL GAUSSIAN MODEL:


###
### Step 1: Estimate partial correlation matrix
###

# there are many options for estimation the partial correlations
# - we recommend to use a shrinkage estimator (Schaefer and Strimmer 2005c)

pcor.shrink <- ggm.estimate.pcor(data.matrix, method="shrinkage")

# other possibilites include the three estimators for partical correlation 
# employed in Schaefer and Strimmer (2005a,b)

#pcor.pi1 <- ggm.estimate.pcor(data.matrix, method = "observed.pcor")
#pcor.pi2 <- ggm.estimate.pcor(data.matrix, method = "partial.bagged.cor", R=10000)
#pcor.pi3 <- ggm.estimate.pcor(data.matrix, method = "bagged.pcor")


inferred.pcor <- pcor.shrink


###
### Step 2: Assign p-values, q-values, empirical posterior probabilites to each edge
###

test.results <- ggm.test.edges(inferred.pcor, fA.type="nonparametric")

# show first 20 edges
test.results[1:20,]

# parameters of the mixture distribution used to compute p-values etc.
c <- cor.fit.mixture(sm2vec(inferred.pcor), fA.type="nonparametric")
c$eta0
c$kappa



###
### Step 3: Decide which edges to include in the network
###


# how many edges are significant based on FDR cutoff Q=0.05 ?
significant1.idx <- test.results$qval <= 0.05
num.significant.1 <- sum(significant1.idx)
test.results[significant1.idx,]  # list significant edges


# how many edges are significant based on "local fdr" 0.2 cutoff (prob > 0.80) ?
significant2.idx <- test.results$prob > 0.80
num.significant.2 <- sum(significant2.idx)
test.results[significant2.idx,] # list significant edges



#######################################################################

# PLOT GRAPHICAL GAUSSIAN MODEL:

# Note: this requires the "graph" and "Rgraphviz" packages from www.bioconductor.org 

# generate graph object with all significant edges
gr <- ggm.make.graph( test.results[significant2.idx,], num.nodes) 
gr 

# print vector of edge weights
show.edge.weights(gr)

# plot network
X11(width=12, height=9)
ggm.plot.graph(gr, node.labels, show.edge.labels=FALSE)

# with partial correlations as edge labels
X11(width=12, height=9)
ggm.plot.graph(gr, node.labels)


#######################################################################

# GENERATE RANDOM GRAPHICAL GAUSSIAN MODEL:

# generate random network with 20 nodes and 10 percent edges (=19 edges)
true.pcor2 <- ggm.simulate.pcor(20, 0.1)

# convert to edge list 
test.results2 <- ggm.test.edges(true.pcor2, eta0=0.9, kappa=1000)[1:19,]
test.results2

# plot network
gr2 <- ggm.make.graph( test.results2, 20) 
gr2
X11(width=8, height=8)
ggm.plot.graph(gr2)


#######################################################################

# SIMULATE RANDOM GRAPHICAL GAUSSIAN MODEL, GENERATE DATA, 
# AND RE-ESTIMATE PARTIAL CORRELATIONS:

# generate random network with 40 nodes and 5 percent edges
sim.pcor <- ggm.simulate.pcor(40, 0.05)
  
# simulate data set with 40 observations
m.sim <- ggm.simulate.data(40, sim.pcor)

# simple estimate of partial correlations
estimated.pcor <- partial.cor(m.sim)

# comparison of estimated and true model
sum((sim.pcor-estimated.pcor)^2)

# a slightly better estimate ...
estimated.pcor.2 <- ggm.estimate.pcor(m.sim, method = c("bagged.pcor"))
sum((sim.pcor-estimated.pcor.2)^2)

# this is even better
estimated.pcor.3 <- ggm.estimate.pcor(m.sim, method = c("shrinkage"))
sum((sim.pcor-estimated.pcor.3)^2)

