#
#
# Sims4NewCar.R
#
# Andrew Brown
# 4 Feb 2016
#
# Here, I am carrying out further simulation studies to compare the "old" CAR
# model to the "generalized" CAR model on simulated gene pathway data. This
# is to address the comments from the referees of our submission to Statistica
# Sinica. The original simulations were done in SimPathways4NewCAR_Mod3Diff.R.
# I'm using that file for reference, including loading the image(s) that I 
# created from that simulation.
#
# As a general note, it's probably worth mentioning that "good" starting values
# make a big difference in the convergence of the chains.

rm(list= ls(all= TRUE)); gc()


################################################################################
################################################################################

# Simulated gene set data with gene pathways.
# Here I'm simualting data following that done in Efron and Tibshirani (2007)
# to examine the behavior using gene pathways.

expr <- vector(length= 1000)  # The vector of simulated expression values



# 10 subjects, 1000 genes. Genes 11-20, 111-130, ..., 411-430 define the gene 
# pathways. The second (111 - 130) and last (411 - 430) will be differentially
# expressed paths (with non-zero means), while the others will be null.
ctrl.means <- rep(0, times= 1000)
trt.means <- rep(0, times= 1000)
trt.means[111:130] <- 2.5
trt.means[411:430] <- -1.5

expr.ctrl <- matrix(nrow= 1000, ncol= 5)
expr.trt <- expr.ctrl

library(MASS)
expr.ctrl[] <- rnorm(5000)
expr.trt[] <- t(mvrnorm(5, mu= trt.means, 
                        Sigma= diag(rep(1, times= length(trt.means)))))


# This is the Nxn matrix of observed values
X <- cbind(expr.ctrl, expr.trt)


test.stat <- function(x, y) {
  # Return the t test statistic based on vectors x and y
  info = t.test(x, y, alternative= "two.sided", var.equal= TRUE)
  
  # Use inv. probit tranform to get z_i from t_i (as suggested by Efron, 2010)
  stat <- qnorm(pt(info$statistic, df= info$parameter))
  
  return(stat)
  #return(info$statistic)
  
}  # End function test.stat


z.stats <- vector(length= 1000)
for (i in 1:1000){
  
  z.stats[i] <- test.stat(X[i,1:5], X[i,6:10])
  
}  # End loop over i


################################################################################
################################################################################
################################################################################


# (Tentative) Plan: 
# 1. Look at gCAR model with d = 1 for the pathways, and also look at original 
# CAR model with discarding the isolated points. 
#
# 2. Look at ordinary CAR with all points forced into model via inappropriate
# nbhds. Also consider gCAR with inappropriate neighborhoods. This is where
# EDA, diagnostics, model seleciton can be presented.
#
# 3. Study gCAR by varying d and including \rho, looking at the estimated
# dependence in the data
#
#
#

################################################################################
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################

# Look down around line 600 for where I permuted the genes so that the common
# pathways are not adjacent. Then I compared using the "right" and "wrong"
# neighborhood structures.

#### Move pathways around so that they are not adjacent, and consider the 
# pathway dependence vs. ar-type dependence using the full dataset


# 11 - 30, 111 - 130, 211-230, 311-330, 411-430

# Permute and assin labels to capture the dependence

# P2 and P5 are the active cases
lab.stats <- data.frame(stats= z.stats, Path= rep("I", times= length(z.stats)),
                        stringsAsFactors= FALSE)
lab.stats$Path[11:30] <- "P1"
lab.stats$Path[111:130] <- "P2"
lab.stats$Path[211:230] <- "P3"
lab.stats$Path[311:330] <- "P4"
lab.stats$Path[411:430] <- "P5"

p.lab.stats <- lab.stats[sample(1:length(lab.stats$stats)), ]  # PERMUTED 
# labeled statistics
p.z.stats <- p.lab.stats[, 1]  



############## 1a. gCAR using pathways as neighborhoods, d = 1 #################
rgt.adj <- matrix(nrow= length(z.stats), ncol= length(z.stats))  # the right
# adjacency structure
rgt.adj[] <- 0

rgt.adj[which(p.lab.stats$Path == "P1"), which(p.lab.stats$Path == "P1")] <- 1
rgt.adj[which(p.lab.stats$Path == "P2"), which(p.lab.stats$Path == "P2")] <- 1
rgt.adj[which(p.lab.stats$Path == "P3"), which(p.lab.stats$Path == "P3")] <- 1
rgt.adj[which(p.lab.stats$Path == "P4"), which(p.lab.stats$Path == "P4")] <- 1
rgt.adj[which(p.lab.stats$Path == "P5"), which(p.lab.stats$Path == "P5")] <- 1
diag(rgt.adj) <- 0  # Can't be a neighbor with yourself!



W <- rgt.adj
diff <- 1 


# The eigenvalue computation is only necessary if we include rho ~= 0 in the 
# model. D_w is needed, regardless, so we calculate it.
# Calculate the scaled adjacency matrix and find its eigenvalues. This involves
# calculating D_w^(-.5) and taking D_w^(-.5) %*% W %*% D_w^(-.5).
d.w <- W
d.w[] <- 0

num.nabors<- vector(length= dim(W)[1])
for (i in 1:dim(W)[1]){
  
  num.nabors[i] <- sum(W[i, ])
  
}
rm(i)

diag(d.w) <- num.nabors + diff  

# Scale the adjacency matrix. 
w.star <- W
for (i in 1:dim(w.star)[1]){
  
  for (j in 1:dim(w.star)[2]) {
    
    w.star[i,j] <- w.star[i,j]/(sqrt(num.nabors[i]+diff)*sqrt(num.nabors[j]+diff))
    
  }
  
}  # End loop


lambdas <- eigen(w.star, symmetric= TRUE, only.values= TRUE)$values

rho.low <- 1/min(lambdas)
rho.high <- 1/max(lambdas)


# Choose initial values and run the chains in parallel to check convergence
library(doSNOW)

# Open a cluster of cores for parallel processing
machines<- rep("localhost", times= 3)
cl<- makeCluster(machines, type= "SOCK")
registerDoSNOW(cl)

# Store the initial values in 3-vectors
init.s2<- c(1.25, 1, 0.25)
init.mu<- list(i.mu1= rnorm(length(p.z.stats), sd= 2), 
               i.mu2= rep(1.5, times= length(p.z.stats)), 
               i.mu3= rnorm(length(p.z.stats)))
init.p<- c(.8, .5, .6)
#init.om <- c(7, 1, 6); init.psi <- c(3, 1, 4)
init.xi<- c(0.5, 0.4, .75)  
init.rho<- c(.99*rho.high, .91, .93*rho.high)  


source("Model3GibbsSampler.R")


# After permuting and assigning nbhd according to adjacency, I called the
# post. draws with the "correct" neighborhood structure car.rgtAdj.d1LH
car.rgtAdj.d1LH <- foreach(chain= 1:3, .inorder= FALSE) %dopar% {
  
  
  # Make sure the diff argument agrees with your calculations above!
  #   post.draws<- Model3GS.LH(W= rgt.adj, y= p.z.stats, p.hyper= 50, burn.in= 5000,
  #                            n.sample= 2000,
  #                            mu.init= init.mu[[chain]], p.init= init.p[chain],
  #                            sig2.init= init.s2[chain], xi.init= init.xi[chain],
  #                            rho.init= init.rho[chain], rho.low= rho.low,
  #                            rho.high= rho.high, d.w= d.w, diff= 1)
  
  
  # Something about the LH sampler was messing up how the independent (isolated)
  # cases are being estiamted. It's odd...
  
  post.draws<- Model3GS(W= rgt.adj, y= p.z.stats, p.hyper= 50, burn.in= 5000,
                        mu.init= init.mu[[chain]], p.init= init.p[chain],
                        sig2.init= init.s2[chain], xi.init= init.xi[chain],
                        rho.init= init.rho[chain], diff= 1)
  
  names(post.draws)<- paste(names(post.draws), chain, sep= ".chain")
  
  return(post.draws)
  
}  # End parallel for loop
stopCluster(cl)



# Trace plots
plot(car.rgtAdj.d1LH[[1]]$sig2, type= "l", lwd= 2)
lines(car.rgtAdj.d1LH[[2]]$sig2, col= "red", lty= 2, lwd= 2)
lines(car.rgtAdj.d1LH[[3]]$sig2, col= "blue", lty= 3, lwd= 2)

plot(car.rgtAdj.d1LH[[1]]$sig2*car.rgtAdj.d1LH[[1]]$xi, type= "l", lwd= 2)
lines(car.rgtAdj.d1LH[[2]]$sig2*car.rgtAdj.d1LH[[2]]$xi, col= "red", lty= 2, lwd= 2)
lines(car.rgtAdj.d1LH[[3]]$sig2*car.rgtAdj.d1LH[[3]]$xi, col= "blue", lty= 3, lwd= 2)

plot(car.rgtAdj.d1LH[[1]]$p, type= "l", lwd= 2)
lines(car.rgtAdj.d1LH[[2]]$p, col= "red", lty= 2, lwd= 2)
lines(car.rgtAdj.d1LH[[3]]$p, col= "blue", lty= 3, lwd= 2)


# Combine the chains
rgtAdj.d1LH.draws <- list(sig2= c(car.rgtAdj.d1LH[[1]]$sig2.chain1, 
                                  car.rgtAdj.d1LH[[2]]$sig2.chain2, 
                                  car.rgtAdj.d1LH[[3]]$sig2.chain3), 
                          tau2= c(car.rgtAdj.d1LH[[1]]$xi.chain1*car.rgtAdj.d1LH[[1]]$sig2, 
                                  car.rgtAdj.d1LH[[1]]$xi.chain1*car.rgtAdj.d1LH[[2]]$sig2, 
                                  car.rgtAdj.d1LH[[1]]$xi.chain1*car.rgtAdj.d1LH[[3]]$sig2), 
                          gam= rbind(car.rgtAdj.d1LH[[1]]$gam.chain1, 
                                     car.rgtAdj.d1LH[[2]]$gam.chain2, 
                                     car.rgtAdj.d1LH[[3]]$gam.chain3), 
                          p= c(car.rgtAdj.d1LH[[1]]$p.chain1, 
                               car.rgtAdj.d1LH[[2]]$p.chain2, 
                               car.rgtAdj.d1LH[[3]]$p.chain3),
                          rho= c(car.rgtAdj.d1LH[[1]]$rho.chain1, 
                                 car.rgtAdj.d1LH[[2]]$rho.chain2, 
                                 car.rgtAdj.d1LH[[3]]$rho.chain3), 
                          mu= rbind(car.rgtAdj.d1LH[[1]]$mu.chain1, 
                                    car.rgtAdj.d1LH[[2]]$mu.chain2, 
                                    car.rgtAdj.d1LH[[3]]$mu.chain3))


save.image("RevSims4NewCar_WSpace_020416.Rdata")  # <- 2/4/16 is when I started 
# this






########### 1b: CAR with isolated points discarded #############################


# Eliminate isolated points
z.red <- p.lab.stats[p.lab.stats$Path != "I", ]

colnames(rgt.adj) <- p.lab.stats$Path
rownames(rgt.adj) <- p.lab.stats$Path

red.adj <- rgt.adj[which(p.lab.stats$Path != "I"), 
                   which(p.lab.stats$Path != "I")]  # This is the reduced 
# adjacency matrix


W <- red.adj
diff <- 0  # With the "everybody has a neighbor" structure, we take d = 0


# The eigenvalue computation is only necessary if we include rho ~= 0 in the 
# model. D_w is needed, regardless, so we calculate it.
# Calculate the scaled adjacency matrix and find its eigenvalues. This involves
# calculating D_w^(-.5) and taking D_w^(-.5) %*% W %*% D_w^(-.5).
d.w <- W
d.w[] <- 0

num.nabors<- vector(length= dim(W)[1])
for (i in 1:dim(W)[1]){
  
  num.nabors[i] <- sum(W[i, ])
  
}
rm(i)

diag(d.w) <- num.nabors + diff  

# Scale the adjacency matrix.
w.star <- W
for (i in 1:dim(w.star)[1]){
  
  for (j in 1:dim(w.star)[2]) {
    
    w.star[i,j] <- w.star[i,j]/(sqrt(num.nabors[i]+diff)*sqrt(num.nabors[j]+diff))
    
  }
  
}  

lambdas <- eigen(w.star, symmetric= TRUE, only.values= TRUE)$values

rho.low <- 1/min(lambdas)
rho.high <- 1/max(lambdas)


# Choose initial values and run the chains in parallel to check convergence
library(doSNOW)

# Open a cluster of cores for parallel processing
machines<- rep("localhost", times= 3)
cl<- makeCluster(machines, type= "SOCK")
registerDoSNOW(cl)

# Store the initial values in 3-vectors
init.s2<- c(var(z.red[, 1]), 1, 1)  # <- The initial variance estimates seem
# to make a huge difference in terms of the convergence of the chain. 
init.mu<- list(i.mu1= z.red[, 1], 
               i.mu2= rnorm(length(z.red[, 1]), sd= 0.25), 
               i.mu3= rnorm(length(z.red[, 1]), sd= 0.05))
init.p<- c(.1, .15, .12)
#init.om <- c(7, 1, 6); init.psi <- c(3, 1, 4)
init.xi<- c(10, 20, 20)  
init.rho<- c(.95*rho.high, .99*rho.high, .97*rho.high)  


source("Model3GibbsSampler.R")

# After permuting and assigning nbhd according to adjacency, I called the
# post. draws with the "correct" neighborhood structure car.rgtAdj.d1LH
car.redAdj.d0 <- foreach(chain= 1:3, .inorder= FALSE) %dopar% {
  
  
  # Make sure the diff argument agrees with your calculations above!
  #
  # Oddly, the "original" Gibbs sampler seems to work better than the LH 
  # step when using the smaller adjacency matrix. Is this due to sparsity
  # and the lack of it?  Hmmm....
    post.draws<- Model3GS(W= red.adj, y= z.red[ ,1], p.hyper= 50, burn.in= 5000,
                             mu.init= init.mu[[chain]], p.init= init.p[chain],
                             sig2.init= init.s2[chain], xi.init= init.xi[chain],
                             rho.init= init.rho[chain], diff= 0)
  
  names(post.draws)<- paste(names(post.draws), chain, sep= ".chain")
  
  return(post.draws)
  
}  # End parallel for loop
stopCluster(cl)

# Create the tau2 draws since Model3GS returns only xi and sig2
# for (i in 1:3){
#   
car.redAdj.d0[[i]]$tau2 <- car.redAdj.d0[[i]]$sig2*
  car.redAdj.d0[[i]]$xi
#   
# }


# Trace plots
plot(car.redAdj.d0[[1]]$sig2, type= "l", lwd= 2)
lines(car.redAdj.d0[[2]]$sig2, col= "red", lty= 2, lwd= 2)
lines(car.redAdj.d0[[3]]$sig2, col= "blue", lty= 3, lwd= 2)

plot(car.redAdj.d0[[1]]$tau2, type= "l", lwd= 2)
lines(car.redAdj.d0[[2]]$tau2, col= "red", lty= 2, lwd= 2)
lines(car.redAdj.d0[[3]]$tau2, col= "blue", lty= 3, lwd= 2)

plot(car.redAdj.d0[[1]]$p, type= "l", lwd= 2)
lines(car.redAdj.d0[[2]]$p, col= "red", lty= 2, lwd= 2)
lines(car.redAdj.d0[[3]]$p, col= "blue", lty= 3, lwd= 2)


# Combine the chains
redAdj.d0.draws <- list(sig2= c(car.redAdj.d0[[1]]$sig2, 
                                car.redAdj.d0[[2]]$sig2, 
                                car.redAdj.d0[[3]]$sig2), 
                        xi= c(car.redAdj.d0[[1]]$tau2, 
                              car.redAdj.d0[[2]]$tau2, 
                              car.redAdj.d0[[3]]$tau2), 
                        gam= rbind(car.redAdj.d0[[1]]$gam, 
                                   car.redAdj.d0[[2]]$gam, 
                                   car.redAdj.d0[[3]]$gam), 
                        p= c(car.redAdj.d0[[1]]$p, 
                             car.redAdj.d0[[2]]$p, 
                             car.redAdj.d0[[3]]$p),
                        rho= c(car.redAdj.d0[[1]]$rho, 
                               car.redAdj.d0[[2]]$rho, 
                               car.redAdj.d0[[3]]$rho), 
                        mu= rbind(car.redAdj.d0[[1]]$mu, 
                                  car.redAdj.d0[[2]]$mu, 
                                  car.redAdj.d0[[3]]$mu))



#save.image("RevSims4NewCar_WSpace_020416.Rdata")  

########### Compare 1a and 1b via densities, error rates, ROC, etc. ############

# Densities
hist(rgtAdj.d1LH.draws$p, breaks= 75, freq= FALSE)
hist(rgtAdj.d1LH.draws$sig2, breaks= 75, freq= FALSE)
hist(rgtAdj.d1LH.draws$tau2, breaks= 75, freq= FALSE)  # xi = tau2 (misnamed)
hist(rgtAdj.d1LH.draws$rho, breaks= 75, freq= FALSE)


hist(redAdj.d0.draws$p, breaks= 75, freq= FALSE)
hist(redAdj.d0.draws$sig2, breaks= 75, freq= FALSE)
hist(redAdj.d0.draws$xi, breaks= 75, freq= FALSE)  # xi = tau2 (misnamed)
hist(redAdj.d0.draws$rho, breaks= 75, freq= FALSE)


# Superimpose the estimated densities from with and without isolated cases
dev.new(width= 15, height= 3)
par(mfrow= c(1,3))
plot(density(rgtAdj.d1LH.draws$p), type= "l", lwd= 2, xlim= c(0,1), xlab= "p", 
     ylab= "", main= "", yaxt="n", ylim= c(0,12))
mtext(expression(paste(pi, "(p|y)")), side= 2, line= 1)
lines(density(redAdj.d0.draws$p), lwd= 3, lty= 2)
legend(x= 0.0, y= 12, bty= "n", lty= c(1,2), lwd= 2, legend= c("All Cases", 
                                                               "Cases Excluded"), cex= 1.2)


plot(density(rgtAdj.d1LH.draws$sig2), type= "l", lwd= 2, xlab= expression(sigma^2), 
     xlim= c(0,2), main= "", yaxt= "n", ylab= "", ylim= c(0,10))
mtext(expression(paste(pi, "(", sigma^2, "|y)")), side= 2, line= 1)
lines(density(redAdj.d0.draws$sig2), lwd= 3, lty= 2)
legend(x= 0.0, y= 10, bty= "n", lty= c(1,2), lwd= 2, legend= c("All Cases", 
                                                               "Cases Excluded"), cex= 1.2)


plot(density(rgtAdj.d1LH.draws$tau2), type= "l", lwd= 2, xlab= expression(tau^2), 
     xlim= c(0,8), ylab= "", main= "", yaxt= "n", ylim= c(0,4))
mtext(expression(paste(pi, "(", tau^2, "|y)")), side= 2, line= 1)
lines(density(redAdj.d0.draws$xi), lwd= 3, lty= 2)
legend(x= 0, y= 4, bty= "n", lty= c(1,2), lwd= 2, legend= c("All Cases", 
                                                            "Cases Excluded"), cex= 1.2)



######### Incl. probs with isolated points removed
ps.red.d0<- matrix(nrow= 2000, ncol= length(z.red[ ,1]))
for (samp in 1:2000) {
  
  for (indx in 1:length(z.red[ ,1])){
    
    num <- (1-redAdj.d0.draws$p[samp])*exp((-1/redAdj.d0.draws$sig2[samp])*
                                             (z.red[indx,1] - redAdj.d0.draws$mu[samp,
                                                                                 indx])^2)
    
    denom <- num + (redAdj.d0.draws$p[samp])*exp((-1/redAdj.d0.draws$sig2[samp])*
                                                   z.red[indx,1]^2)
    
    ps.red.d0[samp, indx]<-  num/denom
    
  }  # End for loop on indx
  rm(num)
  
}  # End for loop on samp



####### Incl. probs with the gCAR (d=1)
ps.rgt.d1<- matrix(nrow= 2000, ncol= length(p.z.stats))
for (samp in 1:2000) {
  
  for (indx in 1:length(p.z.stats)){
    
    num <- (1-rgtAdj.d1LH.draws$p[samp])*exp((-1/rgtAdj.d1LH.draws$sig2[samp])*
                                               (p.z.stats[indx] - rgtAdj.d1LH.draws$mu[samp,
                                                                                       indx])^2)
    
    denom <- num + (rgtAdj.d1LH.draws$p[samp])*exp((-1/rgtAdj.d1LH.draws$sig2[samp])*
                                                     p.z.stats[indx]^2)
    
    ps.rgt.d1[samp, indx]<-  num/denom
    
  }  # End for loop on indx
  rm(num)
  
}  # End for loop on samp




# Look @ error rates for both
gam.probs.rgt.d1LH <- apply(ps.rgt.d1, 2, mean)
id.pts.rgtAdj.d1LH <- (gam.probs.rgt.d1LH > .95)*1


#  # 11 - 30, 111 - 130, 211-230, 311-330, 411-430
active.p.z.stats <- rep(0, times= length(p.z.stats))
#active[c(21:40, 81:100)] <- 1  # Indices change after
# filtering out
active.p.z.stats[c(which(p.lab.stats$Path == "P2"), 
                   which(p.lab.stats$Path == "P5"))] <- 1

sum(id.pts.rgtAdj.d1LH*(1-active.p.z.stats))/sum(id.pts.rgtAdj.d1LH)  # true fdr
sum((1-id.pts.rgtAdj.d1LH)*active.p.z.stats)/sum(1-id.pts.rgtAdj.d1LH)  # fnr
(sum(id.pts.rgtAdj.d1LH*(1-active.p.z.stats)) + 
    sum((1-id.pts.rgtAdj.d1LH)*active.p.z.stats))/1000 # Error rate




# CAR with isolated removed

gam.probs.wrg.d0 <- apply(ps.red.d0, 2, mean)
id.pts.wrg.d0 <- (gam.probs.wrg.d0 > .95)*1


#  # 11 - 30, 111 - 130, 211-230, 311-330, 411-430
active.p.z.stats <- rep(0, times= length(z.red[, 1]))
#active[c(21:40, 81:100)] <- 1  # Indices change after
# filtering out
active.p.z.stats[c(which(z.red$Path == "P2"), 
                   which(z.red$Path == "P5"))] <- 1

sum(id.pts.wrg.d0*(1-active.p.z.stats))/sum(id.pts.wrg.d0)  # true fdr
sum((1-id.pts.wrg.d0)*active.p.z.stats)/sum(1-id.pts.wrg.d0)  # fnr
(sum(id.pts.wrg.d0*(1-active.p.z.stats)) + 
    sum((1-id.pts.wrg.d0)*active.p.z.stats))/length(z.red[, 1]) # Error rate



############ plot incl. probs vs. test stats for both models

##### p_j vs. y_j, all cases
old.par <- par()
par(mfrow= c(1,2))
plot(jitter(p.z.stats[p.lab.stats$Path == "I"], fa= 50), 
     jitter(gam.probs.rgt.d1LH[p.lab.stats$Path == "I"], fa= 50), pch= 20, 
     cex= 0.25, ylim= c(0,1), xlim= c(-5, 5), xlab= expression('y'[j]), 
     ylab= expression('p'[j]))
points(jitter(p.z.stats[p.lab.stats$Path == "P1"], fa= 50), 
       jitter(gam.probs.rgt.d1LH[p.lab.stats$Path == "P1"], fa= 50), 
       pch= 5, cex= 0.75)
points(jitter(p.z.stats[p.lab.stats$Path == "P2"], fa= 50), 
       jitter(gam.probs.rgt.d1LH[p.lab.stats$Path == "P2"], fa= 50), 
       pch= 10, cex= 0.75)
points(jitter(p.z.stats[p.lab.stats$Path == "P3"], fa= 50), 
       jitter(gam.probs.rgt.d1LH[p.lab.stats$Path == "P3"], fa= 50), 
       pch= 15, cex= 0.75)
points(jitter(p.z.stats[p.lab.stats$Path == "P4"], fa= 50), 
       jitter(gam.probs.rgt.d1LH[p.lab.stats$Path == "P4"], fa= 50), 
       pch= 17, cex= 0.75)
points(jitter(p.z.stats[p.lab.stats$Path == "P5"], fa= 50), 
       jitter(gam.probs.rgt.d1LH[p.lab.stats$Path == "P5"], fa= 50), 
       pch= 25, cex= 0.75)
abline(h= 0.95, lwd= 2, lty= 2)
legend(x= -4, y= 0.3, pch= c(20, 5, 10, 15, 17,25), 
       legend= c("Isolated points", "P1", "P2 (active)", "P3", "P4", "P5 (active)"),
       cex= 0.75, pt.cex= 0.75, bty= "n")


##### p_j vs. y_j, cases removed
plot(jitter(z.red[z.red$Path == "P1", 1], fa= 50), 
     jitter(gam.probs.wrg.d0[z.red$Path == "P1"], fa= 50), pch= 5, 
     cex= 0.75, ylim= c(0,1), xlim= c(-4, 4), xlab= expression('y'[j]), 
     ylab= expression('p'[j]))
points(jitter(z.red[z.red$Path == "P2", 1], fa= 50), 
       jitter(gam.probs.wrg.d0[z.red$Path == "P2"], fa= 50), 
       pch= 10, cex= 0.75)
points(jitter(z.red[z.red$Path == "P3", 1], fa= 50), 
       jitter(gam.probs.wrg.d0[z.red$Path == "P3"], fa= 50), 
       pch= 15, cex= 0.75)
points(jitter(z.red[z.red$Path == "P4", 1], fa= 50), 
       jitter(gam.probs.wrg.d0[z.red$Path == "P4"], fa= 50), 
       pch= 17, cex= 0.75)
points(jitter(z.red[z.red$Path == "P5", 1], fa= 50), 
       jitter(gam.probs.wrg.d0[z.red$Path == "P5"], fa= 50), 
       pch= 25, cex= 0.75)
abline(h= 0.95, lwd= 2, lty= 2)
legend(x= -4, y= 0.3, pch= c(5, 10, 15, 17,25), 
       legend= c("P1", "P2 (active)", "P3", "P4", "P5 (active)"),
       cex= 0.75, pt.cex= 0.75, bty= "n")
