---
title: "ContinuousData"
author: "Zhiwen Tan"
output: 
  rmarkdown::html_vignette:
    toc: true
    number_sections: false
vignette: >
  %\VignetteIndexEntry{ContinuousData}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

## Introduction

`BCClong` is an R package for performing Bayesian Consensus Clustering (BCC) model for clustering continuous, discrete and categorical longitudinal data, which are commonly seen in many clinical studies. This document gives a tour of BCClong package.

see `help(package = "BCClong")` for more information and references provided by `citation("BCClong")`

To download **BCClong**, use the following commands:

``` r
require("devtools")
devtools::install_github("ZhiwenT/BCClong", build_vignettes = TRUE)
library("BCClong")
```
To list all functions available in this package:

```r
ls("package:BCClong")
```

## Components

Currently, there are 5 function in this package which are __*BCC.multi*__, 
__*BayesT*__, __*model.selection.criteria*__, __*traceplot*__, __*trajplot*__.

__*BCC.multi*__ function performs clustering on mixed-type (continuous, discrete and
categorical) longitudinal markers using Bayesian consensus clustering method
with MCMC sampling and provide a summary statistics for the computed model. This function will take in a data set and multiple parameters and output a BCC model with summary statistics. 

__*BayesT*__ function assess the model goodness of fit by calculate the
discrepancy measure T(\bm{\y}, \bm{\Theta}) with following steps
  (a) Generate T.obs based on the MCMC samples
  (b) Generate T.rep based on the posterior distribution of the parameters
  (c) Compare  T.obs and T.rep, and calculate the P values.

__*model.selection.criteria*__ function calculates DIC and WAIC for the fitted model
__*traceplot*__ function visualize the MCMC chain for model parameters
__*trajplot*__ function plot the longitudinal trajectory of features by local and global clustering

## Pre-process (Setting up)

In this example, the `epileptic.qol` data set from `joinrRML` package was used. The variables used here include `anxiety score`, `depress score` and `AEP score`. All of the variables are continuous.

```{r, warning=F, message=F, fig.height= 5, fig.width= 8, fig.align='center', fig.cap= "Spaghtti plot for each marker"}
library(BCClong)
library(joineRML)
library(ggplot2)
library(cowplot)
# convert days to months
epileptic.qol$time_month <- epileptic.qol$time/30.25 		
# Sort by ID and time
epileptic.qol <- epileptic.qol[order(epileptic.qol$id,epileptic.qol$time_month),]  

## Make Spaghetti Plots to Visualize
p1 <- ggplot(data =epileptic.qol, aes(x =time_month, y = anxiety, group = id))+
	 	 geom_point() + geom_line() +
		 geom_smooth(method = "loess", size = 1.5,group =1,se = FALSE, span=2) +
		theme(legend.position = "none",
			plot.title = element_text(size = 20, face = "bold"),
			axis.text=element_text(size=20),
			axis.title=element_text(size=20),
			axis.text.x = element_text(angle = 0 ),
			strip.text.x = element_text(size = 20, angle = 0),
			strip.text.y = element_text(size = 20,face="bold")) +
		xlab("Time (months)") + ylab("anxiety")
p2 <- ggplot(data =epileptic.qol, aes(x =time_month, y = depress, group = id))+
	 	 geom_point() +
		 geom_line() +
		 geom_smooth(method = "loess", size = 1.5,group =1,se = FALSE, span=2) +
		theme(legend.position = "none",
			plot.title = element_text(size = 20, face = "bold"),
			axis.text=element_text(size=20),
			axis.title=element_text(size=20),
			axis.text.x = element_text(angle = 0 ),
			strip.text.x = element_text(size = 20, angle = 0),
			strip.text.y = element_text(size = 20,face="bold")) +
		xlab("Time (months)") + ylab("depress")
p3 <- ggplot(data =epileptic.qol, aes(x =time_month, y = aep, group = id))+
	  	geom_point() +
		 geom_line() +
		 geom_smooth(method = "loess", size = 1.5,group =1,se = FALSE, span=2) +
		theme(legend.position = "none",
			plot.title = element_text(size = 20, face = "bold"),
			axis.text=element_text(size=20),
			axis.title=element_text(size=20),
			axis.text.x = element_text(angle = 0 ),
			strip.text.x = element_text(size = 20, angle = 0),
			strip.text.y = element_text(size = 20,face="bold")) +
		xlab("Time (months)") + ylab("aep")

plot_grid(p1,NULL,p2,NULL,p3,NULL,labels=c("(A)","", "(B)","","(C)",""), nrow = 1,
		align = "v", rel_widths = c(1,0.1,1,0.1,1,0.1))

epileptic.qol$anxiety_scale <- scale(epileptic.qol$anxiety)
epileptic.qol$depress_scale <- scale(epileptic.qol$depress)
epileptic.qol$aep_scale <- scale(epileptic.qol$aep)
dat <- epileptic.qol
```

## Choose Best Number Of Clusters

We can compute the mean adjusted adherence to determine the number of clusters using the code below. Since this program takes a long time to run, this chunk of code will not run in this tutorial file.

```r
# computed the mean adjusted adherence to determine the number of clusters
set.seed(20220929)
alpha.adjust <- NULL
DIC <- WAIC <- NULL
for (k in 1:5){
  fit.BCC <- BCC.multi (
    mydat = list(dat$anxiety_scale,dat$depress_scale,dat$aep_scale),
    dist = c("gaussian"),
    id = list(dat$id),
    time = list(dat$time),
    formula =list(y ~ time +  (1 |id)),
    num.cluster = k,
    initials= NULL,
    burn.in = 1000,
    thin = 10,
    per = 100,
    max.iter = 2000)
   alpha.adjust <- c(alpha.adjust, fit.BCC$alpha.adjust)
   res <- model.selection.criteria(fit.BCC, fast_version=0)
   DIC <- c(DIC,res$DIC)
   WAIC <- c(WAIC,res$WAIC)}
   
   num.cluster <- 1:5
par(mfrow=c(1,3))
plot(num.cluster[2:5], alpha.adjust, type="o",cex.lab=1.5,cex.axis=1.5,cex.main=1.5,lwd=2,
	xlab="Number of Clusters",
	ylab="mean adjusted adherence",main="mean adjusted adherence")
plot(num.cluster, DIC, type="o",cex=1.5, cex.lab=1.5,cex.axis=1.5,cex.main=1.5,lwd=2,
	xlab="Number of Clusters",ylab="DIC",main="DIC")
plot(num.cluster, WAIC, type="o",cex=1.5, cex.lab=1.5,cex.axis=1.5,cex.main=1.5,lwd=2,
	xlab="Number of Clusters",ylab="WAIC",main="WAIC")
```

## Fit BCC Model Using BCC.multi Function

Here, We used gaussian distribution for all three markers. The number of clusters was set to 2 because it has highest mean adjusted adherence. All hyper parameters were set to default.

For the purpose of this tutorial, the MCMC iteration will be set to a small number to minimize the compile time and the result will be read from the pre-compiled RData file using `data(epil1), data(epil1)` and `data(epil1)`

```r
# Fit the final model with the number of cluster 2 (largest mean adjusted adherence)
fit.BCC2 <-  BCC.multi (
    mydat = list(dat$anxiety_scale,dat$depress_scale,dat$aep_scale),
    dist = c("gaussian"),
    id = list(dat$id),
    time = list(dat$time),
  formula =list(y ~ time + (1|id)),
  num.cluster = 2,
  burn.in = 10, 			# number of samples discarded
  thin = 1, 				# thinning
  per = 10, 				# output information every "per" iteration
  max.iter = 30) 			# maximum number of iteration

fit.BCC2b <-  BCC.multi (
    mydat = list(dat$anxiety_scale,dat$depress_scale,dat$aep_scale),
    dist = c("gaussian"),
    id = list(dat$id),
    time = list(dat$time),
  formula =list(y ~ time + (1 + time|id)),
  num.cluster = 2,
  burn.in = 10,
  thin = 1,
  per = 10,
  max.iter = 30)

fit.BCC2c <-  BCC.multi (
    mydat = list(dat$anxiety_scale,dat$depress_scale,dat$aep_scale),
    dist = c("gaussian"),
    id = list(dat$id),
    time = list(dat$time),
  formula =list(y ~ time + time2 + (1 + time|id)),
  num.cluster = 2,
  burn.in = 10,
  thin = 1,
  per = 10,
  max.iter = 30)
```
Load the pre-compiled results

```{r, warning=F, message=F}
data(epil1)
data(epil2)
data(epil3)
fit.BCC2 <- epil1
fit.BCC2b <- epil2
fit.BCC2c <- epil3
fit.BCC2b$cluster.global <- factor(fit.BCC2b$cluster.global,
	labels=c("Cluster 1","Cluster 2"))
table(fit.BCC2$cluster.global, fit.BCC2b$cluster.global)

fit.BCC2c$cluster.global <- factor(fit.BCC2c$cluster.global,
	labels=c("Cluster 1","Cluster 2"))
table(fit.BCC2$cluster.global, fit.BCC2c$cluster.global)
```

## Printing Summary Statistics for key model parameters
To print the BCC model

```r
print(fit.BCC2)
```

To print the summary statistics for all parameters 

```r
summary(fit.BCC2)
```

To print the proportion \pi for each cluster (mean, sd, 2.5% and 97.5% percentile)
geweke statistics (geweke.stat) between -2 and 2 suggests the parameters converge

```r
fit.BCC2$summary.stat$PPI
```
The code below prints out all major parameters

```{r, warning=F, message=F}
summary(fit.BCC2)
```

## Visualize Clusters

Generic plot can be used on BCC object, all relevant plots will be generate one by one using return key

```r
plot(fit.BCC2)
```

We can also use the __*traceplot*__ function to plot the MCMC process and the __*trajplot*__ function to plot the trajectory for each feature.

```{r, warning=F, message=F, fig.height=5, fig.width=8, fig.align='center'}
#=====================================================#
# Trace-plot for key model parameters
#=====================================================#
traceplot(fit=fit.BCC2, parameter="PPI",ylab="pi",xlab="MCMC samples")
traceplot(fit=fit.BCC2, parameter="ALPHA",ylab="alpha",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 1, feature.indx=1,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 1, feature.indx=2,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 1, feature.indx=3,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 2, feature.indx=1,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 2, feature.indx=2,parameter="GA",ylab="GA",xlab="MCMC samples")
traceplot(fit=fit.BCC2,cluster.indx = 2, feature.indx=3,parameter="GA",ylab="GA",xlab="MCMC samples")
```


```{r, warning=F, message=F, fig.width=12, fig.height=6, fig.align='center'}
#=====================================================#
# Trajectory plot for features
#=====================================================#
gp1 <- trajplot(fit=fit.BCC2,feature.ind=1,
			which.cluster = "local.cluster",
			title= bquote(paste("Local Clustering (",hat(alpha)[1] ==.(round(fit.BCC2$alpha[1],2)),")")),
			xlab="time (months)",ylab="anxiety",color=c("#00BA38", "#619CFF"))
gp2 <- trajplot(fit=fit.BCC2,feature.ind=2,
			which.cluster = "local.cluster",
			title= bquote(paste("Local Clustering (",hat(alpha)[2] ==.(round(fit.BCC2$alpha[2],2)),")")),
			xlab="time (months)",ylab="depress",color=c("#00BA38", "#619CFF"))
gp3 <- trajplot(fit=fit.BCC2,feature.ind=3,
			which.cluster = "local.cluster",
			title= bquote(paste("Local Clustering (",hat(alpha)[3] ==.(round(fit.BCC2$alpha[3],2)),")")),
			xlab="time (months)",ylab="aep",color=c("#00BA38", "#619CFF"))
gp4 <- trajplot(fit=fit.BCC2,feature.ind=1,
			which.cluster = "global.cluster",
			title="Global Clustering",xlab="time (months)",ylab="anxiety",color=c("#00BA38", "#619CFF"))
gp5 <- trajplot(fit=fit.BCC2,feature.ind=2,
			which.cluster = "global.cluster",
			title="Global Clustering",xlab="time (months)",ylab="depress",color=c("#00BA38", "#619CFF"))
gp6 <- trajplot(fit=fit.BCC2,feature.ind=3,
			which.cluster = "global.cluster",
			title="Global Clustering",
			xlab="time (months)",ylab="aep",color=c("#00BA38", "#619CFF"))
library(cowplot)
plot_grid(gp1, gp2,gp3,gp4,gp5,gp6,
          labels=c("(A)", "(B)", "(C)", "(D)", "(E)", "(F)"),
	    ncol = 3,   align = "v" )
plot_grid(gp1,NULL,gp2,NULL,gp3,NULL,
	    gp4,NULL,gp5,NULL,gp6,NULL,
		labels=c("(A)","", "(B)","","(C)","","(D)","","(E)","","(F)",""), nrow = 2,
		align = "v", rel_widths = c(1,0.1,1,0.1,1,0.1))
```

## Posterior Check

The __*BayesT*__ function will be used for posterior check. Here we used the pre-compiled results, un-comment the line `res <- BayesT(fit=fit.BCC2)` to try your own. The pre-compiled data file can be attached using `data("conRes")` For this function, the p-value between 0.3 to 0.7 was consider reasonable. In the scatter plot, the data pints should be evenly distributed around y = x. 

```{r, message=F, warning=F, fig.height=5, fig.width=7, fig.align='center'}
#res <- BayesT(fit=fit.BCC2)
data("conRes")
res <- conRes
plot(log(res$T.obs),log(res$T.rep),xlim=c(8.45,8.7), cex=1.5,
	ylim=c(8.45,8.7),xlab="Observed T statistics (in log scale)", ylab = "Predicted T statistics (in log scale)")
abline(0,1,lwd=2,col=2)

p.value <- sum(res$T.rep > res$T.obs)/length(res$T.rep)
p.value

fit.BCC2$cluster.global <- factor(fit.BCC2$cluster.global,labels=c("Cluster 1","Cluster 2"))
boxplot(fit.BCC2$postprob ~ fit.BCC2$cluster.global,ylim=c(0,1),xlab="",ylab="Posterior Cluster Probability")
```

## Package References

[Tan, Z., Shen, C., Lu, Z. (2022) BCClong: an R package for performing Bayesian Consensus Clustering model for clustering continuous, discrete and categorical longitudinal data.](https://github.com/ZhiwenT/BCClong)

```{r}
sessionInfo()
```