params <-
list(EVAL = TRUE)

## ---- echo = FALSE------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  message = FALSE,
  warning = FALSE,
  eval = if (isTRUE(exists("params"))) params$EVAL else FALSE
)


## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  echo = TRUE,
  dpi = 100,
  fig.asp = 0.8,
  fig.width = 6,
  out.width = "60%",
  fig.align = "center")
library(mvgam)
library(ggplot2)
library(dplyr)
# A custom ggplot2 theme
theme_set(theme_classic(base_size = 12, base_family = 'serif') +
            theme(axis.line.x.bottom = element_line(colour = "black",
                                                    size = 1),
                  axis.line.y.left = element_line(colour = "black",
                                                  size = 1)))
options(ggplot2.discrete.colour = c("#A25050",
                                    "#00008b",
                                    'darkred',
                                    "#010048"),
        ggplot2.discrete.fill = c("#A25050",
                                  "#00008b",
                                  'darkred',
                                  "#010048"))


## -----------------------------------------------------------------------------
set.seed(999)
# Simulate observations for species 1, which shows a declining trend and 0.7 detection probability
data.frame(site = 1,
           # five replicates per year; six years
           replicate = rep(1:5, 6),
           time = sort(rep(1:6, 5)),
           species = 'sp_1',
           # true abundance declines nonlinearly
           truth = c(rep(28, 5),
                     rep(26, 5),
                     rep(23, 5),
                     rep(16, 5),
                     rep(14, 5),
                     rep(14, 5)),
           # observations are taken with detection prob = 0.7
           obs = c(rbinom(5, 28, 0.7),
                   rbinom(5, 26, 0.7),
                   rbinom(5, 23, 0.7),
                   rbinom(5, 15, 0.7),
                   rbinom(5, 14, 0.7),
                   rbinom(5, 14, 0.7))) %>%
  # add 'series' information, which is an identifier of site, replicate and species
  dplyr::mutate(series = paste0('site_', site,
                                '_', species,
                                '_rep_', replicate),
                time = as.numeric(time),
                # add a 'cap' variable that defines the maximum latent N to 
                # marginalize over when estimating latent abundance; in other words
                # how large do we realistically think the true abundance could be?
                cap = 100) %>%
  dplyr::select(- replicate) -> testdat

# Now add another species that has a different temporal trend and a smaller 
# detection probability (0.45 for this species)
testdat = testdat %>%
  dplyr::bind_rows(data.frame(site = 1,
                              replicate = rep(1:5, 6),
                              time = sort(rep(1:6, 5)),
                              species = 'sp_2',
                              truth = c(rep(4, 5),
                                        rep(7, 5),
                                        rep(15, 5),
                                        rep(16, 5),
                                        rep(19, 5),
                                        rep(18, 5)),
                              obs = c(rbinom(5, 4, 0.45),
                                      rbinom(5, 7, 0.45),
                                      rbinom(5, 15, 0.45),
                                      rbinom(5, 16, 0.45),
                                      rbinom(5, 19, 0.45),
                                      rbinom(5, 18, 0.45))) %>%
                     dplyr::mutate(series = paste0('site_', site,
                                                   '_', species,
                                                   '_rep_', replicate),
                                   time = as.numeric(time),
                                   cap = 50) %>%
                     dplyr::select(-replicate))


## -----------------------------------------------------------------------------
testdat$species <- factor(testdat$species,
                          levels = unique(testdat$species))
testdat$series <- factor(testdat$series,
                         levels = unique(testdat$series))


## -----------------------------------------------------------------------------
dplyr::glimpse(testdat)
head(testdat, 12)


## -----------------------------------------------------------------------------
testdat %>%
  # each unique combination of site*species is a separate process
  dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>%
  dplyr::select(trend, series) %>%
  dplyr::distinct() -> trend_map
trend_map


## ----include = FALSE, results='hide'------------------------------------------
mod <- mvgam(
  # the observation formula sets up linear predictors for
  # detection probability on the logit scale
  formula = obs ~ species - 1,
  
  # the trend_formula sets up the linear predictors for 
  # the latent abundance processes on the log scale
  trend_formula = ~ s(time, by = trend, k = 4) + species,
  
  # the trend_map takes care of the mapping
  trend_map = trend_map,
  
  # nmix() family and data
  family = nmix(),
  data = testdat,
  
  # priors can be set in the usual way
  priors = c(prior(std_normal(), class = b),
             prior(normal(1, 1.5), class = Intercept_trend)),
  samples = 1000)


## ----eval = FALSE-------------------------------------------------------------
## mod <- mvgam(
##   # the observation formula sets up linear predictors for
##   # detection probability on the logit scale
##   formula = obs ~ species - 1,
## 
##   # the trend_formula sets up the linear predictors for
##   # the latent abundance processes on the log scale
##   trend_formula = ~ s(time, by = trend, k = 4) + species,
## 
##   # the trend_map takes care of the mapping
##   trend_map = trend_map,
## 
##   # nmix() family and data
##   family = nmix(),
##   data = testdat,
## 
##   # priors can be set in the usual way
##   priors = c(prior(std_normal(), class = b),
##              prior(normal(1, 1.5), class = Intercept_trend)),
##   samples = 1000)


## -----------------------------------------------------------------------------
code(mod)


## -----------------------------------------------------------------------------
summary(mod)


## -----------------------------------------------------------------------------
loo(mod)


## -----------------------------------------------------------------------------
plot(mod, type = 'smooths', trend_effects = TRUE)


## -----------------------------------------------------------------------------
marginaleffects::plot_predictions(mod, condition = 'species',
                 type = 'detection') +
  ylab('Pr(detection)') +
  ylim(c(0, 1)) +
  theme_classic() +
  theme(legend.position = 'none')


## -----------------------------------------------------------------------------
hc <- hindcast(mod, type = 'latent_N')

# Function to plot latent abundance estimates vs truth
plot_latentN = function(hindcasts, data, species = 'sp_1'){
  all_series <- unique(data %>%
                         dplyr::filter(species == !!species) %>%
                         dplyr::pull(series))
  
  # Grab the first replicate that represents this series
  # so we can get the true simulated values
  series <- as.numeric(all_series[1])
  truths <- data %>%
    dplyr::arrange(time, series) %>%
    dplyr::filter(series == !!levels(data$series)[series]) %>%
    dplyr::pull(truth)
  
  # In case some replicates have missing observations,
  # pull out predictions for ALL replicates and average over them
  hcs <- do.call(rbind, lapply(all_series, function(x){
    ind <- which(names(hindcasts$hindcasts) %in% as.character(x))
    hindcasts$hindcasts[[ind]]
  }))
  
  # Calculate posterior empirical quantiles of predictions
  pred_quantiles <- data.frame(t(apply(hcs, 2, function(x) 
    quantile(x, probs = c(0.05, 0.2, 0.3, 0.4, 
                          0.5, 0.6, 0.7, 0.8, 0.95)))))
  pred_quantiles$time <- 1:NROW(pred_quantiles)
  pred_quantiles$truth <- truths
  
  # Grab observations
  data %>%
    dplyr::filter(series %in% all_series) %>%
    dplyr::select(time, obs) -> observations
  
  # Plot
  ggplot(pred_quantiles, aes(x = time, group = 1)) +
    geom_ribbon(aes(ymin = X5., ymax = X95.), fill = "#DCBCBC") + 
    geom_ribbon(aes(ymin = X30., ymax = X70.), fill = "#B97C7C") +
    geom_line(aes(x = time, y = truth),
              colour = 'black', linewidth = 1) +
    geom_point(aes(x = time, y = truth),
               shape = 21, colour = 'white', fill = 'black',
               size = 2.5) +
    geom_jitter(data = observations, aes(x = time, y = obs),
                width = 0.06, 
                shape = 21, fill = 'darkred', colour = 'white', size = 2.5) +
    labs(y = 'Latent abundance (N)',
         x = 'Time',
         title = species)
}


## -----------------------------------------------------------------------------
plot_latentN(hc, testdat, species = 'sp_1')
plot_latentN(hc, testdat, species = 'sp_2')


## -----------------------------------------------------------------------------
# Date link
load(url('https://github.com/doserjef/spAbundance/raw/main/data/dataNMixSim.rda'))
data.one.sp <- dataNMixSim

# Pull out observations for one species
data.one.sp$y <- data.one.sp$y[1, , ]

# Abundance covariates that don't change across repeat sampling observations
abund.cov <- dataNMixSim$abund.covs[, 1]
abund.factor <- as.factor(dataNMixSim$abund.covs[, 2])

# Detection covariates that can change across repeat sampling observations
# Note that `NA`s are not allowed for covariates in mvgam, so we randomly
# impute them here
det.cov <- dataNMixSim$det.covs$det.cov.1[,]
det.cov[is.na(det.cov)] <- rnorm(length(which(is.na(det.cov))))
det.cov2 <- dataNMixSim$det.covs$det.cov.2
det.cov2[is.na(det.cov2)] <- rnorm(length(which(is.na(det.cov2))))


## -----------------------------------------------------------------------------
mod_data <- do.call(rbind,
                    lapply(1:NROW(data.one.sp$y), function(x){
                      data.frame(y = data.one.sp$y[x,],
                                 abund_cov = abund.cov[x],
                                 abund_fac = abund.factor[x],
                                 det_cov = det.cov[x,],
                                 det_cov2 = det.cov2[x,],
                                 replicate = 1:NCOL(data.one.sp$y),
                                 site = paste0('site', x))
                    })) %>%
  dplyr::mutate(species = 'sp_1',
                series = as.factor(paste0(site, '_', species, '_', replicate))) %>%
  dplyr::mutate(site = factor(site, levels = unique(site)),
                species = factor(species, levels = unique(species)),
                time = 1,
                cap = max(data.one.sp$y, na.rm = TRUE) + 20)


## -----------------------------------------------------------------------------
NROW(mod_data)
dplyr::glimpse(mod_data)
head(mod_data)


## -----------------------------------------------------------------------------
mod_data %>%
  # each unique combination of site*species is a separate process
  dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>%
  dplyr::select(trend, series) %>%
  dplyr::distinct() -> trend_map

trend_map %>%
  dplyr::arrange(trend) %>%
  head(12)


## ----include = FALSE, results='hide'------------------------------------------
mod <- mvgam(
  # effects of covariates on detection probability;
  # here we use penalized splines for both continuous covariates
  formula = y ~ s(det_cov, k = 3) + s(det_cov2, k = 3),
  
  # effects of the covariates on latent abundance;
  # here we use a penalized spline for the continuous covariate and
  # hierarchical intercepts for the factor covariate
  trend_formula = ~ s(abund_cov, k = 3) +
    s(abund_fac, bs = 're'),
  
  # link multiple observations to each site
  trend_map = trend_map,
  
  # nmix() family and supplied data
  family = nmix(),
  data = mod_data,
  
  # standard normal priors on key regression parameters
  priors = c(prior(std_normal(), class = 'b'),
             prior(std_normal(), class = 'Intercept'),
             prior(std_normal(), class = 'Intercept_trend'),
             prior(std_normal(), class = 'sigma_raw_trend')),
  
  # use Stan's variational inference for quicker results
  algorithm = 'meanfield',
  
  # no need to compute "series-level" residuals
  residuals = FALSE,
  samples = 1000)


## ----eval=FALSE---------------------------------------------------------------
## mod <- mvgam(
##   # effects of covariates on detection probability;
##   # here we use penalized splines for both continuous covariates
##   formula = y ~ s(det_cov, k = 4) + s(det_cov2, k = 4),
## 
##   # effects of the covariates on latent abundance;
##   # here we use a penalized spline for the continuous covariate and
##   # hierarchical intercepts for the factor covariate
##   trend_formula = ~ s(abund_cov, k = 4) +
##     s(abund_fac, bs = 're'),
## 
##   # link multiple observations to each site
##   trend_map = trend_map,
## 
##   # nmix() family and supplied data
##   family = nmix(),
##   data = mod_data,
## 
##   # standard normal priors on key regression parameters
##   priors = c(prior(std_normal(), class = 'b'),
##              prior(std_normal(), class = 'Intercept'),
##              prior(std_normal(), class = 'Intercept_trend'),
##              prior(std_normal(), class = 'sigma_raw_trend')),
## 
##   # use Stan's variational inference for quicker results
##   algorithm = 'meanfield',
## 
##   # no need to compute "series-level" residuals
##   residuals = FALSE,
##   samples = 1000)


## -----------------------------------------------------------------------------
summary(mod, include_betas = FALSE)


## -----------------------------------------------------------------------------
marginaleffects::avg_predictions(mod, type = 'detection')


## -----------------------------------------------------------------------------
abund_plots <- plot(conditional_effects(mod,
                                        type = 'link',
                                        effects = c('abund_cov',
                                                    'abund_fac')),
                    plot = FALSE)


## -----------------------------------------------------------------------------
abund_plots[[1]] +
  ylab('Expected latent abundance')


## -----------------------------------------------------------------------------
abund_plots[[2]] +
  ylab('Expected latent abundance')


## -----------------------------------------------------------------------------
det_plots <- plot(conditional_effects(mod,
                                      type = 'detection',
                                      effects = c('det_cov',
                                                  'det_cov2')),
                  plot = FALSE)


## -----------------------------------------------------------------------------
det_plots[[1]] +
  ylab('Pr(detection)')
det_plots[[2]] +
  ylab('Pr(detection)')


## -----------------------------------------------------------------------------
fivenum_round = function(x)round(fivenum(x, na.rm = TRUE), 2)

marginaleffects::plot_predictions(mod, 
                 newdata = marginaleffects::datagrid(det_cov = unique,
                                    det_cov2 = fivenum_round),
                 by = c('det_cov', 'det_cov2'),
                 type = 'detection') +
  theme_classic() +
  ylab('Pr(detection)')