%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{Examples in statistical ecology}
%\VignetteDepends{tramME, gamm4, gamlss.dist, survival, mgcv, glmmTMB, xtable, multcomp}

\documentclass[11pt]{article}
\usepackage{graphicx}
\usepackage[margin=0.8in]{geometry}
\usepackage[margin=0.5in, font=small]{caption}
\usepackage{amsfonts,amstext,amsmath,amssymb,amsthm}
\usepackage[utf8]{inputenc}
\usepackage[comma,authoryear]{natbib}
\usepackage{xcolor}
\usepackage[colorlinks=true, urlcolor=black, linkcolor=blue, citecolor=blue]{hyperref}
\usepackage{booktabs}
\usepackage{doi}
\usepackage[inline]{enumitem}

\renewcommand{\tt}[1]{\texttt{#1}}
\newcommand{\todo}[1]{\textbf{\color{red} TODO: #1}}
\newcommand{\note}[1]{\textbf{\color{orange} NOTE: #1}}
\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\pkg}[1]{\texttt{#1}}
\newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}%

% Math notation
\newcommand{\IP}{{\mathbb{P}}}
\newcommand{\IR}{{\mathbb{R}}}
\newcommand{\IE}{{\mathbb{E}}}
\newcommand{\U}{{\mathbf{U}}}
\newcommand{\X}{{\boldsymbol{X}}}
\newcommand{\bH}{{\boldsymbol{H}}}
\newcommand{\bS}{{\boldsymbol{S}}}
\newcommand{\x}{{\boldsymbol{x}}}
\newcommand{\Y}{{\boldsymbol{Y}}}
\newcommand{\y}{{\boldsymbol{y}}}
\newcommand{\z}{{\boldsymbol{z}}}
% \newcommand{\bcB}{{\boldsymbol{\mathcal{B}}}}
% \newcommand{\bcb}{{\boldsymbol{\mathcal{b}}}}
\newcommand{\ba}{{\boldsymbol{a}}}
\newcommand{\bb}{{\boldsymbol{b}}}
\newcommand{\bbeta}{{\boldsymbol{\beta}}}
\newcommand{\bgamma}{{\boldsymbol{\gamma}}}
\newcommand{\bdelta}{{\boldsymbol{\delta}}}
\newcommand{\blambda}{{\boldsymbol{\lambda}}}
\newcommand{\bvartheta}{{\boldsymbol{\vartheta}}}
\newcommand{\bGamma}{{\boldsymbol{\Gamma}}}
\newcommand{\bSigma}{{\boldsymbol{\Sigma}}{}}
\newcommand{\vect}[1]{\boldsymbol{#1}}
\newcommand{\cN}{{\mathcal{N}}}
\newcommand{\cL}{{\mathcal{L}}}
\newcommand{\0}{{\mathbf{0}}}
\newcommand{\T}{^{\top}}
\renewcommand{\d}{\mathsf{\,d}}

\usepackage{accents}
\newcommand{\ubar}[1]{\underaccent{\bar}{#1}}


%%%%% Section titles, table, figure, equation labels
%% Sections with A, B, C etc
\let\thesectionWithoutS\thesection %% <- store old definition
\renewcommand\thesection{S\thesectionWithoutS}
%% equation, figure, table references prepended with S
\let\theequationWithoutS\theequation %% <- store old definition
\renewcommand\theequation{S\theequationWithoutS}
\let\thefigureWithoutS\thefigure %% <- store old definition
\renewcommand\thefigure{S\thefigureWithoutS}
\let\thetableWithoutS\thetable %% <- store old definition
\renewcommand\thetable{S\thetableWithoutS}

%%%%%% Column header formatting in xtables
%% https://stackoverflow.com/a/33237779/10826854
\usepackage{array}
\newcolumntype{R}[1]{>{\raggedleft\let\newline\\\arraybackslash\hspace{0pt}}p{#1}}

\setlength{\parskip}{0.5em}
\renewcommand{\baselinestretch}{1.2}

% \newcommand{\address}[1]{
%   \addvspace{\baselineskip}\noindent
%   \textbf{Affiliations:} \\
%   \emph{#1}}
% \newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}}

<<setup, echo=FALSE, message=FALSE, results="hide">>=
knitr::opts_chunk$set(size = "small", prompt = TRUE, comment = NA,
                      out.width=".9\\linewidth")
knitr::knit_hooks$set(
  document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}',
                              x, fixed = TRUE)}
)
oldpar <- par(no.readonly = TRUE) ## NOTE: for setting back at the end
oldopt <- options()
options(prompt = "R> ", continue = "+  ")
options(width = 80, digits = 3)

## Dependencies
library("tramME")
library("survival")
library("mgcv")
library("glmmTMB")
library("xtable")
library("gamm4")
@

\title{\textbf{Online Appendix:}\\
  Flexible Regression for Correlated and Censored Ecological
  Data with Mixed-Effects Additive Transformation Models
}
\author{B\'alint Tam\'asi \\ {\small \url{Balint.Tamasi@uzh.ch}}
\and Torsten Hothorn \\ {\small \url{Torsten.Hothorn@R-project.org}}
}

\date{16/3/2023}

\begin{document}

\maketitle

\abstract{
  This vignette serves as the online appendix for the manuscript
  \emph{``Flexible Regression for Correlated and Censored Ecological
  Data with Mixed-Effects Additive Transformation Models''}.
  %% \citet{Tamasi_2022}. 
  It presents details on the estimation and inference in
  the titular model class,
  four example analyses that
  % use mixed-effects additive transformation models to reanalyze
  use these models to reanalyze
  ecological phenomena from recently published studies,
  as well as a small simulation exercise to verify the correctness
  of the implementation.
}

\section{Estimation and inference}\label{sec:inference}

In this section, we give a brief summary
on maximum likelihood estimation and inference
in mixed-effects additive transformation models.
The model can be written in its general form as
%
\begin{align}
  \IP\left(Y_{i}\leq y \mid \x_{i}, \z_{i}, \bgamma \right)
  &= F\left(h(y) +
    f_{1}(x_{1i}) + f_{2}(x_{2i}, x_{3i}) + \dots +
    \x_{+i}\T\bbeta +
    \z_{i}\T\bgamma\right)
    \qquad \bgamma \sim \cN_{q}(\0, \bSigma), \label{eq:memodel}
\end{align}
%
with $F(\cdot)$ inverse link function and $h(y)$ baseline transformation function.
Each smooth term ($f_{j}(\cdot)$) can be represented
with the help of basis functions
$f_{j}(x_{ji}) = \bb_{j}(x_{i})\T\bdelta_{j}$.
Penalties on the $\bdelta_{j}$ parameters can be added to the model
to prevent overfitting
in the general form of $\lambda_{j}\bdelta_{j}\T\bS_{j}\bdelta_{j}$.
$\lambda_{j}$ is the smoothing parameter
that controls the amount of penalty imposed on the $j$th term
and thus controls the smoothness of $f_{j}(\cdot)$,
while the penalty matrix $\bS_{j}$ defines
how exactly the parameters
$\bdelta_{j}$
are penalized.

The log-likelihood of Model~\ref{eq:memodel}
can be formulated
with the help of appropriate penalty terms
for the smooth effects as well as the random effects
%
\begin{align}
  \ell(\bvartheta, \bbeta, \bSigma, \blambda)
  &= \ell^{\star}(\bvartheta, \bbeta, \bgamma, \bdelta)
    - \frac{1}{2} \sum_{j=1}^{J}\lambda_{j}\bdelta_{j}\T\bS_{j}\bdelta_{j}
    - \frac{1}{2} \bgamma\T \bSigma^{-1}\bgamma
    + c, \label{eq:loglik}
\end{align}
%
where $\ell^{\star}(\bvartheta, \bbeta, \bgamma, \bdelta) =
\sum_{i=1}^{N}\ell_{i}^{\star}(\bvartheta, \bbeta, \bgamma, \bdelta)$
is the unpenalized or conditional
(on the vector of random effects and penalized coefficients)
log-likelihood, which can be written
as the sum of the log-likelihood contributions
of the individual observations,
assuming conditional independence.
$\blambda$ denotes the vector of smoothing parameters
corresponding to $f_{1}(\cdot), f_{2}(\cdot), \dots, f_{J}(\cdot)$ smooth terms,
and $\bdelta = \left(\bdelta_{1}\T, \bdelta_{2}\T, \dots, \bdelta_{J}\T\right)\T$.
The covariance matrix of the stacked vector of random effects ($\bSigma$)
is positive definite and typically has a block structure,
which can be leveraged in the calculations.
The term $c$ is a constant,
which does not depend on the parameters (either fixed or random)
of the model.

The conditional log-likelihood contributions of the observations,
given that they are exactly observed can be written as
%
\begin{align*}
  \ell_{i}^{\star}(\bvartheta, \bbeta, \bgamma, \bdelta)
  &= \log F'\left(h(y_{i}) +
    f_{1}(x_{1i}) + f_{2}(x_{2i}, x_{3i}) + \dots +
    \x_{+i}\T\bbeta +
    \z_{i}\T\bgamma\right) + \log h'(y_{i})
\end{align*}
%
where $F'(\cdot)$ and $h'(\cdot)$ are the derivative functions
of $F(\cdot)$ and $h(\cdot)$, respectively.
Because we directly parameterize
the cumulative distribution function of the outcome
in Equation~\ref{eq:memodel},
it is straightforward to express the log-likelihood contributions
of randomly censored observations:
%
\begin{align}
\ell_{i}^{\star}(\bvartheta, \bbeta, \bgamma, \bdelta)
  &= \log\left(\IP\left(Y_{i}\leq \bar{y}_{i} \mid \x_{i}, \z_{i}, \bgamma \right)
    - \IP\left(Y_{i}\leq \ubar{y}_{i} \mid \x_{i}, \z_{i}, \bgamma \right) \right),
    \label{eq:ic}
\end{align}
%
for an interval-censored observation on $\ubar{y}_{i}\leq y_{i} < \bar{y}_{i}$.
The contribution of left ($-\infty < y_{i} < \bar{y}_{i}$)
and right-censored ($\ubar{y}_{i} \leq y_{i} < \infty$) observations
can be expressed by setting
the second part of this expression to 0
and the first part to 1, respectively.

Likelihood-based inference
in the family of mixed-effects transformation models
is presented by \citet{Tamasi_Hothorn_2021}.
The estimation procedure described there is directly applicable
in the setting with penalized smooth additive terms
utilizing their mixed model representation \citep{Ruppert_2003, Wand_smoothing_2003}.
In the case of some smoothing bases,
a reparameterization is necessary
to fit into a mixed model-based estimation framework
(\citealp[Appendix]{Wood_2004} and \citealp[Chapter~8]{Fahrmeir_2013}).
The marginal log-likelihood function of the model
can be efficiently evaluated
by integrating over the vector of random effects
using the Laplace approximation (see Section~\ref{sec:compute} for details).
In the mixed-effects representation,
the smoothing parameters can be estimated
as variance components of the mixed model
via maximum likelihood or restricted maximum likelihood (REML).

Frequentist inference for the penalized smooth terms
can be based on the joint covariance matrix estimate
of fixed and random parameters
using the generalized delta method \citep{Kristensenetal_2016}.
This estimate approximates
the mean squared error of the random effects predictions
\citep{Zheng_Cadigan_2021},
and it is also in agreement with results derived
in a Bayesian setting for independent hierarchical models \citep{Kass_Steffey_1989}.

The effective degrees of freedom (EDF)
of the individual smooth terms
give an approximation of the effective number of parameters
used in modeling these penalized smooth functions.
Following \citet{Wood_2016} and \citet[][Section~6.11]{Wood_gam_2017},
the EDFs can be estimated by summing up the appropriate elements
of the diagonal of the matrix $V\mathcal{I}$,
where $V$ is the joint variance-covariance matrix
of all unpenalized (i.e., fixed-effects) and penalized (random) parameters.
The matrix $\mathcal{I}$ is the negative Hessian
of the \emph{unpenalized} log-likelihood
($\ell^{\star}(\bvartheta, \bbeta, \bgamma, \bdelta)$
in Equation~\ref{eq:loglik}),
evaluated at the vector of parameter estimates.

\section{Computational details}\label{sec:compute}

\pkg{tramME} offers several methods
to approximate the baseline transformation function
($h(y)$ in Equation~\ref{eq:memodel})
of additive mixed-effects transformation models
\citep[see][for the available options]{Tamasi_Hothorn_2021}.
Similarly to \citet{Hothornetal_2018},
the \emph{Bernstein polynomial basis} \citep{Farouki_2012}
is used for a general and flexible parameterization
for continuous outcomes:
%
\begin{align*}
  h(y) &= \ba_{\text{Bs},M}(y)\T\bvartheta
       = \frac{1}{M+1}\sum_{m=0}^{M}\vartheta_{m}
         \binom{M}{m}\tilde y^{m}(1-\tilde y)^{M-m},
\end{align*}
%
where $\ba_{\text{Bs},M}(y)$ denotes
the set of order $M$ polynomials in Bernstein form,
and $\tilde y$ equals to $y$ scaled to the interval $[0, 1]$.
In practice, the exact order of the polynomials in Bernstein form
usually does not have a large effect on the results
beyond a minimal value.
\citet{Hothorn_mlt_2020} discusses the choice of order in fixed effects-only
transformation models and recommends choosing $M$ between 5 and 10.
In the application of the main article, we used the order of 6 as a default.

Likelihood-based estimation and inference
in mixed-effects additive transformation models
is implemented by the \textsf{R} add-on package \pkg{tramME}
\citep{Tamasi_Hothorn_2021}.
The package utilizes existing implementations
of linear transformation models
\citep[\pkg{mlt} and \pkg{tram} packages by][]{Hothorn_mlt_2020}
for the user interface and to set up inputs for the estimation procedure.
The penalized smooth terms are defined with the help
of the \pkg{mgcv} package \citep{Wood_gam_2017},
which provides functionality to reformulate these terms
to their mixed-effects representations as described by
\citet[Appendix]{Wood_2004}.
\pkg{tramME} uses the Template Model Builder package
\citep[\pkg{TMB},][]{Kristensenetal_2016}
to evaluate the log-likelihood via Laplace approximation
and to calculate the gradients of it with automatic differentiation
\citep[\pkg{glmmTMB}, by][uses
the same underlying techniques]{Brooks_2017}.
Consequently, model estimation and inference in \pkg{tramME} is
fast and efficient.

The user interface of \pkg{tramME} follows
standard model specification conventions of the most popular \textsf{R} packages
(\pkg{lme4}, \citealp{Bates_lme4_2015},
\pkg{mgcv}, \citealp{Wood_gam_2017})
to define mixed-effects additive transformation models. Thus,
transition from a \texttt{gam}, \texttt{lmer}, or \texttt{glmer} model is very
simple. For example,
a mixed-effects additive parametric proportional hazards model
can be defined with the function call
%
<<tramME-exmpl1, eval=FALSE, echo=TRUE>>=
## tramME is available from CRAN:
## install.packages("tramME")
library("tramME")
mc1 <- CoxphME(                            ### conditional proportional hazards
    Time ~                                 ### response time intervals
           Insects +                       ### fixed effects
           Habitat +
           Landscape +
           s(Temperature, k = 20) +        ### non-linear terms, as in mgcv
           s(Elevation100, k = 20) +
           (1 | PlotID),                   ### random intercept, as in lme4
    data = carrion,                        ### data
    log_first = TRUE,                      ### log(Time) before modeling
    order = 6                              ### order of Bernstein
)
@
%

\section{The rat carrion decomposition experiment}

<<load-carrion, include=FALSE>>=
carrion <- read.csv("carrion.csv")
carrion$Time <- with(carrion, Surv(time1, time2, type = "interval2"))
carrion$Time_rc <- with(carrion,
  Surv(ifelse(is.finite(time2), time2, time1), event = is.finite(time2)))
carrion$Insects <- factor(carrion$Insects, labels = c("no", "yes"))
carrion$Habitat <- factor(carrion$Habitat)
carrion$Habitat <- relevel(carrion$Habitat, ref = "forest")
carrion$Landscape <- factor(carrion$Landscape)
carrion$Landscape <- relevel(carrion$Landscape, ref = "seminatural")
carrion$PlotID <- factor(carrion$PlotID)
@

\begin{figure}[!ht]
  \centering

<<fig-carrion-cens, echo=FALSE, fig.width=8, fig.height=4, out.width="0.9\\textwidth">>=
par(mfrow = c(1, 2))
## -- Plot 1
par(cex = 0.9, mar = c(5, 4, 2, 1))
idx <- 140:155
dpl <- carrion$Time[idx]
plot(0, type = "n", ylim = range(idx) + c(-0.5, 0.5), xlim = c(0, 120),
     ylab = "ID", xlab = "Follow-up time (days)", yaxt = "n")
abline(h = idx, v = seq(0, 120, by = 20), col = "lightgrey", lty = 3)
axis(2, at = idx, las = 2)
ic <- dpl[, 3] > 0
segments(y0 = idx[ic], x0 = dpl[ic, 1], x1 = dpl[ic, 2], lwd = 2)
arrows(y0 = idx[!ic], x0 = dpl[!ic, 1], x1 = 123, length = 0.1, lwd = 2)
## -- Plot 2
par(cex = 0.9, mar = c(5, 4, 2, 1))
dpl <- carrion$Time[carrion$Time[, 3] > 0] ## exclude right censored
plot(ecdf(dpl[, 2] - dpl[, 1]), xlab = "Interval length (days)", main = NULL,
     verticals = TRUE, las = 1, ylab = "Probability")
grid()
@

\caption{
  Censoring in the carrion decomposition dataset of \protect\citet{Englmeier2022}.
  \emph{Left panel}: Examples of censoring types.
  Segments with arrows denote right-censored observations,
  other segments show interval-censored outcomes.
  \emph{Right panel}: The distribution of the interval lengths of interval-censored observations.
}\label{fig:carrion-cens}
\end{figure}

\begin{figure}[!ht]
  \centering

<<fig-carrion-data, echo=FALSE, fig.width=4, fig.height=3.5, out.width=".5\\textwidth">>=
sv <- survfit(Time ~ Insects, data = carrion)
par(cex = 0.8, mar = c(4, 4.2, 2, 1), las = 1)
plot(sv, lty = c(1, 2), lwd = 2, xlab = "", ylab = "P(T>t)")
title(xlab = "Follow-up time (days)", line = 2.3)
grid()
legend("topright", c("Insect = no", "Insect = yes"), lty = c(1, 2),
       lwd = 2, bty = "n")
@

\caption{Non-parametric survival probability estimates
  associated with the decomposition times
  in groups where insect access was and was not allowed in the carrion decomposition
  experiment.}
\label{fig:carrion-surv}
\end{figure}


Our first example presents
the reanalysis of the carrion decomposition data
by \citet{Englmeier2022}.
In the original study,
the authors analyzed the environmental factors
that affect the decomposition process
of small rodent carrion
using data from an experiment,
in which they placed rat carcasses
in different environments
and recorded the time until complete decomposition.
As we describe it in the main article,
the outcome variable
(time until complete decomposition)
is \emph{interval censored}
due to the discrete follow-up.
Figure~\ref{fig:carrion-cens} presents some examples
of the censored outcomes in the dataset.
Segments with arrows denote right-censored observations,
i.e., cases, where the total decomposition did not occur
by the time of the last visit.
As these examples demonstrate, the sites were visited at
regular intervals, albeit we can see some variability
in the lengths.
The survival curves for specimen
with and without the presence of insects
are shown in Figure~\ref{fig:carrion-surv}.
Based on this marginal view of the data,
i.e., without adjusting for other important factors
and ignoring cluster-level heterogeneity,
the presence of insects accelerated the decomposition process.
The 25\textsuperscript{th}, 50\textsuperscript{th} and 75\textsuperscript{th}
percentiles in the cases where insects were present were
\Sexpr{paste(round(quantile(sv, c(0.25, 0.5, 0.75))$quantile[2, ], 3), collapse = ", ")} %$
days, respectively,
while, without insects, the same quantiles amounted to
\Sexpr{paste(round(quantile(sv, c(0.25, 0.5, 0.75))$quantile[1, ], 3), collapse = ", ")} %$
days.

The main environmental variables we include in our model are
the indicator for the presence of insects (\texttt{Insects}),
local (\texttt{Habitat}) and regional (\texttt{Landscape}) land use types,
The average temperature on each experimental plot
was measured with thermologgers
(\texttt{Temperature}),
and the elevational gradient was used
as a surrogate for the long-term macroclimate (\texttt{Elevation100}).
The plot-level unobserved sources of variability were modeled
by including random intercepts (grouping variable: \texttt{PlotID}).
The outcome variable (\texttt{Time})
is a predefined \code{Surv} object
that encodes the interval-censored decomposition times.
%
<<carrion-outome, echo=TRUE>>=
head(carrion$Time, 10)
@

We estimate the following
proportional-hazards mixed-effect additive transformation model
for the decomposition intervals
%
<<carrion-model>>=
dcmp <- CoxphME(Time ~ Insects + Habitat + Landscape
                + s(Temperature, k = 20) + s(Elevation100, k = 20)
                + (1 | PlotID), data = carrion,
                log_first = TRUE, order = 6)
summary(dcmp)
@
%
The smooth terms can be evaluated and plotted with
%
<<plot-carrion-smooth, eval=FALSE>>=
plot(smooth_terms(dcmp), panel.first = grid())
@
%
As the results in Figure~\ref{fig:carrion-smooth} show,
the effects of temperature and elevation look fairly linear.

\begin{figure}[!ht]
\centering
<<plot-carrion-smooth2, echo=FALSE, fig.width=8, fig.height=4>>=
par(mar = c(4, 4, 1, 1))
<<plot-carrion-smooth>>
@
\caption{Smooth effects of the continuous variables
  in the model of the decomposition times.}\label{fig:carrion-smooth}
\end{figure}

As a general check of the appropriateness of the estimated model,
we can evaluate the \emph{marginal} distribution function
or, as it is more commonly done in survival analysis,
the marginal survivor function of the outcomes
at the observations in our dataset
by integrating over the random effects numerically.
%
\begin{align*}
  \widehat S(t_{i}\mid\x_{i})
  &= \widehat \IP(T_{i} > t_{i}\mid\x_{i})
  = \int_{-\infty}^{+\infty}\widehat\IP(T_{i} > t_{i}\mid\x_{i},\gamma)\phi(\gamma)\d\gamma.
\end{align*}
%
$\widehat S(t_{i}\mid\x_{i})$ denotes the fitted survivor function,
which is straightforward to calculate
from the mixed-effects additive transformation model,
due to its fully parametric approach to approximate the outcome distribution.
By evaluating $-\log(\widehat S(t_{i}\mid\x_{i}))$
at the observations in our dataset,
we get the Cox-Snell residuals \citep[Chapter~11]{Klein_Moeschberger_2003},
which are unit exponentially distributed under the correct model.

Interval-censored outcomes pose
a technical difficulty in assessing the Cox-Snell residuals.
In this situation we can either evaluate the marginal survivor function at
the upper and lower bounds of the censoring intervals
and assess the interval-censored version of the residuals,
or we can apply the adjustment proposed by \citet{Farrington_2000},
which replaces these intervals with expected values
under unit exponential distribution.
To assess the distribution of the residuals,
we estimate the cumulative hazard function of the Cox-Snell residuals,
which should be close to a straight line
with unit slope through the origin under the correct model.
Figure~\ref{fig:resid} presents the distributions of the Cox-Snell residuals
using the two approaches to interval censoring.
Neither of these plots signals serious departures
from the unit exponential distribution,
which confirms the appropriateness of our regression model.
Because the marginal Cox-Snell residuals are not independent in our case,
these plots only provide a crude visual check of the model fits.

<<resid, echo=FALSE>>=
## From Surv object to a matrix with left and right censoring values
get_bounds <- function(x) {
  stopifnot(is.Surv(x))
  lb <- x[, 1]
  ub <- x[, 2]
  ty <- x[, 3]
  ub[ty == 2] <- lb[ty == 2]
  lb[ty == 2] <- -Inf
  ub[ty == 0] <- Inf
  ub[ty == 1] <- lb[ty == 1]
  cbind(lb, ub)
}

## Evaluate the marginal survivor function
marginalize <- function(model, data, type = "survivor",
  lower = -Inf, upper = Inf) {
  ## fun(y|x,g) * density(g)
  joint <- function(re, nd, mod, type) {
    nd <- nd[rep(1, length(re)), ]
    nd[[variable.names(mod, "grouping")]] <- seq(nrow(nd)) ## to take vector-valued REs
    pr <- predict(mod, newdata = nd, ranef = re, type = type) *
      dnorm(re, 0, sd = sqrt(varcov(mod)[[1]][1, 1]))
    c(pr)
  }
  ## integrate out the random effects row by row
  res <- parallel::mclapply(split(data, seq(nrow(data))), function(nd) {
    out <- integrate(joint, lower = lower, upper = upper, nd = nd, mod = model,
                     type = type)
    if (out$message == "OK") return(out$value)
    return(NA)
  }, mc.cores = 8)
  unlist(res)
}

## Cox-Snell residuals
## type:
##   ic: Evaluate the cumulative hazard at the left and right censoring values
##   and return residuals as interval-censored.
##   adj: Adjusted CS resid (Farrington,2000 <doi:10.1111/j.0006-341X.2000.00473.x>),
##   replace the intervals with the expected values under unit exponential on these
##   intervals (under the assumption of correct model fit)
CSresid <- function(model, data = model.frame(model), type = c("both", "adj", "ic")) {
  type <- match.arg(type)
  rv <- variable.names(model, "response")
  if (!is.Surv(data[[rv]])) data[[rv]] <- Surv(data[[rv]])
  bns <- get_bounds(data[[rv]])
  stopifnot(all(is.finite(bns[,1])))
  data[[rv]] <- bns[, 1]
  Sl <- marginalize(model, data, type = "survivor")
  Su <- numeric(length = nrow(bns))
  ie <- bns[,1] == bns[,2]
  Su[ie] <- Sl[ie]
  Su[ir <- is.infinite(bns[,2])] <- 0
  ii <- !(ie | ir)
  data[[rv]] <- bns[, 2]
  Su[ii] <- marginalize(model, data[ii, ], type = "survivor")
  res_ic <- Surv(-log(Sl), -log(Su), type = "interval2")
  if (type == "ic") return(res_ic)
  res <- numeric(length = nrow(bns))
  res[ie] <- -log(Sl[ie])
  res[ir] <- 1 - log(Sl[ir])
  res[ii] <- (Sl[ii] * (1 - log(Sl[ii])) - Su[ii] * (1 - log(Su[ii]))) /
    (Sl[ii] - Su[ii])
  if (type == "adj") return(res)
  data.frame(IC = res_ic, adjusted = res)
}

if (!file.exists("CS-resid.rda")) {
  resids <- CSresid(dcmp, type = "both")
  save(resids, file = "CS-resid.rda")
} else {
  load("CS-resid.rda")
}
@

\begin{figure}[!ht]
  \centering

<<resid-plot, echo=FALSE, fig.width=7, fig.height=3.5>>=
rf_cens <- survfit(IC ~ 1, data = resids) ## Turnbull NPMLE for interval-censored
rf_adj <- survfit(Surv(adjusted) ~ 1, data = resids) ## KM for adjusted

par(mfrow = c(1, 2), cex = 0.9, mar = c(4, 4, 2, 1))
plot(rf_cens$time, -log(rf_cens$surv), type = "s", lwd = 1, las = 1,
     xlab = "Cox-Snell residuals", ylab = "Cumulative hazard")
abline(0, 1, lwd = 2, lty = 2)
grid()
blx <- grconvertX(0.10, from = "nfc", to = "user")
bly <- grconvertY(0.98, from = "nfc", to = "user")
text(blx, bly, labels = "A", xpd = TRUE, cex = 1.2)

plot(rf_adj$time, rf_adj$cumhaz, type = "s", lwd = 1, las = 1,
     xlab = "Cox-Snell residuals", ylab = "Cumulative hazard")
abline(0, 1, lwd = 2, lty = 2)
grid()
blx <- grconvertX(0.10, from = "nfc", to = "user")
bly <- grconvertY(0.98, from = "nfc", to = "user")
text(blx, bly, labels = "B", xpd = TRUE, cex = 1.2)
@

\caption{Cox-Snell residuals of the carrion decomposition model.
  \emph{Panel A}: Treating the residuals as interval-censored
  and estimating the cumulative hazard function
  using the Turnbull non-parametric maximum likelihood estimator.
  \emph{Panel B}: Using the adjustment proposed by \protect\citet{Farrington_2000}.
  The dashed lines correspond to the unit exponential distribution.
}\label{fig:resid}
\end{figure}

We can relax the assumption of proportional hazards
by allowing for time-dependent covariate effects.
A transformation model with time-dependent effects
for the \texttt{Insects} indicator can be estimated as
%
<<carrion-model2>>=
dcmp2 <- CoxphME(Time | Insects ~ Habitat + Landscape
                 + s(Temperature, k = 20) + s(Elevation100, k = 20)
                 + (1 | PlotID), data = carrion,
                 log_first = TRUE, order = 6)
@
%
In Figure~\ref{fig:non-ph},
we compare the effect estimates of the presence of insects
(on the log-hazard scale)
from the proportional hazards and non-proportional hazards
(time-varying effects) models.
According to these results, the proportional hazards assumption
seems plausible.

\begin{figure}[!ht]
\centering

<<fig-carrion-tve, echo=FALSE, fig.width=4, fig.height=3.5, warning=FALSE, out.width=".5\\textwidth">>=
cf <- coef(dcmp2, with_baseline = TRUE)
vc <- vcov(dcmp2, pargroup = "baseline")
cf <- cf[grep("Insectsyes", names(cf), fixed = TRUE)]
idx <- grep("Insectsyes", colnames(vc), fixed = TRUE)
vc <- vc[idx, idx]

ns <- 200
nd <- model.frame(dcmp2)[rep(1, ns), ]
nd[[variable.names(dcmp2, "response")]] <- seq(1, 100, length.out = ns)
X <- model.matrix(dcmp2, data = nd, type = "Y", simplify = TRUE)$Ye
idx <- grep("Insectsyes", colnames(X), fixed = TRUE)
X <- X[, idx]
ci <- confint(multcomp::glht(multcomp::parm(cf, vc), linfct = X),
              calpha = multcomp::univariate_calpha(), level = 0.95)$confint
## use multcomp::adjusted_calpha() for multiplicity adjustments

par(cex = 0.8, mar = c(4, 4, 2, 1), las = 1)
plot(nd$Time, ci[, 1], type = "l", col = 1, lwd = 2, ylim = c(-1, 3),
     panel.first = {grid(); abline(h = 0, lwd = 2, col = "lightgrey")},
     ylab = "Log-hazard ratio", xlab = "")
title(xlab = "Time (days)", line = 2.3)
polygon(c(nd$Time, rev(nd$Time)), c(ci[, 2], rev(ci[, 3])), border = NA,
        col = grey(0.5, 0.2))
ci2 <- confint(dcmp, parm = "Insectsyes", estimate = TRUE)
ci2 <- ci2[rep(1, ns), ]
matlines(nd$Time, ci2, col = 1, lwd = c(1, 1, 2), lty = c(3, 3, 2))
legend("bottomright", c("Time-varying effect", "Proportional effect"),
       lty = 1:2, lwd = 2, bty = "n", cex = 0.9)
@

\caption{The effect of the presence of insects on the decomposition process
from the proportional-hazards and non-proportional hazards models.}\label{fig:non-ph}
\end{figure}

\section{\emph{E. coli} concentrations in streams
  with different grazing periods}\label{sec:ecoli}

\citet{Hulvey_2021} compare the concentration levels
of \emph{Escherichia coli} bacteria (most probable number, MPN)
in streams under three different rotational grazing regimes.
In the additive mixed model specifications they estimated,
within-year variability was modeled,
as functions of the day of year (DOY),
with cubic regression splines
and between-year and location-level variability were captured by
random intercepts of pasture-specific year effects and separate stream effects.
Note that although the cyclic version of the cubic regression splines
(\texttt{bs = "cc"} in \pkg{mgcv} and \pkg{tramME}) \citep{pkg:tramME}
would be more appropriate for modeling the within-year trend,
the original article used \texttt{bs = 'cr'}
and hence we also stick with this basis in our reanalysis.

<<ecoli-data, echo=FALSE>>=
ecoli <- read.csv("Hulvey2021.csv")
fs <- c("treatment", "stream", "pasture", "cattle", "rotation")
ecoli[fs] <- lapply(ecoli[fs], factor)
@

As a first step,
we replicate the results of all model variants
that \citet{Hulvey_2021} investigated in the original article
with the \textsf{R} package \pkg{gamm4} \citep{pkg:gamm4}.
In a second step, we fit the \emph{same} models using the software
implementation of additive mixed transformation models in package
\pkg{tramME}, that is, using a linear transformation function (function
\code{tramME::LmME}). We do expect identical results in steps one and two,
although the two implementions rely on two completely distinct code bases.
Thus, these results are only interesting from a quality assurance point of
view. In the last step, we relax the normal distributional
assumption by 
allowing a nonlinear transformation function (\code{tramME::BoxCoxME} function)
and evaluate how the model fits change. We are primarily interested in
potential changes of the model interpretation induced by a shift from a
normal to a distribution-free model.

As Table~\ref{tbl:ecoli-res} shows,
we managed to reproduce the \pkg{gamm4} results with \pkg{tramME}.
Moreover, relaxing the distributional assumption of the
normal linear model resulted in stronger model fits
in terms of in-sample log-likelihood values.

<<ecoli-est, echo=TRUE>>=
## specifications w/o random effects
mf <- c(log10(ecoli_MPN) ~ treatment + cattle +
          s(DOY, bs = "cr", by = treatment),
        log10(ecoli_MPN) ~ treatment + cattle + s(DOY, bs = "cr"),
        log10(ecoli_MPN) ~ treatment + s(DOY, bs = "cr", by = treatment),
        log10(ecoli_MPN) ~ cattle + s(DOY, bs = "cr"),
        log10(ecoli_MPN) ~ treatment + s(DOY, bs = "cr"),
        log10(ecoli_MPN) ~ s(DOY, bs = "cr"))
names(mf) <- paste("Model", c(1:5, "Null"))
ecoli_res <- data.frame(matrix(NA, nrow = length(mf), ncol = 3))
colnames(ecoli_res) <- c("gamm", "LmME", "BoxCoxME")
rownames(ecoli_res) <- names(mf)
for (i in seq_along(mf)) {
  m_gamm <- gamm4(mf[[i]], data = ecoli,
                  random = ~ (1 | year:stream:pasture) + (1 | stream),
                  REML = FALSE)
  ecoli_res$gamm[i] <- logLik(m_gamm$mer)
  mf2 <- update(mf[[i]], . ~ . + (1 | year:stream:pasture) + (1 | stream))
  m_LmME <- LmME(mf2, data = ecoli)
  if (m_LmME$opt$convergence == 0) ecoli_res$LmME[i] <- logLik(m_LmME)
  m_BCME <- BoxCoxME(mf2, data = ecoli)
  if (m_BCME$opt$convergence == 0) ecoli_res$BoxCoxME[i] <- logLik(m_BCME)
}
@

\begin{table}[!ht]
  \centering
  \caption{Log-likelihood values of the fitted models
    presented by \protect\citet[\emph{GAMM}]{Hulvey_2021}, replicated as
    mixed-effects additive transformation models assuming conditional normality
    (\emph{Additive normal transformation model}) and extended
    as flexible (non-normal) mixed-effects additive transformation models
    (\emph{Additive non-normal transformation model}).}
  \label{tbl:ecoli-res}

<<ecoli-tbl, echo=FALSE, results="asis">>=
names(ecoli_res) <- c("\\multicolumn{1}{m{2cm}}{\\centering GAMM}",
 "\\multicolumn{1}{m{4cm}}{\\centering Additive normal transformation model}",
 "\\multicolumn{1}{m{4cm}}{\\centering Additive non-normal transformation model}")
print(xtable(ecoli_res, align = c("l", "R{2cm}", rep("R{4cm}", 2))),
      floating = FALSE, booktabs = TRUE,
      sanitize.colnames.function = function(x){x})
@

\end{table}

Let us focus on the most complicated specification, Model~1,
%
<<ecoli-show-m1>>=
update(mf[[1]], . ~ . + (1 | year:stream:pasture) + (1 | stream))
@
%
\noindent
and compare the effect estimates from the normal model to
its non-parametric counterpart.
But first, notice that by changing the transformation from
$h(y) = \vartheta_{0} + \vartheta_{1}y$ to
$h(y) = \ba(y)\T\bvartheta$,
we change the scale on which the coefficients
and the smooth terms are interpreted.
In the normal additive mixed model,
the coefficient of a fixed effect captures
the change in the expectation of the outcome when increasing
the respective predictor by one unit
(keeping everything else unchanged).
In the non-normal transformation model with
$\Phi$ as the inverse link, the coefficients
capture similar effects but on a latent scale
defined by the transformation $h(Y)$.

To cast the effect estimates from the two models to a common scale,
we can calculate the \emph{probabilistic indices} \citep[PI,][]{Thas_2012}.
To simplify the notation,
first, we will now focus on the simple,
fixed effects-only case with a single predictor:
%
\begin{align*}
  \IP(Y\leq y \mid \X = x) &= \Phi\left(h(y) - \beta x \right)
\end{align*}
%
The PI is the probability that
one outcome ($Y^{\star}$) is larger than the other ($Y$),
given the same covariate values ($\X$) except for one,
which is larger with one unit ($\X^{\star}$).
In our simplified example, this means
%
\begin{align*}
  \IP\left(Y < Y^{\star} \mid \X = x, \X^{\star} = x+1\right)
  &= \IP\left(h(Y) < h(Y^{\star}) \mid \X = x, \X^{\star} = x+1\right) \\
  &= \IP\left(\frac{h(Y) - h(Y^{\star}) + \beta}{\sqrt{2}} < \frac{\beta}{\sqrt{2}}
    \right) \\
  &= \Phi\left(\frac{\beta}{\sqrt{2}}\right).
\end{align*}
%
The third line follows from the fact
that, in a transformation model with $\Phi(\cdot)$ as the inverse link,
$h(Y)$ and $h(Y^{\star})$ are independent,
normally distributed random variables
with unit variance and a mean difference of $\beta$.
Notice that the PI does not depend on the transformation function.
When random effects are present in the model,
the PI is conditional on the cluster.

In the case of transformation models
with non-linear additive terms
the probabilistic index is a function of the covariate.
In the simplest form of an additive transformation model
with probit link,
we have
%
\begin{align*}
  \IP(Y\leq y \mid \X = x) &= \Phi\left(h(y) - f(x) \right)
\end{align*}
%
and the PI is
%
\begin{align*}
  PI(x) = \IP\left(Y < Y^{\star} \mid \X = x, \X^{\star} = x+1\right)
  = \Phi\left(\frac{f(x+1) - f(x)}{\sqrt{2}} \right).
\end{align*}

By transforming the effect estimates to the probability scale,
Figure~\ref{fig:ecoli-nonnorm} compares the smooth terms from
the normal and non-normal versions of Model~1,
while the first two blocks of Table~\ref{tbl:ecoli-fes}
contrasts the fixed effects estimates.
The results are very close to each other,
which suggests that the original log-normal model is actually appropriate.
As a built-in visual normality check,
we can compare the fitted transformation functions
of the normal and non-normal transformation models.
The linear function corresponds
to a conditional normal distribution in Figure~\ref{fig:ecoli-trafo}.
This result further confirms the appropriateness
of the normal additive model in this specific example.

<<ecoli-plot-fun, echo=FALSE>>=
plot2cis <- function(x, y, col = c(1, 2),
                     fill = c(grey(0.1, 0.25), rgb(1, 0, 0, 0.25)),
                     xlabs = NULL, ylabs = NULL, mains = NULL,
                     ...) {
  stopifnot(length(x) == length(y))
  for (ii in seq_along(x)) {
    plot(0, type = "n",
         xlab = if (is.null(xlabs)) colnames(x[[ii]])[1] else xlabs[ii],
         ylab = if (is.null(ylabs)) colnames(x[[ii]])[2] else ylabs[ii],
         main = if (is.null(mains)) NULL else mains[ii],
         xlim = range(x[[ii]][, 1], y[[ii]][, 1]),
         ylim = range(x[[ii]][, 2:4], y[[ii]][, 2:4]),
         panel.first = grid(), ...)
    lines(x[[ii]][, 1], x[[ii]][, 2], col = col[1])
    lines(y[[ii]][, 1], y[[ii]][, 2], col = col[2])
    polygon(c(x[[ii]][, 1], rev(x[[ii]][, 1])),
            c(x[[ii]][, 3], rev(x[[ii]][, 4])),
            border = NA, col = fill[1])
    polygon(c(y[[ii]][, 1], rev(y[[ii]][, 1])),
            c(y[[ii]][, 3], rev(y[[ii]][, 4])),
            border = NA, col = fill[2])
  }
}

## NOTE: currently only w/ pnorm inverse link
smooth2ci <- function(sm, PI = FALSE, ilink = "pnorm") {
  PIfun <- switch(ilink, pnorm = function(x) pnorm(x / sqrt(2)),
                  stop("No other inverse links are available atm."))
  lapply(sm, function(x) {
    out <- matrix(0, nrow = nrow(x), ncol = 4)
    colnames(out) <- c(colnames(x)[c(1, ncol(x)-1)], "lwr", "upr")
    out[, 1] <- x[, 1]
    se <- rev(x)[, 1]
    yy <- rev(x)[, 2]
    out[, 2] <- yy
    out[, 3:4] <- yy + qnorm(0.975) * se %o% c(-1, 1)
    if (PI) out[, 2:4] <- PIfun(out[, 2:4])
    out
  })
}

## NOTE: this function assumes the smooth term: s(DOY, by = "treatment")
diff_smooth <- function(mod, DOY = NULL, n = 100) {
  XZ_sm <- function(XZ) {
    X <- XZ$X
    X[, attr(XZ$X, "type") != "sm"] <- 0
    Z_ <- Matrix::t(XZ$Zt)[, attr(XZ$Zt, "type") == "sm", drop = FALSE]
    Z <- tramME:::nullTMatrix(nrow = nrow(Z_), ncol = length(mod$param$gamma))
    Z[, attr(mod$param$gamma, "type") == "sm"] <- Z_
    list(X = X, Z = Z)
  }

  mf <- model.frame(mod, drop.unused.levels = TRUE)
  if (is.null(DOY)) DOY <- seq(range(mf$DOY)[1], range(mf$DOY)[2], length.out = n)
  nd_ <- expand.grid(DOY = DOY,
    treatment = factor(levels(mf$treatment), levels = levels(mf$treatment)))
  nd <- mf[rep(1, nrow(nd_)), ]
  nd[colnames(nd_)] <- nd_
  XZ <- model.matrix(mod, data = nd, type = c("X", "Zt"), keep_sign = FALSE)
  XZ1 <- XZ_sm(XZ)
  nd$DOY <- nd$DOY + 1
  XZ <- model.matrix(mod, data = nd, type = c("X", "Zt"), keep_sign = FALSE)
  XZ2 <- XZ_sm(XZ)
  XZ <- list(X = XZ2$X - XZ1$X, Z = tramME:::as_dgTMatrix(XZ2$Z - XZ1$Z))
  pr <- predict(mod$tmb_obj, newdata = XZ, scale = "lp")
  split(data.frame(DOY = nd_$DOY, df = pr$pred, se = pr$se), nd$treatment)
}
@

<<ecoli-m1, echo=FALSE>>=
fm1 <- log10(ecoli_MPN) ~ treatment + cattle +
  s(DOY, bs = "cr", by = treatment) +
  (1 | year:stream:pasture) + (1 | stream)
ecoli_m1 <- LmME(fm1, data = ecoli)
ecoli_m1_bc <- BoxCoxME(fm1, data = ecoli)
@

\begin{figure}[!ht]
  \centering

<<plot-ecoli-m1, echo=FALSE, fig.width=9, fig.height=3>>=
par(mfrow = c(1, 3), cex = 0.8, mar = c(4, 4, 2, 1), las = 1)
plot2cis(smooth2ci(diff_smooth(ecoli_m1), PI = TRUE),
         smooth2ci(diff_smooth(ecoli_m1_bc), PI = TRUE),
         mains = paste("treatment =", levels(ecoli$treatment)),
         ylabs = rep("PI", 3))
legend("topright", c("normal", "non-normal"), col = c(1, 2),
       lty = 1, bty = "n", cex = 0.9)
@

\caption{The comparison of the smooth terms from the normal
  and non-normal (probit link) mixed-effects additive transformation models
  (specification Model~1).
}\label{fig:ecoli-nonnorm}
\end{figure}

\begin{figure}[!ht]
\centering

<<plot-ecoli-trafo, echo=FALSE, fig.width=5.5, fig.height=4, out.width="0.5\\linewidth">>=
nd <- model.frame(ecoli_m1_bc)[rep(1, 100), ]
nd[[1]] <- do.call(seq,
  c(as.list(range(log10(ecoli$ecoli_MPN))), length.out = 100))

Y <- model.matrix(ecoli_m1_bc, data = nd, type = "Y")$Ye
b <- coef(ecoli_m1_bc, with_baseline = TRUE)[1:7]
vc <- vcov(ecoli_m1_bc, pargroup = "baseline")
ci <- confint(multcomp::glht(multcomp::parm(b, vc), linfct = Y),
              calpha = multcomp::univariate_calpha())$confint

par(mar = c(4, 4, 1, 1), cex = 0.8, las = 1)
plot(0, type = "n", xlim = range(nd[[1]]), ylim = range(ci),
     xlab = variable.names(ecoli_m1_bc, "response"), ylab = "h(y)",
     panel.first = grid(), xaxs = "i", yaxs = "i")
lines(nd[[1]], ci[, 1], lwd = 2)
polygon(c(nd[[1]], rev(nd[[1]])), c(ci[, 2], rev(ci[, 3])), col = grey(0.2, 0.2),
        border = FALSE)
b2 <- coef(ecoli_m1, with_baseline = TRUE)
abline(b2[1:2], lwd = 2, lty = 2)
legend("topleft", c("Non-normal", "Normal"), lwd = 2, lty = 1:2, bty = "n")
@

\caption{Baseline transformation functions
  from the normal and non-normal mixed-effects
  additive transformation models.}\label{fig:ecoli-trafo}
\end{figure}


The outcome variable (MPN per 100~ml)
was measured with the Quanti-Tray System,
which can detect \emph{E. coli} concentrations
up to a maximum of 2,419.6~MPN without dilution.
This means that there is an effective upper detection limit
on the outcome,
i.e., the \Sexpr{sum(ecoli$ecoli_MPN == 2419.6)} observations with
the value of 2,419.6 are \emph{right censored}.
The authors of the original article mention this fact,
but they do not take into account in the subsequent analyses.
Because random censoring can be easily handled in \pkg{tramME},
we will rerun the model taking the upper limit into account.
%
<<ecoli-cens>>=
fm1c <- update(fm1, Surv(log10(ecoli_MPN), event = ecoli_MPN < 2419.6) ~ .)
ecoli_m1_cens <- BoxCoxME(fm1c, data = ecoli)
summary(ecoli_m1_cens)
@
%
The fitted non-linear terms are compared to the original
(normal linear) estimates in Figure~\ref{fig:ecoli-cens}
and the fixed effects are presented in the third block of Table~\ref{tbl:ecoli-fes}.

\begin{table}[!ht]
  \centering
  \caption{
    Estimates of the parametric fixed-effects
    terms on the \emph{probability scale}
    (PI:~probabilistic index) from the normal, non-normal
    and non-normal (with censoring taken into account) models,
    respectively.
  }\label{tbl:ecoli-fes}
<<ecoli-fe-tbl, results="asis", echo=FALSE>>=
cis <- lapply(list(ecoli_m1, ecoli_m1_bc, ecoli_m1_cens), function(x) {
  ci <- confint(x, pargroup = "shift", estimate = TRUE)
  ci <- pnorm(ci[, c(3, 1, 2)] / sqrt(2))
  ci <- formatC(ci, format = "f", digits = 2)
  cbind(ci[, 1],
        apply(ci[, 2:3], 1, function(x) paste(x, collapse = "---")))
})

tbl <- do.call("cbind", cis)
rownames(tbl) <- c("treatment = medium", "treatment = short", "cattle = present")
add <- list()
add$pos <- list(0)
add$command <- paste(c("& \\multicolumn{2}{c}{Normal} &",
                       "\\multicolumn{2}{c}{Non-normal} &",
                       "\\multicolumn{2}{c}{Non-normal, censored} \\\\\n",
                       "\\cmidrule(lr){2-3} \\cmidrule(lr){4-5}",
                       "\\cmidrule(lr){6-7}",
                       rep("& PI & 95\\% CI ", 3),
                       "\\\\\n"), collapse = " ")
print(xtable(tbl, align = "lrrrrrr"), include.colnames = FALSE,
      floating = FALSE, add.to.row = add,
      sanitize.text.function = function(x) x, booktabs = TRUE)
@

\end{table}


\begin{figure}[!ht]
  \centering

<<plot-ecoli-cens, echo=FALSE, fig.width=9, fig.height=3>>=
par(mfrow = c(1, 3), cex = 0.8, mar = c(4, 4, 2, 1), las = 1)
plot2cis(smooth2ci(diff_smooth(ecoli_m1), PI = TRUE),
         smooth2ci(diff_smooth(ecoli_m1_cens), PI = TRUE),
         mains = paste("treatment =", levels(ecoli$treatment)),
         ylabs = rep("PI", 3))
legend("topright", c("normal", "non-normal, censored"), col = c(1, 2),
       lty = 1, bty = "n", cex = 0.9)
@

\caption{
  The comparison of the smooth terms
  from the original model (normal linear)
  and the non-normal (probit link) extension
  where censoring is also taken into account.
}\label{fig:ecoli-cens}
\end{figure}

Because the transformation model approximates
the conditional distribution of the outcome,
in theory,
we do not even have to take the base 10 logarithm
of the \emph{E. coli} most probable numbers (MPN)
on the left-hand side of the model formula.
\pkg{tramME} should be able to approximate the
\emph{most likely transformation}.
%
<<ecoli-notr>>=
f_nontr <- update(fm1, Surv(ecoli_MPN, event = ecoli_MPN < 2419.6) ~ .)
ecoli_nontr <- BoxCoxME(f_nontr, data = ecoli, log_first = TRUE)
summary(ecoli_nontr)
@
%
Notice that we set \texttt{log\_first = TRUE} in the function call,
to take the natural logarithm of the outcome
before setting up the Bernstein bases.
This usually helps the approximation in the case of positive right-skewed outcomes.
With this, we basically estimate the same model as the original,
but with the natural logarithm instead of base-ten.
Because of this difference,
the log-likelihood values are also different,
but the fixed effects and variance components parameter estimates,
as well as the smooth terms are essentially the same as
in the case of the model \texttt{ecoli\_m1\_cens}.

In summary,
after bringing the estimates to the same scale,
the results of the additive mixed effects model
did not change much in this specific example
by switching to the transformation model approach.
The originally applied base 10 logarithm falls very close
to the fitted ``most likely transformation'',
i.e., taking the logarithm of the outcome was sufficient
to achieve (close) conditional normality.
This could be verified through comparing
the baseline transformation functions of the normal and non-normal models,
which can also serve as a visual check on conditional normality.
Moreover, the number of censored outcomes
was relatively small in the sample,
so taking the censoring properly into account
did not result in large differences, either.
However, as the example demonstrated,
transformation models are flexible enough
to accommodate these properties
of the response of interest
(non-normality and censoring) automatically,
without the need to apply ad hoc transformations
or to implement new estimation procedures.
In this sense, \code{tramME::BoxCoxME} provides a simple way of checking the
impact of the more restrictive assumptions hard-wired in \code{gamm4::gamm4}
on model interpretation and of handling censoring properly in the estimation procedure.


\section{Sea urchin removal experiment}

\citet{Andrew_1993} analyzed the percentage cover
of filamentous algae
under four sea urchin removal treatments (Control/33\%/66\%/Removal).
The algae colonization was measured
on five quadrants located on several larger patches,
so there is a clear grouped structure in the data.
\citet{Douma_2019} reanalyzed the data as a demonstration for the usage
of mixed-effects models for zero-inflated beta regression.
Here, we fit mixed-effects transformation models to the data
and compare the results to zero-inflated mixed-model estimates
obtained from \texttt{glmmTMB} \citep{Brooks_2017,pkg:glmmTMB}.
Figure~\ref{fig:algae-data} presents the empirical cumulative distribution functions
of the outcome under the four treatments.
Note the large number of zeros, especially in the control group.

<<algae-data, echo=FALSE>>=
andrew <- read.csv("andrew.csv",
                   colClasses = c(QUAD = "factor", PATCH = "factor"))
andrew$TREAT <- factor(andrew$TREAT, labels = c("control", "removal", "0.33", "0.66"))
andrew$TREAT <- factor(andrew$TREAT, levels = c("control", "0.33", "0.66", "removal"))
andrew$pALGAE <- andrew$ALGAE / 100
## summary(andrew)
ecdfs <- lapply(split(andrew, andrew$TREAT), function(x) ecdf(x$pALGAE))
@

\begin{figure}[!ht]
  \centering
<<plot-algae-treatment, echo=FALSE, fig.width=5.5, fig.height=4.5, out.width=".6\\textwidth">>=
x <- seq(0, 1, length.out = 100)
plot(x, ecdfs[[1]](x), type = "s", xlab = "Algae cover proportion", ylab = "ECDF",
     lty = 1, lwd = 2, xlim = c(0, 1), ylim = c(0, 1), las = 1, panel.first = grid())
for (ii in 2:4) {
  lines(x, ecdfs[[ii]](x), type = "s", lty = ii, lwd = 2)
}
legend("bottomright", levels(andrew$TREAT), lty = 1:4, lwd = 2, bty = "n", cex = 0.9)
@
\caption{Empirical CDFs of the algae cover proportions under the four treatments.}\label{fig:algae-data}
\end{figure}

First, we fit a zero-inflated beta regression model
with random intercepts for the patches.
The probability of observing zero values
is allowed to vary with the treatment.
%
<<algae-glmm, eval=FALSE>>=
urchin_zib <- glmmTMB(pALGAE ~ TREAT + (1 | PATCH), ziformula = ~ TREAT,
                      data = andrew, family = beta_family())
@

As an alternative to the traditional beta regression approach,
we estimate a mixed-effects continuous outcome logistic regression.
%
<<algae-tram>>=
urchin_tram <- ColrME(
  Surv(pALGAE, pALGAE > 0, type = "left") ~ TREAT + (1 | PATCH),
  bounds = c(-0.1, 1), support = c(-0.1, 1), data = andrew,
  order = 6)
summary(urchin_tram)
@
%
To allow for a jump in the conditional CDF of the outcome,
we expand its bound and treat the zero observations as left-censored.
This way, we can place a point mass on zero,
i.e., introduce a jump at 0
(see Figure~\ref{fig:cdf-jump}).

\begin{figure}[!ht]
  \centering

<<pointmass-plot, echo=FALSE, fig.width=7, fig.height=3.5, out.width=".8\\textwidth">>=
par(mar = c(4, 4, 1, 1))
layout(mat = matrix(1:3, nrow = 1), widths = c(45, 10, 45))
nd <- model.frame(urchin_tram)[rep(21, 100), ]
nd[[1]] <- seq(-0.1, 1, length.out = 100)
ccdf_e <- predict(urchin_tram, newdata = nd, type = "distribution", ranef = "zero")
plot(nd[[1]], ccdf_e, type = "l", ylim = c(0, 1), xlim = c(-0.1, 1),
     panel.first = grid(), xlab = "y", ylab = "h(y)")
idx <- which(nd[[1]] < 0)
lines(nd[[1]][idx], ccdf_e[idx], col = 2, lwd = 3)
par(mar = c(1, 1, 1, 1))
plot(0:1, 0:1, type = "n", yaxt = "n", xaxt = "n", xlab = "", ylab = "", bty = "n")
arrows(x0 = 0, x1 = 1, y0 = 0.5, length = 0.1, lwd = 3)
nd[[1]] <- seq(0, 1, length.out = 100)
ccdf <- predict(urchin_tram, newdata = nd, type = "distribution", ranef = "zero")
par(mar = c(4, 4, 1, 1))
plot(nd[[1]], ccdf, type = "l", ylim = c(0, 1), xlim = c(-0.1, 1), lwd = 2,
     panel.first = grid(), xlab = "y", ylab = expression(F[Y] * "(y)"))
points(0, 0, pch = 21, cex = 1, lwd = 2)
points(0, ccdf[1], pch = 20, cex = 1, lwd = 2)
segments(x0 = -0.1, x1 = -0.01, y0 = 0, lwd = 2)
@

\caption{Visual demonstration of how a discrete jump
  is introduced in the CDF by extending the support and
  treating the edge cases as censored.}\label{fig:cdf-jump}
\end{figure}

Because the zero-inflated beta model is a mixture of two models,
the interpretation of its results is cumbersome.
It is not clear which parameters,
or combinations of parameters,
one needs to inspect to contrast
the effects of the various treatments.
Moreover, extra steps are needed to calculate the marginal effects
of the covariates.
In contrast,
the mixed-effects transformation model
only contains a single set of fixed effects parameters
and their interpretation is straightforward:
For example,
the odds of observing higher proportions of algae cover
under the 33\% removal treatment is about
$\exp(-\widehat{\beta}_{0.33}) = $
\Sexpr{formatC(exp(-coef(urchin_tram)[1]), format = 'f', digits = 2)}
times higher compared to the control group.

To assess the fits of the two models we can marginalize
the conditional distributions by integrating over the random effects numerically,
and compare against the ECDFs.
As Figure~\ref{fig:algae-mcdf1} shows, both model overestimate the dispersion
in the control group.

<<algae-mCDF1, echo=FALSE>>=
## Numerical integration to get an estimate of the marginal distribution
marginalize.zib.glmmTMB <- function(model, data,
                                     type = c("distribution", "density"),
                                     lower = -Inf, upper = Inf) {
  type <- match.arg(type)
  ## for a single data point
  joint <- function(re, nd, mod, type) {
    mu <- plogis(predict(mod, newdata = nd, type = "link", re.form = NA) + re)
    mu <- ifelse(mu == 0, .Machine$double.eps, mu)
    mu <- ifelse(mu == 1, 1 - .Machine$double.eps, mu)
    sig <- predict(mod, newdata = nd, type = "disp")
    nu <- predict(mod, newdata = nd, type = "zprob")
    out <- sapply(mu, function(m) {
      switch(type,
             distribution = gamlss.dist::pBEZI(nd[[1]], mu = m, sigma = sig, nu = nu),
             density = gamlss.dist::dBEZI(nd[[1]], mu = m, sigma = sig, nu = nu),
             stop("Unknown function!"))
    })
    out * dnorm(re, mean = 0, sd = sqrt(VarCorr(mod)$cond[[1]][1]))
  }
  ## integrate out the random effects row by row
  res <- parallel::mclapply(split(data, seq(nrow(data))), function(nd) {
    out <- integrate(joint, lower = lower, upper = upper, nd = nd, mod = model,
                     type = type)
    if (out$message == "OK") return(out$value)
    return(NA)
  }, mc.cores = 8)
  unlist(res)
}

marginalize.tramME <- function(model, data, type = "survivor", add = 0,
  lower = -Inf, upper = Inf) {
  ## fun(y|x,g) * density(g)
  joint <- function(re, nd, mod, type) {
    nd <- nd[rep(1, length(re)), ]
    rv <- variable.names(mod, "response")
    nd[[rv]] <- nd[[rv]] + add
    nd[[variable.names(mod, "grouping")]] <- seq(nrow(nd)) ## to take vector-valued REs
    pr <- predict(mod, newdata = nd, ranef = re, type = type) *
      dnorm(re, 0, sd = sqrt(varcov(mod)[[1]][1, 1]))
    c(pr)
  }
  ## integrate out the random effects row by row
  res <- parallel::mclapply(split(data, seq(nrow(data))), function(nd) {
    out <- integrate(joint, lower = lower, upper = upper, nd = nd, mod = model,
                     type = type)
    if (out$message == "OK") return(out$value)
    return(NA)
  }, mc.cores = 8)
  unlist(res)
}


nd <- expand.grid(pALGAE = seq(0, 1, length.out = 100),
                  TREAT = factor(levels(andrew$TREAT),
                                 levels = c("control", "0.33", "0.66", "removal")),
                  PATCH = andrew$PATCH[1],
                  KEEP.OUT.ATTRS = FALSE)
colnames(nd)[1] <- variable.names(urchin_tram, "response")

if (!file.exists("urchin.rda")) {
  ## estimate the glmmTMB model
  urchin_zib <- glmmTMB(pALGAE ~ TREAT + (1 | PATCH), ziformula = ~ TREAT,
                        data = andrew, family = beta_family())
  ## save results in a single df
  urchin_res <- nd
  colnames(urchin_res)[1] <- "pALGAE"
  urchin_res$tramME_shift <- marginalize.tramME(urchin_tram,
                                                data = nd, type = "distribution")
  urchin_res$zib <- marginalize.zib.glmmTMB(urchin_zib, data = nd,
                                            type = "distribution")
  urchin_zib_ll <- data.frame(ll = logLik(urchin_zib), np = length(urchin_zib$obj$par))
} else{
  load("urchin.rda")
}
@

\begin{figure}[!ht]
\centering

<<plot-algae-mCDF1, echo=FALSE, fig.width=9, fig.height=3.5>>=
multiplot <- function(dfs, yn, xn = colnames(dfs[[1]])[1], ecdfs = NULL,
                      cols = 1:(length(yn)+!is.null(ecdfs)),
                      ltys = rep(1, length(yn)+!is.null(ecdfs)),
                      lwds = rep(1, length(yn)+!is.null(ecdfs)),
                      mains = NULL, ...) {
  if (!is.null(ecdfs)) stopifnot(length(dfs) == length(ecdfs))
  for (i in seq_along(dfs)) {
    col <- if (is.null(ecdfs)) cols else cols[-length(cols)]
    lty <- if (is.null(ecdfs)) ltys else ltys[-length(ltys)]
    lwd <- if (is.null(ecdfs)) ltys else lwds[-length(lwds)]
    main <- if (is.null(mains)) names(dfs)[i] else mains[i]
    matplot(dfs[[i]][[xn]], dfs[[i]][yn], type = "l",
            col = col, lty = lty, lwd = lwd, panel.first = grid(), ...,
            main = main)
    if (!is.null(ecdfs)) {
      lines(dfs[[i]][[xn]], ecdfs[[i]](dfs[[i]][[xn]]), type = "s",
            lty = rev(ltys)[1], col = rev(cols)[1], lwd = rev(lwds)[1],
            ...)
    }
  }
}

cols <- c("#E16A86", "#00AD9A")
layout(mat = matrix(c(1:4, rep(5, 4)), nrow = 2, byrow = TRUE),
       heights = c(7, 1))
par(mar = c(4, 4, 3, 1), las = 1)
multiplot(split(urchin_res, urchin_res$TREAT),
          yn = c("tramME_shift", "zib"),
          ecdfs = ecdfs,
          mains = paste("treatment =", levels(urchin_res$TREAT)),
          cols = c(cols, 1), lwds = c(rep(2, 2), 1),
          xlab = "pALGAE", ylab = "prob",
          xlim = c(0, 1))
par(mar = c(0, 0, 0, 0))
plot.new()
legend("center",
       c("Linear transformation model", "Zero-inflated beta", "ECDF"),
       col = c(cols, 1), lty = 1, bty = "n", lwd = c(rep(2, 2), 1), horiz = TRUE)
@

\caption{Fitted marginal distributions of algae cover proportion from
  the zero-inflated beta regression and the mixed-effects
  transformation model, respectively.
  The step functions show the empirical cumulative distribution
  functions in the four treatment groups.}\label{fig:algae-mcdf1}
\end{figure}

Systematic differences in the outcome variability
in the treatment groups occur in many situations \citep{Douma_2019}.
By modeling the dispersion separately, we can incorporate such differences
in the beta regression model.
%
<<algae-glmm2, eval=FALSE>>=
urchin_zib_disp <- glmmTMB(pALGAE ~ TREAT + (1 | PATCH),
                           ziformula = ~ TREAT, dispformula = ~ TREAT,
                           data = andrew, family = beta_family())
@

In the mixed-effects linear transformation model,
we stratify to the treatment group to allow for separate
transformation functions.
%
<<algae-tram2>>=
urchin_tram_strat <- ColrME(
  Surv(pALGAE, pALGAE > 0, type = "left") | 0 + TREAT ~ 1 + (1 | PATCH),
  bounds = c(-0.1, 1), support = c(-0.1, 1), data = andrew,
  order = 6, control = optim_control(iter.max = 1e3, eval.max = 1e3,
                                     rel.tol = 1e-9))
summary(urchin_tram_strat)
@

<<algae-mCDF2, echo=FALSE>>=
if (!("tramME_strat" %in% colnames(urchin_res))) {
  ## fit the glmm
  urchin_zib_disp <- glmmTMB(pALGAE ~ TREAT + (1 | PATCH),
                             ziformula = ~ TREAT, dispformula = ~ TREAT,
                             data = andrew, family = beta_family())
  ## marginalize
  urchin_res$tramME_strat <- marginalize.tramME(urchin_tram_strat,
    data = nd, type = "distribution")
  urchin_res$zib_disp <- marginalize.zib.glmmTMB(urchin_zib_disp,
    data = nd, type = "distribution")
}
if (nrow(urchin_zib_ll) == 1) {
  urchin_zib_ll <- rbind(urchin_zib_ll,
    c(logLik(urchin_zib_disp), length(urchin_zib_disp$obj$par)))
}
if (!file.exists("urchin.rda")) {
  save(urchin_res, urchin_zib_ll, file = "urchin.rda")
}
@

As Figure~\ref{fig:algae-mcdf2} illustrates,
the two models fit the data much better.
However, the cost of this flexibility is
that we cannot reduce the group comparisons
to inference on a small set of parameters anymore.

\begin{figure}[!ht]
\centering

<<plot-algae-mCDF2, echo=FALSE, fig.width=9, fig.height=3.5>>=
cols <- c("#E16A86", "#00AD9A")
layout(mat = matrix(c(1:4, rep(5, 4)), nrow = 2, byrow = TRUE),
       heights = c(7, 1))
par(mar = c(4, 4, 3, 1), las = 1)
multiplot(split(urchin_res, urchin_res$TREAT),
          yn = c("tramME_strat", "zib_disp"),
          ecdfs = ecdfs,
          mains = paste("treatment =", levels(urchin_res$TREAT)),
          cols = c(cols, 1), lwds = c(rep(2, 2), 1),
          xlab = "pALGAE", ylab = "prob",
          xlim = c(0, 1))
par(mar = c(0, 0, 0, 0))
plot.new()
legend("center",
       c("Stratified linear transformation model",
         "Zero-inflated beta with dispersion model",
         "ECDF"),
       col = c(cols, 1), lty = 1, bty = "n", lwd = c(rep(2, 2), 1), horiz = TRUE)
@

\caption{Fitted marginal distributions of algae cover proportion from
  the zero-inflated beta regression with dispersion model
  and the stratified mixed-effects transformation model, respectively.
  The step functions show the empirical cumulative distribution
  functions in the four treatment groups.}\label{fig:algae-mcdf2}
\end{figure}

Figures~\ref{fig:algae-mcdf1} and~\ref{fig:algae-mcdf2}
demonstrate the flexibility of the distribution-free
approach of transformation models compared
to the parametric alternative.
This is also reflected in the log-likelihood values (Table~\ref{tbl:ll-algae}).

\begin{table}[!ht]
\centering
\caption{Log-likelihood values of the four model specifications
  for the sea urchin removal experiment.
}\label{tbl:ll-algae}

<<algae-loglik, results="asis", echo=FALSE>>=
lls <- data.frame(c(urchin_zib_ll$ll[1], urchin_zib_ll$ll[2],
                    logLik(urchin_tram), logLik(urchin_tram_strat)),
                  c(urchin_zib_ll$np[1], urchin_zib_ll$np[2],
                    length(urchin_tram$tmb_obj$par),
                    length(urchin_tram_strat$tmb_obj$par)))
rownames(lls) <- c("Zero-inflated beta without dispersion model",
                   "Zero-inflated beta with dispersion model",
                   "Linear transformation model",
                   "Stratified linear transformation model")
colnames(lls) <- c("$\\log\\mathcal{L}$", "Number of parameters")
print(xtable(lls, align = "lrr", digits = c(0, 2, 0)), sanitize.text.function = function(x) x,
      booktabs = TRUE, floating = FALSE)
@

\end{table}

In summary, although the shift-scale beta regression model is not a special
case of a transformation model and one thus cannot expect identical results
with a specific parameterisation of \code{tramME::ColrME}, the simpler
transformation model (with one instead of two linear predictors) produced a 
better model fit (when comparing the in-sample log-likelihoods).

\section{Mosquito control trial}

<<mosquito-data, echo=FALSE>>=
AGO <- read.csv("Juarez2021.csv")
factors <- c("Community", "HouseID", "Year", "Income", "Placement")
AGO[factors] <- lapply(AGO[factors], factor)
AGO$AEAfemale <- as.integer(AGO$AEAfemale)
@

\citet{Juarez_2021} presented the results of a cluster randomized crossover trial
that assessed the efficacy of Autocidal Gravid Ovitrap (AGO)
as a tool for against the mosquito species \emph{Aedes aegypti}.
The outcome of interest was the number of female mosquitoes
collected on glue boards that were placed either inside or outside
of the selected houses in various neighborhoods.
Within-year patterns in mosquito counts
as well as the coverage of the treatment
in different areas were modeled with non-linear smooths,
while unobserved household and community level effects
were captured by nested random effects.
The original article presented the results
of a conditional Poisson and a negative binomial model.
We reproduce these results with \pkg{gamm4},
and also estimate a mixed-effects additive transformation model
for count data with ``expit'' inverse link function.
Detailed exposition of count transformation models
is given by \citet{Siegfried_Hothorn_2020}.
For fitting such a model,
we will use the following custom-made
\texttt{'CotramME'} model class implementing the likelihood for
count data via interval censoring \citep{Siegfried_Hothorn_2020},
which is currently not part of the \pkg{tramME} package.
%
<<cotram-model, echo=TRUE, eval=FALSE>>=
## Additive count transformation model
## See ?cotram::cotram for the documentation
CotramME <- function(formula, data,
                     method = c("logit", "cloglog", "loglog", "probit"),
                     log_first = TRUE, plus_one = log_first, prob = 0.9,
                     ...) {
  method <- match.arg(method)
  rv <- all.vars(formula)[1]
  stopifnot(is.integer(data[[rv]]), all(data[[rv]] >= 0))
  data[[rv]] <- data[[rv]] + as.integer(plus_one)
  sup <- c(-0.5 + log_first, quantile(data[[rv]], prob = prob))
  bou <- c(-0.9 + log_first, Inf)
  data[[rv]] <- as.Surv(R(data[[rv]], bounds = bou))
  fc <- match.call()
  fc[[1L]] <- switch(method, logit = quote(ColrME), cloglog = quote(CoxphME),
                     loglog = quote(LehmannME), probit = quote(BoxCoxME))
  fc$method <- NULL
  fc$plus_one <- NULL
  fc$prob <- NULL
  fc$log_first <- log_first
  fc$bounds <- bou
  fc$support <- sup
  fc$data <- data
  out <- eval(fc, parent.frame())
  out$call$data <- match.call()$data
  class(out) <- c("CotramME", class(out))
  out
}
mosquito_tram <- CotramME(AEAfemale ~ Year + Income*Placement
  + s(Week) + s(CovRate200) + (1|HouseID)
  + (1|Community), offset = -log(daystrapping), data = AGO,
  method = "logit", order = 5, log_first = TRUE, prob = 0.9)
@

<<mosquito-est, echo=FALSE>>=
if (file.exists("mosquito_models.rda")) {
  load("mosquito_models.rda")
} else {
  ## GAMMs
  mosquito_pois <- gamm4(AEAfemale ~ offset(log(daystrapping)) + Year + Income*Placement
                         + s(Week) + s(CovRate200), random =~ (1|HouseID)
                         + (1|Community), data = AGO, family=poisson)

  mosquito_nb <- gamm4(AEAfemale ~ offset(log(daystrapping)) + Year + Income*Placement
                       + s(Week) + s(CovRate200), random =~ (1|HouseID)
                       + (1|Community), data = AGO, family = negative.binomial(1))

  ## additive count transformation model
  CotramME <- function(formula, data,
                       method = c("logit", "cloglog", "loglog", "probit"),
                       log_first = TRUE, plus_one = log_first, prob = 0.9,
                       ...) {
    method <- match.arg(method)
    rv <- all.vars(formula)[1]
    stopifnot(is.integer(data[[rv]]), all(data[[rv]] >= 0))
    data[[rv]] <- data[[rv]] + as.integer(plus_one)
    sup <- c(-0.5 + log_first, quantile(data[[rv]], prob = prob))
    bou <- c(-0.9 + log_first, Inf)
    data[[rv]] <- as.Surv(R(data[[rv]], bounds = bou))
    fc <- match.call()
    fc[[1L]] <- switch(method, logit = quote(ColrME), cloglog = quote(CoxphME),
                       loglog = quote(LehmannME), probit = quote(BoxCoxME))
    fc$method <- NULL
    fc$plus_one <- NULL
    fc$prob <- NULL
    fc$log_first <- log_first
    fc$bounds <- bou
    fc$support <- sup
    fc$data <- data
    out <- eval(fc, parent.frame())
    out$call$data <- match.call()$data
    class(out) <- c("CotramME", class(out))
    out
  }

  mosquito_tram <- CotramME(AEAfemale ~ Year + Income*Placement
                            + s(Week) + s(CovRate200) + (1|HouseID)
                            + (1|Community), offset = -log(daystrapping), data = AGO,
                            method = "logit", order = 5, log_first = TRUE, prob = 0.9)
  stopifnot(mosquito_tram$opt$convergence == 0)
}
@

Table~\ref{tbl:mosquito-ll} compares
the log-likelihood values of the three model versions.
In terms of in-sample model fit,
as measured by the log-likelihood value,
both the negative binomial and the transformation model
perform much better than the Poisson GAMM.
The results suggest slight improvement in the model fit when we relax the
conditional distribution assumption of the negative binomial GAMM
and follow the distribution-free transformation model approach.

\begin{table}[!ht]
  \centering
  \caption{Log-likelihood values of the fitted Poisson and negative binomial GAMMs
    reproduced from \protect\citet{Juarez_2021} along with the log-likelihood
    of an additive transformation model for count data.}
  \label{tbl:mosquito-ll}

<<mosquito-ll-tbl, echo=FALSE, results="asis">>=
if (!file.exists("mosquito_models.rda")) {
  ll_mosquito <- c("Poisson GAMM" = logLik(mosquito_pois$mer),
                   "Negative binomial GAMM" = logLik(mosquito_nb$mer),
                   "Additive count transformation model" = logLik(mosquito_tram))
  ll_mosquito <- data.frame(ll_mosquito)
  names(ll_mosquito) <- "Log-likelihood"
}
print(xtable(ll_mosquito, align = "lr"), floating = FALSE, booktabs = TRUE)
@

\end{table}

We will now concentrate on comparing the estimates
from the negative binomial and the count transformation models.
Note that the scales on which the parameters are interpreted
are different in the two models:
While in the negative binomial model,
the parametric and smooth terms affect
the log of the conditional mean of the outcome,
in the transformation model with ``logit'' link
(i.e., ``expit'' inverse link),
they are interpreted on the log-odds scale.
Unlike in the example application of Section~\ref{sec:ecoli},
we cannot easily transform the negative binomial parameters
to the probability scale.
Although the magnitudes of the effect estimates
of the two models are not directly comparable,
their directions, significance
and the general shapes of the smooths can be contrasted.

<<mosquito_summaries, echo=FALSE>>=
if (!file.exists("mosquito_models.rda")) {
  sums_mosquito <- list(
    nb = summary(mosquito_nb$gam),
    tram = summary(mosquito_tram)
  )
}
@

Figure~\ref{fig:mosquito-smooth} compares the smooth estimates
of the GAMM from \pkg{gamm4}
and the transformation model from \pkg{tramME}.
Although the within-year time patterns (\texttt{s(Week)})
from the two models are almost identical
(on different scales),
the difference of the smooth estimates of the coverage rate
(\texttt{s(CovRate200)}) is marked.
The general shapes of the smooths are similar,
but the negative binomial GAMM penalizes it more,
which is also reflected in the EDFs:
\Sexpr{round(sums_mosquito$nb$edf[2], 2)}
and \Sexpr{round(sums_mosquito$tram$smooth[2, 1], 2)}
for the negative binomial and count transformation models,
respectively.

\begin{figure}[!ht]
  \centering

<<plot-mosquito-smooth, echo=FALSE, fig.width=9, fig.height=7>>=
if (!file.exists("mosquito_models.rda")) {
  nd <- AGO[rep(1, 100), ]
  nd$Week <- seq(min(AGO$Week), max(AGO$Week), length.out = 100)
  nd$CovRate200 <- seq(min(AGO$CovRate200), max(AGO$CovRate200), length.out = 100)
  pr <- predict(mosquito_nb$gam, type = "terms",
                terms = c("s(Week)", "s(CovRate200)"),
                newdata = nd, se.fit = TRUE)

  sm_mosquito_nb <- lapply(colnames(pr[[1]]), function(n) {
    ci <- pr$fit[, n] + qnorm(0.975) * pr$se.fit[, n] %o% c(-1, 1)
    out <- data.frame(cbind(pr$fit[, n], ci))
    colnames(out) <- c("fit", "lwr", "upr")
    out
  })
  names(sm_mosquito_nb) <- colnames(pr[[1]])
  sm_mosquito_nb[[1]]$x <- nd$Week
  sm_mosquito_nb[[2]]$x <- nd$CovRate200

  sm_mosquito_tram <- smooth_terms(mosquito_tram)
}

plotsm <- function(sm, xlab, ylab) {
  plot(sm$x, sm$fit, type = "l", xlim = range(sm$x),
       ylim = range(sm[, c("lwr", "upr")]),
       xlab = xlab, ylab = ylab, panel.first = grid())
  polygon(c(sm$x, rev(sm$x)), c(sm$lwr, rev(sm$upr)),
          border = NA, col = grey(0.5, 0.25))
}

par(mfrow = c(2, 2), mar = c(4, 4, 3, 1), cex = 0.8, las = 1)
plotsm(sm_mosquito_nb[[1]], "Week", "s(Week)")
plotsm(sm_mosquito_nb[[2]], "CovRate200", "s(CovRate200)")
mtext("Negative binomial model", side = 3, line = -2, outer = TRUE)
sm <- sm_mosquito_tram
for (i in seq_along(sm_mosquito_tram)) {
  sm[[i]][, 2] <- -sm[[i]][, 2]
  plot(sm[i], panel.first = grid())
}
mtext("Count transfromation model", side = 3, line = -23, outer = TRUE)
@

\caption{Smooth terms from the negative binomial and transformation models
  of the \emph{A. aegypti} counts.
  The dashed lines and the grey areas denote
  the 95\% confidence intervals}\label{fig:mosquito-smooth}
\end{figure}

Because the parametric and smooth terms
of the two models are defined on different scales,
the magnitudes of the effect estimates are not directly comparable.
As Table~\ref{tbl:mosquito-fe} shows,
the directions of the effects match
and neither model finds evidence that the main effect
of middle income is different from zero.

\begin{table}
  \centering
  \caption{
    Point estimates and 95\% confidence intervals of the parametric
    fixed effects terms from the negative binomial
    and count transformation models
    of the mosquito control data by \protect\citet{Juarez_2021}.
    Note that the scale of the parameters are different
    and the effect sizes are not directly comparable.
  }\label{tbl:mosquito-fe}

<<mosquito-fe-tbl, echo=FALSE, results="asis">>=
if (!file.exists("mosquito_models.rda")) {
  formatCI <- function(x, digits = 2) {
    fx <- formatC(x, format = "f", digits = digits)
    fx <- matrix(paste0("$",
                        ifelse(c(x) > 0, paste0("\\phantom{-}", fx), fx),
                        "$"), ncol = 2)
    apply(fx, 1, paste, collapse = " ---")
  }

  b_nb <- sums_mosquito$nb$p.table[-1, 1]
  se_nb <- sums_mosquito$nb$p.table[-1, 2]
  ci_nb <- b_nb + qnorm(0.975) * se_nb %o% c(-1, 1)
  ci_nb <- formatCI(ci_nb)
  b_tr <- -sums_mosquito$tram$coef[, 1]
  se_tr <- sums_mosquito$tram$coef[, 2]
  ci_tr <- b_tr + qnorm(0.975) * se_tr %o% c(-1, 1)
  ci_tr <- formatCI(ci_tr)
  ci_mosquito <- data.frame(paste0("$", formatC(b_nb, format = "f", digits = 2), "$"),
                            ci_nb,
                            paste0("$", formatC(b_tr, format = "f", digits = 2), "$"),
                            ci_tr)
  rownames(ci_mosquito) <- c("Year = 2018", "Income = middle", "Placement = out",
                             "Income = middle \\& Placement = out")

  save(sm_mosquito_nb, sm_mosquito_tram,
       ll_mosquito, ci_mosquito, sums_mosquito,
       file = "mosquito_models.rda")
}
add <- list()
add$pos <- list(0)
add$command <- paste(c("& \\multicolumn{2}{c}{Negative binomial} &",
                       "\\multicolumn{2}{c}{Count transformation} \\\\\n",
                       "\\cmidrule(lr){2-3} \\cmidrule(lr){4-5}",
                       rep("& $\\widehat\\beta$ & 95\\% CI ", 2),
                       "\\\\\n"), collapse = " ")
print(xtable(ci_mosquito, align = c("@{}l", rep("r", 3), "r@{}")),
      include.colnames = FALSE, floating = FALSE, add.to.row = add,
      sanitize.text.function = function(x) x, booktabs = TRUE)
@

\end{table}

Again, the models compared for this example are not nested and it is
therefore hard to compare them directly. The
transformation model leads to a similar model interpretation as the model
based on the negative-binomial distribution. Model uncertainty was larger in
the transformation model, at least for the nonlinear effect of \code{CovRate200}, 
and thus one might wonder if the stricter
distributional assumption lead to overconfident model interpretation.

\section{Simulation experiments}\label{sec:sim}

The goals of the following simulation experiments are threefold.
%
\begin{enumerate*}
\item First, we want to evaluate the loss of model performance when using
the flexible, distribution-free
mixed-effects additive transformation model approach
in a scenario when a simple normal linear additive mixed model is appropriate.
\item We are interested in comparing the performances
of the normal and non-normal model under a highly non-normal data-generating process.
\item Finally, we investigate the performance of the non-normal transformation model
in the case of interval-censored data.
\end{enumerate*}

We consider a normal and a non-normal data-generating process
for simulating grouped data with non-linear covariate effects.
The conditional distributions corresponding to
both of these can be written in the general transformation model form:
%
\begin{align*}
  \IP(Y\leq y\mid x_{1}, x_{2}, \gamma_{j})
  &= \Phi\left(h(y) - \beta x_{1} - f(x_{2}) - \gamma_{j}\right)
    \qquad \gamma_{\left\{1:J\right\}}\overset{\text{i.i.d.}}{\sim}\cN(0,\sigma),
\end{align*}
%
where $\Phi$ denotes the CDF of the standard Gaussian distribution,
$\beta x_{1}$ is the parametric fixed-effect term,
$f(x_{2})$ is a non-linear term
and $\gamma_{j}$ is the random intercept of the group $j=1\dots J$
with $\sigma$ standard deviation.
The data are simulated by numerically inverting the conditional distribution function.
%
<<dgp-code>>=
##' @param n Numeric vector, number of observations in each group
##' @param beta Numeric, effect size
##' @param sfun Function with a numeric argument, non-linear smooth shift term
##' @param sigma Numeric, SD of random intercepts
##' @param hinv Function with a numeric argument, inverse transformation function
##' @param link Function with a single numeric argument on [0, 1] link function
##' @param scale Numeric, optional scaling constant on the transformation scale
##' @param seed Seed for the random number generator
##' @param two_sets Logical; generate both an estimation and a test sample
gen_smpl <- function(n, beta, sfun, sigma, hinv, link, scale = 1,
                     seed = NULL, two_sets = TRUE) {
  ## -- setting up the seed
  if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE))
    runif(1)
  if (is.null(seed))
    RNGstate <- get(".Random.seed", envir = .GlobalEnv)
  else {
    R.seed <- get(".Random.seed", envir = .GlobalEnv)
    set.seed(seed)
    RNGstate <- structure(seed, kind = as.list(RNGkind()))
    on.exit(assign(".Random.seed", R.seed, envir = .GlobalEnv))
  }
  if (two_sets) n <- rep(n, 2)
  x1 <- runif(sum(n))
  x2 <- runif(sum(n))
  gr <- factor(rep(seq_along(n), n))
  re <- rep(rnorm(length(n), mean = 0, sd = sigma), n)
  lp <- x1 * beta + sfun(x2) + re
  y  <- hinv((link(runif(sum(n))) + lp) * scale)
  if (two_sets) {
    n <- sum(n) / 2
    out <- list(est  = data.frame(x1 = x1[1:n], x2 = x2[1:n], gr = gr[1:n],
                                  y = y[1:n]),
                test =  data.frame(x1 = tail(x1, n), x2 = tail(x2, n),
                                   gr = tail(gr, n), y = tail(y, n)))
  } else {
    out <- data.frame(x1 = x1, x2 = x2, gr = gr, y = y)
  }
  out
}
@

For the inverse baseline transformations
and the non-linear shift term we set the following
functional forms:
%
\begin{align*}
  h_{1}(y) &= y, \qquad h_{2}(y) = \Phi^{-1}\left(F_{\chi^{2}_{3}}(y)\right), \qquad f(x) = \xi \sin(\pi x),
\end{align*}
%
with $\Phi^{-1}(\cdot)$ as the quantile function of the standard normal distribution
and $F_{\chi^{2}_{3}}(\cdot)$ is the distribution function
of the $\chi^{2}$-distribution with 3 degrees of freedom.
By using a linear function, $h_{1}(y)$ as the transformation function,
we assume a normal additive mixed-model.
In contrast, setting $h_{2}(t)$ introduces non-normality.
In this case, the conditional distribution
implied by the data-generating mechanism is
$\chi^{2}$ ($\text{df}=3$) when the linear predictor is zero.
In the simulations, we use $\beta=\sqrt{2}\Phi^{-1}(0.7)$
so that the effect size of the parametric fixed-effect term
is $0.7$ on the probabilistic index scale
in both the normal and the non-normal case
(see Section~\ref{sec:ecoli} for details).
Values of $x_{1}$ and $x_{2}$ are drawn
from the uniform distribution on the interval between 0 and 1
for each observation separately.
The the parameters $\sigma$ and $\xi$ are set to values
that result in equal variances for the parametric ($\beta x_{1}$),
the non-linear ($f(x_{2})$) and the random-effects terms
of the linear predictor on the transformation scale.

<<sim-setup, echo=FALSE>>=
## ==== Simulation inputs
b   <- qnorm(0.7) * sqrt(2) ## PI = 0.7, SD of the parametric part is sqrt(1/12) * b = 0.2141 on the LP scale
sf  <- function(x) 2 * b * sqrt(1/12) * sin(x * pi) / sqrt(2 - 16 / pi^2) ## same SD as the parametric terms on the LP scale
sig <- sqrt(1/12) * b ## same RE SD on LP scale
hi1 <- function(x) x
hi2 <- function(x) qchisq(pnorm(x), df = 3)
lnk <- qnorm

## ==== Helper function to extract results
get_res <- function(res, what, which = NULL, simplify = TRUE) {
  out <- lapply(res, function(x) {
    if (!is.null(which)) x <- x[which]
    sapply(x, function(xx) {
      if (length(xx) == 1 && is.na(xx)) NA
      else {
        if (is.function(what)) return(what(xx))
        xx[[what]]
      }
    })
  })
  if (simplify) {
    out <- do.call("rbind", out)
    if (!is.function(what))
      colnames(out) <- paste0(names(res[[1]]), "_", what)
  }
  out
}
@

For the third question,
to generate censored outcomes on an interval of a given length
from continuous data,
we define the following function:
%
<<intcens-dgp>>=
make_intcens <- function(x, length = 1) {
  Surv(floor(x / length) * length, ceiling(x / length) * length,
       type = "interval2")
}
@

In all scenarios,
we simulate moderate-sized datasets with 100 subjects/groups ($J=100$)
and four observations per each group.
We repeat all simulations 500 times.

Our primary focus in these simulation experiments
is on inference about the fixed-effects parameter $\beta$
and the general predictive performances of the model variants
evaluated through the out-of-sample log-likelihood.
For the out-of-sample log-likelihood calculations,
we integrate over the random intercepts in the log-likelihood
(i.e., the exponential of Equation~\ref{eq:loglik})
of a new set of observations
and evaluate it at the fitted values of the
remaining parameters
(coefficients corresponding to the baseline transformation,
the fixed effect and the smooth term
as well as the variance component and smoothing parameters).

\subsection{Normal data-generating process}

To investigate our first question,
we draw samples from the (conditionally) normal additive mixed model
(i.e., $h(y)=h_{1}(y)=y$)
and fit a mixed-effects additive transformation model
(with standard normal inverse link),
as well as the correctly specified normal additive mixed model,
to it.

<<sim1, echo=FALSE, results="hide">>=
if (!file.exists("sim1.rda")) {
  seeds <- 301:800
  system.time({
    sims1 <- parallel::mclapply(seeds, function(s) {
      smpl <- gen_smpl(rep(4, 100), beta = b, sfun = sf, sigma = sig,
                       hinv = hi1, link = lnk, seed = s, two_sets = TRUE)
      m1 <- LmME(y ~ x1 + s(x2) + (1 | gr), data = smpl$est)
      if (m1$opt$convergence == 0) {
        lmm <- list(ll    = logLik(m1),
                    lloos = logLik(m1, newdata = smpl$test, type = "fix_smooth"),
                    beta  = coef(m1), sigma = sqrt(varcov(m1)$gr[1,1]),
                    theta = m1$param$beta[attr(m1$param$beta, "type") == "bl"],
                    beta_se = sqrt(vcov(m1, parm = "x1")[1,1]))
      } else {
        lmm <- NA
      }
      m2 <- BoxCoxME(y ~ x1 + s(x2) + (1 | gr), data = smpl$est)
      if (m2$opt$convergence == 0) {
        bcm <- list(ll    = logLik(m2),
                    lloos = logLik(m2, newdata = smpl$test, type = "fix_smooth"),
                    beta  = coef(m2), sigma = sqrt(varcov(m2)$gr[1,1]),
                    theta = m2$param$beta[attr(m2$param$beta, "type") == "bl"],
                    beta_se = sqrt(vcov(m2, parm = "x1")[1,1]),
                    intercept = mean(sf(smpl$est$x2)))
      } else {
        bcm <- NA
      }
      list(LmME = lmm, BoxCoxME = bcm)
    }, mc.cores = 6)
  })
  save(sims1, file = "sim1.rda")
} else {
  load("sim1.rda")
}

## -- non-convergence
rowSums(sapply(sims1, function(x)
  sapply(x, function(xx) c(length(xx) == 1 && is.na(xx))))
  )
@

\begin{figure}
  \centering

<<sim1-beta-plot, echo=FALSE, fig.width=4, fig.height=3.5, out.width=".5\\textwidth">>=
par(cex = 0.8, mar = c(4, 4.2, 2, 1), las = 1)
boxplot(pnorm(get_res(sims1, "beta") / sqrt(2)) - 0.7,
        ylab = "PI", names = c("Normal", "Non-normal"),
        boxwex = 0.5)
grid(nx = NA, ny = NULL)
abline(h = 0, lwd = 2, lty = 2, col = 2)
@
\caption{Bias of the coefficient of the parametric fixed term in the case
  of the conditionally normal data-generating process.
  The results are presented on the probabilistic index scale.
  ``Normal'' corresponds to the results from the normal additive mixed model,
  while ``Non-normal'' denotes the distribution-free mixed-effects additive
  transformation model.}\label{fig:sim1-bias}

\end{figure}

According to Figure~\ref{fig:sim1-bias},
both the correctly specified additive model
and the flexible transformation model results in unbiased estimates
for the $\beta$ parameter,
with no visible additional variability in the case of the more general,
distribution-free approach.
The coverage rates of the 95\%~Wald confidence intervals from the two models
are very close to their nominal levels.
%
<<sim1-beta-ci>>=
cvi <- get_res(sims1, what = function(x) {
  ci <- x$beta + x$beta_se * qnorm(0.975) * c(-1, 1)
  (ci[1] <= b) && (b < ci[2])
})
(cvr <- colMeans(cvi, na.rm = TRUE)) ## Coverage rate
sqrt(cvr * (1 - cvr) / nrow(cvi)) ## Monte Carlo SE
@

The Monte Carlo distributions of out-of-sample log-likelihood
are presented in Figure~\ref{fig:sim1-pred}.
For each simulated dataset for fitting the two models,
we generated an independent evaluation dataset of the same size
and structure.
We calculated the log-likelihood of this new data given the parameters
fitted to the estimation sample.
The predictive performance of the two models are very close to each other,
with just slight reduction in the log-likelihood values
for the more complex transformation model.

\begin{figure}
  \centering

<<sim1-pred-plot, echo=FALSE, fig.width=4, fig.height=3.5, out.width=".5\\textwidth">>=
par(cex = 0.8, mar = c(4, 4.2, 2, 1), las = 1)
boxplot(get_res(sims1, "lloos"), ylab = "Out-of-sample log-likelihood",
        names = c("Normal", "Non-normal"), boxwex = 0.5)
grid(nx = NA, ny = NULL)
@
\caption{Predictive performance of the normal additive mixed model
  and the mixed-effects additive
  transformation model evaluated by the out-of-sample log-likelihood
  of an independently generated dataset.}\label{fig:sim1-pred}

\end{figure}

To visually assess,
how well the mixed-effects additive transformation model
approximates the conditionally normal response distribution,
we plot the fitted baseline transformation functions ($\widehat h(y)$),
and compare to the true function $h(y)=y$ (Figure~\ref{fig:sim1-baseline}).

\begin{figure}
  \centering
<<sim1-baseline-plot, echo=FALSE, fig.width=4, fig.height=3.5, out.width=".5\\textwidth">>=
par(cex = 0.8, mar = c(4, 4.2, 2, 1), las = 1)
th <- get_res(sims1, "theta", "BoxCoxME", simplify = FALSE)
th <- do.call("cbind", th)
## NOTE: to compare with the true transformation function (identity) we need to
## adjust the fitted baseline transformation with the expectation of the smooth
## term
ic <- get_res(sims1, "intercept", "BoxCoxME", simplify = FALSE)
ic <- sapply(ic, `[`, 1)
dat <- gen_smpl(rep(4, 100), beta = b, sfun = sf, sigma = sig,
                hinv = hi1, link = lnk, seed = 1, two_sets = FALSE)
bcm <- BoxCoxME(y ~ x1 + s(x2) + (1 | gr), data = dat, nofit = TRUE)
nd  <- dat[rep(1, 100), ]
nd[[variable.names(bcm, "response")]] <- seq(-1, 2, length.out = 100)
mm  <- model.matrix(bcm$model$ctm$bases$response, data = nd)
tr <- mm %*% th + matrix(rep(ic, each = 100), nrow = 100)
matplot(nd$y, tr, type = "l", col = grey(0.1,0.1), lty = 1,
        ylab = "h(y)", xlab = "y", panel.first = grid())
abline(0,1, lwd = 2, col = 2)
@
\caption{Fitted baseline transformation functions from the mixed-effects
  additive transformation model under normal data-generating mechanism.
  The red line shows the true value.}\label{fig:sim1-baseline}

\end{figure}


\subsection{Non-normal data-generating process}

In many applied regression problems,
the assumption that the outcome is normally distributed,
conditionally on the predictors,
is violated.
Normal additive mixed models are still
often used in these cases,
mainly due to the fact that
the normal linear model can be remarkably
robust to some violations of the model assumptions
\citep{Schielzeth_etal_2020}.
In the next simulation experiment,
we compare the normal additive mixed model
and non-normal mixed-effects transformation model
based on the inference on the parametric fixed term
and out-of-sample predictive performance
under a non-normal, skewed data-generating process.

As an additional goal,
we also investigate the performance
of the transformation approach
when the outcome variable is interval-censored,
i.e., none of the values are exactly observed,
only up to an interval.
For each iteration in the Monte Carlo simulations,
we generate a dataset with exactly observed outcomes
(from the conditionally non-normal data-generating process)
and fit the normal additive model
and the non-normal transformation model to it.
We then create the censored version
of the same variable by binning
its values into intervals
(of length two in our simulations)
and define the corresponding \code{Surv} object.
%
<<intcens-example>>=
dat <- gen_smpl(rep(4, 100), beta = b, sfun = sf, sigma = sig,
                hinv = hi2, link = lnk, seed = 1, two_sets = FALSE)
head(dat$y) ## exact values
head(make_intcens(dat$y, length = 2)) ## interval-censored version
@
%
In the next step, we refit the transformation model
using the interval-censored data,
and compare its results to the additive mixed model
and the transformation model with exact responses.
To ensure the comparability of the out-of-sample predictions,
we only transform the response in the estimation sample,
but not the evaluation sample,
to interval-censored.

<<sim2, echo=FALSE, results="hide">>=
nd <- data.frame(x2 = seq(0, 1, length.out = 100))
if (!file.exists("sim2.rda")) {
  seeds <- 1001:1500
  system.time({
    sims2 <- parallel::mclapply(seeds, function(s) {
      smpl <- gen_smpl(rep(4, 100), beta = b, sfun = sf, sigma = sig,
                       hinv = hi2, link = lnk, seed = s, two_sets = TRUE)
      m1 <- LmME(y ~ x1 + s(x2) + (1 | gr), data = smpl$est)
      if (m1$opt$convergence == 0) {
        lmm <- list(ll    = logLik(m1),
                    lloos = logLik(m1, newdata = smpl$test, type = "fix_smooth"),
                    beta  = coef(m1), sigma = sqrt(varcov(m1)$gr[1,1]),
                    beta_se = sqrt(vcov(m1, parm = "x1")[1,1]))
      } else {
        lmm <- NA
      }
      m2 <- BoxCoxME(y ~ x1 + s(x2) + (1 | gr), data = smpl$est)
      if (m2$opt$convergence == 0) {
        sm <- smooth_terms(m2, newdata = nd)[[1]][,2] + mean(sf(smpl$est$x2))
        bcm <- list(ll    = logLik(m2),
                    lloos = logLik(m2, newdata = smpl$test, type = "fix_smooth"),
                    beta  = coef(m2), sigma = sqrt(varcov(m2)$gr[1,1]),
                    beta_se = sqrt(vcov(m2, parm = "x1")[1,1]),
                    smooth = sm)
      } else {
        bcm <- NA
      }
      smpl$est$y <- make_intcens(smpl$est$y, length = 2)
      m3 <- BoxCoxME(y ~ x1 + s(x2) + (1 | gr), data = smpl$est)
      if (m3$opt$convergence == 0) {
        sm <- smooth_terms(m3, newdata = nd)[[1]][, 2] + mean(sf(smpl$est$x2))
        icm <- list(ll    = logLik(m3),
                    lloos = logLik(m3, newdata = smpl$test, type = "fix_smooth"),
                    beta  = coef(m3), sigma = sqrt(varcov(m3)$gr[1,1]),
                    beta_se = sqrt(vcov(m3, parm = "x1")[1,1]),
                    smooth = sm)
      } else {
        icm <- NA
      }
      list(LmME = lmm, BoxCoxME = bcm, BoxCoxME_ic = icm)
    }, mc.cores = 8)
  })
  save(sims2, file = "sim2.rda")
} else {
  load("sim2.rda")
}

## -- non-convergence
rowSums(sapply(sims2, function(x)
  sapply(x, function(xx) c(length(xx) == 1 && is.na(xx))))
  )
@

Figure~\ref{fig:sim2-bias} show the bias in the estimates of the coefficient $\beta$
in the case of the two models and the transformation model under interval-censored data.
In the case of the (misspecified) normal additive mixed-model,
there is a sign of slight
downward bias, while the transformation model gives unbiased estimates for the parametric
fixed-effects coefficient,
irrespective of whether it uses the exact observations
or their interval-censored version.
The coverages of the 95\% Wald confidence intervals
are close to their nominal levels in all cases.
%
<<sim2-beta-ci>>=
cvi <- get_res(sims2, what = function(x) {
  ci <- x$beta + x$beta_se * qnorm(0.975) * c(-1, 1)
  (ci[1] <= b) && (b < ci[2])
})
(cvr <- colMeans(cvi, na.rm = TRUE)) ## Coverage rate
sqrt(cvr * (1 - cvr) / nrow(cvi)) ## Monte Carlo SE
@


\begin{figure}
  \centering

<<sim2-beta-plot, echo=FALSE, fig.width=4.5, fig.height=3.5, out.width=".5\\textwidth">>=
par(cex = 0.8, mar = c(4, 4.2, 2, 1), las = 1)
boxplot(pnorm(get_res(sims2, "beta") / sqrt(2)) - 0.7,
        ylab = "PI", names = c("Normal", "Non-normal", "Non-normal (IC)"),
        boxwex = 0.5)
grid(nx = NA, ny = NULL)
abline(h = 0, lwd = 2, lty = 2, col = 2)
@
\caption{Bias of the coefficient of the parametric fixed term in the case
  of the conditionally non-normal data-generating process.
  The results are presented on the probabilistic index scale.
  ``Normal'' corresponds to the results from the normal additive mixed model,
  ``Non-normal'' denotes the distribution-free mixed-effects additive transformation model
  and ``Non-normal (IC)'' show the results of the transformation model
  under interval-censoring.}\label{fig:sim2-bias}

\end{figure}

The differences between the normal and the distribution-free approaches
are more marked in their out-of-sample predictive performances.
As Figure~\ref{fig:sim2-pred} shows,
the normal additive mixed model performs poorly compared
to the transformation model,
even in the interval-censored case,
when the observations in the estimation sample
are not exactly observed.

\begin{figure}
  \centering

<<sim2-pred-plot, echo=FALSE, fig.width=4.5, fig.height=3.5, out.width=".5\\textwidth">>=
par(cex = 0.8, mar = c(4, 4.2, 2, 1), las = 1)
boxplot(get_res(sims2, "lloos"), ylab = "Out-of-sample log-likelihood",
        names = c("Normal", "Non-normal", "Non-normal (IC)"),
        boxwex = 0.5)
grid(nx = NA, ny = NULL)
@
\caption{Predictive performance of the normal additive mixed model
  and the mixed-effects additive
  transformation model
  with exact (``Non-normal'') and interval-censored (``Non-normal (IC)'') observations
  evaluated by the out-of-sample log-likelihood
  of an independently generated dataset.}\label{fig:sim2-pred}

\end{figure}

Figure~\ref{fig:sim2-smooth} presents the fitted smooth non-linear terms
from the non-normal transformation model in the case of the exactly observed
and interval-censored scenarios.
As the results show, the model is able to recover the non-linear term
even when the information on the outcome is limited to intervals.

\begin{figure}
  \centering

<<sim2-smooth-plot, echo=FALSE, fig.width=6, fig.height=3.5, out.width=".8\\textwidth">>=
par(mfrow = c(1, 2), las = 1, mar = c(4, 5, 1, 1), cex = 0.8)
sms <- get_res(sims2, what = "smooth", which = c("BoxCoxME", "BoxCoxME_ic"),
               simplify = FALSE)
sm1 <- sapply(sms, function(x) x[, 1])
matplot(nd$x2, sm1, type = "l", col = grey(0.1, 0.1), lty = 1,
        ylab = expression(hat(f) * "(x)"), xlab = "x",
        panel.first = grid())
lines(nd$x2, sf(nd$x2), col = 2, lwd = 2)
sm2 <- sapply(sms, function(x) x[, 2])
matplot(nd$x2, sm2, type = "l", col = grey(0.1, 0.1), lty = 1,
        ylab = expression(hat(f) * "(x)"), xlab = "x",
        panel.first = grid())
lines(nd$x2, sf(nd$x2), col = 2, lwd = 2)
@
\caption{Fitted non-linear effects from the mixed-effects
  additive transformation model with exact observations (\emph{left panel})
  and interval-censored observations (\emph{right panel})
  under the non-normal data-generating mechanism.
  The red curves show the true function.}\label{fig:sim2-smooth}

\end{figure}

In summary,
the results of the simulation experiments presented above suggest
that, at least in the scenarios considered,
the cost of using the distribution-free transformation model framework
in mixed-effects additive modeling is low,
while it can improve the inference and predictive performance
when the distributional assumptions of the available alternatives are not met.
Moreover, our results confirm that the model provides reliable estimates
when the outcome variable is censored.

\clearpage

<<packages, echo = FALSE, results='hide', warning=FALSE, purl = FALSE, cache = FALSE>>=
if (file.exists("packages.bib")) file.remove("packages.bib")
pkgversion <- function(pkg) {
    pkgbib(pkg)
    packageDescription(pkg)$Version
}
pkgbib <- function(pkg) {
    x <- citation(package = pkg, auto = TRUE)[[1]]
    b <- toBibtex(x)
    b[1] <- paste("@Manual{pkg:", pkg, ",", sep = "")
    if (is.na(b["url"])) {
        b[length(b)] <- paste("   URL = {http://CRAN.R-project.org/package=",
                              pkg, "}", sep = "")
        b <- c(b, "}")
    }
    cat(b, sep = "\n", file = "packages.bib", append = TRUE)
}
pkg <- function(pkg) {
    vrs <- try(pkgversion(pkg))
    if (inherits(vrs, "try-error")) return(NA)
    paste("\\\\pkg{", pkg, "} \\\\citep[version~",
          vrs, ",][]{pkg:", pkg, "}", sep = "")
}

pkgs <- c("gamm4", "tramME", "glmmTMB")
out <- sapply(pkgs, pkg)

x <- readLines("packages.bib")
for (p in pkgs)
    x <- gsub(paste("\\{", p, ":", sep = ""), paste("\\{\\\\pkg{", p, "}:", sep = ""), x)
cat(x, sep = "\n", file = "packages.bib", append = FALSE)
@

\bibliographystyle{plainnat}
\bibliography{ref,packages}

\clearpage

<<info>>=
sessionInfo()
@


\end{document}