\documentclass[shortnames,nojss]{jss} % onecolumn

% \VignetteIndexEntry{gamboostLSS Tutorial}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Setup LaTeX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}

\usepackage{booktabs}
\usepackage{multirow}

\usepackage{amsmath,amssymb,amsfonts}
\usepackage{bm}
\usepackage{dsfont}

\usepackage{pdflscape,rotating}

\usepackage{algorithm}

%%% Needed for RCS
\usepackage{ulem}      % use this for sout
\normalem              % set \emph to \textit again

\shortcites{borghi_who,bmc_growth,schmid2013beta,villarini2009}

%%% RCS System
\def\rcsmark#1{\colorbox{yellow}{#1}}
\def\rcscom#1{\noindent\newline\vspace*{0.5cm}\colorbox{yellow}{\parbox{\textwidth}{#1}}\vspace*{0.5cm}}
\def\rcsdel#1{{\color{red}\sout{#1}}} % delete
\def\rcsdelp#1{{\color{red}$<$DEL$>$#1$<$/DEL$>$}} % delete paragraph
\def\rcsadd#1{{\color{blue}#1}} % added

%%% Used for diamond at the end of the case study parts
\DeclareSymbolFont{extraup}{U}{zavm}{m}{n}
\DeclareMathSymbol{\varheart}{\mathalpha}{extraup}{86}
\DeclareMathSymbol{\vardiamond}{\mathalpha}{extraup}{87}

%% do not \usepackage{Sweave}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Setup R
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

<<init, echo = FALSE, results = hide>>=
options(prompt = "R> ", continue = "+  ", width = 76, useFancyQuotes = FALSE)
repos <- "http://cran.at.r-project.org"
## check if a current version of gamboostLSS is installed
suppressWarnings(pd <- packageDescription("gamboostLSS"))
if (all(is.na(pd))){
    install.packages("gamboostLSS", repos = repos)
    pd <- packageDescription("gamboostLSS")
} else {
    if (compareVersion(pd$Version, "1.2-0") < 0){
        warning(sQuote("gamboostLSS"), " (1.2-0 or newer) is being installed!")
        install.packages("gamboostLSS", repos = repos)
    }
}

## for the exact reproduction of the plots R2BayesX >= 0.3.2 is needed
suppressWarnings(if (!require("R2BayesX"))
                     install.packages("R2BayesX", repos = repos))

## remove (possibly) altered versions of "india" from working environment
suppressWarnings(rm("india"))
require("gamboostLSS")

## make graphics directory if not existing
if (!file.exists("graphics"))
    dir.create("graphics")
if (!file.exists("cvrisk"))
    dir.create("cvrisk")
@

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Title Information
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{\pkg{gamboostLSS}: An \proglang{R} Package for Model Building and
  Variable Selection in the GAMLSS Framework}
\Plaintitle{gamboostLSS: An R Package for Model Building and Variable Selection
  in the GAMLSS Framework}
\Shorttitle{\pkg{gamboostLSS}: Model Building and Variable Selection for GAMLSS}
\author{Benjamin Hofner\\FAU Erlangen-Nürnberg \And
        Andreas Mayr\\FAU Erlangen-Nürnberg \And
        Matthias Schmid\\University of Bonn}
\Plainauthor{Benjamin Hofner, Andreas Mayr, Matthias Schmid}

\Abstract{ \emph{This vignette is a slightly modified version of
    \citet{Hofner:gamboostLSS:2015} which appeared in the
    \textbf{Journal of Statistical Software}. Please cite that article when
    using the package \pkg{gamboostLSS} in your work.}\\[1em]

  Generalized additive models for location, scale and shape are a
  flexible class of regression models that allow to model multiple parameters of
  a distribution function, such as the mean and the standard deviation,
  simultaneously. With the \proglang{R} package \pkg{gamboostLSS}, we provide a
  boosting method to fit these models. Variable selection and model choice are
  naturally available within this regularized regression framework. To introduce
  and illustrate the \proglang{R} package \pkg{gamboostLSS} and its
  infrastructure, we use a data set on stunted growth in India. In addition to
  the specification and application of the model itself, we present a variety of
  convenience functions, including methods for tuning parameter selection,
  prediction and visualization of results. The package \pkg{gamboostLSS} is
  available from CRAN (\url{http://cran.r-project.org/package=gamboostLSS}).}

\Keywords{additive models, prediction intervals, high-dimensional data}

\Address{
  Benjamin Hofner \& Andreas Mayr\\
  Department of Medical Informatics, Biometry and Epidemiology\\
  Friedrich-Alexander-Universität Erlangen-Nürnberg\\
  Waldstra{\ss}e 6\\
  91054 Erlangen, Germany\\
  E-mail: \email{benjamin.hofner@fau.de},\\
  \hphantom{E-mail: }\email{andreas.mayr@fau.de}\\
  URL: \url{http://www.imbe.med.uni-erlangen.de/cms/benjamin_hofner.html},\\
  \hphantom{URL: }\url{http://www.imbe.med.uni-erlangen.de/ma/A.Mayr/}\\

  Matthias Schmid\\
  Department of Medical Biometry, Informatics and Epidemiology\\
  University of Bonn\\
  Sigmund-Freud-Stra{\ss}e 25\\
  53105 Bonn\\
  E-mail: \email{matthias.schmid@imbie.uni-bonn.de}\\
  URL: \url{http://www.imbie.uni-bonn.de}
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Begin Document
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}
\SweaveOpts{concordance=FALSE}
\SweaveOpts{echo = TRUE, results = verbatim, prefix.string=graphics/fig}
\SweaveOpts{width = 5, height = 5}
\SweaveOpts{keep.source = TRUE}

\maketitle

\section{Introduction}
\label{sec:intro}

Generalized additive models for location, scale and shape (GAMLSS) are a
flexible statistical method to analyze the relationship between a response
variable and a set of predictor variables. Introduced by \citet{rs}, GAMLSS are
an extension of the classical GAM (generalized additive model) approach
\citep{hastietib}. The main difference between GAMs and GAMLSS is that GAMLSS do
not only model the conditional mean of the outcome distribution (location) but
\textit{several} of its parameters, including scale and shape parameters (hence
the extension ``LSS''). In Gaussian regression, for example, the density of the
outcome variable $Y$ conditional on the predictors $\mathbf{X}$ may depend on
the mean parameter $\mu$, and an additional scale parameter $\sigma$, which
corresponds to the standard deviation of $Y| \mathbf{X}$. Instead of assuming
$\sigma$ to be fixed, as in classical GAMs, the Gaussian GAMLSS regresses both
parameters on the predictor variables,
\begin{eqnarray}
\label{Gaussian:mu}
\mu = \E(y \, | \,  \mathbf{X}) & = & \eta_\mu = \beta_{\mu,0} + \sum_j f_{\mu,j}(x_j), \\
\label{Gaussian:sigma}
\log(\sigma) = \log ( \sqrt{\VAR(y \, | \,  \mathbf{X})}) & = &
\eta_{\sigma} = \beta_{\sigma,0} +\sum_j f_{\sigma, j}(x_j),
\end{eqnarray}
where $\eta_\mu$ and $\eta_\sigma$ are \emph{additive predictors} with parameter
specific intercepts $\beta_{\mu,0}$ and $\beta_{\sigma,0}$, and functions
$f_{\mu,j}(x_j)$ and $f_{\sigma, j}(x_j)$, which represent the effects of
predictor $x_j$ on $\mu$ and $\sigma$, respectively. In this notation, the
functional terms $f(\cdot)$ can denote various types of effects (e.g., linear,
smooth, random).

In our case study, we will analyze the prediction of stunted growth for children
in India via a Gaussian GAMLSS. The response variable is a stunting score, which
is commonly used to relate the growth of a child to a reference population in
order to assess effects of malnutrition in early childhood. In our analysis, we
model the expected value ($\mu$) of this stunting score and also its variability
($\sigma$) via smooth effects for mother- or child-specific predictors, as well
as a spatial effect to account for the region of India where the child is
growing up. This way, we are able to construct point predictors (via
$\eta_{\mu}$) and additionally child-specific prediction intervals (via
$\eta_{\mu}$ and $\eta_{\sigma}$) to evaluate the individual risk of stunted
growth.

In recent years, due to their versatile nature, GAMLSS have been used to address
research questions in a variety of fields. Applications involving GAMLSS range
from the normalization of complementary DNA microarray data
\citep{khondoker2009} and the analysis of flood frequencies
\citep{villarini2009} to the development of rainfall models
\citep{serinaldi2012} and stream-flow forecasting models \citep{ogtrop2011}. The
most prominent application of GAMLSS is the estimation of centile curves, e.g.,
for reference growth charts \citep{onis2006child, borghi_who, bmc_growth}. The
use of GAMLSS in this context has been recommended by the World Health
Organization \citep[see][and the references therein]{rigby:smoothing:2014}.

Classical estimation of a GAMLSS is based on backfitting-type Gauss-Newton
algorithms with AIC-based selection of relevant predictors. This strategy is
implemented in the \proglang{R} \citep{R:3.1.0} package \pkg{gamlss}
\citep{rs,gamlss:jss:2007,pkg:gamlss:4.3-0}, which provides a great variety of
functions for estimation, hyper-parameter selection, variable selection and
hypothesis testing in the GAMLSS framework.

In this article we present the \proglang{R} package \pkg{gamboostLSS}
\citep{pkg:gamboostLSS:1.2-0}, which is designed as an alternative to
\pkg{gamlss} for high-dimensional data settings where variable selection is of
major importance. Specifically, \pkg{gamboostLSS} implements the
\emph{gamboostLSS} algorithm, which is a new fitting method for GAMLSS that was
recently introduced by \cite{mayretal}. The \emph{gamboostLSS} algorithm uses
the same optimization criterion as the Gauss-Newton type algorithms implemented
in the package \pkg{gamlss} (namely, the log-likelihood of the model under
consideration) and hence fits the same type of statistical model. In contrast to
\pkg{gamlss}, however, the \pkg{gamboostLSS} package operates within the
component-wise gradient boosting framework for model fitting and variable
selection \citep{BuehlmannYu2003,BuhlmannHothorn07}. As demonstrated in
\cite{mayretal}, replacing Gauss-Newton optimization by boosting techniques
leads to a considerable increase in flexibility: Apart from being able to fit
basically any type of GAMLSS, \pkg{gamboostLSS} implements an efficient
mechanism for variable selection and model choice. As a consequence,
\pkg{gamboostLSS} is a convenient alternative to the AIC-based variable
selection methods implemented in \pkg{gamlss}. The latter methods can be
unstable, especially when it comes to selecting possibly different sets of
variables for multiple distribution parameters. Furthermore, model fitting via
\emph{gamboostLSS} is also possible for high-dimensional data with more
candidate variables than observations ($p > n$), where the classical fitting
methods become unfeasible.

The \pkg{gamboostLSS} package is a comprehensive implementation of the most
important issues and aspects related to the use of the \emph{gamboostLSS}
algorithm. The package is available on CRAN
(\url{http://cran.r-project.org/package=gamboostLSS}). Current development
versions are hosted on GitHub (\url{https://github.com/hofnerb/gamboostLSS}). As
will be demonstrated in this paper, the package provides a large number of
response distributions \citep[e.g., distributions for continuous data, count
data and survival data, including \textit{all} distributions currently available
in the \pkg{gamlss} framework; see][]{pkg:gamlss.dist:4.3-0}. Moreover, users of
\pkg{gamboostLSS} can choose among many different possibilities for modeling
predictor effects. These include linear effects, smooth effects and trees, as
well as spatial and random effects, and interaction terms.

After starting with a toy example (Section~\ref{sec:toy-example}) for
illustration, we will provide a brief theoretical overview of GAMLSS and
component-wise gradient boosting (Section~\ref{sec:boost-gamlss-models}). In
Section~\ref{sec:india}, we will introduce the \code{india} data set, which is
shipped with the \proglang{R} package \pkg{gamboostLSS}. We present the
infrastructure of \pkg{gamboostLSS}, discuss model comparison methods and model
tuning, and will show how the package can be used to build regression models in
the GAMLSS framework (Section~\ref{sec:package}). In particular, we will give a
step by step introduction to \pkg{gamboostLSS} by fitting a flexible GAMLSS
model to the \code{india} data. In addition, we will present a variety of
convenience functions, including methods for the selection of tuning parameters,
prediction and the visualization of results (Section~\ref{sec:methods}).

\section[A toy example]{A toy example}
\label{sec:toy-example}

Before we discuss the theoretical aspects of the \emph{gamboostLSS} algorithm
and the details of the implementation, we present a short, illustrative toy
example. This highlights the ease of use of the \pkg{gamboostLSS} package in
simple modeling situations. Before we start, we load the package
<<load_package>>=
library("gamboostLSS")
@
Note that \pkg{gamboostLSS} 1.2-0 or newer is needed. We simulate data
from a heteroscedastic normal distribution, i.e., both the mean and the variance
depend on covariates:
<<simulate_data>>=
set.seed(1907)
n <- 150
x1 <- rnorm(n)
x2 <- rnorm(n)
x3 <- rnorm(n)
toydata <- data.frame(x1 = x1, x2 = x2, x3 = x3)
toydata$y <- rnorm(n, mean = 1 + 2 * x1 - x2,
                      sd = exp(0.5 - 0.25 * x1 + 0.5 * x3))
@
Next we fit a linear model for location, scale and shape to the simulated data
<<modelfitting>>=
lmLSS <- glmboostLSS(y ~ x1 + x2 + x3, data = toydata)
@
and extract the coefficients using \code{coef(lmLSS)}. When we add the offset
(i.e., the starting values of the fitting algorithm) to the intercept, we obtain
<<coefficients2>>=
coef(lmLSS, off2int = TRUE)
@
Usually, model fitting involves additional tuning steps, which are skipped here
for the sake of simplicity (see Section~\ref{sec:model_tuning} for details).
Nevertheless, the coefficients coincide well with the true effects, which are
$\beta_\mu = (1, 2, -1, 0)$ and $\beta_\sigma = (0.5, - 0.25, 0, 0.5)$. To get a
graphical display, we \code{plot} the resulting model
<<coef_paths, eval = false>>=
par(mfrow = c(1, 2), mar = c(4, 4, 2, 5))
plot(lmLSS, off2int = TRUE)
@

\setkeys{Gin}{width = 0.9\textwidth}
\begin{figure}[ht!]
  \centering
<<plot_coef_paths, echo = FALSE,  fig = TRUE, width = 10, height = 4.5>>=
<<coef_paths>>
@
\caption{Coefficient paths for linear LSS models, which depict the
    change of the coefficients over the iterations of the algorithm.}
\end{figure}

To extract fitted values for the mean, we use the function \code{fitted(,
  parameter = "mu")}. The results are very similar to the true values:
<<>>=
muFit <- fitted(lmLSS, parameter = "mu")
rbind(muFit, truth = 1 + 2 * x1 - x2)[, 1:5]
@
The same can be done for the standard deviation, but we need to make sure that
we apply the response function (here $\exp(\eta)$) to the fitted values by
additionally using the option \code{type = "response"}:
<<>>=
sigmaFit <- fitted(lmLSS, parameter = "sigma", type = "response")[, 1]
rbind(sigmaFit, truth = exp(0.5 - 0.25 * x1 + 0.5 * x3))[, 1:5]
@

For new observations stored in a data set \code{newData} we could use
\code{predict(lmLSS, newdata = newData)} essentially in the same way. As
presented in Section~\ref{sec:methods}, the complete distribution could also be
depicted as marginal prediction intervals via the function \code{predint()}.

\section[Boosting GAMLSS models]{Boosting GAMLSS models}
\label{sec:boost-gamlss-models}

\emph{GamboostLSS} is an algorithm to fit GAMLSS models via component-wise
gradient boosting \citep{mayretal} adapting an earlier strategy by
\cite{schmidetal}. While the concept of boosting emerged from the field of
supervised machine learning, boosting algorithms are nowadays often applied as
flexible alternative to estimate and select predictor effects in statistical
regression models \citep[statistical boosting,][]{mayr_boosting_part1}. The key
idea of statistical boosting is to iteratively fit the different predictors with
simple regression functions (base-learners) and combine the estimates to an
additive predictor. In case of gradient boosting, the base-learners are fitted
to the negative gradient of the loss function; this procedure can be described
as gradient descent in function space \citep{BuhlmannHothorn07}. For GAMLSS, we
use the negative log-likelihood as loss function. Hence, the negative gradient
of the loss function equals the (positive) gradient of the log-likelihood. To
avoid confusion we directly use the gradient of the log-likelihood in the
remainder of the article.

To adapt the standard boosting algorithm to fit additive predictors for all
distribution parameters of a GAMLSS we extended the component-wise fitting to
multiple parameter dimensions: In each iteration, \emph{gamboostLSS} calculates
the partial derivatives of the log-likelihood function $l(y, \bm{\theta})$ with
respect to each of the additive predictors $\eta_{\theta_k}$, $k=1,\ldots,K$.
The predictors are related to the parameter vector $\bm{\theta} =
(\theta_k)^\top_{k = 1,\ldots,K}$ via parameter-specific link functions $g_k$,
$\theta_k = g_k^{-1}(\eta_{\theta_k})$. Typically, we have at maximum $K = 4$
distribution parameters \citep{rs}, but in principle more are possible. The
predictors are updated successively in each iteration, while the current
estimates of the other distribution parameters are used as offset values. A
schematic representation of the updating process of \emph{gamboostLSS} with four
parameters in iteration $m+1$ looks as follows:

\begin{alignat*}{8}
   \frac{\partial}{\partial \eta_{\mu}}\, l(&y, \hat{\mu}^{[m]}&,& \hat{\sigma}^{[m]}&,& \hat{\nu}^{[m]}&,&
  \hat{\tau}^{[m]}&)
  &\quad \stackrel{\rm update}{\longrightarrow}&  \hat{\eta}_\mu^{[\boldmath{m+1}]}    \Longrightarrow  \hat{\mu}^{[\emph{m+1}]} \ ,& \\
%
   \frac{\partial}{\partial \eta_{\sigma}}\, l(&y, \hat{\mu}^{[\emph{m+1}]}&,& \hat{\sigma}^{[m]}&,& \hat{\nu}^{[m]}&,&
  \hat{\tau}^{[m]}&)
  &\quad \stackrel{\rm update }{\longrightarrow} \quad &  \hat{\eta}_\sigma^{[\emph{m+1}]}    \Longrightarrow  \hat{\sigma}^{[\emph{m+1}]} \ ,& \\
%
   \frac{\partial}{\partial \eta_{\nu}}\, l(&y, \hat{\mu}^{[\emph{m+1}]}&,& \hat{\sigma}^{[\emph{m+1}]}&,&
  \hat{\nu}^{[m]}&,& \hat{\tau}^{[m]}&)
  &\quad \stackrel{\rm update }{\longrightarrow}&  \hat{\eta}_\nu^{[\emph{m+1}]}    \Longrightarrow  \hat{\nu}^{[\emph{m+1}]} \ ,  \\
%
   \frac{\partial}{\partial \eta_{\tau}}\, l(&y, \hat{\mu}^{[\emph{m+1}]}&,& \hat{\sigma}^{[\emph{m+1}]}&,&
  \hat{\nu}^{[\emph{m+1}]}&,& \hat{\tau}^{[m]}&)
  &\quad \stackrel{\rm update}{\longrightarrow}& \hat{\eta}_\tau^{[\emph{m+1}]} \Longrightarrow
  \hat{\tau}^{[\emph{m+1}]} \ .
\end{alignat*}


The algorithm hence circles through the different parameter dimensions: in every
dimension, it carries out one boosting iteration, updates the corresponding
additive predictor and includes the new prediction in the loss function for the
next dimension.

As in classical statistical boosting, inside each boosting iteration only the
best fitting base-learner is included in the update. Typically, each
base-learner corresponds to one component of $\mathbf{X}$, and in every boosting
iteration only a small proportion (a typical value of the \textit{step-length}
is 0.1) of the fit of the selected base-learner is added to the current additive
predictor $\eta^{[m]}_{\theta_k}$. This procedure effectively leads to
data-driven variable selection which is controlled by the stopping iterations
$\bm{m}_{\text{stop}} = (m_{\text{stop},1},...,m_{\text{stop},K})^{\top}$: Each
additive predictor $\eta_{\theta_k}$ is updated until the corresponding stopping
iterations $\bm{m}_{\text{stop},k}$ is reached. If $m$ is greater than
$m_{\text{stop},k}$, the $k$th distribution parameter dimension is no longer
updated. Predictor variables that have never been selected up to iteration
$m_{\text{stop},k}$ are effectively excluded from the resulting model. The
vector $\bm{m}_{\text{stop}}$ is a tuning parameter that can, for example, be
determined using multi-dimensional cross-validation (see
Section~\ref{sec:model_tuning} for details). A discussion of model comparison
methods and diagnostic checks can be found in
Section~\ref{sec:model_complexity}. The complete \emph{gamboostLSS} algorithm
can be found in Appendix~\ref{algorithm} and is described in detail in
\cite{mayretal}.

\paragraph{Scalability of boosting algorithms} One of the main advantages of
boosting algorithms in practice, besides the automated variable selection, is
their applicability in situations with more variables than observations ($p >
n$). Despite the growing model complexity, the run time of boosting algorithms
for GAMs increases only linearly with the number of base-learners
\cite{BuehlmannYu2007}. An evaluation of computing times for up to $p = 10000$
predictors can be found in \cite{BinMueSch2012}. In case of boosting GAMLSS, the
computational complexity additionally increases with the number of distribution
parameters $K$. For an example on the performance of \emph{gamboostLSS} in case
of $p > n$ see the simulation studies provided in \cite{mayretal}. To speed up
computations for the tuning of the algorithm via cross-validation or resampling,
\pkg{gamboostLSS} incorporates parallel computing (see
Section~\ref{sec:model_tuning}).

\section{Childhood malnutrition in India}
\label{sec:india}

Eradicating extreme poverty and hunger is one of the Millennium Development
Goals that all 193 member states of the United Nations have agreed to achieve by
the year 2015. Yet, even in democratic, fast-growing emerging countries like
India, which is one of the biggest global economies, malnutrition of children is
still a severe problem in some parts of the population. Childhood malnutrition
in India, however, is not necessarily a consequence of extreme poverty but can
also be linked to low educational levels of parents and cultural factors
\citep{india_nut}. Following a bulletin of the World Health Organization, growth
assessment is the best available way to define the health and nutritional status
of children \citep{who}. Stunted growth is defined as a reduced growth rate
compared to a standard population and is considered as the first consequence of
malnutrition of the mother during pregnancy, or malnutrition of the child during
the first months after birth. Stunted growth is often measured via a $Z$ score
that compares the anthropometric measures of the child with a reference
population:
\begin{eqnarray*}
Z_i = \frac{\text{AI}_i - \text{MAI}}{s}
\end{eqnarray*}
In our case, the individual anthropometric indicator ($\text{AI}_i$) will be the
height of the child $i$, while MAI and $s$ are the median and the standard
deviation of the height of children in a reference population. This $Z$ score
will be denoted as \textit{stunting score} in the following. Negative values of
the score indicate that the child's growth is below the expected growth of a
child with normal nutrition.

The stunting score will be the outcome (response) variable in our application,:
we analyze the relationship of the mother's and the child's body mass index
(BMI) and age with stunted growth resulting from malnutrition in early
childhood. Furthermore, we will investigate regional differences by including
also the district of India in which the child is growing up. The aim of the
analysis is both, to explain the underlying structure in the data as well as to
develop a prediction model for children growing up in India. A prediction rule,
based also on regional differences, could help to increase awareness for the
individual risk of a child to suffer from stunted growth due to malnutrition.
For an in-depth analysis on the multi-factorial nature of child stunting in
India, based on boosted quantile regression, see \citet{Fenske:2011:JASA}, and
\citet{fenske2013plos}.

The data set that we use in this analysis is based on the Standard Demographic
and Health Survey, 1998-99, on malnutrition of children in India, which can be
downloaded after registration from \url{http://www.measuredhs.com}. For
illustrative purposes, we use a random subset of \Sexpr{nrow(india)}
observations from the original data (approximately 12\%) and only a (very small)
subset of variables. For details on the data set and the data source see the
help file of the \code{india} data set in the \pkg{gamboostLSS} package and
\citet{FahrmeirKneib:2011}.

\paragraph{Case study: Childhood malnutrition in India}

First of all we load the data sets \code{india} and \code{india.bnd} into the
workspace. The first data set includes the outcome and \Sexpr{ncol(india) - 2}
explanatory variables. The latter data set consists of a special boundary file
containing the neighborhood structure of the districts in India.

<<load_package_and_data>>=
data("india")
data("india.bnd")
names(india)
@

The outcome variable \code{stunting} is depicted with its spatial structure in
Figure~\ref{fig:india}. An overview of the data set can be found in
Table~\ref{tab:summary_india}. One can clearly see a trend towards malnutrition
in the data set as even the 75\% quantile of the stunting score is below zero.
\hfill $\vardiamond$

\setkeys{Gin}{width = 0.45\textwidth}
\begin{figure}[ht!]
  \centering
<<india_stunting1, echo = FALSE,  fig = TRUE, width = 5, height = 4.5>>=
library("R2BayesX")
## plot mean
FUN <- mean
india_agg <- data.frame(mcdist = names(tapply(india$stunting, india$mcdist, FUN,
                                              na.rm = TRUE)),
                        stunting = tapply(india$stunting, india$mcdist, FUN,
                                          na.rm = TRUE))
par(mar = c(1, 0, 2, 0))
plotmap(india.bnd, india_agg, pos = "bottomright",
        range = c(-5.1, 1.50), main = "Mean", mar.min = NULL)
@
\hfill
<<india_stunting2, echo = FALSE,  fig = TRUE, width = 5, height = 4.5>>=
## plot sd
FUN <- sd
india_agg <- data.frame(mcdist = names(tapply(india$stunting, india$mcdist, FUN,
                                              na.rm = TRUE)),
                        stunting = tapply(india$stunting, india$mcdist, FUN,
                                          na.rm = TRUE))
## remove missing values (= no variation)
india_agg <- india_agg[complete.cases(india_agg),]
par(mar = c(1, 0, 2, 0))
plotmap(india.bnd, india_agg, pos = "bottomright",
        range = c(0, 4.00), main = "Standard deviation")
@
\caption{Spatial structure of stunting in India. The raw mean per district is
  given in the left figure, ranging from dark blue (low stunting score), to dark
  red (higher scores). The right figure depicts the standard deviation of the
  stunting score in the district, ranging from dark blue (no variation) to dark
  red (maximal variability). Dashed regions represent regions without
  data.}\label{fig:india}
\end{figure}

\begin{table}[h!]
  \centering
  \begin{tabular}{llrrrrrr}
<<summary, results = tex, echo = false>>=
out <- data.frame(Description = c("Stunting", "BMI (child)", "Age (child; months)",
                                 "BMI (mother)", "Age (mother; years)"),
                  Variable = paste0("\\code{", c(names(india)[1:5]),"}"),
                  "Min." = apply(india[, 1:5], 2, min),
                  "25\\% Quantile" = apply(india[, 1:5], 2, quantile, p = 0.25),
                  "Median" = apply(india[, 1:5], 2, median),
                  "Mean" = apply(india[, 1:5], 2, mean),
                  "75\\% Quantile" = apply(india[, 1:5], 2, quantile, p = 0.75),
                  "Max." = apply(india[, 1:5], 2, max))
names(out)[c(4, 7)] <- c("25\\% Qu.", "75\\% Qu.")
names(out)[1:2] <- ""
out[, 3:8] <- round(out[, 3:8], 2)
cat("\\toprule \n")
cat(paste(colnames(out), collapse = " & "), "\\\\ \n \\cmidrule{3-8}", "\n")
out <- apply(out, 1, function(x) cat(paste(x, collapse = " & "), " \\\\ \n"))
cat("\\bottomrule \n")
@
  \end{tabular}
  \caption{Overview of \code{india} data.\label{tab:summary_india}}
\end{table}
\pagebreak[3]

\section[The Package gamboostLSS]{The package \pkg{gamboostLSS}}
\label{sec:package}
The \emph{gamboostLSS} algorithm is implemented in the publicly available
\proglang{R} add-on package \pkg{gamboostLSS} \citep{pkg:gamboostLSS:1.2-0}. The
package makes use of the fitting algorithms and some of the infrastructure of
\pkg{mboost}
\citep{BuhlmannHothorn07,Hothorn+Buehlmann+Kneib+Schmid+Hofner:mboost:2010,pkg:mboost:2.4-0}.
Furthermore, many naming conventions and features are implemented in analogy to
\pkg{mboost}. By relying on the \pkg{mboost} package, \pkg{gamboostLSS}
incorporates a wide range of base-learners and hence offers a great flexibility
when it comes to the types of predictor effects on the parameters of a GAMLSS
distribution. In addition to making the infrastructure available for GAMLSS,
\pkg{mboost} constitutes a well-tested, mature software package in the back end.
For the users of \pkg{mboost}, \pkg{gamboostLSS} offers the advantage of
providing a drastically increased number of possible distributions to be fitted
by boosting.

As a consequence of this partial dependency on \pkg{mboost}, we recommend users
of \pkg{gamboostLSS} to make themselves familiar with the former before using
the latter package. To make this tutorial self-contained, we try to shortly
explain all relevant features here as well. However, a dedicated hands-on
tutorial is available for an applied introduction to \pkg{mboost}
\citep{Hofner:mboost:2014}.

\subsection{Model fitting}\label{sec:model-fitting}

The models can be fitted using the function \code{glmboostLSS()} for linear
models. For all kinds of structured additive models the function
\code{gamboostLSS()} can be used. The function calls are as
follows:

\begin{Sinput}
R> glmboostLSS(formula, data = list(), families = GaussianLSS(),
+              control = boost_control(), weights = NULL, ...)
R> gamboostLSS(formula, data = list(), families = GaussianLSS(),
+              control = boost_control(), weights = NULL, ...)
\end{Sinput}

Note that here and in the remainder of the paper we sometimes focus on the most
relevant (or most interesting) arguments of a function only. Further arguments
might exist. Thus, for a complete list of arguments and their description we
refer the reader to the respective help file.

The \code{formula} can consist of a single \code{formula} object, yielding the
same candidate model for all distribution parameters. For example,
\begin{Sinput}
R> glmboostLSS(y ~ x1 + x2 + x3, data = toydata)
\end{Sinput}
specifies linear models with predictors \code{x1} to \code{x3} for all GAMLSS
parameters (here $\mu$ and $\sigma$ of the Gaussian distribution). As an
alternative, one can also use a named list to specify different candidate models
for different parameters, e.g.,
\begin{Sinput}
R> glmboostLSS(list(mu = y ~ x1 + x2, sigma = y ~ x1 + x3), data = toydata)
\end{Sinput}
fits a linear model with predictors \code{x1} and \code{x2} for the \code{mu}
component and a linear model with predictors \code{x1} and \code{x3} for the
\code{sigma} component. As for all \proglang{R} functions with a formula
interface, one must specify the data set to be used (argument \code{data}).
Additionally, \code{weights} can be specified for weighted regression. Instead
of specifying the argument \code{family} as in \pkg{mboost} and other modeling
packages, the user needs to specify the argument \code{families}, which
basically consists of a list of sub-families, i.e., one family for each of the
GAMLSS distribution parameters. These sub-families define the parameters of the
GAMLSS distribution to be fitted. Details are given in the next section.

The initial number of boosting iterations as well as the step-lengths
($\nu_{\text{sl}}$; see Appendix~\ref{algorithm}) are specified via the function
\code{boost\_control()} with the same arguments as in \pkg{mboost}. However, in
order to give the user the possibility to choose different values for each
additive predictor (corresponding to the different parameters of a GAMLSS), they
can be specified via a vector or list. Preferably a \emph{named} vector or list
should be used, where the names correspond to the names of the sub-families. For
example, one can specify:

\begin{Sinput}
R> boost_control(mstop = c(mu = 100, sigma = 200),
+                nu = c(mu = 0.2, sigma = 0.01))
\end{Sinput}

Specifying a single value for the stopping iteration \code{mstop} or the
step-length \code{nu} results in equal values for all sub-families. The defaults
is \code{mstop = 100} for the initial number of boosting iterations and \code{nu
  = 0.1} for the step-length. Additionally, the user can specify if status
information should be printed by setting \code{trace = TRUE} in
\code{boost_control}. Note that the argument \code{nu} can also refer to one of
the GAMLSS distribution parameters in some families (and is also used in
\pkg{gamlss} as the name of a distribution parameter). In \code{boost\_control},
however, \code{nu} always represents the step-length $\nu_{\text{sl}}$.

\subsection{Distributions}\label{sec:distributions}

Some GAMLSS distributions are directly implemented in the \proglang{R} add-on
package \pkg{gamboostLSS} and can be specified via the \code{families} argument
in the fitting function \code{gamboostLSS()} and \code{glmboostLSS()}. An
overview of the implemented families is given in
Table~\ref{tab:gamboostlss_families}. The parametrization of the negative
binomial distribution, the log-logistic distribution and the $t$ distribution in
boosted GAMLSS models is given in \citet{mayretal}. The derivation of boosted
beta regression, another special case of GAMLSS, can be found in
\cite{schmid2013beta}. In our case study we will use the default
\code{GaussianLSS()} family to model childhood malnutrition in India. The
resulting object of the family looks as follows:

<<families>>=
str(GaussianLSS(), 1)
@

We obtain a list of class \code{"families"} with two sub-families, one for the
$\mu$ parameter of the distribution and another one for the $\sigma$ parameter.
Each of the sub-families is of type \code{"boost_family"} from package
\pkg{mboost}. Attributes specify the name and the quantile function
(\code{"qfun"}) of the distribution.

In addition to the families implemented in the \pkg{gamboostLSS} package, there
are many more possible GAMLSS distributions available in the \pkg{gamlss.dist}
package \citep{pkg:gamlss.dist:4.3-0}. In order to make our boosting approach
available for these distributions as well, we provide an interface to
automatically convert available distributions of \pkg{gamlss.dist} to objects of
class \code{"families"} to be usable in the boosting framework via the function
\code{as.families()}. As input, a character string naming the
\code{"gamlss.family"}, or the function itself is required. The function
\code{as.families()} then automatically constructs a \code{"families"} object
for the \pkg{gamboostLSS} package. To use for example the gamma family as
parametrized in \pkg{gamlss.dist}, one can simply use \code{as.families("GA")}
and plug this into the fitting algorithms of \pkg{gamboostLSS}:

\begin{Sinput}
R> gamboostLSS(y ~ x, families = as.families("GA"))
\end{Sinput}

\begin{landscape}
  \begin{table}[h!]
    \centering
    \begin{tabular}{lllllllp{0.5\textwidth}}
      \toprule
      & &  Name & Response & $\mu$ & $\sigma$  & $\nu$  & Note\\
      \cmidrule{2-8}

      \multicolumn{8}{l}{\textbf{Continuous response}} \\
      & Gaussian & \code{GaussianLSS()} & cont.  & id & log & & \\
      & Student's $t$  & \code{StudentTLSS()} & cont.   & id & log & log & The 3rd parameter is denoted by \code{df} (degrees of freedom). \\
      \cmidrule{2-8}

      \multicolumn{8}{l}{\textbf{Continuous non-negative response}} \\
      & Gamma & \code{GammaLSS()} & cont. $>0$  & log & log & & \\
      \cmidrule{2-8}

      \multicolumn{8}{l}{\textbf{Fractions and bounded continuous response}} \\
      & Beta & \code{BetaLSS()} & $\in (0,1)$  & logit & log & & The 2nd parameter is denoted by \code{phi}.\\
      \cmidrule{2-8}

      \multicolumn{8}{l}{\textbf{Models for count data}} \\
      & Negative binomial & \code{NBinomialLSS()} & count  & log & log &   & For over-dispersed count data.\\
      & Zero inflated Poisson & \code{ZIPoLSS()} & count  & log & logit & & For zero-inflated count data; the 2nd parameter is the probability parameter of the zero mixture component.\\
      & Zero inflated neg. binomial & \code{ZINBLSS()} & count  & log & log & logit & For over-dispersed and zero-inflated count data; the 3rd parameter is the probability parameter of the zero mixture component.\\
      \cmidrule{2-8}

      \multicolumn{8}{l}{\textbf{Survival models} \citep[accelerated failure time models; see, e.g.,][]{klein03}} \\
      & Log-normal & \code{LogNormalLSS()} & cont. $>0$ & id & log &&
      \multirow{3}{8cm}{All three families assume that the data are subject to
        right-censoring. Therefore the response must be a \code{Surv()} object.}\\
      & Weibull & \code{WeibullLSS()} & cont. $>0$ & id & log &&  \\
      & Log-logistic & \code{LogLogLSS()} & cont. $>0$ & id & log &&\\
      \bottomrule
  \end{tabular}
  \caption{Overview of \code{"families"} that are implemented in
    \pkg{gamboostLSS}. For every distribution parameter the  corresponding
    link-function is displayed (id = identity link).} \label{tab:gamboostlss_families}
\end{table}
\end{landscape}

With this interface, it is possible to apply boosting for any distribution
implemented in \pkg{gamlss.dist} and for all new distributions that will be
added in the future. Note that one can also fit censored or truncated
distributions by using \code{gen.cens()} \citep[from package
\pkg{gamlss.cens}; see][]{pkg:gamlss.cens:4.2-7} or \code{gen.trun()} \citep[from
package \pkg{gamlss.tr}; see][]{pkg:gamlss.tr:4.2.7}, respectively. An overview of
common GAMLSS distributions is given in Appendix~\ref{sec:additional-families}.
Minor differences in the model fit when applying a pre-specified distribution
(e.g., \code{GaussianLSS()}) and the transformation of the corresponding
distribution from \pkg{gamlss.dist} (e.g., \code{as.families("NO")}) can be
explained by possibly different offset values.

\subsection{Base-learners}\label{sec:base-learners}

For the base-learners, which carry out the fitting of the gradient vectors using
the covariates, the \pkg{gamboostLSS} package completely depends on the
infrastructure of \pkg{mboost}. Hence, every base-learner which is available in
\pkg{mboost} can also be applied to fit GAMLSS distributions via
\pkg{gamboostLSS}. The choice of base-learners is crucial for the application of
the \emph{gamboostLSS} algorithm, as they define the type(s) of effect(s) that
covariates will have on the predictors of the GAMLSS distribution parameters.
See \citet{Hofner:mboost:2014} for details and application notes on the
base-learners.

The available base-learners include simple linear models for \textit{linear}
effects and penalized regression splines \citep[\textit{P}-splines,
][]{Eilers1996} for \textit{non-linear} effects. \textit{Spatial} or other
\textit{bivariate} effects can be incorporated by setting up a bivariate tensor
product extension of P-splines for two continuous variables \citep{kneibetal}.
Another way to include spatial effects is the adaptation of Markov random fields
for modeling a neighborhood structure \citep{sobotka12} or radial basis
functions \citep{Hofner:Dissertation:2011}. \textit{Constrained} effects such as
monotonic or cyclic effects can be specified as well
\citep{Hofner:monotonic:2011,Hofner:constrained:2014}. \textit{Random} effects
can be taken into account by using ridge-penalized base-learners for fitting
categorical grouping variables such as random intercepts or slopes \citep[see
supplementary material of][]{kneibetal}.

\paragraph{Case study (\textit{cont'd}): Childhood malnutrition in India}
\hfill\newline First, we are going to set up and fit our model. Usually, one
could use \code{bmrf(mcdist, bnd = india.bnd)} to specify the spatial
base-learner using a Markov random field. However, as it is relatively
time-consuming to compute the neighborhood matrix from the boundary file and as
we need it several times, we pre-compute it once. Note that \pkg{R2BayesX}
\citep{pkg:R2BayesX:0.3-1} needs to be loaded in order to use this function:

<<compute_neighborhood, results=hide>>=
library("R2BayesX")
neighborhood <- bnd2gra(india.bnd)
@

The other effects can be directly specified without further care. We use smooth
effects for the age (\code{mage}) and BMI (\code{mbmi}) of the mother and smooth
effects for the age (\code{cage}) and BMI (\code{cbmi}) of the child. Finally,
we specify the spatial effect for the district in India where mother and child
live (\code{mcdist}).

We set the options

<<options>>=
ctrl <- boost_control(trace = TRUE, mstop = c(mu = 1269, sigma = 84))
@

and fit the boosting model

<<modeling, eval = false>>=
mod_nonstab <- gamboostLSS(stunting ~ bbs(mage) + bbs(mbmi) +
                             bbs(cage) + bbs(cbmi) +
                             bmrf(mcdist, bnd = neighborhood),
                           data = india,
                           families = GaussianLSS(),
                           control = ctrl)
@

<<run_modeling, echo = FALSE, results = hide>>=
## abbreviate output to use less manuscript space
out <- capture.output(
<<modeling>>
)
out <- out[c(1:5, 33:35)]
out[3:5] <- c("", "(...)", "")
@
<<print_modeling_results, echo = FALSE>>=
writeLines(out)
@

We specified the initial number of boosting iterations as \code{mstop = c(mu =
  \Sexpr{mstop(mod_nonstab)["mu"]}, sigma =
  \Sexpr{mstop(mod_nonstab)["sigma"]})}, i.e., we used
$\Sexpr{mstop(mod_nonstab)["mu"]}$ boosting iterations for the $\mu$ parameter
and only $\Sexpr{mstop(mod_nonstab)["sigma"]}$ for the $\sigma$ parameter. This
means that we cycle between the $\mu$ and $\sigma$ parameter until we have
computed $\Sexpr{mstop(mod_nonstab)["sigma"]}$ update steps in both sub-models.
Subsequently, we update only the $\mu$ model and leave the $\sigma$ model
unchanged. The selection of these tuning parameters will be
discussed in the next section. \hfill $\vardiamond$\\

Instead of optimizing the gradients per GAMLSS parameter in each boosting
iteration, one can potentially stabilize the estimation further by standardizing
the gradients in each step. Details and an explanation are given in
Appendix~\ref{sec:stab-ngrad}.

\paragraph{Case study (\textit{cont'd}): Childhood malnutrition in India}
\label{page:stabilization}

We now refit the model with the built-in median absolute deviation (MAD)
stabilization by setting \code{stabilization = "MAD"} in the definition of the
families:

<<modeling2, eval = FALSE>>=
mod <- gamboostLSS(stunting ~ bbs(mage) + bbs(mbmi) +
                     bbs(cage) + bbs(cbmi) +
                     bmrf(mcdist, bnd = neighborhood),
                   data = india,
                   families = GaussianLSS(stabilization = "MAD"),
                   control = ctrl)
@

<<run_modeling2, echo = FALSE, results = hide>>=
## abbreviate output to use less manuscript space
out <- capture.output(
<<modeling2>>
)
out <- out[c(1:5, 33:35)]
out[3:5] <- c("", "(...)", "")
@
<<print_modeling2_results, echo = FALSE>>=
writeLines(out)
@

One can clearly see that the stabilization changes the model and reduces the
intermediate and final risks. \hfill $\vardiamond$

\subsection[Model complexity and diagnostic checks]{Model complexity and
  diagnostic checks}\label{sec:model_complexity}

Measuring the complexity of a GAMLSS is a crucial issue for model building and
parameter tuning, especially with regard to the determination of optimal
stopping iterations for gradient boosting (see next section). In the GAMLSS
framework, valid measures of the complexity of a fitted model are even more
important than in classical regression, since variable selection and model
choice have to be carried out in \emph{several} additive predictors within the
same model.

In the original work by \cite{rs}, the authors suggested to evaluate AIC-type
information criteria to measure the complexity of a GAMLSS. Regarding the
complexity of a classical boosting fit with one predictor, AIC-type measures are
available for a limited number of distributions
\citep[see][]{BuhlmannHothorn07}. Still, there is no commonly accepted approach
to measure the degrees of freedom of a boosting fit, even in the classical
framework with only one additive predictor. This is mostly due to the
algorithmic nature of gradient boosting, which results in regularized model fits
for which complexity is difficult to evaluate \cite{Hast:comment:2007}. As a
consequence, the problem of deriving valid (and easy-to-compute) complexity
measures for boosting remains largely unsolved
\citep[Sec.~4]{Kneib:comment:2014}.

In view of these considerations, and because it is not possible to use the
original information criteria specified for GAMLSS in the \emph{gamboostLSS}
framework, \cite{mayretal} suggested to use cross-validated estimates of the
empirical risk (i.e., of the predicted log-likelihood) to measure the complexity
of \emph{gamboostLSS} fits. Although this strategy is computationally expensive
and might be affected by the properties of the used cross-validation technique,
it is universally applicable to all \pkg{gamboostLSS} families and does not rely
on possibly biased estimators of the effective degrees of freedom. We therefore
decided to implement various resampling procedures in the function
\code{cvrisk()} to estimate model complexity of a \code{gamboostLSS} fit via
cross-validated empirical risks (see next section).

A related problem is to derive valid diagnostic checks to compare different
families or link functions. For the original GAMLSS method, \cite{rs} proposed
to base diagnostic checks on normalized quantile residuals. In the boosting
framework, however, residual checks are generally difficult to derive because
boosting algorithms result in regularized fits that reflect the trade-off
between bias and variance of the effect estimators. As a consequence, residuals
obtained from boosting fits usually contain a part of the remaining structure of
the predictor effects, rendering an unbiased comparison of competing model
families via residual checks a highly difficult issue. While it is of course
possible to compute residuals from \code{gamboostLSS} models, valid comparisons
of competing models are more conveniently obtained by considering estimates of
the predictive risk.

\paragraph{Case study (\textit{cont'd}): Childhood malnutrition in India}
To extract the empirical risk in the last boosting iteration (i.e., in the last
step) of the model which was fitted with stabilization (see
Page~\pageref{page:stabilization}) one can use
<<>>=
emp_risk <- risk(mod, merge = TRUE)
tail(emp_risk, n = 1)
@
and compare it to the risk of the non-stabilized model
<<>>=
emp_risk_nonstab <- risk(mod_nonstab, merge = TRUE)
tail(emp_risk_nonstab, n = 1)
@
In this case, the stabilized model has a lower (in-bag) risk than the
non-stabilized model. Note that usually both models should be tuned before the
empirical risk is compared. Here it merely shows that the risk of the stabilized
model decreases quicker.

To compare the risk on new data sets, i.e., the predictive risk, one could
combine all data in one data set and use weights that equal zero for the new
data. Let us fit the model only on a random subset of 2000 observations. To
extract the risk for observations with zero weights, we need to additionally set
\code{risk = "oobag"}.
<<>>=
weights <- sample(c(rep(1, 2000), rep(0, 2000)))
mod_subset <- update(mod, weights = weights, risk = "oobag")
@
Note that we could also specify the model anew via
<<eval = FALSE>>=
mod_subset <- gamboostLSS(stunting ~ bbs(mage) + bbs(mbmi) +
                     bbs(cage) + bbs(cbmi) +
                     bmrf(mcdist, bnd = neighborhood),
                   data = india,
                   weights = weights,
                   families = GaussianLSS(),
                   control = boost_control(mstop = c(mu = 1269, sigma = 84),
                                           risk = "oobag"))
@
To refit the non-stabilized model we use
<<>>=
mod_nonstab_subset <- update(mod_nonstab,
                             weights = weights, risk = "oobag")
@
Now we extract the predictive risks which are now computed on the 2000 ``new''
observations:
<<>>=
tail(risk(mod_subset, merge = TRUE), 1)
tail(risk(mod_nonstab_subset, merge = TRUE), 1)
@
Again, the stabilized model has a lower predictive risk.\hfill $\vardiamond$

\subsection{Model tuning: Early stopping to prevent overfitting}\label{sec:model_tuning}

As for other component-wise boosting algorithms, the most important tuning
parameter of the \emph{gamboostLSS} algorithm is the stopping iteration
$\bm{m}_{\text{stop}}$ (here a $K$-dimensional vector). In some low-dimensional
settings it might be convenient to let the algorithm run until convergence
(i.e., use a large number of iterations for each of the $K$ distribution
parameters). In these cases, as they are optimizing the same likelihood,
boosting should converge to the same model as \pkg{gamlss} -- at least when the
same penalties are used for smooth effects.

However, in most settings, where the application of boosting is favorable, it is
crucial that the algorithm is not run until convergence but some sort of early
stopping is applied \citep{Mayr:mstop:2012}. Early stopping results in shrunken
effect estimates, which has the advantage that predictions become more stable
since the variance of the estimates is reduced. Another advantage of early
stopping is that \emph{gamboostLSS} has an intrinsic mechanism for data-driven
variable selection, since only the best-fitting base-learner is updated in each
boosting iteration. Hence, the stopping iteration $m_{\text{stop},k}$ does not
only control the amount of shrinkage applied to the effect estimates but also
the complexity of the models for the distribution parameter~$\theta_k$.

To find the optimal complexity, the resulting model should be evaluated
regarding the predictive risk on a large grid of stopping values by
cross-validation or resampling methods, using the function \code{cvrisk()}. In
case of \emph{gamboostLSS}, the predictive risk is computed as the negative log
likelihood of the out-of-bag sample. The search for the optimal
$\bm{m}_{\text{stop}}$ based on resampling is far more complex than for standard
boosting algorithms. Different stopping iterations can be chosen for the
parameters, thus allowing for different levels of complexity in each sub-model
(\emph{multi-dimensional} early stopping). In the package \pkg{gamboostLSS} a
multi-dimensional grid can be easily created utilizing the function
\code{make.grid()}.

In most of the cases the $\mu$ parameter is of greatest interest in a GAMLSS
model and thus more care should be taken to accurately model this parameter.
\citet{pkg:gamlss:4.3-0}, the inventors of GAMLSS, stated on the help page for
the function \code{gamlss()}: ``Respect the parameter hierarchy when you are
fitting a model. For example a good model for $\mu$ should be fitted before a
model for $\sigma$ is fitted.''. Consequently, we provide an option
\code{dense_mu_grid} in the \code{make.grid()} function that allows to have a
finer grid for (a subset of) the $\mu$ parameter. Thus, we can better tune the
complexity of the model for $\mu$ which helps to avoid over- or underfitting of
the mean without relying to much on the grid. Details and explanations are given
in the following paragraphs.

\paragraph{Case study (\textit{cont'd}): Childhood malnutrition in India}

We first set up a grid for \code{mstop} values starting at $20$ and going in
$10$ equidistant steps on a logarithmic scale to $500$:

<<setup_grid1>>=
grid <- make.grid(max = c(mu = 500, sigma = 500), min = 20,
                  length.out = 10, dense_mu_grid = FALSE)
@

Additionally, we can use the \code{dense\_mu\_grid} option to create a dense
grid for $\mu$. This means that we compute the risk for all iterations
$m_{\text{stop},\mu}$, if $m_{\text{stop},\mu} \geq m_{\text{stop},\sigma}$ and
do not use the values on the sparse grid only:

<<setup_grid2_code, eval = false>>=
densegrid <- make.grid(max = c(mu = 500, sigma = 500), min = 20,
                       length.out = 10, dense_mu_grid = TRUE)
plot(densegrid, pch = 20, cex = 0.2,
     xlab = "Number of boosting iterations (mu)",
     ylab = "Number of boosting iterations (sigma)")
abline(0,1)
points(grid, pch = 20, col = "red")
@

A comparison and an illustration of the sparse and the dense grids can be found
in Figure~\ref{fig:grid} (left). Red dots refer to all possible combinations of
$m_{\text{stop},\mu}$ and $m_{\text{stop},\sigma}$ on the sparse grid, whereas
the black lines refer to the additional combinations when a dense grid is used.
For a given $m_{\text{stop},\sigma}$, all iterations $m_{\text{stop},\mu} \geq
m_{\text{stop},\sigma}$ (i.e., below the bisecting line) can be computed without
additional computing time. For example, if we fit a model with \code{mstop =
  c(mu = 30, sigma = 15)}, all $m_{\text{stop}}$ combinations on the red path
(Figure~\ref{fig:grid}, right) are computed. Until the point where
$m_{\text{stop}, \mu} = m_{\text{stop}, \sigma}$, we move along the bisecting
line. Then we stop increasing $m_{\text{stop}, \sigma}$ and increase
$m_{\text{stop}, \mu}$ only, i.e., we start moving along a horizontal line.
Thus, all iterations on this horizontal line are computed anyway. Note that it
is quite expensive to move from the computed model to one with \code{mstop =
  c(mu = 30, sigma = 16)}. One cannot simply increase $m_{\text{stop},\sigma}$
by 1 but needs to go along the black dotted path. As the dense grid does not
increase the run time (or only marginally), we recommend to always use this
option, which is also the default.

\setkeys{Gin}{width = 0.45\textwidth}
\begin{figure}[h!]
  \centering
<<setup_grid2, fig = true, echo = false, results = hide>>=
par(mar = c(4, 4, 0.1, 0.1))
<<setup_grid2_code>>
@
\hfill
<<iterations, fig = true, echo = false, results = hide>>=
par(mar = c(4, 4, 0.1, 0.1))
plot(1:30, type = "n", xlab = "Number of boosting iterations (mu)",
     ylab = "Number of boosting iterations (sigma)")
j <- 0
for (i in 0:29) {
    lines(c(i, i + 1), c(j, j), col = "red", lwd = 2)
    points(i, j, col = "red", pch = 20, cex = 1.3)
    if (j < 15) {
        lines(c(i + 1, i + 1), c(j, j + 1), col = "red", lwd = 2)
        points(i + 1, j, col = "red", pch = 20, cex = 1.3)
        j <- j + 1
    }
}
points(30, 15, col = "red", pch = 20, cex = 1.3)
lines(c(16, 16), c(15, 16), lwd = 2)
lines(c(16, 30), c(16, 16), lwd = 2)
points(c(16:30), rep(16, 15), col = "black", pch = 20, cex = 1.3)
abline(0,1)
lines(c(-1, 15), c(15, 15), lty = "dotted")
lines(c(15, 15), c(-1, 15), lty = "dotted")
legend("topleft", legend = c("mstop = c(mu = 30, sigma = 15)", "mstop = c(mu = 30, sigma = 16)"),
       lty = "solid", pch = 20, cex = 1.1, col = c("red", "black"), bty = "n")
@
\caption{\emph{Left:} Comparison between sparse grid (red) and dense $\mu$ grid
  (black horizontal lines in addition to the sparse red grid). \emph{Right:}
  Example of the path of the iteration counts.}\label{fig:grid}
\end{figure}

The \code{dense\_mu\_grid} option also works for asymmetric grids (e.g.,
\code{make.grid(max = c(mu = 100, sigma = 200))}) and for more than two
parameters (e.g., \code{make.grid(max = c(mu = 100, sigma = 200, nu = 20))}).
For an example in the latter case see the help file of \code{make.grid()}.

Now we use the dense grid for cross-validation (or subsampling to be more
precise). The computation of the cross-validated risk using \code{cvrisk()}
takes more than one hour on a 64-bit Ubuntu machine using 2 cores.

<<cvrisk, eval = FALSE, results = hide>>=
cores <- ifelse(grepl("linux|apple", R.Version()$platform), 2, 1)
if (!file.exists("cvrisk/cvr_india.Rda")) {
    set.seed(1907)
    folds <- cv(model.weights(mod), type = "subsampling")
    densegrid <- make.grid(max = c(mu = 5000, sigma = 500), min = 20,
                           length.out = 10, dense_mu_grid = TRUE)
    cvr <- cvrisk(mod, grid = densegrid, folds = folds, mc.cores = cores)
    save("cvr", file = "cvrisk/cvr_india.Rda", compress = "xz")
}
@

By using more computing cores or a larger computer cluster the speed can be
easily increased. The usage of \code{cvrisk()} is practically identical to that
of \code{cvrisk()} from package \pkg{mboost}. See \citet{Hofner:mboost:2014} for
details on parallelization and grid computing. As Windows does not support
addressing multiple cores from \proglang{R}, on Windows we use only one core
whereas on Unix-based systems two cores are used. We then load the pre-computed
results of the cross-validated risk:

<<load_cvrisk, echo = TRUE, results = hide, eval = FALSE>>=
load("cvrisk/cvr_india.Rda")
@
\vspace{-2em} \hfill $\vardiamond$

\subsection{Methods to extract and display results}\label{sec:methods}

In order to work with the results, methods to extract information both from
boosting models and the corresponding cross-validation results have been
implemented. Fitted \pkg{gamboostLSS} models (i.e., objects of type
\code{"mboostLSS"}) are lists of \code{"mboost"} objects. The most important
distinction from the methods implemented in \pkg{mboost} is the widespread
occurrence of the additional argument \code{parameter}, which enables the user
to apply the function on all parameters of a fitted GAMLSS model or only on one
(or more) specific parameters.

Most importantly, one can extract the coefficients of a fitted model
(\code{coef()}) or plot the effects (\code{plot()}). Different versions of both
functions are available for linear GAMLSS models (i.e., models of class
\code{"glmboostLSS"}) and for non-linear GAMLSS models (e.g., models with
P-splines). Additionally, the user can extract the risk for all iterations using
the function \code{risk()}. Selected base-learners can be extracted using
\code{selected()}. Fitted values and predictions can be obtained by
\code{fitted()} and \code{predict()}. For details and examples, see the
corresponding help files and \citet{Hofner:mboost:2014}. Furthermore, a special
function for marginal prediction intervals is available (\code{predint()})
together with a dedicated plot function (\code{plot.predint()}).

For cross-validation results (objects of class \code{"cvriskLSS"}), there exists
a function to extract the estimated optimal number of boosting iterations
(\code{mstop()}). The results can also be plotted using a special \code{plot()}
function. Hence, convergence and overfitting behavior can be visually inspected.

In order to increase or reduce the number of boosting steps to the appropriate
number (as e.g., obtained by cross-validation techniques) one can use the
function \code{mstop}. If we want to reduce our model, for example, to 10
boosting steps for the \code{mu} parameter and 20 steps for the \code{sigma}
parameter we can use
\begin{Sinput}
R> mstop(mod) <- c(10, 20)
\end{Sinput}
This directly alters the object \code{mod}. Instead of specifying a vector with
separate values for each sub-family one can also use a single value, which then
is used for each sub-family (see Section~\ref{sec:model-fitting}).

\paragraph{Case study (\textit{cont'd}): Childhood malnutrition in India}

We first inspect the cross-validation results (see Figure~\ref{fig:cvr}):

<<crossvalidation_code, eval = false>>=
plot(cvr)
@

\setkeys{Gin}{width = 0.5\textwidth}
\begin{figure}[h!]
  \centering
<<crossvalidation, fig = true, results = hide, echo = false, eval = FALSE>>=
<<crossvalidation_code>>
@
\includegraphics{fig-crossvalidation.pdf}
\caption{Cross-validated risk. Darker color represents higher predictive risks.
  The optimal combination of stopping iterations is indicated by dashed red
  lines.}\label{fig:cvr}
\end{figure}

If the optimal stopping iteration is close to the boundary of the grid one
should re-run the cross-validation procedure with different \code{max} values
for the grid and/or more grid points. This is not the case here
(Figure~\ref{fig:cvr}). To extract the optimal stopping iteration one can now
use
\begin{Sinput}
R> mstop(cvr)
   mu sigma 
 1269    84 
\end{Sinput}

<<check_initial_assignement, echo = FALSE, results = hide, eval = FALSE>>=
## for the purpose of the tutorial we started already with the optimal number of
## boosting steps. Check if this is really true:
if (!isTRUE(all.equal(mstop(mod), mstop(cvr))))
    warning("Check mstop(mod) throughout the manuscript.")
@

To use the optimal model, i.e., the model with the iteration number from the
cross-validation, we set the model to these values:

<<subset, results = hide, eval = FALSE>>=
mstop(mod) <- mstop(cvr)
@

In the next step, the \code{plot()} function can be used to plot the partial
effects. A partial effect is the effect of a certain predictor only, i.e., all
other model components are ignored for the plot. Thus, the reference level of
the plot is arbitrary and even the actual size of the effect might not be
interpretable; only changes and hence the functional form are meaningful. If no
further arguments are specified, all \emph{selected} base-learners are plotted:

<<effects>>=
par(mfrow = c(2, 5))
plot(mod)
@

Special base-learners can be plotted using the argument \code{which} (to specify
the base-learner) and the argument \code{parameter} (to specify the parameter,
e.g., \code{"mu"}). Partial matching is used for \code{which}, i.e., one can
specify a sub-string of the base-learners' names. Consequently, all matching
base-learners are selected. Alternatively, one can specify an integer which
indicates the number of the effect in the model formula. Thus

<<smooth_effects_code, eval = FALSE, height = 8>>=
par(mfrow = c(2, 4), mar = c(5.1, 4.5, 4.1, 1.1))
plot(mod, which = "bbs", type = "l")
@

plots \emph{all} P-spline base-learners irrespective if they where selected or
not. The partial effects in Figure~\ref{fig:smooth_effects} can be interpreted
as follows: The age of the mother seems to have a minor impact on stunting for
both the mean effect and the effect on the standard deviation. With increasing
BMI of the mother, the stunting score increases, i.e., the child is better
nourished. At the same time the variability increases until a BMI of roughly 25
and then decreases again. The age of the child has a negative effect until the
age of approximately 1.5 years (18 months). The variability increases over the
complete range of age. The BMI of the child has a negative effect on stunting,
with lowest variability for an BMI of approximately 16. While all other effects
can be interpreted quite easily, this effect is more difficult to interpret.
Usually, one would expect that a child that suffers from malnutrition also has a
small BMI. However, the height of the child enters the calculation of the BMI in
the denominator, which means that a lower stunting score (i.e., small height)
should lead on average to higher BMI values if the weight of a child is fixed.

\setkeys{Gin}{width = 0.99\textwidth}
\begin{figure}[h!]
  \centering
<<smooth_effects, fig = true, results = hide, echo = false, width = 8, height = 4>>=
par(cex.axis = 1.3, cex.lab = 1.3, mar = c(4, 5, 2, 1))
<<smooth_effects_code>>
@
\caption{Smooth partial effects of the estimated model with the rescaled
  outcome. The effects for \code{sigma} are estimated and plotted on the
  log-scale (see Equation~\ref{Gaussian:sigma}), i.e., we plot the predictor
  against $\log(\hat{\sigma})$.}\label{fig:smooth_effects}
\end{figure}

If we want to plot the effects of all P-spline base-learners for the $\mu$
parameter, we can use

<<smooth_effects_2>>=
plot(mod, which = "bbs", parameter = "mu")
@

Instead of specifying (sub-)strings for the two arguments one could use integer
values in both cases. For example,

<<smooth_effects_3>>=
plot(mod, which = 1:4, parameter = 1)
@

results in the same plots.

Prediction intervals for new observations can be easily constructed by computing
the quantiles of the conditional GAMLSS distribution. This is done by plugging
the estimates of the distribution parameters (e.g., $\hat{\mu}(x_{\text{new}}),
\hat{\sigma}(x_{\text{new}})$ for a new observation $x_{\text{new}}$) into the
quantile function \citep{mayretal}.

Marginal prediction intervals, which reflect the effect of a single predictor on
the quantiles (keeping all other variables fixed), can be used to illustrate the
combined effect of this variable on various distribution parameters and hence
the shape of the distribution. For illustration purposes we plot the influence
of the children's BMI via \code{predint()}. To obtain marginal prediction
intervals, the function uses a grid for the variable of interest, while fixing
all others at their mean (continuous variables) or modus (categorical
variables).

<<marginal_prediction_plot_code, fig = false, results = hide>>=
plot(predint(mod, pi = c(0.8, 0.9), which = "cbmi"),
     lty = 1:3, lwd = 3, xlab =  "BMI (child)",
     ylab = "Stunting score")
@

To additionally highlight observations from Greater Mumbai, we use
<<greater_mumbai, fig = false, results = hide>>=
points(stunting ~ cbmi, data = india, pch = 20,
       col = rgb(1, 0, 0, 0.5), subset = mcdist == "381")
@


\setkeys{Gin}{width = 0.45\textwidth}
\begin{figure}[h!]
  \centering
<<marginal_prediction_plot, fig = true, results = hide, echo = false>>=
<<marginal_prediction_plot_code>>
<<greater_mumbai>>
@
\caption{80\% (dashed) and 90\% (dotted) marginal prediction intervals for the
  BMI of the children in the district of Greater Mumbai (which is the region
  with the most observations). For all other variables we used average values
  (i.e., a child with average age, and a mother with average age and BMI). The
  solid line corresponds to the median prediction (which equals the mean for
  symmetric distributions such as the Gaussian distribution). Observations from
  Greater Mumbai are highlighted in red.}\label{fig:pred_interval}
\end{figure}

The resulting marginal prediction intervals are displayed in
Figure~\ref{fig:pred_interval}. For the interpretation and evaluation of
prediction intervals, see \cite{MayrPI}.

For the spatial \code{bmrf()} base-learner we need some extra work to plot the
effect(s). We need to obtain the (partial) predicted values per region using
either \code{fitted()} or \code{predict()}:
<<spatial_effects_code1, eval = false>>=
fitted_mu <- fitted(mod, parameter = "mu", which = "mcdist",
                    type = "response")
fitted_sigma <- fitted(mod, parameter = "sigma", which = "mcdist",
                       type = "response")
@

In case of \code{bmrf()} base-learners we then need to aggregate the data for
multiple observations in one region before we can plot the data. Here, one could
also plot the coefficients, which constitute the effect estimates per region.
Note that this interpretation is not possible for for other bivariate or spatial
base-learners such as \code{bspatial()} or \code{brad()}:

<<spatial_effects_code2, eval = false>>=
fitted_mu <- tapply(fitted_mu, india$mcdist, FUN = mean)
fitted_sigma <- tapply(fitted_sigma, india$mcdist, FUN = mean)
plotdata <- data.frame(region = names(fitted_mu),
                       mu = fitted_mu, sigma = fitted_sigma)
par(mfrow = c(1, 2), mar = c(1, 0, 2, 0))
plotmap(india.bnd, plotdata[, c(1, 2)], range = c(-0.62, 0.82),
        main = "Mean", pos = "bottomright", mar.min = NULL)
plotmap(india.bnd, plotdata[, c(1, 3)], range = c(0.75, 1.1),
        main = "Standard deviation", pos = "bottomright", mar.min = NULL)
@

\setkeys{Gin}{width = 0.9\textwidth}
\begin{figure}[h!]
  \centering
<<spatial_effects, fig = true, results = hide, echo = false, width = 10, height = 4.5>>=
<<spatial_effects_code1>>
<<spatial_effects_code2>>
@
\caption{Spatial partial effects of the estimated model. Dashed regions
  represent regions without data. Note that effect estimates for these regions
  exist and could be extracted. \label{fig:spatial_effects}}
\end{figure}

Figure~\ref{fig:spatial_effects} (left) shows a clear spatial pattern of
stunting. While children in the southern regions like Tamil Nadu and Kerala as
well as in the north-eastern regions around Assam and Arunachal Pradesh seem to
have a smaller risk for stunted growth, the central regions in the north of
India, especially Bihar, Uttar Pradesh and Rajasthan seem to be the most
problematic in terms of stunting due to malnutrition. Since we have also modeled
the scale of the distribution, we can gain much richer information concerning
the regional distribution of stunting: the regions in the south which seem to be
less affected by stunting do also have a lower partial effect with respect to
the expected standard deviation (Figure~\ref{fig:spatial_effects}, right), i.e.,
a reduced standard deviation compared to the average region. This means that not
only the expected stunting score is smaller on average, but that the
distribution in this region is also narrower. This leads to a smaller size of
prediction intervals for children living in that area. In contrast, the regions
around Bihar in the central north, where India shares border with Nepal, do not
only seem to have larger problems with stunted growth but have a positive
partial effect with respect the scale parameter of the conditional distribution
as well. This leads to larger prediction intervals, which could imply a greater
risk for very small values of the stunting score for an individual child in that
region. On the other hand, the larger size of the interval also offers the
chance for higher values and could reflect higher differences between different
parts of the population. \hfill $\vardiamond$

\section{Summary}
\label{sec:summary}

The GAMLSS model class has developed into one of the most flexible tools in
statistical modeling, as it can tackle nearly any regression setting of practical
relevance. Boosting algorithms, on the other hand, are one of the most flexible
estimation and prediction tools in the toolbox of a modern statistician
\citep{mayr_boosting_part1}.

In this paper, we have presented the \proglang{R} package \pkg{gamboostLSS},
which provides the first implementation of a boosting algorithm for GAMLSS.
Hence, beeing a combination of boosting and GAMLSS, \pkg{gamboostLSS} combines a
powerful machine learning tool with the world of statistical modeling
\citep{Breiman2001:TwoCultures}, offering the advantage of intrinsic model
choice and variable selection in potentially high-dimensional data situations.
The package also combines the advantages of both \pkg{mboost} (with a
well-established, well-tested modular structure in the back-end) and
\pkg{gamlss} (which implements a large amount of families that are available via
conversion with the \code{as.families()} function).

While the implementation in the \proglang{R} package \pkg{gamlss} (provided by
the inventors of GAMLSS) must be seen as the gold standard for fitting GAMLSS,
the \pkg{gamboostLSS} package offers a flexible alternative, which can be
advantageous, amongst others, in following data settings: (i) models with a
large number of coefficients, where classical estimation approaches become
unfeasible; (ii) data situations where variable selection is of great interest;
(iii) models where a greater flexibility regarding the effect types is needed,
e.g., when spatial, smooth, random, or constrained effects should be included
and selected at the same time.

\section*{Acknowledgments}

We thank the editors and the two anonymous referees for their valuable comments
that helped to greatly improve the manuscript. We gratefully acknowledge the
help of Nora Fenske and Thomas Kneib, who provided code to prepare the data and
also gave valuable input on the package \pkg{gamboostLSS}. We thank Mikis
Stasinopoulos for his support in implementing \code{as.families} and Thorsten
Hothorn for his great work on \pkg{mboost}. The work of Matthias Schmid and
Andreas Mayr was supported by the Deutsche Forschungsgemeinschaft (DFG), grant
SCHM-2966/1-1, and the Interdisciplinary Center for Clinical Research (IZKF) of
the Friedrich-Alexander University Erlangen-N{\"u}rnberg, project J49.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Bibligraphy
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\bibliography{bib}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\clearpage
\appendix

\section[The gamboostLSS algorithm]{The \emph{gamboostLSS} algorithm}\label{algorithm}

Let $\bm{\theta} = (\theta_k)_{k = 1,\ldots,K}$ be the vector of distribution
parameters of a GAMLSS, where $\theta_k = g_k^{-1}(\eta_{\theta_k})$ with
parameter-specific link functions $g_k$ and additive predictor
$\eta_{\theta_k}$. The \emph{gamboostLSS} algorithm \citep{mayretal} circles
between the different distribution parameters $\theta_k,\, k=1, \ldots, K,$ and
fits all base-learners $h(\cdot)$ separately to the negative partial derivatives
of the loss function, i.e., in the GAMLSS context to the partial derivatives of
the log-likelihood with respect to the additive predictors $\eta_{\theta_k}$,
i.e., $\frac{\partial}{\partial \eta_{\theta_k}} l(\bm{y}, \bm{\theta})$.


\begin{enumerate}
\item[] \textbf{Initialize}
  \begin{enumerate}
  \item[(1)] Set the iteration counter $m := 0$. Initialize the additive
    predictors $\hat{\eta}_{\theta_{k,i}}^{[m]},\, k = 1, \ldots, K,\, i=1,
    \ldots, n,$ with offset values, e.g., $\hat{\eta}_{\theta_{k,i}}^{[0]} \equiv
    \underset{c}{\operatorname{argmax}} \sum_{i=1}^n l(y_i, \theta_{k,i} = c)$.
  \item[(2)] For each distribution parameter $\theta_k$, $k=1,\ldots, K$,
    specify a set of base-learners: i.e., for parameter $\theta_k$ by $h_{k,1}
    (\cdot),\ldots,h_{k,p_k} (\cdot)$, where $p_k$ is the cardinality of the set
    of base-learners specified for $\theta_k$.
  \end{enumerate}
\item[] \textbf{Boosting in multiple dimensions}
  \begin{enumerate}
  \item[(3)] \textbf{Start} a new boosting iteration: increase $m$ by 1 and set
    $k := 0$.
  \item[(4)]
    \begin{enumerate}
    \item[(a)] Increase $k$ by 1. \\
      \textbf{If} $m > m_{\rm{stop},\emph{k}}$ proceed to
      step 4(e).\\
      \textbf{Else} compute the partial derivative $\frac{\partial}{\partial
        \eta_{\theta_k}} l(y,\bm{\theta})$ and plug in the current estimates
      $\hat{\bm{\theta}}_i^{[m-1]} = \left( \hat{\theta}_{1,i}^{[m-1]}, \ldots,
        \hat{\theta}_{K,i}^{[m-1]} \right) = \left(
        g^{-1}_1(\hat{\eta}_{\theta_{1,i}}^{[m-1]}), \ldots,
        g^{-1}_K(\hat{\eta}_{\theta_{K,i}}^{[m-1]}) \right)$:
      \begin{equation*}
        u^{[m-1]}_{k,i} = \left. \frac{\partial}{\partial \eta_{\theta_k}} l(y_i,
          \bm{\theta})\right|_{\bm{\theta} = \hat{\bm{\theta}}_i^{[m-1]}} ,\, i = 1,\ldots, n.
      \end{equation*}
    \item[(b)] \textbf{Fit} each of the base-learners contained in the set of
      base-learners specified for the parameter $\theta_k$ in step (2) to the
      gradient vector $\bm{u}^{[m-1]}_k$.
    \item[(c)] \textbf{Select} the base-learner $j^*$ that best fits the
      partial-derivative vector according to the least-squares criterion, i.e.,
      select the base-learner $h_{k,j^*}$ defined by
      \begin{equation*}
        j^* = \underset{1 \leq j \leq p_k}{\operatorname{argmin}}\sum_{i=1}^n (u_{k,i}^{[m-1]} - h_{k,j}(\cdot))^2 \ .
      \end{equation*}
    \item[(d)] \textbf{Update} the additive predictor $\eta_{\theta_k}$ as
      follows:
      \begin{equation*}
        \hat{\eta}_{\theta_k}^{[m-1]} := \hat{\eta}_{\theta_k}^{[m-1]} + \nu_{\text{sl}} \cdot h_{k,j^*}(\cdot)\ ,
      \end{equation*}
      where $\nu_{\text{sl}}$ is a small step-length ($0 <
        \nu_{\text{sl}} \ll 1$).
    \item[(e)] Set $\hat{\eta}_{\theta_k}^{[m]} :=
      \hat{\eta}_{\theta_k}^{[m-1]}$.
    \item[(f)] \textbf{Iterate} steps 4(a) to 4(e) for $k=2,\ldots, K$.
    \end{enumerate}
  \end{enumerate}

\item[] \textbf{Iterate}
  \begin{enumerate}
  \item[(5)] Iterate steps 3 and 4 until  $m > m_{\text{stop},k}$ for all
    $k=1,\ldots, K$.
  \end{enumerate}
\end{enumerate}

\section{Data pre-processing and stabilization of gradients}\label{sec:stab-ngrad}

As the \emph{gamboostLSS} algorithm updates the parameter estimates in turn by
optimizing the gradients, it is important that these are comparable for
all GAMLSS parameters. Consider for example the standard Gaussian distribution
where the gradients of the log-likelihood with respect to $\eta_{\mu}$ and
$\eta_{\sigma}$ are
\begin{equation*}
   \frac{\partial }{\partial \eta_{\mu}}\, l(y_i, g_{\mu}^{-1}(\eta_{\mu}), \hat{\sigma}) =
  \frac{ y_i - \eta_{\mu i}}{\hat{\sigma}_i^2},
\end{equation*}
with identity link, i.e., $g_{\mu}^{-1}(\eta_{\mu}) = \eta_{\mu}$, and
\begin{equation*}
  \frac{\partial }{\partial \eta_{\sigma}}\, l(y_i, \hat{\mu}, g_{\sigma}^{-1}(\eta_{\sigma})) =
  -1 + \frac{(y_i - \hat{\mu}_i)^2}{\exp(2\eta_{\sigma i})},
\end{equation*}
with log link, i.e., $g_{\sigma}^{-1}(\eta_{\sigma}) = \exp(\eta_{\sigma})$.

For small values of $\hat{\sigma}_i$, the gradient vector for $\mu$ will hence
inevitably become huge, while for large variances it will become very small. As
the base-learners are directly fitted to this gradient vector, this will have a
dramatic effect on convergence speed. Due to imbalances regarding the range of
$\frac{\partial }{\partial \eta_{\mu}} l(y_i, \mu, \sigma)$ and $\frac{\partial
}{\partial \eta_{\sigma}} l(y_i, \mu, \sigma)$, a potential bias might be
induced when the algorithm becomes so unstable that it does not converge to the
optimal solution (or converges very slowly).

Consequently, one can use standardized gradients, where in \textbf{each step}
the gradient is divided by its median absolute deviation, i.e., it is divided by
\begin{equation}
  \label{eq:MAD}
  \text{MAD} = \text{median}_i(|u_{k,i} - \text{median}_j(u_{k,j})|),
\end{equation}
where $u_{k,i}$ is the gradient of the $k$th GAMLSS parameter in the current
boosting step $i$. If weights are specified (explicitly or implicitly as for
cross-validation) a weighted median is used. MAD-stabilization can be activated
by setting the argument \code{stabilization} to \code{"MAD"} in the fitting
families (see example on p.~\pageref{page:stabilization}). Using
\code{stabilization = "none"} explicitly switches off the stabilization. As this
is the current default, this is only needed for clarity.

Another way to improve convergence speed might be to standardize the response
variable (and/or to use a larger step size $\nu_{\text{sl}}$). This is
especially useful if the range of the response differs strongly from the range
of the negative gradients. Both, the built in stabilization and the
standardization of the response are not always advised but need to be carefully
considered given the data at hand. If convergence speed is slow or if the
negative gradient even starts to become unstable, one should consider one or
both options to stabilize the fit. To judge the impact of these methods one can
run the \code{gamboostLSS} algorithm using different options and compare the
results via cross-validated predictive risks (see
Sections~\ref{sec:model_complexity} and~\ref{sec:model_tuning}).

\pagebreak

\section{Additional Families}\label{sec:additional-families}

Table~\ref{tab:gamlss_families} gives an overview of common, additional GAMLSS
distributions and GAMLSS distributions with a different parametrization than in
\pkg{gamboostLSS}. For a comprehensive overview see the distribution tables
available at \url{www.gamlss.org} and the documentation of the
\pkg{gamlss.dist} package \citep{pkg:gamlss.dist:4.3-0}. Note that
\pkg{gamboostLSS} works only for more-parametric distributions, while in
\pkg{gamlss.dist} also a few one-parametric distributions are implemented. In
this case the \code{as.families()} function will construct a corresponding
\code{"boost\_family"} which one can use as \code{family} in \pkg{mboost} (a
corresponding advice is given in a warning message).

\begin{landscape}
  \begin{table}[h!]
    \centering
    \begin{tabular}{llllllllp{0.5\textwidth}}
      \toprule
      & &  Name & Response & $\mu$ & $\sigma$  & $\nu$  & $\tau$  & Note\\
      \cmidrule{2-9}

      \multicolumn{9}{l}{\textbf{Continuous response}} \\
      & Generalized $t$  & \code{GT} & cont.   & id & log & log& log & \\
      & Box-Cox $t$  & \code{BCT} & cont.   & id & log & id& log& \\
      & Gumbel & \code{GU} & cont.  & id & log & & & For moderately skewed data.\\
      & Reverse Gumbel & \code{RG} & cont.  & id & log & & &  Extreme value distribution.\\
      \cmidrule{2-9}

      \multicolumn{9}{l}{\textbf{Continuous non-negative response} (without censoring)} \\
      & Gamma & \code{GA} & cont. $>0$  & log & log & & & Also implemented as
      \code{GammaLSS()}$\,^{a,b}$.\\
      & Inverse Gamma & \code{IGAMMA} & cont. $>0$  & log & log & & & \\
      & Zero-adjusted Gamma & \code{ZAGA} & cont. $\geq 0$    & log & log& logit
      & & Gamma, additionally allowing for zeros.\\
      & Inverse Gaussian & \code{IG} & cont. $>0$   & log & log & & & \\
      & Log-normal & \code{LOGNO} & cont. $>0$  & log & log & & & For positively skewed data.\\
      & Box-Cox Cole and Green & \code{BCCG} & cont. $>0$  & id & log & id & & For positively and negatively skewed data.\\
      & Pareto & \code{PARETO2} & cont. $>0$  & log & log & & & \\
      & Box-Cox power exponential & \code{BCPE} & cont. $>0$  & id & log & id & log & Recommended for child growth centiles.\\
      \cmidrule{2-9}

      \multicolumn{9}{l}{\textbf{Fractions and bounded continuous response}} \\
      & Beta & \code{BE} & $\in (0,1)$  & logit & logit & & & Also implemented
      as \code{BetaLSS()}$\,^{a,c}$.\\
      & Beta inflated & \code{BEINF} & $\in [0,1]$  & logit & logit & log &log
      & Beta, additionally allowing for zeros and ones.\\
      \cmidrule{2-9}

      \multicolumn{9}{l}{\textbf{Models for count data}} \\
      & Beta binomial & \code{BB} & count  & logit & log &  &  &   \\
      & Negative binomial & \code{NBI} & count  & log & log &  &  & For
      over-dispersed count data; also implemented as
      \code{NBinomialLSS()}$\,^{a,d}$.\\
      \bottomrule
    \end{tabular}
    \caption{\label{tab:gamlss_families} Overview of common, additional GAMLSS distributions that can be used via
      \code{as.families()} in \pkg{gamboostLSS}. For every modeled
      distribution parameter, the corresponding link-function is displayed. $^a$
      The parametrizations of the distribution functions in \pkg{gamboostLSS}
      and \pkg{gamlss.dist} differ with respect to the variance. $\,^b$
      \code{GammaLSS(mu, sigma)} has $\VAR(y|x) =
      \text{\code{mu}}^2/\text{\code{sigma}}$, and \code{as.families(GA)(mu,
        sigma)} has $\VAR(y|x) = \text{\code{sigma}}^2\cdot \text{\code{mu}}^2$.
      $\,^c$ \code{BetaLSS(mu, phi)} has $\VAR(y|x) = \text{\code{mu}}\cdot(1-
      \text{\code{mu}}) \cdot(1 + \text{\code{phi}})^{-1}$, and
      \code{as.families(BE)(mu, sigma)} has $\VAR(y|x) =
      \text{\code{mu}}\cdot(1- \text{\code{mu}}) \cdot\text{\code{sigma}}^2$.
      $\,^d$ \code{NBinomialLSS(mu, sigma)} has $\VAR(y|x) = \text{\code{mu}}+
      1/\text{\code{sigma}}\cdot\text{\code{mu}}^2$,
      and \code{as.families(NBI)(mu, sigma)} has $\VAR(y|x) =
      \text{\code{mu}}+\text{\code{sigma}}\cdot\text{\code{mu}}^2$.}
  \end{table}
\end{landscape}

\end{document}