\documentclass{chapman} %%% copy Sweave.sty definitions %%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE %\usepackage{Sweave} \RequirePackage[T1]{fontenc} \RequirePackage{graphicx,ae,fancyvrb} \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} \usepackage{relsize} \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \newenvironment{Schunk}{}{} %%% environment for raw output \newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\small} \rawSinput } %%% environment for labeled output \newcommand{\nextcaption}{} \newcommand{\SchunkLabel}{ \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption} \end{figure} } \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline} \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, samepage = true, fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} } %%% S code with line numbers \DefineVerbatimEnvironment{Sinput} {Verbatim} { %% numbers=left } \newcommand{\numberSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left} } \newcommand{\rawSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{} } %%% R / System symbols \newcommand{\R}{\textsf{R}} \newcommand{\rR}{{R}} \renewcommand{\S}{\textsf{S}} \newcommand{\SPLUS}{\textsf{S-PLUS}} \newcommand{\rSPLUS}{{S-PLUS}} \newcommand{\SPSS}{\textsf{SPSS}} \newcommand{\EXCEL}{\textsf{Excel}} \newcommand{\ACCESS}{\textsf{Access}} \newcommand{\SQL}{\textsf{SQL}} %%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}} \newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}} \newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}} \newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}} \newcommand{\Roperator}[1]{\texttt{#1}} \newcommand{\Rarg}[1]{\texttt{#1}} \newcommand{\Rlevel}[1]{\texttt{#1}} %%% other symbols \newcommand{\file}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\stress}[1]{\index{#1}\textit{#1}} \newcommand{\stress}[1]{\textit{#1}} \newcommand{\booktitle}[1]{\textit{#1}} %%' %%% Math symbols \usepackage{amstext} \usepackage{amsmath} \newcommand{\E}{\mathsf{E}} \newcommand{\Var}{\mathsf{Var}} \newcommand{\Cov}{\mathsf{Cov}} \newcommand{\Cor}{\mathsf{Cor}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \renewcommand{\a}{\mathbf{a}} \newcommand{\W}{\mathbf{W}} \newcommand{\C}{\mathbf{C}} \renewcommand{\H}{\mathbf{H}} \newcommand{\X}{\mathbf{X}} \newcommand{\B}{\mathbf{B}} \newcommand{\V}{\mathbf{V}} \newcommand{\I}{\mathbf{I}} \newcommand{\D}{\mathbf{D}} \newcommand{\bS}{\mathbf{S}} \newcommand{\N}{\mathcal{N}} \renewcommand{\L}{L} \renewcommand{\P}{\mathsf{P}} \newcommand{\K}{\mathbf{K}} \newcommand{\m}{\mathbf{m}} \newcommand{\argmin}{\operatorname{argmin}\displaylimits} \newcommand{\argmax}{\operatorname{argmax}\displaylimits} \newcommand{\bx}{\mathbf{x}} \newcommand{\bbeta}{\mathbf{\beta}} %%% links \usepackage{hyperref} \hypersetup{% pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)}, pdfsubject = {Book}, pdfauthor = {Torsten Hothorn and Brian S. Everitt}, colorlinks = {black}, linkcolor = {black}, citecolor = {black}, urlcolor = {black}, hyperindex = {true}, linktocpage = {true}, } %%% captions & tables %% <FIXME>: conflics with figure definition in chapman.cls %%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption} %% </FIMXE> \usepackage{longtable} \usepackage[figuresright]{rotating} %%% R symbol in chapter 1 \usepackage{wrapfig} %%% Bibliography \usepackage[round,comma]{natbib} \renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}} \citeindexfalse %%% texi2dvi complains that \newblock is undefined, hm... \def\newblock{\hskip .11em plus .33em minus .07em} %%% Example sections \newcounter{exercise}[chapter] \setcounter{exercise}{0} \newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }} %% URLs \newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}} %%% for manual corrections %\renewcommand{\baselinestretch}{2} %%% plot sizes \setkeys{Gin}{width=0.95\textwidth} %%% color \usepackage{color} %%% hyphenations \hyphenation{drop-out} \hyphenation{mar-gi-nal} %%% new bidirectional quotes need \usepackage[utf8]{inputenc} %\usepackage{setspace} \definecolor{sidebox_todo}{rgb}{1,1,0.2} \newcommand{\todo}[1]{ \hspace{0pt}% \marginpar{% \fcolorbox{black}{sidebox_todo}{% \parbox{\marginparwidth} { \raggedright\sffamily\footnotesize{TODO: #1}% } }% } } \begin{document} %% Title page \title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition} \author{Torsten Hothorn and Brian S. Everitt} \maketitle %%\VignetteIndexEntry{Chapter Survival Analysis} %%\VignetteDepends{survival,coin,partykit} \setcounter{chapter}{10} \SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} <<setup, echo = FALSE, results = hide>>= rm(list = ls()) s <- search()[-1] s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices", "package:utils", "package:datasets", "package:methods", "Autoloads"), s)] if (length(s) > 0) sapply(s, detach, character.only = TRUE) if (!file.exists("tables")) dir.create("tables") if (!file.exists("figures")) dir.create("figures") set.seed(290875) options(prompt = "R> ", continue = "+ ", width = 63, # digits = 4, show.signif.stars = FALSE, SweaveHooks = list(leftpar = function() par(mai = par("mai") * c(1, 1.05, 1, 1)), bigleftpar = function() par(mai = par("mai") * c(1, 1.7, 1, 1)))) HSAURpkg <- require("HSAUR3") if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3")) rm(HSAURpkg) ### </FIXME> hm, R-2.4.0 --vanilla seems to need this a <- Sys.setlocale("LC_ALL", "C") ### </FIXME> book <- TRUE refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM", "DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA", "MDS", "CA"), 1:18) ch <- function(x) { ch <- refs[which(refs[,1] == x),] if (book) { return(paste("Chapter~\\\\ref{", ch[1], "}", sep = "")) } else { return(paste("Chapter~", ch[2], sep = "")) } } if (file.exists("deparse.R")) source("deparse.R") setHook(packageEvent("lattice", "attach"), function(...) { lattice.options(default.theme = function() standard.theme("pdf", color = FALSE)) }) @ \pagestyle{headings} <<singlebook, echo = FALSE>>= book <- FALSE @ <<SA-setup, echo = FALSE, results = hide>>= x <- library("survival") x <- library("coin") x <- library("partykit") @ \chapter[Survival Analysis]{Survival Analysis: \\ Glioma Treatment and \\ Breast Cancer Survival \label{SA}} \section{Introduction} \section{Survival Analysis} \section{Analysis Using \R{}} \subsection{Glioma Radioimmunotherapy} \begin{figure} \begin{center} <<SA-glioma-KM, echo = TRUE, fig = TRUE, height = 4>>= data("glioma", package = "coin") library("survival") layout(matrix(1:2, ncol = 2)) g3 <- subset(glioma, histology == "Grade3") plot(survfit(Surv(time, event) ~ group, data = g3), main = "Grade III Glioma", lty = c(2, 1), ylab = "Probability", xlab = "Survival Time in Month", legend.text = c("Control", "Treated"), legend.bty = "n") g4 <- subset(glioma, histology == "GBM") plot(survfit(Surv(time, event) ~ group, data = g4), main = "Grade IV Glioma", ylab = "Probability", lty = c(2, 1), xlab = "Survival Time in Month", xlim = c(0, max(glioma$time) * 1.05)) @ \caption{Survival times comparing treated and control patients. \label{SA-glioma-plot}} \end{center} \end{figure} Figure~\ref{SA-glioma-plot} leads to the impression that patients treated with the novel radioimmunotherapy survive longer, regardless of the tumor type. In order to assess if this informal finding is reliable, we may perform a log-rank test via \index{Log-rank test} <<SA-glioma-logrank, echo = TRUE>>= survdiff(Surv(time, event) ~ group, data = g3) @ which indicates that the survival times are indeed different in both groups. However, the number of patients is rather limited and so it might be dangerous to rely on asymptotic tests. As shown in \Sexpr{ch("CI")}, conditioning on the data and computing the distribution of the test statistics without additional assumptions are one alternative. The function \Rcmd{surv\_test} from package \Rpackage{coin} \citep{HSAUR:Hothorn:2006:AmStat,PKG:coin} can be used to compute an exact conditional test answering the question whether the survival times differ for grade III patients. For all possible permutations of the groups on the censored response variable, the test statistic is computed and the fraction of whose being greater than the observed statistic defines the exact $p$-value: <<SA-glioma-exact, echo = TRUE>>= library("coin") logrank_test(Surv(time, event) ~ group, data = g3, distribution = "exact") @ which, in this case, confirms the above results. The same exercise can be performed for patients with grade IV glioma <<SA-glioma-g4, echo = TRUE>>= logrank_test(Surv(time, event) ~ group, data = g4, distribution = "exact") @ which shows a difference as well. However, it might be more appropriate to answer the question whether the novel therapy is superior for both groups of tumors simultaneously. This can be implemented by \stress{stratifying}, or \stress{blocking}, with respect to tumor grading: <<SA-glioma-hist, echo = TRUE>>= logrank_test(Surv(time, event) ~ group | histology, data = glioma, distribution = approximate(10000)) @ Here, we need to approximate the exact conditional distribution since the exact distribution is hard to compute. The result supports the initial impression implied by Figure~\ref{SA-glioma-plot}. \subsection{Breast Cancer Survival} Before fitting a Cox model to the \Robject{GBSG2} data, we again derive a Kaplan-Meier estimate of the survival function of the data, here stratified with respect to whether a patient received hormonal therapy or not (see Figure~\ref{SA-GBSG2-plot}). \begin{figure} \begin{center} <<SA-GBSG2-plot, echo = TRUE, fig = TRUE, width = 6, height = 5>>= data("GBSG2", package = "TH.data") plot(survfit(Surv(time, cens) ~ horTh, data = GBSG2), lty = 1:2, mark.time = FALSE, ylab = "Probability", xlab = "Survival Time in Days") legend(250, 0.2, legend = c("yes", "no"), lty = c(2, 1), title = "Hormonal Therapy", bty = "n") @ \caption{Kaplan-Meier estimates for breast cancer patients who either received hormonal therapy or not. \label{SA-GBSG2-plot}} \end{center} \end{figure} Fitting a Cox model follows roughly the same rules as shown for linear models in \Sexpr{ch("MLR")} with the exception that the response variable is again coded as a \Rclass{Surv} object. For the \Robject{GBSG2} data, the model is fitted via <<SA-GBSG2-coxph, echo = TRUE>>= GBSG2_coxph <- coxph(Surv(time, cens) ~ ., data = GBSG2) @ and the results as given by the \Rcmd{summary} method are given in Figure~\ref{GBSG2-coxph-summary}. Since we are especially interested in the relative risk for patients who underwent hormonal therapy, we can compute an estimate of the relative risk and a corresponding confidence interval via <<SA-GBSG2-coxph-ci, echo = TRUE>>= ci <- confint(GBSG2_coxph) exp(cbind(coef(GBSG2_coxph), ci))["horThyes",] @ This result implies that patients treated with hormonal therapy had a lower risk and thus survived longer compared to women who were not treated this way. \renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method for \Robject{GBSG2\_coxph}. \label{GBSG2-coxph-summary}} \SchunkLabel <<GBSG2-coxph-summary, echo = TRUE>>= summary(GBSG2_coxph) @ \SchunkRaw Model checking and model selection for proportional hazards models are complicated by the fact that easy-to-use residuals, such as those discussed in \Sexpr{ch("MLR")} for linear regression models, are not available, but several possibilities do exist. A check of the proportional hazards assumption can be done by looking at the parameter estimates $\beta_1, \dots, \beta_q$ over time. We can safely assume proportional hazards when the estimates don't vary much over time. %' The null hypothesis of constant regression coefficients can be tested, both globally as well as for each covariate, by using the \Rcmd{cox.zph} function <<SA-GBSG2-zph, echo = TRUE>>= GBSG2_zph <- cox.zph(GBSG2_coxph) GBSG2_zph @ There seems to be some evidence of time-varying effects, \index{Time-varying effects} especially for age and tumor grading. A graphical representation of the estimated regression coefficient over time is shown in Figure~\ref{SA-GBSG2-zph-plot}. We refer to \cite{HSAUR:TherneauGrambsch2000} for a detailed theoretical description of these topics. \begin{figure} \begin{center} <<SA-GBSG2-zph-plot, echo = TRUE, fig = TRUE, width = 7, height = 7>>= plot(GBSG2_zph, var = "age") @ \caption{Estimated regression coefficient for \Robject{age} depending on time for the \Robject{GBSG2} data. \label{SA-GBSG2-zph-plot}} \end{center} \end{figure} \begin{figure} \begin{center} <<SA-GBSG2-Martingal, echo = TRUE, fig = TRUE, height = 4>>= layout(matrix(1:3, ncol = 3)) res <- residuals(GBSG2_coxph) plot(res ~ age, data = GBSG2, ylim = c(-2.5, 1.5), pch = ".", ylab = "Martingale Residuals") abline(h = 0, lty = 3) plot(res ~ pnodes, data = GBSG2, ylim = c(-2.5, 1.5), pch = ".", ylab = "") abline(h = 0, lty = 3) plot(res ~ log(progrec), data = GBSG2, ylim = c(-2.5, 1.5), pch = ".", ylab = "") abline(h = 0, lty = 3) @ \caption{Martingale residuals for the \Robject{GBSG2} data. \label{SA-GBSG2-mart-plot}} \end{center} \end{figure} The tree-structured regression models applied to continuous and binary responses in \Sexpr{ch("RP")} are applicable to censored responses in survival analysis as well. Such a simple prognostic model with only a few terminal nodes might be helpful for relating the risk to certain subgroups of patients. Both \Rcmd{rpart} and the \Rcmd{ctree} function from package \Rpackage{partykit} can be applied to the GBSG2 data, where the conditional trees of the latter select cutpoints based on log-rank statistics \index{Conditional tree} <<SA-GBSG2-ctree, echo = TRUE>>= GBSG2_ctree <- ctree(Surv(time, cens) ~ ., data = GBSG2) @ and the \Rcmd{plot} method applied to this tree produces the graphical representation in Figure~\ref{SA-GBSG2-ctree-plot}. The number of positive lymph nodes (\Robject{pnodes}) is the most important variable in the tree, corresponding to the $p$-value associated with this variable in Cox's %%'s regression; see Figure~\ref{GBSG2-coxph-summary}. Women with not more than three positive lymph nodes who have undergone hormonal therapy seem to have the best prognosis whereas a large number of positive lymph nodes and a small value of the progesterone receptor indicates a bad prognosis. \begin{figure} \begin{center} <<SA-GBSG2-ctree-plot, echo = TRUE, fig = TRUE, width = 8>>= plot(GBSG2_ctree) @ \caption{Conditional inference tree for the \Robject{GBSG2} data with the survival function, estimated by Kaplan-Meier, shown for every subgroup of patients identified by the tree. \label{SA-GBSG2-ctree-plot}} \end{center} \end{figure} \bibliographystyle{LaTeXBibTeX/refstyle} \bibliography{LaTeXBibTeX/HSAUR} \end{document}