## ----include=FALSE, cache=FALSE-----------------------------------------------
library("knitr")
knitr::opts_knit$set(self.contained = FALSE)
knitr::opts_chunk$set(tidy = TRUE, collapse=TRUE, comment = "#>",
                      tidy.opts=list(blank=FALSE, width.cutoff=55))

## ----fig.width=4.5, fig.height=3.5, fig.cap="Lee (2008) data"-----------------
library("RDHonest")
## plot 50-bin averages in for observations 50 at most points
## away from the cutoff. See Figure 1.
RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50,
          avg=50, propdotsize=FALSE, xlab="Margin of victory",
          ylab="Vote share in next election")

## ----fig.width=4.5, fig.height=3.5, fig.cap="Oreopoulos (2006) data"----------
## see Figure 2
f2 <- RDScatter(log(earnings)~yearat14, data=cghs, cutoff=1947,
                avg=Inf, xlab="Year aged 14", ylab="Log earnings",
                propdotsize=TRUE)
## Adjust size of dots if they are too big
f2 + ggplot2::scale_size_area(max_size = 4)

## -----------------------------------------------------------------------------
CVb(0, alpha=0.05) ## Usual critical value
CVb(1/2, alpha=0.05)

## Tabulate critical values for different bias levels
CVb(0:5, alpha=0.1)

## -----------------------------------------------------------------------------
r0 <- RDHonest(voteshare~margin, data=lee08, kern="triangular", M=0.1, h=8)
print(r0)

## -----------------------------------------------------------------------------
RDHonest(voteshare ~ margin, data=lee08, kern="triangular",
         M=0.1, opt.criterion="MSE")
## Choose bws optimal for length of CI
RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
         opt.criterion="FLCI")

## -----------------------------------------------------------------------------
## Data-driven choice of M
RDHonest(voteshare ~ margin, data=lee08)

## -----------------------------------------------------------------------------
## Replicate Table 2, column (10) in Kolesar and Rothe (2018)
RDHonest(log(earnings) ~ yearat14, cutoff=1947,
         data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H")

## -----------------------------------------------------------------------------
## Replicate Table 2, column (6), run local linear regression (order=1)
## with a uniform kernel (other kernels are not implemented for RDHonestBME)
RDHonestBME(log(earnings) ~ yearat14, cutoff=1947,
            data=cghs, h=3, order=1)

## -----------------------------------------------------------------------------
## Initial estimate of treatment effect for optimal bandwidth calculations
r <- RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
              M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0)
## Use it to compute optimal bandwidth
RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
         M=c(0.001, 0.002), opt.criterion="MSE", sclass="H",
         T0=r$coefficients$estimate)

## -----------------------------------------------------------------------------
## Data-driven choice of M
RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
         opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate)

## -----------------------------------------------------------------------------
## No covariates
rn <- RDHonest(mortHS ~ povrate, data=headst)
## Use Percent attending school aged 14-17, urban, black,
## and their interaction as covariates.
rc <- RDHonest(mortHS ~ povrate | urban*black + sch1417, data=headst)
rn
rc

## -----------------------------------------------------------------------------
ci_len <- c(rc$coefficients$conf.high-rc$coefficients$conf.low,
            rn$coefficients$conf.high-rn$coefficients$conf.low)
100 * (1 - ci_len[1]/ci_len[2])

## -----------------------------------------------------------------------------
RDHonest(log(cn) | retired ~ elig_year | education, data=rcp,
         T0=r$coefficients$estimate)

## ----fig.width=4.5, fig.height=3.5, fig.cap="Battistin et al (2009) data"-----
## see Figure 3
f3 <- RDScatter(log(cn)~elig_year, data=rcp, cutoff=0, avg=Inf,
                xlab="Years to eligibility",
                ylab="Log consumption of non-durables", propdotsize=TRUE,
                subset=abs(elig_year)<15)
## Adjust size of dots if they are too big
f3 + ggplot2::scale_size_area(max_size = 5)

## -----------------------------------------------------------------------------
dd <- data.frame()
## Collapse data by running variable
for (j in unique(cghs$yearat14)) {
    ix <- cghs$yearat14==j
    df <- data.frame(y=mean(log(cghs$earnings[ix])), x=j,
                     weights=sum(ix),
                     sigma2=var(log(cghs$earnings[ix]))/sum(ix))
    dd <- rbind(dd, df)
}

## -----------------------------------------------------------------------------
s0 <- RDHonest(log(earnings)~yearat14, cutoff=1947, data=cghs)
## keep same bandwidth
s1 <- RDHonest(y~x, cutoff=1947, data=dd, weights=weights,
               sigmaY2=sigma2, se.method="supplied.var",
               h=s0$coefficients$bandwidth)
## Results are identical:
s0
s1

## -----------------------------------------------------------------------------
r0 <- RDHonest(log(cn)|retired~elig_year, data=rcp, h=7)
dd <- data.frame(x=sort(unique(rcp$elig_year)), y=NA, d=NA, weights=NA,
                 sig11=NA, sig12=NA, sig21=NA, sig22=NA)
for (j in seq_len(NROW(dd))) {
    ix <- rcp$elig_year==dd$x[j]
    Y <- cbind(log(rcp$cn[ix]), rcp$retired[ix])
    dd[j, -1] <- c(colMeans(Y), sum(ix), as.vector(var(Y))/sum(ix))
}
r1 <- RDHonest(y|d~x, data=dd, weights=weights,
               sigmaY2=sig11, T0=0, sigmaYD=sig21,
               sigmaD2=sig22, h=7,
               se.method="supplied.var")
## Outputs match up to numerical precision
max(abs(r0$coefficients[2:11]-r1$coefficients[2:11]))

## -----------------------------------------------------------------------------
## make fake clusters
set.seed(42)
clusterid <- sample(1:50, NROW(lee08), replace=TRUE)
sc <- RDHonest(voteshare~margin, data=lee08, se.method="EHW",
               clusterid=clusterid, M=0.14, h=7)
## Since clusters are unrelated to outcomes, not clustering
## should yield similar standard errors
sn <- RDHonest(voteshare~margin, data=lee08, se.method="EHW",
               M=0.14, h=7)
sc
sn

## -----------------------------------------------------------------------------
r1 <- RDHonest(voteshare ~ margin, data=lee08, M=0.1, se.method="nn")
### Only use three point-average for averages of a 100 points closest to cutoff,
### and report results separately for points above and below cutoff
RDSmoothnessBound(r1, s=100, separate=TRUE, multiple=FALSE, sclass="T")

## Pool estimates based on observations below and above cutoff, and
## use three-point averages over the entire support of the running variable
RDSmoothnessBound(r1, s=100, separate=FALSE, multiple=TRUE, sclass="H")

## -----------------------------------------------------------------------------
r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1,
               opt.criterion="FLCI",
               se.method="nn")$coefficients
r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
               opt.criterion="FLCI", se.method="nn",
               sclass="T")$coefficients
r1$conf.high-r1$conf.low
r2$conf.high-r2$conf.low

## -----------------------------------------------------------------------------
## Specify we're interested in inference at x0=20,
## and drop observations below cutoff
RDHonest(voteshare ~ margin, data=lee08, subset=margin>0,
         cutoff=20, kern="uniform",
         opt.criterion="MSE", sclass="H", point.inference=TRUE)