# pine cones.R
# Yvonne Buckley, 18/07/2014
# Fits a model to predict number of cones based on plot (location), age and
# height, or basal area. See Coutts, Caplat et al. 2012 for more detail on a
# larger data-set and its analysis at the end of the script I give an example
# of how to fit and test an off-set in a GLM
# INPUTS: pine_cones.csv
# OUTPUTS: none
# Remember to set a working directory using File menu and Change Dir on R-GUI
# to where you have saved the data
# The MASS library is needed for fitting negative binomial GLMs
library(MASS)
cone_num <- read.csv("pine_cones.csv", header = T)
names(cone_num)
head(cone_num)
cone_num$year <- as.factor(cone_num$year)
#plot the data
plot(cone_num)
# You can see that height, age and basal area are all correlated so only want
# to use one in the analysis. Height was difficult to measure in the field,
# on some trees that were measured twice there was several meters difference
# between measures (both up and down), meaning we can have little confidence
# in any of the heights measured. Age was easy to measure (by counting annual
# growth nodes) and was measured in all plots so there is little missing data.
# Basal area was easy to measure but some of the data are missing.
# So it seems logical to use age as the covariate.
plot(cone_num$age, cone_num$n_cone, bty="l", pch=16, xlab="tree age, years",
ylab="No. cones", cex.lab=1.5, main="All trees")
# We can see there are lots of 0's, particularly at low ages. We could fit a
# zero-inflated model, but we know why at least some of these 0's occur, they
# occur becasue the trees are too young.
# We would not expect a 2 year old tree to ever produce cones. 2009 was a
# particularly good coning year (mast year) and it appears as though trees
# younger than 13 did not produce cones that year. This also supports previous
# work in the same population that shows trees start to produce cones at
# around age 12 (Buckley et al. 2005).
# So we exclude all trees that were 12 or younger becasue these 0's are not
# interesting and do not tell us anthing except very young trees do not produce
# cones (which we already knew).
cn13up <- cone_num[which(cone_num$age >= 13),]
plot(cn13up$age, cn13up$n_cone, bty="l", pch=16, xlab="tree age, years", ylab="No. cones",
cex.lab=1.5, main="Trees 13+ years")
# Start with a classical linear regression of these data.
# I will use one year (2009) to avoid pseudo-replication due to multiple
# observations on the same individuals to analyse both years simultaneously
# you need to use mixed effects models (ch 13)
# Lets look at how the homoscedasticity of residuals and normality assumptions
# perform
cones09 <- subset(cn13up, year == 2009)
lm1 <- lm(n_cone ~ age, data = cones09)
summary(lm1)
# the residuals look highly dispersed (min -392, max 582) which is a warning
# sign
## reproduces Fig. 6
par(mfrow = c(1,2))
plot(cones09$age, cones09$n_cone, bty = "l", pch = 16, xlab = "tree age, years",
ylab = "No. cones", cex.lab = 1.2, main = "(a) Trees 13+ years, 2009")
abline(coef(lm1),lwd = 2)
hist(resid(lm1), xlab = "Residuals", cex.lab = 1.2,
main = "(b) Histogram of errors")
# You can see that the variance increases with the mean fitted values and that
# a histogram of resids does not look normal
par(mfrow = c(2,2))
plot(lm1)
# Not too good! There is a "shot-gun" pattern in the residuals (errors
# increasing with the fitted values - plots 1 and 3) and poor performance in
# the Normal Q-Q plot, deviations from Normality are apparent.
# Log transforming count data is often recommended to improve model fit and
# normalise residuals, homogenise variance etc.
# As the log of zero cannot be taken your options are either to remove the zeros
# or add a small constant (1) to all values
cones09_pos <- subset(cones09, n_cone > 0)
# removing the zero counts as we could separately model the probability of
# producing cones process as a Binomial model with a binary (produce cones
# or not) response variable.
lm2 <- lm(log(n_cone) ~ age, data = cones09_pos)
par(mfrow = c(1,1))
plot(cones09_pos$age, log(cones09_pos$n_cone), bty = "l", pch = 16,
xlab = "tree age, years", ylab = "Log(no. cones)", cex.lab = 1.5,
main = "Trees 13+ years, 2009, no zeros")
new_dat <- data.frame(age = 13:25)
lines(new_dat$age, predict(lm2, new_dat), lwd = 2)
# this model uses only non-zero counts of cones, given a tree of a certain size
# produces cones, how many?
par(mfrow = c(2,2))
plot(lm2)
# this gives us some new problems, normality assumptions are still not
# appropriate and we appear to have over-corrected the dispersion problem
# with a reverse shotgun pattern now apparent
# as an alternative approach to dropping the zeros we go back to the full 2009
# data set and add a small amount, 1, to the observations before logging
lm3 <- lm(log(n_cone+1) ~ age, data = cones09)
par(mfrow = c(1,1))
plot(cones09$age, log(cones09$n_cone+1), bty = "l", pch = 16,
xlab = "tree age, years", ylab = "Log(No. cones+1)", cex.lab = 1.5,
main = "Trees 13+ years, 2009")
abline(coef(lm3), lwd = 2)
par(mfrow = c(2,2))
plot(lm3)
# The transformation has not solved our assumption problems.
# Patterns are similar to model lm2 where we removed zeros and log transformed
# GLM APPROACH
# we start with a simple analysis by fitting a glm with a Poisson error
# distribution (popular choce for count data)
m1 <- glm(n_cone ~ age, family = "poisson", data = cones09)
summary(m1)
# There is serious over dispersion in this model (residual deviance /degrees of
# freedom much greater than 1). Thus there is more variance in the error than
# is accounted for by the Poisson distirbution.
# It may be that the zeros in our data are over-inflating the variance, we
# could model these counts as a zero-inflated process or remove them and model
# the probability of producing cones as a separate process. Here I will remove
# the zeros and model the counts again as a poisson process and see if this
# helps with correcting overdispersion
m2 <- glm(n_cone ~ age, family = "poisson", data = cones09_pos)
summary(m2)
# removing the zeros has reduced the over dispersion from 190 to 173 but it is
# still a big problem
# It could be our error distribution is wrong. we have some good evidence to
# suggest the error distribuition might be negative binomial (Coutts et al. 2011)
# so we repeat the proccess with the negative binomial error distribution
# we need a special function for this which is in the MASS library
m3_nb <- glm.nb(n_cone ~ age, data = cones09_pos, link = log)
summary(m3_nb)
#This looks much better, our dispersion is now 48/40=1.2 which is close to 1
# the AIC is much less than when we used the Poisson distribution (534 for
# m3_nb verus 6915 for Poisson m2) also the parameter estimates and conclusions
# change as now the intercept is no longer significantly different to zero
# whereas in the poisson model with an equivalent data set (m2)
# it was positive and highly significantly different to zero
# If we'd really like to keep the zero cone counts in the data set we can try
# using the complete 2009 data set and see if the NegBin model is still
# adequate
m4_nb <- glm.nb(n_cone ~ age, data = cones09, link = log)
warnings()
# the model does not converge, so those zeros really are a problem.
# At this point you might be satisfied to model just the positive cone counts
# with the NegBin model (m3_nb)or you might try to model the full 2009 data set
# with a zero-inflated model
## ALTERNATIVE APPROACH - using quasiPoisson
# instead of using a neg. bin. model we might choose to use the quasi-poisson
# distribution this allows the variance to increase faster than the mean by
# estimating a dispersion parameter the dispersion parameter is not constrained
# to be equal to one (as for Poisson)
m5_qp <- glm(n_cone ~ age, family = "quasipoisson", data = cones09_pos)
summary(m5_qp)
# the parameter estimates of the quasi poisson model are identical to the
# poisson model (m2) as quasi models do not have a log-likelihood technically
# they don't have an AIC either...
# there are work-arounds for this if you search the R help lists. Also see the
# library AICcmodavg
# PLOTS of predictions
# reproduces Fig. 8
# The best model of cone counts for the 2009 positive cone counts
# (cones09_pos) was a negative binomial model with a log link function (m3_nb)
par(mfrow = c(1,2), bty = "l", lwd = 2, pch = 16, cex.lab = 1.5)
predict(m3_nb)
# this gives us the linear predictor fitted values from original ages
new_age <- data.frame(age = min(cones09_pos$age) : max(cones09_pos$age))
fits <- predict(m3_nb, new_age )
plot(cones09_pos$age, log(cones09_pos$n_cone), xlab = "Tree age",
ylab = "Log(no. cones)", main = "(a) Linear predictor")
lines( new_age$age, fits) ## Neg Bin model m3_nb linear predictor
lines(new_age$age, predict(m2, new_age, type="link"), lty=3)
# Poisson model m2 linear predictor
# lines(new_age$age, predict(m5_qp, new_age, type="link"), lty=2)
# Quasi model predictions identical to poisson
legend("bottomright", c("neg.bin", "poisson"), lty=c(1,3), bty="n", lwd=2)
plot(cones09_pos$age, cones09_pos$n_cone, xlab = "Tree age",
ylab = "No. cones", main = "(b) Counts")
lines( new_age$age, exp(fits))
# Neg Bin model m3_nb predictions on count scale
lines(new_age$age, predict(m2, new_age, type="response"), lty=3)
## Poisson model m2 predictions on count scale
# lines(new_age$age, predict(m5_qp, new_age, type="response"), lty=2)
# Quasi model predictions identical to poisson
legend("topleft", c("neg.bin", "poisson"), lty=c(1,3), bty="n", lwd=2)
# Here is how you fit and test an off-set
m5_qp <- glm(n_cone ~ age, family = "quasipoisson", data = cones09_pos)
summary(m5_qp)
# I might have a good a priori reason for assuming a certain value for the
# slope for example there might be some ecological theory specifying a 1:1
# relationship between cones & age by using an offset I can test whether my
# model can be simplified in this way
os_m5_qp <- glm(n_cone ~ age + offset(age), family="quasipoisson",
data = cones09_pos)
summary(os_m5_qp)
# There is an implicit "x1" for the slope in the offset(age) part of the model,
# if I wanted to fit an offset with a slope=2 I would do "offset(age*2)"
# Note that if you compare the estimated slope for the offset with the no
# offset models you'll see exactly what's going on, the parameter estimate for
# the slope is re-calculated taking the offset slope into account.
# if the new parameter value for the slope was close to 0 and non-significant
# then you would have good evidence that the offset slope is a relationship
# which is supported. In the no offset model the p value on slope tests whether
# it is different to zero, in the offset model the slope parameter estimate is
# a difference from the offset slope so the p-value can be interpreted
# as testing to see if the estimated slope is different to the offset slope.
os_m5_qp2 <- glm(n_cone ~ age + offset(2 * age), family = "quasipoisson",
data = cones09_pos)
summary(os_m5_qp2)
# here you can see the effect of having a slope of 2 as the offset, the
# parameter estimate changes again to account for the offset.