# Core data manipulation and modeling packages
library(tidyverse) # for data manipulation and visualization
library(pscl) # for zero-inflated models
library(MASS) # for negative binomial models
library(performance) # for model comparison
library(DHARMa) # for model diagnostics
library(gridExtra) # for arranging plotsChapter 6: Zero-Inflated Models for Ecological Count Data
6.1 The Mystery of the Missing Organisms: Understanding Excess Zeros in Ecological Data
Context and Challenges
In the world of ecological research, not all zeros are created equal. Imagine you’re a field biologist conducting a survey of woodpeckers in forest fragments. You meticulously visit 100 different forest patches, but in many of these patches, you find zero woodpeckers. But why? Are the woodpeckers truly absent, or are they just exceptionally hard to detect?
- Structural Zeros: Locations where the species cannot exist
- A marine fish species cannot live in a desert
- Tropical birds are absent from arctic regions
- Specific habitat requirements prevent species occurrence
- Sampling Zeros: Locations where the species could exist but was not detected
- The species is rare
- Sampling methods were inadequate
- Bad timing or luck during the survey
Learning Objectives
By the end of this chapter, you will:
- Understand the difference between structural and sampling zeros
- Learn how to model count data with excess zeros
- Apply zero-inflated and hurdle models to ecological datasets
- Interpret complex model outputs that account for zero-generating processes
Connection to Previous Chapters
In our journey through generalized linear models, we’ve already explored:
- Poisson regression for basic count data
- Negative binomial regression to handle overdispersion
- Beta regression for proportion data
Zero-inflated models are our next tool for handling the complexities of ecological count data, extending our modeling capabilities to account for the nuanced ways zeros can appear in our datasets.
6.2 Preparing for the Analysis: Required R Packages
6.3 Simulating Zero-Inflated Ecological Data
Let’s create a realistic scenario to demonstrate zero-inflated models. We’ll simulate woodpecker counts across forest fragments:
set.seed(123)
n <- 200 # Number of forest fragments
# Predictors
fragment_size <- runif(n, 1, 50) # Forest fragment size (hectares)
isolation <- runif(n, 0.1, 15) # Distance to nearest forest patch (km)
understory <- runif(n, 0, 100) # Understory density score
# Zero-inflation process
# Smaller, more isolated fragments more likely to have structural zeros
zero_prob <- plogis(2 - 0.1 * fragment_size + 0.2 * isolation)
# Generate structural zeros indicator
structural_zero <- rbinom(n, 1, zero_prob)
# Count process for non-structural zeros
lambda <- exp(0.5 + 0.04 * fragment_size - 0.1 * isolation + 0.01 * understory)
# Generate counts
woodpecker_count <- rep(0, n)
non_zero_indices <- which(structural_zero == 0)
woodpecker_count[non_zero_indices] <- rnbinom(
length(non_zero_indices),
mu = lambda[non_zero_indices],
size = 1.5 # Dispersion parameter
)
# Create data frame
bird_data <- data.frame(
woodpecker_count = woodpecker_count,
fragment_size = fragment_size,
isolation = isolation,
understory = understory
)- Created realistic predictor variables for forest fragments
- Simulated a zero-inflation process based on fragment characteristics
- Generated woodpecker counts that reflect real-world complexity
6.4 Exploring Zero-Inflated Data
# Proportion of zeros
prop_zeros <- mean(bird_data$woodpecker_count == 0)
# Plot count distribution
ggplot(bird_data, aes(x = woodpecker_count)) +
geom_histogram(binwidth = 1, fill = "forestgreen", color = "black") +
theme_minimal() +
labs(
x = "Number of woodpeckers",
y = "Frequency",
title = "Distribution of Woodpecker Counts",
subtitle = paste0("Proportion of zeros: ", round(prop_zeros * 100, 1), "%")
)# Summary statistics
cat("Mean woodpecker count:", mean(bird_data$woodpecker_count), "\n")Mean woodpecker count: 2.815
cat("Variance of woodpecker count:", var(bird_data$woodpecker_count), "\n")Variance of woodpecker count: 43.17666
6.5 Fitting and Comparing Models
# Poisson model
poisson_model <- glm(
woodpecker_count ~ fragment_size + isolation + understory,
family = poisson,
data = bird_data
)
# Negative binomial model
nb_model <- MASS::glm.nb(
woodpecker_count ~ fragment_size + isolation + understory,
data = bird_data
)
# Zero-Inflated Poisson (ZIP)
zip_model <- zeroinfl(
woodpecker_count ~ fragment_size + isolation + understory |
fragment_size + isolation,
dist = "poisson",
data = bird_data
)
# Zero-Inflated Negative Binomial (ZINB)
zinb_model <- zeroinfl(
woodpecker_count ~ fragment_size + isolation + understory |
fragment_size + isolation,
dist = "negbin",
data = bird_data
)
# Model comparison
model_list <- list(
Poisson = poisson_model,
"Negative Binomial" = nb_model,
"Zero-Inflated Poisson" = zip_model,
"Zero-Inflated Negative Binomial" = zinb_model
)
# Calculate AIC values
aic_table <- data.frame(
Model = names(model_list),
AIC = sapply(model_list, function(x) {
if (inherits(x, "zeroinfl")) AIC(x) else AIC(x)
}),
stringsAsFactors = FALSE
)
aic_table$Delta_AIC <- aic_table$AIC - min(aic_table$AIC)
aic_table <- aic_table[order(aic_table$AIC), ]
print(aic_table) Model AIC
Zero-Inflated Negative Binomial Zero-Inflated Negative Binomial 544.5875
Negative Binomial Negative Binomial 583.3021
Zero-Inflated Poisson Zero-Inflated Poisson 639.1324
Poisson Poisson 977.7515
Delta_AIC
Zero-Inflated Negative Binomial 0.00000
Negative Binomial 38.71461
Zero-Inflated Poisson 94.54490
Poisson 433.16406
- Lower AIC indicates better model fit
- Delta AIC shows the relative difference between models
- A difference of less than 2 suggests models are similarly supported
6.6 Diagnostic Plots
# Simplified diagnostic approach for zero-inflated models
# Robust handling of residual calculations
# Extract key components from the zero-inflated model
observed_counts <- zinb_model$y
# Predict probabilities
zero_probs <- predict(zinb_model, type = "zero")
count_means <- predict(zinb_model, type = "count")
# Set up simulation
set.seed(123)
n_sim <- 250
n_obs <- length(observed_counts)
# Simulate data manually
simulate_zeroinfl_data <- function() {
# Simulate structural zeros
is_zero <- rbinom(n_obs, 1, zero_probs)
# Simulate counts
counts <- rnbinom(n_obs, mu = count_means, size = zinb_model$theta)
# Apply structural zeros
counts[is_zero == 1] <- 0
return(counts)
}
# Simulate multiple datasets
simulated_data <- replicate(n_sim, simulate_zeroinfl_data())
# Robust residual calculation function
safe_calc_dharma_residuals <- function(observed, simulated) {
# Ensure valid calculations
ecdf_vals <- sapply(1:length(observed), function(j) {
# Handle potential edge cases
sim_col <- simulated[j, ]
# Avoid division by zero or invalid probabilities
if (all(is.na(sim_col)) || length(unique(sim_col)) < 2) {
return(NA)
}
# Calculate empirical CDF
cdf_val <- mean(sim_col <= observed[j], na.rm = TRUE)
# Ensure value is between 0 and 1
cdf_val <- max(0.001, min(0.999, cdf_val))
return(cdf_val)
})
# Convert to normalized residuals, handling potential NAs
residuals <- qnorm(ecdf_vals)
residuals[is.na(residuals)] <- 0
return(residuals)
}
# Calculate residuals
dharma_residuals <- safe_calc_dharma_residuals(observed_counts, simulated_data)
# Set up a 2x2 plot layout
par(mfrow=c(2,2))
# 1. QQ Plot with robust handling
tryCatch({
qqnorm(dharma_residuals, main = "QQ Plot of Residuals")
qqline(dharma_residuals, col = 2)
}, error = function(e) {
plot(1:10, 1:10, type = "n", main = "QQ Plot Failed",
xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")
text(5, 5, "Could not generate QQ plot", col = "red")
})
# 2. Residuals vs. Predicted
plot(count_means, dharma_residuals,
xlab = "Predicted Counts",
ylab = "Normalized Residuals",
main = "Residuals vs. Predicted")
abline(h = 0, col = 2, lty = 2)
# 3. Variance Check
sim_vars <- apply(simulated_data, 2, var, na.rm = TRUE)
obs_var <- var(observed_counts, na.rm = TRUE)
hist(sim_vars,
main = "Distribution of Simulated Variances",
xlab = "Variance",
xlim = range(c(sim_vars, obs_var), na.rm = TRUE))
abline(v = obs_var, col = 2, lwd = 2)
# 4. Zero Inflation Check
zero_count_obs <- sum(observed_counts == 0)
zero_count_sim <- apply(simulated_data, 2, function(x) sum(x == 0, na.rm = TRUE))
hist(zero_count_sim,
main = "Distribution of Zero Counts in Simulations",
xlab = "Number of Zeros")
abline(v = zero_count_obs, col = 2, lwd = 2)# Additional diagnostics
cat("Observed data variance:", obs_var, "\n")Observed data variance: 43.17666
cat("Mean of simulated variances:", mean(sim_vars, na.rm = TRUE), "\n")Mean of simulated variances: 52.99586
cat("Proportion of simulations with lower variance:",
mean(sim_vars < obs_var, na.rm = TRUE), "\n")Proportion of simulations with lower variance: 0.384
cat("\nObserved zero count:", zero_count_obs, "\n")
Observed zero count: 134
cat("Mean simulated zero count:", mean(zero_count_sim, na.rm = TRUE), "\n")Mean simulated zero count: 133.908
cat("Proportion of simulations with fewer zeros:",
mean(zero_count_sim < zero_count_obs, na.rm = TRUE), "\n")Proportion of simulations with fewer zeros: 0.496
6.7 Interpreting the Zero-Inflated Model
# Extract and interpret coefficients
count_coefs <- coef(zinb_model, model = "count")
zero_coefs <- coef(zinb_model, model = "zero")
# Create interpretation tables
count_table <- data.frame(
Component = "Count Model",
Variable = names(count_coefs),
Coefficient = count_coefs,
Exp_Coefficient = exp(count_coefs)
)
zero_table <- data.frame(
Component = "Zero-Inflation Model",
Variable = names(zero_coefs),
Coefficient = zero_coefs,
Exp_Coefficient = exp(zero_coefs)
)
# Combine and print
rbind(count_table, zero_table) Component Variable Coefficient Exp_Coefficient
(Intercept) Count Model (Intercept) 0.06172480 1.0636696
fragment_size Count Model fragment_size 0.04037800 1.0412043
isolation Count Model isolation -0.13514589 0.8735885
understory Count Model understory 0.01986213 1.0200607
(Intercept)1 Zero-Inflation Model (Intercept) 1.69554477 5.4496139
fragment_size1 Zero-Inflation Model fragment_size -0.10761820 0.8979704
isolation1 Zero-Inflation Model isolation 0.26669228 1.3056386
- Count Model: How predictors affect woodpecker abundance in suitable habitats
- Zero-Inflation Model: How predictors affect the probability of a structural zero
6.8 Visualizing Model Predictions
# Create prediction grid for fragment size
pred_grid <- data.frame(
fragment_size = seq(min(bird_data$fragment_size),
max(bird_data$fragment_size),
length.out = 100),
isolation = mean(bird_data$isolation),
understory = mean(bird_data$understory)
)
# Predict different components
pred_grid$expected_mean <- predict(zinb_model, newdata = pred_grid, type = "response")
pred_grid$zero_prob <- predict(zinb_model, newdata = pred_grid, type = "zero")
pred_grid$expected_count <- predict(zinb_model, newdata = pred_grid, type = "count")
# Plot
p1 <- ggplot(pred_grid, aes(x = fragment_size, y = zero_prob)) +
geom_line(color = "red") +
labs(
title = "Probability of Structural Zero",
x = "Fragment Size (ha)",
y = "Probability"
) +
theme_minimal()
p2 <- ggplot(pred_grid, aes(x = fragment_size, y = expected_count)) +
geom_line(color = "blue") +
labs(
title = "Expected Count (When Not Zero)",
x = "Fragment Size (ha)",
y = "Expected Count"
) +
theme_minimal()
p3 <- ggplot(pred_grid, aes(x = fragment_size, y = expected_mean)) +
geom_line(color = "green") +
labs(
title = "Overall Expected Mean",
x = "Fragment Size (ha)",
y = "Expected Count"
) +
theme_minimal()
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)6.9 Exercise: Analyzing Butterfly Survey Data
# Simulate butterfly survey data
set.seed(456)
n <- 150
# Predictors
flower_diversity <- runif(n, 0, 10)
habitat_area <- runif(n, 1, 50)
elevation <- runif(n, 0, 1000)
# Zero-inflation process
zero_prob <- plogis(1 - 0.2 * flower_diversity + 0.1 * habitat_area)
# Structural zeros
structural_zero <- rbinom(n, 1, zero_prob)
# Count process
lambda <- exp(0.5 + 0.15 * flower_diversity + 0.05 * habitat_area - 0.001 * elevation)
# Generate counts
butterfly_count <- rep(0, n)
non_zero_indices <- which(structural_zero == 0)
butterfly_count[non_zero_indices] <- rnbinom(
length(non_zero_indices),
mu = lambda[non_zero_indices],
size = 1.5
)
butterfly_data <- data.frame(
butterfly_count = butterfly_count,
flower_diversity = flower_diversity,
habitat_area = habitat_area,
elevation = elevation
)
# Tasks for students:
# 1. Explore the data distribution
# 2. Fit zero-inflated and standard count models
# 3. Compare models using AIC
# 4. Interpret the best model
# 5. Visualize predictions- Calculate the proportion of zeros and compare to expected Poisson distribution
- Try different model specifications
- Use AIC for model selection
- Create visualizations that show both zero-inflation and count components
6.10 Key Takeaways
- Zero-inflated models handle datasets with more zeros than expected
- Two types of zeros: structural and sampling
- Models can separate processes generating zeros and counts
- Zero-inflated Negative Binomial often performs best for ecological data
- Careful model selection and interpretation is crucial
6.11 Additional Resources
- Zuur et al. (2009). Mixed Effects Models and Extensions in Ecology with R
- Martin et al. (2005). “Zero Tolerance Ecology”
- Potts & Elith (2006). Comparing Species Abundance Models