Calculate fitted values from simulated posterior distribution for multiple factors in R

Question

I am trying to simulate data from a liner mixed model in R using the sim() function from the arm package.

Here is a subset of my data.

> dput(data)
structure(list(id = c(989001026313185, 989001026313143, 989001026313517, 
989001004605135, 989001026313424, 989001006700707, 989001005924838, 
989001004605145, 989001006700794, 989001026313151, 989001006700633, 
989001005924953, 989001005924862, 989001026313498, 989001026313255, 
989001026313461, 989001026313160, 989001026313256, 989001026313322, 
989001026313172, 989001026313388, 989001026313378, 989001026313357, 
989001005924856, 989001026313343, 989001026313417, 989001026313369, 
989001026313152, 985121031803859, 989001026313369, 989001006700560, 
989001026313370, 989001006700537, 989001026313424, 989001026313463, 
989001030278434), sex = c("F", "F", "F", "F", "F", "F", "F", 
"F", "F", "F", "F", "F", "F", "F", "M", "M", "M", "M", "M", "F", 
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "M", "F", 
"F", "F", "M"), size = c(52, 75, 71.5, 90, 80.6, 67.6, 86.5, 
91.6, 103.1, 77, 70.8, 103.2, 88, 85.2, 50, 51.2, 45.4, 46, 50, 
95.6, 85.8, 82, 87.4, 74.6, 74, 93.9, 91, 92.5, 88.3, 92, 92.6, 
54.2, 60, 80.2, 94, 51.9), loc = c("loc1", "loc1", "loc1", "loc1", 
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", 
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc2", 
"loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", 
"loc2", "loc2", "loc2", "loc3", "loc3", "loc3", "loc3", "loc3"
), output = c(-12.2, -13.9, -14.1, -13.1, -11.7, -11.6, -11.6, 
-12.6, -12.6, -13.3, -13.3, -13, -13.4, -13.9, -13.8, -11.9, 
-13.5, -11.5, -10.7, -15.1, -11.7, -13.9, -12.9, -14.6, -14.3, 
-14.9, -13.8, -13.7, -11.8, -14.1, -12.2, -12.5, -12.2, -11.6, 
-11.9, -11)), class = "data.frame", row.names = c(NA, -36L))

I ran a LMM to understand the influence of location (factor, 3 levels: loc1, loc2, loc3), sex (factor, 2 levels: M and F) and size (continuous in cm) on the output:

mod <- lme4::lmer(output ~ sex + size + location + (1|id)

I then drew 1000 random values from the joint posterior ditribution of the model parameters.

nsim <- 1000
bsim_mod <- sim(mod, n.sim = nsim)

I would like to plot the results now by visualising the credible interval of the bsim_mod object and the observations.

While I can do that for the different size range in females (52 to 104 cm) and males (45 to 55 cm) I am struggling on how to also include the location information in the code below.

length(seq(52,104,1)) # number sizes for females = 53
length(seq(45,55,1)) # number sizes for males = 11

## create sex, size, vectors
sex_vec <- factor(c(rep("F", 53), rep("M", 11)), levels = c("F","M"))
size_vec <- c(seq(52,104,1), seq(45,55,1))

## obtain 1000 simulated values from the posterior distribution for the size range in males and females
newdat <- data.frame(sex_vec,size_vec)
colnames(newdat) <- c("sex", "size")
Xmat <- model.matrix(~sex + size, data = newdat)
fitmat <- matrix(ncol = 1000, nrow = nrow(newdat))
for(i in 1:1000) fitmat[,i] <- Xmat %*% bsim_mod@fixef[i,]
newdat$lower <- apply(fitmat, 1, quantile, prob = 0.025)
newdat$postmean <- apply(fitmat, 1, mean)
newdat$upper <- apply(fitmat, 1, quantile, prob = 0.975)

## Now simulate for the mean size only
newdat_mean <- data.frame(sex = factor(c("F", "M"), levels = levels(data$sex)),
                               size = c(83, 50)
colnames(newdat_mean) <- c("sex", "size")
Xmat_mean <- model.matrix(~sex + size, data = newdat_mean)
fitmat_mean <- matrix(ncol = 1000, nrow = nrow(newdat_mean))

for(i in 1:1000) fitmat_mean[,i] <- Xmat_mean %*% bsim_mod@fixef[i,]
newdat_mean$lower <- apply(fitmat_mC_meanDW, 1, quantile, prob = 0.025)
newdat_mean$postmean <- apply(fitmat_mean, 1, mean)
newdat_mean$upper <- apply(fitmat_mean, 1, quantile, prob = 0.975)

The goal would be to simulate the values but also taking into account that the samples were taken in three different locations.

DaveArmstrong · Accepted Answer · 2025-03-25 18:40:36Z

1

I would probably do it like this:

library(lme4)
library(arm)
library(ggplot2)
library(dplyr)

mod <- lme4::lmer(output ~ sex + size + loc + (1|id), data=data)

nsim <- 1000
bsim_mod <- sim(mod, n.sim = nsim)

To generate the data for the simulation you could group by sex and location and then create a sequence of sizes of the range of sizes in each location-sex combination. You can also save the mean size in each location for use later.

newdat <- data %>% 
  group_by(sex, loc) %>% 
  reframe(mean_size = round(mean(size)), 
          size = seq(floor(min(size)), ceiling(max(size)), by=1))

You can generate the predictions with a product of the model matrix and the simulated coefficients, we can do it without a loop.

Xmat <- model.matrix(~sex + size + loc, data = newdat)
fitmat <- Xmat %*% t(bsim_mod@fixef)

You can summarize the posterior predictions and add them back into the newdat object.

fitdat <- t(apply(fitmat, 1, \(x)c(mean(x), quantile(x, c(.025,.975)))))
colnames(fitdat) <- c("postmean", "lower", "upper")
newdat <- cbind(newdat, fitdat)

If you need a data frame with just he mean sizes in each location-gender combination, you can now get it by filtering the object you just created, rather than redoing everything.

newdat_mean <- newdat %>%
  group_by(sex, loc) %>% 
  filter(mean_size == size)

Then, you could make the graph.

ggplot(newdat, aes(x=size, y=postmean, colour=sex, fill=sex)) + 
  geom_ribbon(aes(ymin=lower, ymax=upper), colour="transparent", alpha=.15) + 
  geom_line() + 
  facet_wrap(~loc) + 
  theme_bw()

Figure whth predicted values for sex and size by location

data <- structure(list(id = c(989001026313185, 989001026313143, 989001026313517, 
989001004605135, 989001026313424, 989001006700707, 989001005924838, 
989001004605145, 989001006700794, 989001026313151, 989001006700633, 
989001005924953, 989001005924862, 989001026313498, 989001026313255, 
989001026313461, 989001026313160, 989001026313256, 989001026313322, 
989001026313172, 989001026313388, 989001026313378, 989001026313357, 
989001005924856, 989001026313343, 989001026313417, 989001026313369, 
989001026313152, 985121031803859, 989001026313369, 989001006700560, 
989001026313370, 989001006700537, 989001026313424, 989001026313463, 
989001030278434), sex = c("F", "F", "F", "F", "F", "F", "F", 
"F", "F", "F", "F", "F", "F", "F", "M", "M", "M", "M", "M", "F", 
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "M", "F", 
"F", "F", "M"), size = c(52, 75, 71.5, 90, 80.6, 67.6, 86.5, 
91.6, 103.1, 77, 70.8, 103.2, 88, 85.2, 50, 51.2, 45.4, 46, 50, 
95.6, 85.8, 82, 87.4, 74.6, 74, 93.9, 91, 92.5, 88.3, 92, 92.6, 
54.2, 60, 80.2, 94, 51.9), loc = c("loc1", "loc1", "loc1", "loc1", 
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", 
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc2", 
"loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", 
"loc2", "loc2", "loc2", "loc3", "loc3", "loc3", "loc3", "loc3"
), output = c(-12.2, -13.9, -14.1, -13.1, -11.7, -11.6, -11.6, 
-12.6, -12.6, -13.3, -13.3, -13, -13.4, -13.9, -13.8, -11.9, 
-13.5, -11.5, -10.7, -15.1, -11.7, -13.9, -12.9, -14.6, -14.3, 
-14.9, -13.8, -13.7, -11.8, -14.1, -12.2, -12.5, -12.2, -11.6, 
-11.9, -11)), class = "data.frame", row.names = c(NA, -36L))

^{Created on 2025-03-25 with reprex v2.1.1.9000}

answered Mar 25 at 18:40

DaveArmstrong

22.5k2 gold badges16 silver badges30 bronze badges

Sign up to request clarification or add additional context in comments.

2 Comments

smok Mar 26 at 10:03

Hi Dave, This works perfectly! Thank you for your help. One question I had was why the fitmat <- Xmat %*% t(bsim_mod@fixef) line is equivalent to the for loop approach I used in the question? Wouldn't the t(bsim_mod@fixef) approach compute predictions for a single fixed effects estimate, while the for loop approach computes predictions for multiple simulated fixed effects estimates?

DaveArmstrong Mar 26 at 16:37

@smok The matrix multiplication approach will calculate predictions for multiple effects. If Xmat is an n x k matrix, where n is the number of different combinations for which you want to calculate predictions and k is the number of parameters in the model, and bsim_mod@fixef is an R x k matrix where R is the number of simulations, the resulting fitmat = Xmat %*% t(bsim_mod@fixef) will be an n x R matrix of predicted values. Summarizing across the columns of fitmat in each row summarises the posterior distribution of the fixed effects for each combination.

Collectives™ on Stack Overflow

Calculate fitted values from simulated posterior distribution for multiple factors in R

1 Answer 1

2 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

2 Comments

Your Answer

Sign up or log in

Post as a guest

Related