Add a column with distribution plot of every variable in a summary table

Question

Is there a nice way to add a column with the distribution plot of every variable?

For example here:

library(vtable)
st(iris)

Not necessary to use this package. Open to suggestions

This seems very complicated to do, why not show the summary stats in the distribution plots themselves? :) — Dion Groothof
– Dion Groothof, Commented Jan 16, 2022 at 19:57
Also, the skim()-function from package skimr could be an option if plotting histograms is sufficient. — user14692575
– user14692575, Commented Jan 16, 2022 at 20:24

Dion Groothof · Accepted Answer · 2022-01-17 17:20:36Z

Even though I am aware that this might be much easier to do with ggplot, I am always eager to see whether I can achieve a similar result with base R plotting tools. I will make use of the iris data in this example.

We first need to identify which columns of our data.frame are numeric.

# returns logical which is TRUE if column p is numeric
numeric_cols <- c(rep(NA, ncol(iris)))
for(p in seq_len(ncol(iris))) {
  numeric_cols[p] <- inherits(iris[, p], 'numeric')
}

Then, we may select some arbitrary colours for densities. Here, I picked three colours which correspond to the number of levels of iris$Species.

my_cols <- c('blue4', 'darkorange', '#00b0a4')
adj_col <- \(x) adjustcolor(x, alpha.f = 0.2)
my_transp_cols <- c(
  adj_col('blue4'), adj_col('darkorange'), adj_col('#00b0a4')
)

Now we need to plot the densities. The function that is given below (i.e. plot_densities) has an option to either provide marginal densities or densities conditional on some factor variable. If you would like to obtain the densities conditional on some factor variable, simply set include_factor to TRUE and pass the factor variable of interest to the factor argument.

plot_densities <- \(DF, columns, include_factor = FALSE, factor) {
  name_vars <- names(DF)
  DF <- DF[complete.cases(DF[name_vars]), ]
  ## setting up plotting device
  layout(matrix(seq_len(4L), ncol = 4L))
  ## only use the TRUEs indicating numeric columns
  n_cols <- length(columns[columns])
  ## if densities are to be shown per factor level
  if (include_factor) {
    par(mar = c(5, 4, 4, 8) + 0.1, xpd = TRUE)
    lvls <- unique(levels(DF[[factor]]))
    for (i in seq_len(n_cols)) {
      ## preallocation
      max_y <- max_x <- min_x <- rep(NA, length(unique(levels(DF[[factor]]))))
      means <- SDs <- rep(NA, length(unique(levels(DF[[factor]]))))
      no_of_levels <- length(lvls)
      for (j in seq_len(no_of_levels)) {
        ## only proceed with this loop if column i is numeric else next
        if (columns[i]) {
          ## subset consisting values of column i for factor level j
          sub <- subset(DF, DF[[factor]] %in% lvls[j])[, i]
          ## make sure that the densities of column i per factor level j
          ## are depicted in the same panel
          if (j == 1) {
            ## limits for the x and y axes per panel for column i
            for (k in seq_len(no_of_levels)) {
              sub_k <- subset(DF, DF[[factor]] %in% lvls[k])[, i]
              x <- density(sub_k)$x
              y <- density(sub_k)$y
              min_x[k] <- min(x)
              max_x[k] <- max(x)
              max_y[k] <- max(y)
            }
            ## mean and SD for column i per factor level j
            r <- \(x) format(round(x, 1L), nsmall = 1L)
            for (kk in seq_len(no_of_levels)) {
              sub_kk <- subset(DF, DF[[factor]] %in% lvls[kk])[, i]
              means[kk] <- r(mean(sub_kk, na.rm = TRUE))
              SDs[kk] <- r(sd(sub_kk, na.rm = TRUE))
            }
            x_lim <- c(min(min_x), max(max_x))
            y_lim <- c(0L, max(max_y))
            plot(density(sub), main = '',
                 las = 1, col = my_cols[j], xlab = '',
                 xlim = x_lim, ylim = y_lim, bty = 'n')
            title(main = names(DF)[i], xpd = TRUE, adj = 1)
            polygon(density(sub), density = -1L, col = my_transp_cols[j])
          } else {
            lines(density(sub), col = my_cols[j])
            polygon(density(sub), density = -1L, col = my_transp_cols[j])
          }
        } else next
      }
      ## add legend to the plot
      legend('topright', paste0(lvls, ': ', means, ' (', SDs, ')'),
             fill = my_transp_cols, bty = 'n',
             inset = c(-0.5, 0.1))
    }
  } else {
    ## if densities are NOT to be shown per factor level
    for (i in seq_len(n_cols)) {
      par(mar = c(5, 4, 4, 8) + 0.1, xpd = TRUE)
      ## only proceed with this loop if column i is numeric else next
      if (columns[i]) {
        ## mean and SD for column i
        r <- \(x) format(round(x, 1L), nsmall = 1L)
        means <- SDs <- rep(NA, n_cols)
        for(j in seq_len(n_cols)) {
          means[j] <- r(mean(DF[, j], na.rm = TRUE))
          SDs[j] <- r(sd(DF[, j], na.rm = TRUE))
        }
        plot(density(DF[, i]),
             las = 1, main = names(DF)[i], col = my_cols[1L], xlab = '',
             bty = 'n')
        polygon(density(DF[, i]), density = -1L, col = my_transp_cols[1L])
        ## add legend to the plot
        legend('topright', paste0(names(DF)[i], ': ', means[i], ' (', SDs[i], ')'),
               fill = my_transp_cols[1L], bty = 'n',
               inset = c(-0.5, 0.1))
      } else next
    }
  }
}

We can save the output as .pdf file. If you would change the layout of the plotting device, than you would also have to play a little bit with the width and height to make it fit your specific situation.

# marginal densities
pdf(file = 'my_directory/my_plot.pdf', # change my_directory
    width = 13, height = 4) 
plot_densities(DF = iris, columns = numeric_cols)
dev.off()

# conditional densities
pdf(file = 'my_directory/my_plot2.pdf', # change my_directory
    width = 13, height = 4) 
plot_densities(DF = iris, columns = numeric_cols,
               include_factor = TRUE, factor = 'Species')
dev.off()

I usually make .pdf files of my plots and then use this online converting tool to convert them to .png files.

I have shown the mean (SD) in the figure legends, but you may choose to show any statistic you like. Just change mean(sub) and sd(sub) in the function to the statistics of your interest.

Output

Marginal densities

Conditional densities

Note: use function(x) instead of \(x) if you use a version of R <4.1.0

Collectives™ on Stack Overflow

Add a column with distribution plot of every variable in a summary table

1 Answer 1

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related