Creating a group by loop using either single or multiple variables from a list in R

Question

I am trying to perform a loop which loops through a list of single or multiple variables then sums a column. I am essentially trying to paste in from a list into the group_by() function so that it recognises the columns.

i.e. I want a list of grouping var/s like this, which I can amend depending on which variables or combinations I want to group by to aggregate another column

grp_by_list <- list("phase","phase,region","region")

which I can then use in a function like this:

grp_tabs <- list()

for(i in 1:length(grp_by_list)){
 grp_tabs[[paste0(grp_by_list[[i]])]] <- table %>%
 group_by(paste(grp_by_list[[i]]))%>%
 summarise(total = sum(total),
 n = n())}

It's the group_by() and pasting I am having a problem with, it works for the single variables but it doesn't like where I have two variables separated by a comma. However if I just type in the two variables separated by a comma (as shown below), that works fine. Does anyone know how to achieve this please?

Rather than getting an error, I am just getting output tables that all look identical where no grouping has taken place, it just gives me the sum and n for all data.

Some dummy data:

dummy_data <- tibble(phase=c("first","second","third","second","third","first","third","second","first","first"),              region=c("south","east","west","north","south","east","south","west","north","west"),
total=c(12,33,45,63,45,67,10,20,29,56))

within the loop list there will be multiple tables, but for each iteration just one table (but grouped by two columns sometimes which is where I have the problem). So one table would be grouped by phase and summing etc, one table would be grouped by phase and region and summed etc... — KatChristiansen
– KatChristiansen, Commented Oct 13 at 12:14

jay.sf · Accepted Answer · 2025-10-13 12:43:44Z

In base R I'd use strsplit,

> grp_by_list <- list("phase", "phase,region", "region")
> 
> lapply(grp_by_list, \(g) {
+   g <- strsplit(g, ',')[[1]]
+   aggregate(reformulate(g, 'total'), dummy_data, \(x) c(sum=sum(x), n=length(x)))
+ })
[[1]]
   phase total.sum total.n
1  first       164       4
2 second       116       3
3  third       100       3

[[2]]
   phase region total.sum total.n
1  first   east        67       1
2 second   east        33       1
3  first  north        29       1
4 second  north        63       1
5  first  south        12       1
6  third  south        55       2
7  first   west        56       1
8 second   west        20       1
9  third   west        45       1

[[3]]
  region total.sum total.n
1   east       100       2
2  north        92       2
3  south        67       3
4   west       121       3

or a proper list in the first place if possible:

> grp_by_list1 <- list("phase", c("phase", "region"),"region")
> 
> lapply(grp_by_list1, \(g) {
+   aggregate(reformulate(g, 'total'), dummy_data, \(x) c(sum=sum(x), n=length(x)))
+ })
[[1]]
   phase total.sum total.n
1  first       164       4
2 second       116       3
3  third       100       3

[[2]]
   phase region total.sum total.n
1  first   east        67       1
2 second   east        33       1
3  first  north        29       1
4 second  north        63       1
5  first  south        12       1
6  third  south        55       2
7  first   west        56       1
8 second   west        20       1
9  third   west        45       1

[[3]]
  region total.sum total.n
1   east       100       2
2  north        92       2
3  south        67       3
4   west       121       3

Same logic should be applicable across packages including dplyr or the like.

The reformulate creates a formula, see:

> reformulate(c("phase", "region"), 'total')
total ~ phase + region

langtang · Accepted Answer · 2025-10-13 13:03:38Z

You can simply strsplit on "," and pass to syms():

lapply(grp_by_list, \(gb) {
  dummy_data |> 
    group_by(!!!syms(strsplit(gb,",")[[1]])) |> 
    summarise(total =sum(total), n=n())
})

Output:

[[1]]
# A tibble: 3 × 3
  phase  total     n
  <chr>  <dbl> <int>
1 first    164     4
2 second   116     3
3 third    100     3

[[2]]
# A tibble: 9 × 4
# Groups:   phase [3]
  phase  region total     n
  <chr>  <chr>  <dbl> <int>
1 first  east      67     1
2 first  north     29     1
3 first  south     12     1
4 first  west      56     1
5 second east      33     1
6 second north     63     1
7 second west      20     1
8 third  south     55     2
9 third  west      45     1

[[3]]
# A tibble: 4 × 3
  region total     n
  <chr>  <dbl> <int>
1 east     100     2
2 north     92     2
3 south     67     3
4 west     121     3

Ifeanyi Idiaye · Accepted Answer · 2025-10-13 12:26:20Z

Here is a solution:

grp_by_list <- list("phase","phase,region","region")

# turn list into a vector
grp_by_list_unl <- c(unlist(grp_by_list))

# split the vector element by comma
grp_by_list_unl_split <- c(unlist(strsplit(grp_by_list_unl,split = ",")))

# loop through the split elements vector and group_by column name
for (i in grp_by_list_unl_split) {
  if (i == "phase") {
    phase_tab <- dummy_data |> 
      group_by(.data[[i]]) |> 
      summarize(total = sum(total),n = n()) |> 
      ungroup()
  }
}

phase_tab

A tibble: 3 × 3
  phase  total     n
  <chr>  <dbl> <int>
1 first    164     4
2 second   116     3
3 third    100     3


# region table
for (i in grp_by_list_unl_split) {
  if (i == "region") {
    region_tab <- dummy_data |> 
      group_by(.data[[i]]) |> 
      summarize(total = sum(total),n = n()) |> 
      ungroup()
  }
}

region_tab

A tibble: 4 × 3
  region total     n
  <chr>  <dbl> <int>
1 east     100     2
2 north     92     2
3 south     67     3
4 west     121     3

This way, you have two tables grouped by the column names in the list.

lailaps · Accepted Answer · 2025-10-14 07:01:43Z

A minimal code fix in {dplyr} would use the .by argument in summarize directly with the strsplit groups. I also retained your code, that names each list based on the applied grouping.

library(dplyr)

table <- tibble(phase=c("first","second","third","second","third","first","third","second","first","first"),              region=c("south","east","west","north","south","east","south","west","north","west"),
                     total=c(12,33,45,63,45,67,10,20,29,56))

grp_by_list <- list("phase","phase,region","region")

grp_tabs <- list()

for(i in 1:length(grp_by_list)) {
  grp_tabs[[paste(grp_by_list[[i]])]] <- table |>
    summarise(total = sum(total), n = n(), .by = strsplit(grp_by_list[[i]],",")[[1]])
}

> grp_tabs
$phase
# A tibble: 3 × 3
  phase  total     n
  <chr>  <dbl> <int>
1 first    164     4
2 second   116     3
3 third    100     3

$`phase,region`
# A tibble: 9 × 4
  phase  region total     n
  <chr>  <chr>  <dbl> <int>
1 first  south     12     1
2 second east      33     1
3 third  west      45     1
4 second north     63     1
5 third  south     55     2
6 first  east      67     1
7 second west      20     1
8 first  north     29     1
9 first  west      56     1

$region
# A tibble: 4 × 3
  region total     n
  <chr>  <dbl> <int>
1 south     67     3
2 east     100     2
3 west     121     3
4 north     92     2

Two more options if you want to keep the order introduced by grouping

for(i in 1:length(grp_by_list)) {
  grp_tabs[[paste(grp_by_list[[i]])]] <- table |>
    group_by(across(all_of(strsplit(grp_by_list[[i]],",")[[1]]))) |>
    summarise(total = sum(total), n = n())
}
# or with lapply

lapply(grp_by_list, \(x) {table |>
    group_by(across(all_of(strsplit(x,",")[[1]]))) |>
    summarise(total = sum(total), n = n())})

Collectives™ on Stack Overflow

Creating a group by loop using either single or multiple variables from a list in R

4 Answers 4

Comments

Comments

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

4 Answers 4

Comments

Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related