3

I am trying to perform a loop which loops through a list of single or multiple variables then sums a column. I am essentially trying to paste in from a list into the group_by() function so that it recognises the columns.

i.e. I want a list of grouping var/s like this, which I can amend depending on which variables or combinations I want to group by to aggregate another column

grp_by_list <- list("phase","phase,region","region")

which I can then use in a function like this:

grp_tabs <- list()
for(i in 1:length(grp_by_list)){
 grp_tabs[[paste0(grp_by_list[[i]])]] <- table %>%
 group_by(paste(grp_by_list[[i]]))%>%
 summarise(total = sum(total),
 n = n())}

It's the group_by() and pasting I am having a problem with, it works for the single variables but it doesn't like where I have two variables separated by a comma. However if I just type in the two variables separated by a comma (as shown below), that works fine. Does anyone know how to achieve this please?

Rather than getting an error, I am just getting output tables that all look identical where no grouping has taken place, it just gives me the sum and n for all data.

Some dummy data:

dummy_data <- tibble(phase=c("first","second","third","second","third","first","third","second","first","first"),              region=c("south","east","west","north","south","east","south","west","north","west"),
total=c(12,33,45,63,45,67,10,20,29,56))
2
  • Are you trying to get out 2 summarized tables from one? Commented Oct 13 at 12:04
  • within the loop list there will be multiple tables, but for each iteration just one table (but grouped by two columns sometimes which is where I have the problem). So one table would be grouped by phase and summing etc, one table would be grouped by phase and region and summed etc... Commented Oct 13 at 12:14

4 Answers 4

4

In base R I'd use strsplit,

> grp_by_list <- list("phase", "phase,region", "region")
> 
> lapply(grp_by_list, \(g) {
+   g <- strsplit(g, ',')[[1]]
+   aggregate(reformulate(g, 'total'), dummy_data, \(x) c(sum=sum(x), n=length(x)))
+ })
[[1]]
   phase total.sum total.n
1  first       164       4
2 second       116       3
3  third       100       3

[[2]]
   phase region total.sum total.n
1  first   east        67       1
2 second   east        33       1
3  first  north        29       1
4 second  north        63       1
5  first  south        12       1
6  third  south        55       2
7  first   west        56       1
8 second   west        20       1
9  third   west        45       1

[[3]]
  region total.sum total.n
1   east       100       2
2  north        92       2
3  south        67       3
4   west       121       3

or a proper list in the first place if possible:

> grp_by_list1 <- list("phase", c("phase", "region"),"region")
> 
> lapply(grp_by_list1, \(g) {
+   aggregate(reformulate(g, 'total'), dummy_data, \(x) c(sum=sum(x), n=length(x)))
+ })
[[1]]
   phase total.sum total.n
1  first       164       4
2 second       116       3
3  third       100       3

[[2]]
   phase region total.sum total.n
1  first   east        67       1
2 second   east        33       1
3  first  north        29       1
4 second  north        63       1
5  first  south        12       1
6  third  south        55       2
7  first   west        56       1
8 second   west        20       1
9  third   west        45       1

[[3]]
  region total.sum total.n
1   east       100       2
2  north        92       2
3  south        67       3
4   west       121       3

Same logic should be applicable across packages including dplyr or the like.

The reformulate creates a formula, see:

> reformulate(c("phase", "region"), 'total')
total ~ phase + region
Sign up to request clarification or add additional context in comments.

Comments

3

You can simply strsplit on "," and pass to syms():

lapply(grp_by_list, \(gb) {
  dummy_data |> 
    group_by(!!!syms(strsplit(gb,",")[[1]])) |> 
    summarise(total =sum(total), n=n())
})

Output:

[[1]]
# A tibble: 3 × 3
  phase  total     n
  <chr>  <dbl> <int>
1 first    164     4
2 second   116     3
3 third    100     3

[[2]]
# A tibble: 9 × 4
# Groups:   phase [3]
  phase  region total     n
  <chr>  <chr>  <dbl> <int>
1 first  east      67     1
2 first  north     29     1
3 first  south     12     1
4 first  west      56     1
5 second east      33     1
6 second north     63     1
7 second west      20     1
8 third  south     55     2
9 third  west      45     1

[[3]]
# A tibble: 4 × 3
  region total     n
  <chr>  <dbl> <int>
1 east     100     2
2 north     92     2
3 south     67     3
4 west     121     3

Comments

2

Here is a solution:

grp_by_list <- list("phase","phase,region","region")

# turn list into a vector
grp_by_list_unl <- c(unlist(grp_by_list))

# split the vector element by comma
grp_by_list_unl_split <- c(unlist(strsplit(grp_by_list_unl,split = ",")))

# loop through the split elements vector and group_by column name
for (i in grp_by_list_unl_split) {
  if (i == "phase") {
    phase_tab <- dummy_data |> 
      group_by(.data[[i]]) |> 
      summarize(total = sum(total),n = n()) |> 
      ungroup()
  }
}

phase_tab

A tibble: 3 × 3
  phase  total     n
  <chr>  <dbl> <int>
1 first    164     4
2 second   116     3
3 third    100     3


# region table
for (i in grp_by_list_unl_split) {
  if (i == "region") {
    region_tab <- dummy_data |> 
      group_by(.data[[i]]) |> 
      summarize(total = sum(total),n = n()) |> 
      ungroup()
  }
}

region_tab

A tibble: 4 × 3
  region total     n
  <chr>  <dbl> <int>
1 east     100     2
2 north     92     2
3 south     67     3
4 west     121     3

This way, you have two tables grouped by the column names in the list.

Comments

1

A minimal code fix in {dplyr} would use the .by argument in summarize directly with the strsplit groups. I also retained your code, that names each list based on the applied grouping.

library(dplyr)

table <- tibble(phase=c("first","second","third","second","third","first","third","second","first","first"),              region=c("south","east","west","north","south","east","south","west","north","west"),
                     total=c(12,33,45,63,45,67,10,20,29,56))

grp_by_list <- list("phase","phase,region","region")

grp_tabs <- list()

for(i in 1:length(grp_by_list)) {
  grp_tabs[[paste(grp_by_list[[i]])]] <- table |>
    summarise(total = sum(total), n = n(), .by = strsplit(grp_by_list[[i]],",")[[1]])
}

> grp_tabs
$phase
# A tibble: 3 × 3
  phase  total     n
  <chr>  <dbl> <int>
1 first    164     4
2 second   116     3
3 third    100     3

$`phase,region`
# A tibble: 9 × 4
  phase  region total     n
  <chr>  <chr>  <dbl> <int>
1 first  south     12     1
2 second east      33     1
3 third  west      45     1
4 second north     63     1
5 third  south     55     2
6 first  east      67     1
7 second west      20     1
8 first  north     29     1
9 first  west      56     1

$region
# A tibble: 4 × 3
  region total     n
  <chr>  <dbl> <int>
1 south     67     3
2 east     100     2
3 west     121     3
4 north     92     2

Two more options if you want to keep the order introduced by grouping

for(i in 1:length(grp_by_list)) {
  grp_tabs[[paste(grp_by_list[[i]])]] <- table |>
    group_by(across(all_of(strsplit(grp_by_list[[i]],",")[[1]]))) |>
    summarise(total = sum(total), n = n())
}
# or with lapply

lapply(grp_by_list, \(x) {table |>
    group_by(across(all_of(strsplit(x,",")[[1]]))) |>
    summarise(total = sum(total), n = n())}) 

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.