4

I have some trouble with which.min function inside a dplyr pipe I have a cumbersome solution (*) and I'm looking form more compact and elegant way to do this

  1. reproducible example
library(dplyr)

data=data.frame(s1=c(10,NA,5,NA,NA),s2=c(8,NA,NA,4,20),s3=c(NA,NA,2,NA,10))
data
#>   s1 s2 s3
#> 1 10  8 NA
#> 2 NA NA NA
#> 3  5 NA  2
#> 4 NA  4 NA
#> 5 NA 20 10
  1. Min vaule:

here with min(x,na.rm=TRUE) I could extract the min value

data%>%
  rowwise()%>%
  mutate(Min_s=min(c(s1,s2,s3),na.rm=TRUE))
#> Warning: There was 1 warning in `mutate()`.
#> ℹ In argument: `Min_s = min(c(s1, s2, s3), na.rm = TRUE)`.
#> ℹ In row 2.
#> Caused by warning in `min()`:
#> ! no non-missing arguments to min; returning Inf
#> # A tibble: 5 × 4
#> # Rowwise: 
#>      s1    s2    s3 Min_s
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    10     8    NA     8
#> 2    NA    NA    NA   Inf
#> 3     5    NA     2     2
#> 4    NA     4    NA     4
#> 5    NA    20    10    10
  1. extracting variable containing min val:

Here I'm having trouble extracting which variable contain the min value

data%>%
  rowwise()%>%
  mutate(which_s=which.min(c(s1,s2,s3)))
#> Error in `mutate()`:
#> ℹ In argument: `which_s = which.min(c(s1, s2, s3))`.
#> ℹ In row 2.
#> Caused by error:
#> ! `which_s` must be size 1, not 0.
#> ℹ Did you mean: `which_s = list(which.min(c(s1, s2, s3)))` ?

# Solution (*)
data%>%
  rowwise()%>%
  mutate(which_s=if(!is.na(s1)|!is.na(s2)|!is.na(s3)) {which.min(c(s1,s2,s3))} else NA )
#> # A tibble: 5 × 4
#> # Rowwise: 
#>      s1    s2    s3 which_s
#>   <dbl> <dbl> <dbl>   <int>
#> 1    10     8    NA       2
#> 2    NA    NA    NA      NA
#> 3     5    NA     2       3
#> 4    NA     4    NA       2
#> 5    NA    20    10       3

Created on 2024-11-07 with reprex v2.1.0

1
  • 4
    The pipe does not like it that which.min() returns a zero length integer vector if you pass a vector of all NAs to it. So you can define a function that returns NA instead: which.min.na <- function(x) if (all(is.na(x))) NA else which.min(x). Then just replace which.min with which.min.na in your first attempt above that originally threw an error. Commented Nov 7, 2024 at 13:23

3 Answers 3

7

In your second row, you will obtain integer(0) in the column which_s, and that's the point you cannot run it without errors.

Instead, you could first store the results in a list, and then unnest (don't forget to enable keep_empty argument in unnest)

data %>%
    rowwise() %>%
    mutate(which_s = list(which.min(c(s1, s2, s3)))) %>%
    unnest(which_s, keep_empty = TRUE)

which gives

# A tibble: 5 × 4
     s1    s2    s3 which_s
  <dbl> <dbl> <dbl>   <int>
1    10     8    NA       2
2    NA    NA    NA      NA
3     5    NA     2       3
4    NA     4    NA       2
5    NA    20    10       3
Sign up to request clarification or add additional context in comments.

Comments

2

Without using rowwise(), you could do this in either base R or a single mutate() step using purrr::pmap_chr():

Base R:

data$min_base <- unlist(apply(data, 1, \(x) ifelse(all(is.na(x)), NA, names(data)[which.min(x)])))

dplyr/purrr

library(dplyr)

data <- data %>%
  mutate(min_dplyr = purrr::pmap_chr(select(., s1:s3), \(...) {
    ifelse(all(is.na(c(...))), NA, colnames(data)[which.min(c(...))])
  }))

Output:

#   s1 s2 s3 min_base min_dplyr
# 1 10  8 NA       s2        s2
# 2 NA NA NA     <NA>      <NA>
# 3  5 NA  2       s3        s3
# 4 NA  4 NA       s2        s2
# 5 NA 20 10       s3        s3

Note that among these answers, the base R custom function by @friede is substantially faster, followed by this base R arroach:

bigdata <- data[rep(seq_len(nrow(data)), 1e5),]

microbenchmark::microbenchmark(
  rowwise = bigdata %>%
    rowwise() %>%
    mutate(which_s = list(which.min(c(s1, s2, s3)))) %>%
    tidyr::unnest(which_s, keep_empty = TRUE),
  base = unlist(apply(bigdata, 1, \(x) ifelse(all(is.na(x)), NA, names(bigdata)[which.min(x)]))),
  pmap = bigdata %>%
    mutate(min_dplyr = purrr::pmap_chr(select(., s1:s3), \(...) {
      ifelse(all(is.na(c(...))), NA, colnames(bigdata)[which.min(c(...))])
    })),
  custom_row.which.min = row.which.min(bigdata, names = TRUE, ties="first")
)

#                 expr       min       lq      mean    median        uq       max neval cld
#              rowwise 3730.8131 4512.870 6018.3180 4985.6024 5913.5166 53501.838   100 a  
#                 base 2419.1913 3162.745 4309.7700 3557.7805 4427.4588 32814.209   100  b 
#                 pmap 3837.8870 4593.846 6091.5265 5203.0391 5984.0412 22015.418   100 a  
# custom_row.which.min  108.4075  147.695  221.7602  168.5267  240.6043  1419.106   100   c

Comments

2

I sometimes miss a good row.which.min function. This is far from good and not harmonised to work (well) with {dplyr}-language, but might help here.

v0

row.which.min = \(.data, .cols, .names = FALSE, tm = "first") {
  if(missing(.cols)) .cols = names(.data)
  x = .data[.cols]
  i = rowSums(is.na(x)) < length(.cols)
  nx = -x[i, ]
  nx[is.na(nx)] = -Inf
  y = rep(NA, nrow(.data))
  y[i] = max.col(nx, tm)
  if(!.names) y else names(.data)[y]
}

giving

> df0 = data.frame(s1=c(10,NA,5,NA,NA),s2=c(8,NA,NA,4,20),s3=c(NA,NA,2,NA,10))
> row.which.min(df0, .names = TRUE)
[1] "s2" NA   "s3" "s2" "s3"

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.