Replace each row based on the string match and column value

Question

I have a dataset like this

dt <- data.table(Score = c(0.33,0.34,00.3, -0.22, 0.232), 
                 Id2 = c("0/0","0/1","1/0","0/0","0/0"), 
                 Kps = c("0/1","0/0","1/1","0/1","0/0"), 
                 Inr = c("0/0","0/1","1/1","0/0","0/1"))

I need to replace the values of each row based on the Score column as like this

If "0/0" or "1/1" then Score * 2
If "1/0" or "0/1" then Score

Usually, it can be done by using the base function like this

dt$Id2 <- dt$Score * 2

But here I have to consider each row and I have around 1000 columns so it can be only done with loop

The expected output

Score  Id2    Kps    Inr 
0.330  0.66   0.330  0.66
0.340  0.340  0.68  0.340
0.300  0.300  0.6   0.6
-0.220 -0.44 -0.22 -0.44
0.232  0.464 0.464  0.232

Any suggestions?

akrun · Accepted Answer · 2023-01-07 21:11:47Z

As the input is data.table, here is one approach with data.table

library(data.table)
 dt[, (names(dt)[-1]) := lapply(.SD, \(x)
    fcase(x %chin% c("0/0", "1/1"), Score *2,
    x %chin% c("1/0", "0/1"), Score)), .SDcols = -1]

-output

> dt
    Score    Id2    Kps    Inr
1:  0.330  0.660  0.330  0.660
2:  0.340  0.340  0.680  0.340
3:  0.300  0.300  0.600  0.600
4: -0.220 -0.440 -0.220 -0.440
5:  0.232  0.464  0.464  0.232

Or another option is to make use of named vector

keyval <- setNames(c(2, 2, 1, 1), c("0/0", "1/1", "1/0", "0/1"))
dt[, (names(dt)[-1]) := lapply(.SD, \(x) Score *keyval[x]), .SDcols = -1]

-output

> dt
    Score    Id2    Kps    Inr
1:  0.330  0.660  0.330  0.660
2:  0.340  0.340  0.680  0.340
3:  0.300  0.300  0.600  0.600
4: -0.220 -0.440 -0.220 -0.440
5:  0.232  0.464  0.464  0.232

Or create a count of 1s and 0s to multiply

library(stringr)
dt[, (names(dt)[-1]) := lapply(.SD, \(x) Score * 1 + 
   (str_count(x, "0")!= 1)) , .SDcols = -1]
> dt
    Score   Id2    Kps   Inr
1:  0.330 1.330  0.330 1.330
2:  0.340 0.340  1.340 0.340
3:  0.300 0.300  1.300 1.300
4: -0.220 0.780 -0.220 0.780
5:  0.232 1.232  1.232 0.232

MarBlo · Accepted Answer · 2023-01-07 18:52:14Z

2

Here is a tidyverse -way solution. It uses a data.frame and makes it longer in a first step. Then with case_when the different conditions were implemented.

pivot_wider brought it back to a wider format.

library(tidyverse)

dt<- data.frame(Score = c(0.33,0.34,00.3, -0.22, 0.232), 
                Id2=c("0/0","0/1","1/0","0/0","0/0"), 
                Kps=c("0/1","0/0","1/1","0/1","0/0"), 
                Inr=c("0/0","0/1","1/1","0/0","0/1"))


dt |> 
  pivot_longer(-Score) |> 
  mutate(value = case_when(
    value == '0/0' | value == "1/1" ~ Score *2,
    value == '1/0' | value == "0/1" ~ Score 
  )) |> 
  pivot_wider(names_from = name, values_from = value)
#> # A tibble: 5 × 4
#>    Score    Id2    Kps    Inr
#>    <dbl>  <dbl>  <dbl>  <dbl>
#> 1  0.33   0.66   0.33   0.66 
#> 2  0.34   0.34   0.68   0.34 
#> 3  0.3    0.3    0.6    0.6  
#> 4 -0.22  -0.44  -0.22  -0.44 
#> 5  0.232  0.464  0.464  0.232

answered Jan 7, 2023 at 18:52

MarBlo

4,5441 gold badge18 silver badges30 bronze badges

2 Comments

Darren Tsai Over a year ago

Using %in% is more concise: case_when(value %in% c("0/0", "1/1") ~ Score * 2, TRUE ~ Score)

MarBlo Over a year ago

@ˋDarren Tsai yes, this is right.

Darren Tsai · Accepted Answer · 2023-01-07 19:36:58Z

2

With dplyr::across(), you can apply a function across multiple columns. It supports tidy selections so that you can cleverly select variables based on their names or properties.

library(dplyr)

dt %>%
  mutate(across(-Score, ~ ifelse(.x %in% c("0/0", "1/1"), Score * 2, Score)))

#     Score    Id2    Kps    Inr
# 1:  0.330  0.660  0.330  0.660
# 2:  0.340  0.340  0.680  0.340
# 3:  0.300  0.300  0.600  0.600
# 4: -0.220 -0.440 -0.220 -0.440
# 5:  0.232  0.464  0.464  0.232

A tricky way

dt %>%
  mutate(across(-Score, ~ Score * (.x %in% c("0/0", "1/1") + 1)))

edited Jan 7, 2023 at 19:36

answered Jan 7, 2023 at 19:28

Darren Tsai

36.6k6 gold badges27 silver badges58 bronze badges

Comments

zx8754 · Accepted Answer · 2023-01-10 14:26:39Z

Using matrix multiplication:

# like @akrun using a named vector for conversion, to avoid ifelse/case/switch:
keyval <- setNames(c(2, 2, 1, 1), c("0/0", "1/1", "1/0", "0/1"))

#convert and make the matrix, then multiply
matrix(keyval[ as.matrix(dt[, -1 ]) ] * dt[[ 1 ]], ncol = ncol(dt) - 1)
#        [,1]   [,2]   [,3]
# [1,]  0.660  0.330  0.660
# [2,]  0.340  0.680  0.340
# [3,]  0.300  0.600  0.600
# [4,] -0.440 -0.220 -0.440
# [5,]  0.464  0.464  0.232

Benchmark using a bigger dataset:

library(dplyr)
library(tidyr)

#bigger data
n = 1000
set.seed(1); dt <- data.table(cbind(
  Score = runif(n),
  data.frame(matrix(sample(c("0/0", "0/1", "1/0", "1/1"), n * n, replace = TRUE), ncol = n))))

Matrix multiplication should give 3-7x improvement compared to dplyr:

m <- microbenchmark::microbenchmark(
  matrix = {
    matrix(keyval[ as.matrix(dt[, -1 ]) ] * dt[[ 1 ]], ncol = ncol(dt) - 1)
  },
  dplyr1 = {
    dt |> 
      pivot_longer(-Score) |> 
      mutate(value = case_when(
        value == '0/0' | value == "1/1" ~ Score *2,
        value == '1/0' | value == "0/1" ~ Score 
      )) |> 
      pivot_wider(names_from = name, values_from = value)
  },
  dplyr2 = {
    dt %>%
      mutate(across(-Score, ~ Score * (.x %in% c("0/0", "1/1") + 1)))
  })

print(m, unit = "relative")
# Unit: relative
#    expr      min       lq     mean   median       uq      max neval
#  matrix 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
#  dplyr1 7.697692 8.468686 7.279598 8.071069 7.652855 3.423847   100
#  dplyr2 3.862794 3.708899 3.399736 3.560082 3.687698 2.096620   100

Collectives™ on Stack Overflow

Replace each row based on the string match and column value

4 Answers 4

Comments

2 Comments

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

4 Answers 4

Comments

2 Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related