5

I have a dataframe looks like:

df <- read.table(text="chr     pos Ref Alt D045313 D045314 D045135 D045136 D045137 D045138
Chr1 1462191   T   C     1/1     0/1     1/1     0/0     1/1     1/1
Chr1 1463534   G   C     0/0     1/1     0/0     0/1     0/0     0/0
Chr1 1463881   T   A     0/1     0/0     1/1     0/0     1/1     1/1
Chr1 1464091   G   A     0/0     0/0     1/1     0/0     1/1     1/1
Chr1 1464651   T   C     1/1     0/0     1/1     0/1    1/1     1/1",head=F, stringsAsFactors=F)

The expected result:

  chr     pos Ref Alt D045313 D045314 D045135 D045136 D045137 D045138
Chr1 1462191   T   C     C/C     T/C     C/C     T/T     C/C     C/C
Chr1 1463534   G   C     G/G     C/C     G/G     G/C     G/G     G/G
Chr1 1463881   T   A     T/A     T/T     A/A     T/T     A/A     A/A
Chr1 1464091   G   A     G/G     G/G     A/A     G/G     A/A     A/A
Chr1 1464651   T   C     C/C     T/T     C/C     T/C    C/C     C/C

the replacements would follow this: in df[5:10], "0" should be replaced by the character in df$Ref, "1" by the character in df$Alt. I checked the question in this link[Replace specific characters in a variable in data frame in R, but it didn't work on my situation. Appreciate any helps.

2
  • 1
    This question is the reverse operation of question here Commented Aug 4, 2015 at 18:53
  • 1
    It's okay. I find some similarities. Commented Aug 4, 2015 at 18:54

3 Answers 3

4

Creating data:

df <- read.table(text="chr     pos Ref Alt D045313 D045314 D045135 D045136 D045137 D045138
                 Chr1 1462191   T   C     1/1     0/1     1/1     0/0     1/1     1/1
                 Chr1 1463534   G   C     0/0     1/1     0/0     0/1     0/0     0/0
                 Chr1 1463881   T   A     0/1     0/0     1/1     0/0     1/1     1/1
                 Chr1 1464091   G   A     0/0     0/0     1/1     0/0     1/1     1/1
                 Chr1 1464651   T   C     1/1     0/0     1/1     0/1    1/1     1/1",head=T, stringsAsFactors=F)

Using gsub:

vgsub<- Vectorize(gsub, SIMPLIFY = FALSE)
new <- vgsub("0", df$Ref, as.data.frame(t(df[5:10])))
new <- vgsub("1", df$Alt, new)
df[5:10] <- do.call("rbind", new)
df
  chr     pos Ref Alt D045313 D045314 D045135 D045136 D045137 D045138
1 Chr1 1462191   T   C     C/C     T/C     C/C     T/T     C/C     C/C
2 Chr1 1463534   G   C     G/G     C/C     G/G     G/C     G/G     G/G
3 Chr1 1463881   T   A     T/A     T/T     A/A     T/T     A/A     A/A
4 Chr1 1464091   G   A     G/G     G/G     A/A     G/G     A/A     A/A
5 Chr1 1464651   T   C     C/C     T/T     C/C     T/C     C/C     C/C
Sign up to request clarification or add additional context in comments.

1 Comment

Simple and fast. only 15.61 sec on my real data
4

Using data.table

setDT(df)[, lapply(.SD, function(x) gsub("0", Ref, gsub("1", Alt, x))), 
            by = .(chr, pos)]

#    chr     pos Ref Alt D045313 D045314 D045135 D045136 D045137 D045138
#1: Chr1 1462191   T   C     C/C     T/C     C/C     T/T     C/C     C/C
#2: Chr1 1463534   G   C     G/G     C/C     G/G     G/C     G/G     G/G
#3: Chr1 1463881   T   A     T/A     T/T     A/A     T/T     A/A     A/A
#4: Chr1 1464091   G   A     G/G     G/G     A/A     G/G     A/A     A/A
#5: Chr1 1464651   T   C     C/C     T/T     C/C     T/C     C/C     C/C

Using dplyr

library(dplyr)

df %>% 
   rowwise %>% 
   mutate_each(funs(gsub("0", Ref, gsub("1", Alt, .))), matches("^D04."))

#   chr     pos Ref Alt D045313 D045314 D045135 D045136 D045137 D045138
#1 Chr1 1462191   T   C     C/C     T/C     C/C     T/T     C/C     C/C
#2 Chr1 1463534   G   C     G/G     C/C     G/G     G/C     G/G     G/G
#3 Chr1 1463881   T   A     T/A     T/T     A/A     T/T     A/A     A/A
#4 Chr1 1464091   G   A     G/G     G/G     A/A     G/G     A/A     A/A
#5 Chr1 1464651   T   C     C/C     T/T     C/C     T/C     C/C     C/C

Another option

library(dplyr)
library(tidyr)

df %>% 
   gather(key, value, -c(chr, pos, Ref, Alt)) %>% rowwise %>% 
   mutate(value = gsub("0", Ref, gsub("1", Alt, value))) %>%    
   spread(key, value)

#Source: local data frame [5 x 10]

#   chr     pos Ref Alt D045313 D045314 D045135 D045136 D045137 D045138
#1 Chr1 1462191   T   C     C/C     T/C     C/C     T/T     C/C     C/C
#2 Chr1 1463534   G   C     G/G     C/C     G/G     G/C     G/G     G/G
#3 Chr1 1463881   T   A     T/A     T/T     A/A     T/T     A/A     A/A
#4 Chr1 1464091   G   A     G/G     G/G     A/A     G/G     A/A     A/A
#5 Chr1 1464651   T   C     C/C     T/T     C/C     T/C     C/C     C/C

base R option using apply

data.frame(t(
  apply(df, 1, 
  function(x) c(x[c(1:4)], gsub("0", x['Ref'], gsub("1", x['Alt'], x[c(5:10)]))))
 ))

#   chr     pos Ref Alt D045313 D045314 D045135 D045136 D045137 D045138
#1 Chr1 1462191   T   C     C/C     T/C     C/C     T/T     C/C     C/C
#2 Chr1 1463534   G   C     G/G     C/C     G/G     G/C     G/G     G/G
#3 Chr1 1463881   T   A     T/A     T/T     A/A     T/T     A/A     A/A
#4 Chr1 1464091   G   A     G/G     G/G     A/A     G/G     A/A     A/A
#5 Chr1 1464651   T   C     C/C     T/T     C/C     T/C     C/C     C/C

6 Comments

data.table solution used 143.22 sec. dplyr used 488.27 sec. others are in testing
The third solution run 547.4 sec. and last one used 16.83 sec.
It is interesting that the apply solution is very fast. Probably for rowwise, apply would be better here.
@akrun yes, I am wondering why dt approach is comparatively very slow!
I think for rowwise operations, data.table won't be adding much value as you are grouping by two variables to make it rowwise
|
2

Here is a function that you can use to fill in the values in this case and be able to change for future cases too.

convert_val <- function(df) {
  reference_cols <- c("chr", "pos", "Ref", "Alt")
  morph <- function(DF,vec,First="0", Second="1") {
  m <- mapply(function(x,y) gsub(First, x,y), x=DF[,"Ref"], y=DF[,vec])
  mapply(function(x,y) gsub(Second, x,y), x=DF[,"Alt"], y=m)
}
  nums <- which(!names(df) %in% reference_cols)
  df[, nums] <- lapply(nums,function(x) morph(df,x))
  df
}

convert_val(df)
#    chr     pos Ref Alt D045313 D045314 D045135 D045136 D045137 D045138
# 1 Chr1 1462191   T   C     C/C     T/C     C/C     T/T     C/C     C/C
# 2 Chr1 1463534   G   C     G/G     C/C     G/G     G/C     G/G     G/G
# 3 Chr1 1463881   T   A     T/A     T/T     A/A     T/T     A/A     A/A
# 4 Chr1 1464091   G   A     G/G     G/G     A/A     G/G     A/A     A/A
# 5 Chr1 1464651   T   C     C/C     T/T     C/C     T/C     C/C     C/C

In the future, you can change the First and Second arguments for the internal function morph to whatever the new values to look for are (default is "0" and "1"). Or if your column names change, you can adjust the line reference_cols.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.