0

I have the following portion of my dataset:

structure(list(domain = c("A1BG_-_-_0", "A1BG_-_-_1", "A1BG_-_-_2", 
"A1BG_-_-_3", "A1BG_-_-_4", "A1BG_143228_143228_0", "A1BG_143228_143228_1", 
"A1BG_143228_143228_2", "A1BG_143228_143228_3", "A1CF_-_-_0"), 
    chr = c("19", "19", "19", "19", "19", "19", "19", "19", "19", 
    "10"), positions = c("(58858387..58858395,58858718..58858719)", 
    "(58858998..58859006,58861735..58862017,58862756..58862766)", 
    "(58863018..58863053,58863648..58863673)", "(58863913..58863921,58864293..58864303)", 
    "(58864552..58864563,58864657..58864693,58864769..58864803)", 
    "(58858719..58858998)", "(58862766..58863018)", "(58863673..58863913)", 
    "(58864303..58864552)", "(52566488..52566640,52569653..52569717)"
    ), length = c(11L, 303L, 62L, 20L, 84L, 280L, 253L, 241L, 
    250L, 218L)), class = "data.frame", row.names = c(NA, -10L
))

The column positions specifies a sequence of one or more start..stop positions separated by a comma.

Additionally, I have a dataset of locations (portion is shown):

structure(list(VarID = 1:9, chr = c(19L, 19L, 19L, 19L, 19L, 
19L, 19L, 19L, 10L), position = c(58864801, 58863673, 58863673, 58863673, 
58863673, 58863673, 58863673, 58863041, 52569689)), class = "data.frame", row.names = c(NA, 
-9L))

I would like to append the second dataset with a column that specifies the domain to which VarID belongs.

My desired output is:

structure(list(VarID = 1:9, chr = c(19L, 19L, 19L, 19L, 19L, 
19L, 19L, 19L, 10L), position = c(58864801, 58863673, 58863673, 
58863673, 58863673, 58863673, 58863673, 58863041, 52569689), 
    domain = c("A1BG_-_-_4", "A1BG_-_-_2", "A1BG_-_-_2", "A1BG_-_-_2", 
    "A1BG_-_-_2", "A1BG_-_-_2", "A1BG_-_-_2", "A1BG_-_-_2", "A1CF_-_-_0"
    )), row.names = c(NA, -9L), class = "data.frame")

Specifically, I'm having trouble getting the gsub to work that will eventually allow me to query whether or not a position is within the start..stop range.

4
  • Are these string columns in both dataset Commented Jul 23, 2019 at 3:45
  • No, just in the first one. However, the chr column is common to both. Commented Jul 23, 2019 at 3:46
  • It would be better if you provide the dput of the example so that it becomes more easier Commented Jul 23, 2019 at 3:47
  • I've edited the post to include the dputs. Commented Jul 23, 2019 at 4:00

2 Answers 2

2

Try foverlaps from data.table:

library(data.table)

dtt.domain <- fread(text = '      domain chr                                                  positions length
            A1BG_-_-_0  19                    (58858387..58858395,58858718..58858719)     11
    A1BG_-_-_1  19 (58858998..58859006,58861735..58862017,58862756..58862766)    303
    A1BG_-_-_2  19                    (58863018..58863053,58863648..58863673)     62
    A1BG_-_-_3  19                    (58863913..58863921,58864293..58864303)     20
    A1BG_-_-_4  19 (58864552..58864563,58864657..58864693,58864769..58864803)     84
    A1BG_143228_143228_0  19                                       (58858719..58858998)    280
    A1BG_143228_143228_1  19                                       (58862766..58863018)    253
    A1BG_143228_143228_2  19                                       (58863673..58863913)    241
    A1BG_143228_143228_3  19                                       (58864303..58864552)    250
    A1CF_-_-_0  10                    (52566488..52566640,52569653..52569717)    218')
dtt.var <- fread(text = 'VarID chr position
      1  19 58864801
    2  19 58863673
    3  19 58863673
    4  19 58863673
    5  19 58863673
    6  19 58863673
    7  19 58863673
    8  19 58863041
    9  10 52569689')

dtt.domain2 <- dtt.domain[, .(
    region = strsplit(
        gsub('\\(|\\)', '', positions), ',', fixed = TRUE)[[1]]),
    by = .(domain, chr)]

dtt.domain2[, c('start', 'end') := tstrsplit(
    region, '..', fixed = TRUE, type.convert = TRUE)]

setkeyv(dtt.domain2, c('chr', 'start', 'end'))

dtt.var[, `:=`(start = position, end = position)]

res <- foverlaps(dtt.var, dtt.domain2, mult = 'first')
res[, .(VarID, chr, position, domain)]
#    VarID chr position     domain
# 1:     1  19 58864801 A1BG_-_-_4
# 2:     2  19 58863673 A1BG_-_-_2
# 3:     3  19 58863673 A1BG_-_-_2
# 4:     4  19 58863673 A1BG_-_-_2
# 5:     5  19 58863673 A1BG_-_-_2
# 6:     6  19 58863673 A1BG_-_-_2
# 7:     7  19 58863673 A1BG_-_-_2
# 8:     8  19 58863041 A1BG_-_-_2
# 9:     9  10 52569689 A1CF_-_-_0
Sign up to request clarification or add additional context in comments.

Comments

1

Another option using dplyr and tidyr. Using gsub we remove opening and closing round brackets (()) and convert chr to integer. We then bring positions into separate rows by splitting on "," and separate start and end index into different columns based on "..". This is then left_joined with locations dataset and filter the values.

library(dplyr)
library(tidyr)

df %>%
   mutate(positions = gsub("[()]", "", positions), 
          chr = as.integer(chr)) %>%
   separate_rows(positions, sep = ",") %>%
   separate(positions, c("start", "end"), sep = "\\.\\.") %>%
   left_join(locations, by = c("chr" = "chrno")) %>%
   filter(loc > start & loc <= end) %>%
   arrange(VarID) %>%
   dplyr::select(VarID, chr, loc, domain)


#  VarID chr      loc     domain
#1     1  19 58864801 A1BG_-_-_4
#2     2  19 58863673 A1BG_-_-_2
#3     3  19 58863673 A1BG_-_-_2
#4     4  19 58863673 A1BG_-_-_2
#5     5  19 58863673 A1BG_-_-_2
#6     6  19 58863673 A1BG_-_-_2
#7     7  19 58863673 A1BG_-_-_2
#8     8  19 58863041 A1BG_-_-_2
#9     9  10 52569689 A1CF_-_-_0

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.