Extracting numeric from string and checking condition

Question

I have the following portion of my dataset:

structure(list(domain = c("A1BG_-_-_0", "A1BG_-_-_1", "A1BG_-_-_2", 
"A1BG_-_-_3", "A1BG_-_-_4", "A1BG_143228_143228_0", "A1BG_143228_143228_1", 
"A1BG_143228_143228_2", "A1BG_143228_143228_3", "A1CF_-_-_0"), 
    chr = c("19", "19", "19", "19", "19", "19", "19", "19", "19", 
    "10"), positions = c("(58858387..58858395,58858718..58858719)", 
    "(58858998..58859006,58861735..58862017,58862756..58862766)", 
    "(58863018..58863053,58863648..58863673)", "(58863913..58863921,58864293..58864303)", 
    "(58864552..58864563,58864657..58864693,58864769..58864803)", 
    "(58858719..58858998)", "(58862766..58863018)", "(58863673..58863913)", 
    "(58864303..58864552)", "(52566488..52566640,52569653..52569717)"
    ), length = c(11L, 303L, 62L, 20L, 84L, 280L, 253L, 241L, 
    250L, 218L)), class = "data.frame", row.names = c(NA, -10L
))

The column positions specifies a sequence of one or more start..stop positions separated by a comma.

Additionally, I have a dataset of locations (portion is shown):

structure(list(VarID = 1:9, chr = c(19L, 19L, 19L, 19L, 19L, 
19L, 19L, 19L, 10L), position = c(58864801, 58863673, 58863673, 58863673, 
58863673, 58863673, 58863673, 58863041, 52569689)), class = "data.frame", row.names = c(NA, 
-9L))

I would like to append the second dataset with a column that specifies the domain to which VarID belongs.

My desired output is:

structure(list(VarID = 1:9, chr = c(19L, 19L, 19L, 19L, 19L, 
19L, 19L, 19L, 10L), position = c(58864801, 58863673, 58863673, 
58863673, 58863673, 58863673, 58863673, 58863041, 52569689), 
    domain = c("A1BG_-_-_4", "A1BG_-_-_2", "A1BG_-_-_2", "A1BG_-_-_2", 
    "A1BG_-_-_2", "A1BG_-_-_2", "A1BG_-_-_2", "A1BG_-_-_2", "A1CF_-_-_0"
    )), row.names = c(NA, -9L), class = "data.frame")

Specifically, I'm having trouble getting the gsub to work that will eventually allow me to query whether or not a position is within the start..stop range.

No, just in the first one. However, the chr column is common to both. — stats134711
– stats134711, Commented Jul 23, 2019 at 3:46
It would be better if you provide the dput of the example so that it becomes more easier — akrun
– akrun, Commented Jul 23, 2019 at 3:47

mt1022 · Accepted Answer · 2019-07-23 04:05:00Z

Try foverlaps from data.table:

library(data.table)

dtt.domain <- fread(text = '      domain chr                                                  positions length
            A1BG_-_-_0  19                    (58858387..58858395,58858718..58858719)     11
    A1BG_-_-_1  19 (58858998..58859006,58861735..58862017,58862756..58862766)    303
    A1BG_-_-_2  19                    (58863018..58863053,58863648..58863673)     62
    A1BG_-_-_3  19                    (58863913..58863921,58864293..58864303)     20
    A1BG_-_-_4  19 (58864552..58864563,58864657..58864693,58864769..58864803)     84
    A1BG_143228_143228_0  19                                       (58858719..58858998)    280
    A1BG_143228_143228_1  19                                       (58862766..58863018)    253
    A1BG_143228_143228_2  19                                       (58863673..58863913)    241
    A1BG_143228_143228_3  19                                       (58864303..58864552)    250
    A1CF_-_-_0  10                    (52566488..52566640,52569653..52569717)    218')
dtt.var <- fread(text = 'VarID chr position
      1  19 58864801
    2  19 58863673
    3  19 58863673
    4  19 58863673
    5  19 58863673
    6  19 58863673
    7  19 58863673
    8  19 58863041
    9  10 52569689')

dtt.domain2 <- dtt.domain[, .(
    region = strsplit(
        gsub('\\(|\\)', '', positions), ',', fixed = TRUE)[[1]]),
    by = .(domain, chr)]

dtt.domain2[, c('start', 'end') := tstrsplit(
    region, '..', fixed = TRUE, type.convert = TRUE)]

setkeyv(dtt.domain2, c('chr', 'start', 'end'))

dtt.var[, `:=`(start = position, end = position)]

res <- foverlaps(dtt.var, dtt.domain2, mult = 'first')
res[, .(VarID, chr, position, domain)]
#    VarID chr position     domain
# 1:     1  19 58864801 A1BG_-_-_4
# 2:     2  19 58863673 A1BG_-_-_2
# 3:     3  19 58863673 A1BG_-_-_2
# 4:     4  19 58863673 A1BG_-_-_2
# 5:     5  19 58863673 A1BG_-_-_2
# 6:     6  19 58863673 A1BG_-_-_2
# 7:     7  19 58863673 A1BG_-_-_2
# 8:     8  19 58863041 A1BG_-_-_2
# 9:     9  10 52569689 A1CF_-_-_0

Ronak Shah · Accepted Answer · 2019-07-23 06:30:04Z

Another option using dplyr and tidyr. Using gsub we remove opening and closing round brackets (()) and convert chr to integer. We then bring positions into separate rows by splitting on "," and separate start and end index into different columns based on "..". This is then left_joined with locations dataset and filter the values.

library(dplyr)
library(tidyr)

df %>%
   mutate(positions = gsub("[()]", "", positions), 
          chr = as.integer(chr)) %>%
   separate_rows(positions, sep = ",") %>%
   separate(positions, c("start", "end"), sep = "\\.\\.") %>%
   left_join(locations, by = c("chr" = "chrno")) %>%
   filter(loc > start & loc <= end) %>%
   arrange(VarID) %>%
   dplyr::select(VarID, chr, loc, domain)


#  VarID chr      loc     domain
#1     1  19 58864801 A1BG_-_-_4
#2     2  19 58863673 A1BG_-_-_2
#3     3  19 58863673 A1BG_-_-_2
#4     4  19 58863673 A1BG_-_-_2
#5     5  19 58863673 A1BG_-_-_2
#6     6  19 58863673 A1BG_-_-_2
#7     7  19 58863673 A1BG_-_-_2
#8     8  19 58863041 A1BG_-_-_2
#9     9  10 52569689 A1CF_-_-_0

Collectives™ on Stack Overflow

Extracting numeric from string and checking condition

2 Answers 2

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related