7

I have the following process done using dplyr without any problem:

library(tidyverse)
my_dplyr_dat  <- structure(list(chrn = c("chr20", "chr6", "chr5"), start = c(52447674L, 
12962440L, 66453982L), end = c(52447689L, 12962455L, 66453997L
), motif_name_binned = c("ZNF263/MA0528.1/Jaspar.instid_chr20:52447338-52447738.bin22", 
"Klf12/MA0742.1/Jaspar.instid_chr6:12962360-12962760.bin6", "Hoxc9/MA0485.1/Jaspar.instid_chr5:66453806-66454206.bin12"
), motif_score = c(6.728401, -0.979777, 6.091471), strand = c("+", 
"+", "+"), read_count = c(0L, 0L, 0L)), .Names = c("chrn", "start", 
"end", "motif_name_binned", "motif_score", "strand", "read_count"
), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))

That looks like this:

# A tibble: 3 x 7
   chrn    start      end                                           motif_name_binned motif_score strand read_count
  <chr>    <int>    <int>                                                       <chr>       <dbl>  <chr>      <int>
1 chr20 52447674 52447689 ZNF263/MA0528.1/Jaspar.instid_chr20:52447338-52447738.bin22    6.728401      +          0
2  chr6 12962440 12962455    Klf12/MA0742.1/Jaspar.instid_chr6:12962360-12962760.bin6   -0.979777      +          0
3  chr5 66453982 66453997   Hoxc9/MA0485.1/Jaspar.instid_chr5:66453806-66454206.bin12    6.091471      +          0

The main task I wish to achieve there is to extract motif_name_binned column using regex and spread it into 3 colums c('motif', 'inst', 'binno'), using dplyr it can be done this way:

my_dplyr_dat %>% 
  extract(motif_name_binned, c('motif', 'inst', 'binno'), regex = "^(.*?\\/.*?)\\.instid_(.*?)\\.bin(\\d+)", remove = FALSE) %>% 
  select(-motif_name_binned)

Which produces this:

# A tibble: 3 x 9
   chrn    start      end                  motif                    inst binno motif_score strand read_count
* <chr>    <int>    <int>                  <chr>                   <chr> <chr>       <dbl>  <chr>      <int>
1 chr20 52447674 52447689 ZNF263/MA0528.1/Jaspar chr20:52447338-52447738    22    6.728401      +          0
2  chr6 12962440 12962455  Klf12/MA0742.1/Jaspar  chr6:12962360-12962760     6   -0.979777      +          0
3  chr5 66453982 66453997  Hoxc9/MA0485.1/Jaspar  chr5:66453806-66454206    12    6.091471      +          0

How can I do it with data.table?

This is the original data in data.table format I have (i.e. before string extraction etc):

library(data.table)
my_data_table <- structure(list(chrn = c("chr20", "chr6", "chr5"), start = c(52447674L, 
12962440L, 66453982L), end = c(52447689L, 12962455L, 66453997L
), motif_name_binned = c("ZNF263/MA0528.1/Jaspar.instid_chr20:52447338-52447738.bin22", 
"Klf12/MA0742.1/Jaspar.instid_chr6:12962360-12962760.bin6", "Hoxc9/MA0485.1/Jaspar.instid_chr5:66453806-66454206.bin12"
), motif_score = c(6.728401, -0.979777, 6.091471), strand = c("+", 
"+", "+"), read_count = c(0L, 0L, 0L)), .Names = c("chrn", "start", 
"end", "motif_name_binned", "motif_score", "strand", "read_count"
), class = c("data.table", "data.frame"), row.names = c(NA, -3L
))

Which looks like this:

    chrn    start      end                                           motif_name_binned motif_score strand read_count
1: chr20 52447674 52447689 ZNF263/MA0528.1/Jaspar.instid_chr20:52447338-52447738.bin22    6.728401      +          0
2:  chr6 12962440 12962455    Klf12/MA0742.1/Jaspar.instid_chr6:12962360-12962760.bin6   -0.979777      +          0
3:  chr5 66453982 66453997   Hoxc9/MA0485.1/Jaspar.instid_chr5:66453806-66454206.bin12    6.091471      +          0
1
  • We should both delete our comments. They don't really help future users. Commented Nov 16, 2017 at 2:59

1 Answer 1

5

We create a unique splitting character with gsub and with tstrsplit split based on the character into 3 columns

my_data_table[, c('motif', 'inst', 'binno') := tstrsplit(
   gsub("^(.*?\\/.*?)\\.instid_(.*?)\\.bin(\\d+)", "\\1$\\2$\\3", motif_name_binned), '$',
       fixed = TRUE)][, setdiff(names(my_data_table), "motif_name_binned"), with = FALSE]
#    chrn    start      end motif_score strand read_count                  motif                    inst binno
#1: chr20 52447674 52447689    6.728401      +          0 ZNF263/MA0528.1/Jaspar chr20:52447338-52447738    22
#2:  chr6 12962440 12962455   -0.979777      +          0  Klf12/MA0742.1/Jaspar  chr6:12962360-12962760     6
#3:  chr5 66453982 66453997    6.091471      +          0  Hoxc9/MA0485.1/Jaspar  chr5:66453806-66454206    12
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.