0
library(plyr)
library(stringr)



###example data
examp<- data.frame(id_info = c("123",   "3464", "7156", "3171", "5299", "4541", "4956", "9926", "8418", "1392", "9080", "6455", "2423", "9101", "7807", "5195", "7827", "365",  "9062", "5558", "239",  "8700", "6995", "9853"),
                   filterme1 = c("ABB123460sadjasd",    "ABB123461asjdjs",  "ABB123462ranogvmg",    "ABB123463dkfohsd", "ABB123464fff///sss",   "ABB123465jfsdf",   "ABB123466 sdf",    "ABB123467 sdf",    "ABB123468 fff///sss",  "ABB123469 ty", "ABB123470 fff///sss",  "ABB123471 dfs",    "ABB123472 ",   "ABB123473 gt", "ABB123474 y",  "ABB123475 f",  "ABB123476 gfgABB123462",   "ABB123477 dsd",    "ABB123478 re", "ABB123479 fgh",    "ABB123480 tu", "ABB123481 yu", "ABB123482 dfg",    "ABB123483 s"),
                    filterme2  = c("sadjasdABB123460",  "asjdjsABB123461",  "ranogvmgABB123462",    "dkfohsdABB123463", "fff///sssABB123464",   "jfsdfABB123465",   "sdfABB123466", "sdfABB123467", "fff///sssABB123468",   "tyABB123469",  "fff///sssABB123470",   "dfsABB123471", "ABB123472",    "gtABB123473",  "yABB123474",   "fABB123475",   "ABB123462gfgABB123476",    "dsdABB123477", "reABB123478",  "fghABB123479", "tuABB123480",  "yuABB123481",  "dfgABB123482", "sABB123483"))

##id_info should be factor even tho it is numeric
str(examp)

I want to extract an element from strings. The element should start with "ABB" and be followed followed by 6 digits.

#extract out all strings that begin with ABB and are followed by 7 digits
examp_str<-str_extract_all(as.character(examp$filterme1),pattern="ABB[0-9]{6}")
length(examp_str)
#thanks for the help with the expression

Below is my attempt to use my stringr function throughout the dataframe.

#Is this how I can eventually loop though the whole dataframe? I know I will create a list, but this isn't quite right.
examp_str_big<-dlply(.data=examp, 1,
                   function(x) str_extract_all(x,pattern="ABB[0-9]{6}"))

Once I create a list, I want to try and put it all back in a data frame.I found the link here about putting unknown lengths of lists into dataframes, but I am not sure if I could use this or not.

indx <- sapply(examp_str_big, length)
#indx <- lengths(lst) 
res <- as.data.frame(do.call(rbind,lapply(examp_str_big, `length<-`,
                                          max(indx))))

colnames(res) <- names(examp_str_big[[which.max(indx)]])
str(res)

So my desired end result would look like this:

id_info filterme1   filterme2   filterme3   filterme4
123     ABB123460               ABB123460   
3464    ABB123461               ABB123461   
7156    ABB123462               ABB123462   
3171    ABB123463               ABB123463   
5299    ABB123464               ABB123464   
4541    ABB123465               ABB123465   
4956    ABB123466               ABB123466   
9926    ABB123467               ABB123467   
8418    ABB123468               ABB123468   
1392    ABB123469               ABB123469   
9080    ABB123470               ABB123470   
6455    ABB123471               ABB123471   
2423    ABB123472               ABB123472   
9101    ABB123473               ABB123473   
7807    ABB123474               ABB123474   
5195    ABB123475               ABB123475   
7827    ABB123476   ABB123462   ABB123462   ABB1234576
365      ABB123477              ABB123477   
9062    ABB123478               ABB123478   
5558    ABB123479               ABB123479   
239     ABB123480               ABB123480   
8700    ABB123481               ABB123481   
6995    ABB123482               ABB123482   
9853    ABB123483               ABB123483   

My actual datset is longer and has more "filterme" columns. Any help would be greatly appreciated. If there is another smarter way to accomplish this goal, I would love to hear it.

Thanks.

4
  • This pattern will match ABB followed by 7 digits as you mentioned: 'ABB[0-9]{7}' Commented Apr 28, 2016 at 23:02
  • Yes that expression does work, Thank you. Do you have any ideas for the second part of the question? Commented Apr 28, 2016 at 23:32
  • You may want to look at unnest from tidyr package. It provides the result in long format. Commented Apr 29, 2016 at 1:09
  • I posted one answer below. See if it works / helps. Commented Apr 29, 2016 at 1:23

2 Answers 2

1

Here is one approach (based on your original data frame examp, which I assume is read with stringsAsFactors = FALSE):

library(stringr)
# Extract all occurrences of patterns (NOTE: I am using 6 digits
# since no 7 digits example in provided sample data. Modify to 7.
examp$pattern <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')

# Append columns for each found pattern
maxlength <- max(sapply(examp$pattern, length))
examp <- cbind(examp,
               as.data.frame(do.call(rbind,
                                     lapply(examp$pattern,
                                            function(x) {
                                              s <- strsplit(x, ' ')
                                              c(s, rep(NA, maxlength - length(s)))
                                              }))))

# This will result in a wider data frame with all found patterns appended
# as new columns

examp

   id_info              filterme1             filterme2              pattern        V1
1      123       ABB123460sadjasd      sadjasdABB123460            ABB123460 ABB123460
2     3464        ABB123461asjdjs       asjdjsABB123461            ABB123461 ABB123461
3     7156      ABB123462ranogvmg     ranogvmgABB123462            ABB123462 ABB123462
4     3171       ABB123463dkfohsd      dkfohsdABB123463            ABB123463 ABB123463
5     5299     ABB123464fff///sss    fff///sssABB123464            ABB123464 ABB123464
6     4541         ABB123465jfsdf        jfsdfABB123465            ABB123465 ABB123465
7     4956          ABB123466 sdf          sdfABB123466            ABB123466 ABB123466
8     9926          ABB123467 sdf          sdfABB123467            ABB123467 ABB123467
9     8418    ABB123468 fff///sss    fff///sssABB123468            ABB123468 ABB123468
10    1392           ABB123469 ty           tyABB123469            ABB123469 ABB123469
11    9080    ABB123470 fff///sss    fff///sssABB123470            ABB123470 ABB123470
12    6455          ABB123471 dfs          dfsABB123471            ABB123471 ABB123471
13    2423             ABB123472              ABB123472            ABB123472 ABB123472
14    9101           ABB123473 gt           gtABB123473            ABB123473 ABB123473
15    7807            ABB123474 y            yABB123474            ABB123474 ABB123474
16    5195            ABB123475 f            fABB123475            ABB123475 ABB123475
17    7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476, ABB123462 ABB123476
18     365          ABB123477 dsd          dsdABB123477            ABB123477 ABB123477
19    9062           ABB123478 re           reABB123478            ABB123478 ABB123478
20    5558          ABB123479 fgh          fghABB123479            ABB123479 ABB123479
21     239           ABB123480 tu           tuABB123480            ABB123480 ABB123480
22    8700           ABB123481 yu           yuABB123481            ABB123481 ABB123481
23    6995          ABB123482 dfg          dfgABB123482            ABB123482 ABB123482
24    9853            ABB123483 s            sABB123483            ABB123483 ABB123483
          V2
1         NA
2         NA
3         NA
4         NA
5         NA
6         NA
7         NA
8         NA
9         NA
10        NA
11        NA
12        NA
13        NA
14        NA
15        NA
16        NA
17 ABB123462
18        NA
19        NA
20        NA
21        NA
22        NA
23        NA
24        NA

In this case, only two new columns are added since there are a maximum of two occurrences of the pattern (even modified to 6 above) in the provided sample data.

EDIT: Adding code that matches pattern across multiple columns (in this case filterme1 and filterme2):

library(tidyr)
examp <- unite(examp, filterme, filterme1, filterme2, remove = FALSE)
examp$pattern <- str_extract_all(examp$filterme, 'ABB[0-9]{6}')

At this point, you can run the rest of the code above AFTER the line where examp$pattern is assigned.

Sign up to request clarification or add additional context in comments.

8 Comments

Thank you for the quick response. When I run this code, I get an error: Error in FUN(X[[i]], ...) : object 'maxlength' not found
So sorry...one of the lines of code I wrote is missing. Edited.
this works great if I wanted the function to be applied to only the first column of my data. I do not see that it is able to apply str_extract_all in the second column, or (n) columns in a data frame.
You mean, the pattern must be extracted from all (or multiple columns) of a row and then this code run on that extracted list? You can look at unite in tidyr to make things easier for you. You can unite multiple columns into one (make sure to use remove = FALSE) and then run this same code. You can say examp <- unite(examp, allFilterme, filterme1, filterme2, ...., remove = FALSE). Only small modification needed.
Ah, so essentially put this dataframe in long format and then apply this function? I will give it a go.
|
0

We can also use the lengths function

 lst <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')
 m1 <- do.call(rbind, lapply(lst, `length<-`, max(lengths(lst))))
 examp[paste0("pattern", seq_len(ncol(m1)))] <- m1
 examp
#   id_info              filterme1             filterme2  pattern1  pattern2
#1      123       ABB123460sadjasd      sadjasdABB123460 ABB123460      <NA>
#2     3464        ABB123461asjdjs       asjdjsABB123461 ABB123461      <NA>
#3     7156      ABB123462ranogvmg     ranogvmgABB123462 ABB123462      <NA>
#4     3171       ABB123463dkfohsd      dkfohsdABB123463 ABB123463      <NA>
#5     5299     ABB123464fff///sss    fff///sssABB123464 ABB123464      <NA>
#6     4541         ABB123465jfsdf        jfsdfABB123465 ABB123465      <NA>
#7     4956          ABB123466 sdf          sdfABB123466 ABB123466      <NA>
#8     9926          ABB123467 sdf          sdfABB123467 ABB123467      <NA>
#9     8418    ABB123468 fff///sss    fff///sssABB123468 ABB123468      <NA>
#10    1392           ABB123469 ty           tyABB123469 ABB123469      <NA>
#11    9080    ABB123470 fff///sss    fff///sssABB123470 ABB123470      <NA>
#12    6455          ABB123471 dfs          dfsABB123471 ABB123471      <NA>
#13    2423             ABB123472              ABB123472 ABB123472      <NA>
#14    9101           ABB123473 gt           gtABB123473 ABB123473      <NA>
#15    7807            ABB123474 y            yABB123474 ABB123474      <NA>
#16    5195            ABB123475 f            fABB123475 ABB123475      <NA>
#17    7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476 ABB123462
#18     365          ABB123477 dsd          dsdABB123477 ABB123477      <NA>
#19    9062           ABB123478 re           reABB123478 ABB123478      <NA>
#20    5558          ABB123479 fgh          fghABB123479 ABB123479      <NA>
#21     239           ABB123480 tu           tuABB123480 ABB123480      <NA>
#22    8700           ABB123481 yu           yuABB123481 ABB123481      <NA>
#23    6995          ABB123482 dfg          dfgABB123482 ABB123482      <NA>
#24    9853            ABB123483 s            sABB123483 ABB123483      <NA>

2 Comments

This works, thanks. However, I would like to apply the str_extract_all function to all the other columns in the data frame. The example has only 2 columns, but my larger data set has 16.
@KathleenBrannen You can loop through the columns and apply the same. i.e lapply(examp, function(x) {lst <- str_extract_all(x, 'ABB[0-9]{6}');...

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.