2

I have a dataframe df which contains a single column GO. Each row in df contains either one term or multiple terms (separated by ;) and each term has a specific format - it starts with either P, C or F and is followed by a : and then the actual term.

df <- data.frame(
  GO = c("C:mitochondrion; C:kinetoplast", "", "F:calmodulin binding; C:cytoplasm; C:axoneme",
     "", "P:cilium movement; P:inner dynein arm assembly; C:axoneme", "", "F:calcium ion binding"))


                                                         GO
1                            C:mitochondrion; C:kinetoplast
2                                                          
3              F:calmodulin binding; C:cytoplasm; C:axoneme
4                                                          
5 P:cilium movement; P:inner dynein arm assembly; C:axoneme
6                                                          
7                                     F:calcium ion binding

I want to split this column into three columns BP, CC, MF based on whether the terms start with a P, C or an F respectively. Also I want the three columns to have only the terms and not the other identifiers (P, C, F and :).

This is what I want my new dataframe to look like:

                                          BP                         CC                  MF
1                                            mitochondrion; kinetoplast                    
2                                                                                          
3                                                    cytoplasm; axoneme  calmodulin binding
4                                                                                          
5 cilium movement; inner dynein arm assembly                    axoneme                    
6                                                                                          
7                                                                       calcium ion binding

3 Answers 3

2

A tidyverse approach to achieve your desired result may look like so:

library(tidyr)
library(dplyr)

df %>%
  mutate(id = seq(nrow(.))) %>%
  separate_rows(GO, sep = ";\\s") %>%
  separate(GO, into = c("category", "item"), sep = ":") %>%
  mutate(category = recode(category, C = "CC", P = "BP", F = "MF", .default = "foo")) %>%
  replace_na(list(item = "")) %>%
  group_by(id, category) %>%
  summarise(items = paste(item, collapse = "; "), .groups = "drop") %>%
  pivot_wider(names_from = category, values_from = items, values_fill = "") %>%
  select(BP, CC, MF)
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 3 rows [3, 7,
#> 11].
#> # A tibble: 7 × 3
#>   BP                                           CC                          MF   
#>   <chr>                                        <chr>                       <chr>
#> 1 ""                                           "mitochondrion; kinetoplas… ""   
#> 2 ""                                           ""                          ""   
#> 3 ""                                           "cytoplasm; axoneme"        "cal…
#> 4 ""                                           ""                          ""   
#> 5 "cilium movement; inner dynein arm assembly" "axoneme"                   ""   
#> 6 ""                                           ""                          ""   
#> 7 ""                                           ""                          "cal…
Sign up to request clarification or add additional context in comments.

Comments

1

Here is one more:

  1. Create an identifier with row_number
  2. Use separate_rows to place each item in a single row
  3. use str_detect in case_when to prepare the column names
  4. remove the beginnings of the items e.g 'C:' 'F:' and 'P:'
  5. group and collapse to one row
  6. get distinct values and remove NA
  7. apply pivot_wider and select the columns
library(tidyverse)

df %>%
  mutate(row = row_number()) %>%
  separate_rows(GO, sep = '; ') %>% 
  mutate(names = case_when(str_detect(GO, 'C:')~"CC",
                           str_detect(GO, 'F:')~"MF",
                           str_detect(GO, 'P:')~"BP",
                           TRUE ~ NA_character_)) %>% 
  mutate(GO = str_replace_all(GO, '.\\:', '')) %>% 
  group_by(row, names) %>% 
  mutate(b_x = paste(GO, collapse = "; ")) %>% 
  distinct(b_x) %>% 
  na.omit() %>% 
  pivot_wider(
    names_from = names, 
    values_from = b_x
  ) %>% 
  ungroup() %>% 
  select(BP, CC, MF)

  BP                                         CC                         MF                 
  <chr>                                      <chr>                      <chr>              
1 NA                                         mitochondrion; kinetoplast NA                 
2 NA                                         cytoplasm; axoneme         calmodulin binding 
3 cilium movement; inner dynein arm assembly axoneme                    NA                 
4 NA                                         NA                         calcium ion binding

Comments

1

Another possible solution:

library(tidyverse)

df %>% 
  rownames_to_column("id") %>% 
  separate_rows(GO, sep = "; ") %>% 
  separate(GO, into = c("name", "value"), sep = ":", fill = "right") %>% 
  filter(complete.cases(.)) %>% 
  pivot_wider(id_cols = id, values_fn = list) %>% rowwise %>% 
  mutate(across(-id, ~ str_c(.x, collapse = "; "))) %>% 
  left_join(data.frame(id = seq(nrow(df)) %>% as.character), .) %>% 
  mutate(across(everything(), replace_na, "")) %>% 
  select(BP = P, CC = C, MF = F)

#> Joining, by = "id"
#>                                           BP                         CC
#> 1                                            mitochondrion; kinetoplast
#> 2                                                                      
#> 3                                                    cytoplasm; axoneme
#> 4                                                                      
#> 5 cilium movement; inner dynein arm assembly                    axoneme
#> 6                                                                      
#> 7                                                                      
#>                    MF
#> 1                    
#> 2                    
#> 3  calmodulin binding
#> 4                    
#> 5                    
#> 6                    
#> 7 calcium ion binding

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.