2

I have an XML like:

    <Trait ID="4711" Type="Disease">
    <!--  each phenotype -->
    <Name>
    <ElementValue Type="Preferred">Breast-ovarian cancer, familial 1</ElementValue>
    <XRef ID="Breast-ovarian+cancer%2C+familial+1/7865" DB="Genetic Alliance"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1</ElementValue>
    <XRef Type="MIM" ID="604370" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0001" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0002" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0003" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0004" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0005" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0006" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0007" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0008" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0009" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0010" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0011" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0012" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0013" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0014" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0015" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0016" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0017" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0018" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0019" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0020" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0021" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0022" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0023" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0024" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0025" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0026" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0027" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0028" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0029" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0030" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0031" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0032" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0033" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0034" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0035" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0036" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0037" DB="OMIM"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">OVARIAN CANCER, SUSCEPTIBILITY TO</ElementValue>
    <XRef Type="Allelic variant" ID="602667.0001" DB="OMIM"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">BREAST CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1</ElementValue>
    <XRef ID="604370" DB="OMIM"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">Breast cancer, familial 1</ElementValue>
    </Name>
    <Name>
    <ElementValue Type="Alternate">Breast-ovarian cancer, familial 1 and 2</ElementValue>
    <XRef ID="GTR000310494" DB="Laboratory of Genetics,HUSLAB"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">BRCA1 Gene Mutation</ElementValue>
    <XRef ID="GTR000501743" DB="Myriad Genetic Laboratories,Myriad Genetic Laboratories, Inc."/>
    </Name>
</Trait>

And I want to parse the XML to a data frame like:

enter image description here

Im tring to use the r package XML, but the problem that i have more XREF attribute values than names of trait values. I can solve this with a for loop but that is often not the "R way". I was wondering if, there are is a simpler solution? (e.g. with xpath query).

I'm trying something like this:

x <- do.call(rbind, xpathApply(xml_1, "//TraitSet/Trait[@ID='4711']/Name", function(node) {
  
  trait <- xmlValue(node[["ElementValue"]])
  
  xp <- "//TraitSet/Trait[@ID='4711']/Name/XRef"
  DB <- sapply(c("ID","DB"), function(x) xpathSApply(xmltop, '//TraitSet/Trait/Name/XRef', xmlGetAttr, x))
  if (is.null(DB)) DB <- NA
  
  data.frame(trait, DB, stringsAsFactors = FALSE)
  
}))

but the records are incorrectly multiplied.

I will appreciate help! Thanks

1 Answer 1

1

This is a solution I think with xml2 and a quick and verbose tidy verse approach.

Not always easy to have a compact code when importing nested xml data.


library(xml2)
library(dplyr)
library(tidyr)
library(purrr)

# read the xml file
te <- xml2::read_xml("~/Desktop/test_so.xml")

# to move from node to node
cursor <- xml2::xml_find_all(te, ".//Name")

# i <- 1L
seq_along(cursor) %>% 
  # map to move along cursor
  purrr::map_df(function(i){
    print(i)
    x <- cursor[i]
    # first part is Name and Type (Alternate/Disease/Preferred)
    dplyr::tibble(Type = xml2::xml_attr(xml_find_all(x, './/ElementValue'), 'Type'),
         Trait = xml2::xml_text(x)) -> temp
    
    # second part is not always here so test if it exists before, then
    # if it exists, extract and compact it (nest)
    if (!is.na(xml2::xml_text(xml2::xml_find_first(x, './/XRef')))){
      Details <- dplyr::tibble(
         DB = xml2::xml_find_all(x, './/XRef') %>% 
           xml2::xml_attr('DB'),
         ID = xml2::xml_find_all(x, './/XRef') %>% 
           xml2::xml_attr('ID'),
         Type_ = xml2::xml_find_all(x, './/XRef') %>% 
           xml2::xml_attr('Type')) %>% tidyr::nest(data = c('DB', 'ID', 'Type_'))
      } else {
      # if it doesn't exist, fill with an empty df and compact it too (nest)
      Details <- dplyr::tibble(DB = NA, ID = NA, Type_ = NA) %>% 
        tidyr::nest(data = c('DB', 'ID', 'Type_'))
      }
    # add this new "df" column to temporary object
    temp <- temp %>% dplyr::mutate(Details = Details$data)
    # return
    temp
  }) -> te2

result <- te2 %>% 
  unnest(Details)
# A tibble: 44 × 5
   Type      Trait                                                 DB               ID                                       Type_          
   <chr>     <chr>                                                 <chr>            <chr>                                    <chr>          
 1 Preferred Breast-ovarian cancer, familial 1                     Genetic Alliance Breast-ovarian+cancer%2C+familial+1/7865 NA             
 2 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             604370                                   MIM            
 3 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0001                              Allelic variant
 4 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0002                              Allelic variant
 5 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0003                              Allelic variant
 6 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0004                              Allelic variant
 7 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0005                              Allelic variant
 8 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0006                              Allelic variant
 9 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0007                              Allelic variant
10 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0008                              Allelic variant
# … with 34 more rows
>
Sign up to request clarification or add additional context in comments.

2 Comments

Its an excelent resolution @Guillaume ! This is very readable. Thank you!
thanks for the feedback. This code and your case will helps me too when I have to do the same kind of things at work.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.