Extract data from xml to data frame

Question

I would be very grateful if someone could show me how to extract data from xml to R. Below is an example of 1 compound from my xml file but the real file contains several hundred such compounds. I know there are several similar questions posted but so far I have not been able to develop the previous answers to suit my requirements. For example, I can use

doc <- xmlParse("isotope information.xml")
xmlToDataFrame(
  getNodeSet(doc, "//isotope"),
  colClasses=c("character","numeric")
)

to extract a VERY long list of "mz" and "abundance" values but these are of no use unless they are linked to the relevant compound and sample etc. Also this method does not seem to work if I try further up the tree, I think part of the reason is because of the different types of information and/or spaces in the names?

Any help much appreciated. I am new to R and had not heard of xPath until I started on this file!

<?xml version="1.0" encoding="utf-8"?>
<compounds>
  <compound identifier="24.24_355.2087m/z" retentionTime="24.2409">
    <statistics>
      <anova>0.0013522641768629606</anova>
      <maxFoldChange>18.444703223432118</maxFoldChange>
      <mean lowest="Group A" highest="Group B" />
    </statistics>
    <condition name="Group A">
      <sample name="ACU_S1_D1_MSonly" normalizedAbundance="0.16176030585271">
        <adduct charge="2">
          <isotope>
            <mz>355.131459235488</mz>
            <abundance>0.115052197015018</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S4_D1_MSonly" normalizedAbundance="0.648153833258576">
        <adduct charge="2">
          <isotope>
            <mz>355.210174560547</mz>
            <abundance>0.45734640955925</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S7_D1_MSonly" normalizedAbundance="0">
        <adduct charge="2">
          <isotope>
            <mz>355.206065493636</mz>
            <abundance>0</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S9_D1_MSonly" normalizedAbundance="0">
        <adduct charge="2">
          <isotope>
            <mz>355.206065493636</mz>
            <abundance>0</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S10_D1_MSonly" normalizedAbundance="1.40543741447065">
        <adduct charge="2">
          <isotope>
            <mz>355.222929359468</mz>
            <abundance>0.998472798001696</abundance>
          </isotope>
          <isotope>
            <mz>355.785247802734</mz>
            <abundance>0.00450361325390688</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S11_D1_MSonly" normalizedAbundance="0">
        <adduct charge="2">
          <isotope>
            <mz>355.206065493636</mz>
            <abundance>0</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S14_D1_MSonly" normalizedAbundance="0">
        <adduct charge="2">
          <isotope>
            <mz>355.206065493636</mz>
            <abundance>0</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S17_D1_MSonly" normalizedAbundance="0">
        <adduct charge="2">
          <isotope>
            <mz>355.206065493636</mz>
            <abundance>0</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
    </condition>
    <condition name="Group B">
      <sample name="ACU_S2_D1_MSonly" normalizedAbundance="8.08281443709004">
        <adduct charge="2">
          <isotope>
            <mz>355.217085869147</mz>
            <abundance>6.34168970755279</abundance>
          </isotope>
          <isotope>
            <mz>355.720179758869</mz>
            <abundance>1.01208656740541</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S3_D1_MSonly" normalizedAbundance="1.74468788905785">
        <adduct charge="2">
          <isotope>
            <mz>355.236865028724</mz>
            <abundance>1.25719554540164</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S5_D1_MSonly" normalizedAbundance="1.20519908118674">
        <adduct charge="2">
          <isotope>
            <mz>355.221413778655</mz>
            <abundance>0.693123193025995</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S6_D1_MSonly" normalizedAbundance="11.8264838326202">
        <adduct charge="2">
          <isotope>
            <mz>355.208446325351</mz>
            <abundance>5.67846393951768</abundance>
          </isotope>
          <isotope>
            <mz>355.712529790798</mz>
            <abundance>0.718700468540192</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S12_D1_MSonly" normalizedAbundance="6.62039336582067">
        <adduct charge="2">
          <isotope>
            <mz>355.195225774627</mz>
            <abundance>4.80023810084345</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S13_D1_MSonly" normalizedAbundance="9.10340543014277">
        <adduct charge="2">
          <isotope>
            <mz>355.231293658837</mz>
            <abundance>8.75476514173928</abundance>
          </isotope>
          <isotope>
            <mz>355.73683673041</mz>
            <abundance>1.118534732035</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S15_D1_MSonly" normalizedAbundance="0">
        <adduct charge="2">
          <isotope>
            <mz>355.206065493636</mz>
            <abundance>0</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
      <sample name="ACU_S16_D1_MSonly" normalizedAbundance="2.27851790546988">
        <adduct charge="2">
          <isotope>
            <mz>355.242192813064</mz>
            <abundance>1.25391817825056</abundance>
          </isotope>
          <isotope>
            <mz>355.704849713088</mz>
            <abundance>0</abundance>
          </isotope>
        </adduct>
      </sample>
    </condition>
  </compound>

UPDATE TO ORIGINAL POST Hi again, many thanks for your initial help using both XML and xml2 I have tried to elaborate on the answers to get the data frame that I need but I am still struggling so I am adding more information...

I have ascertained the structure of the xml document as being:

# load necessary package(s)
library(XML)

# parse the xml file in to an R object call xmlfile
xmlfile = xmlTreeParse("QI isotope information.xml")


# check that the xmlfile object is recognised as an xml class
class(xmlfile) # the output should be: "XMLInternalDocument" "XMLAbstractDocument"

# find the root of the xml file
xmltop = xmlRoot(xmlfile)
class(xmltop) #  "XMLInternalElementNode" "XMLInternalNode"        "XMLAbstractNode"
xmlName(xmltop) # "compounds"
xmlSize(xmltop) # 4278

# the root of the xmlfile is "compounds" and it has 4278 children
# to view the content of the first child use:
xmltop[[1]]

# this contains all of the information from a unique compound identifier:
# <compound identifier="106.16_603.4571m/z" retentionTime="106.16268333333333">
#  <statistics>
#    <anova>1.1102230246251565E-16</anova>
#    <maxFoldChange>321.93091917042375</maxFoldChange>
#    <mean lowest="D9" highest="D1"/>
#  </statistics>
#  <condition name="D1">
#    <sample name="ACU_S1_D1_MSonly" normalizedAbundance="2016.23926856296">
#      <adduct charge="1">
#        <isotope>
#          <mz>603.509454467435</mz>
#          <abundance>1017.28655636311</abundance>
#        </isotope>
#        <isotope>
#          <mz>604.51484984744</mz>
#          <abundance>346.272257983685</abundance>
#        </isotope>
#        <isotope>
#          <mz>605.519216627667</mz>
#          <abundance>64.8701884746552</abundance>
#        </isotope>
#      </adduct>
#    </sample>
# N.B. this list is repeated for each sample name, in this case n=64 samples

xmlSize(xmltop[[1]]) # gives the number of nodes under the root, in this case n=5
xmlSApply(xmltop[[1]], xmlName) # gives the names of these 5 nodes
#  statistics    condition    condition    condition    condition 
# "statistics"  "condition"  "condition"  "condition"  "condition" 
xmlSApply(xmltop[[1]], as.list)

xmltop[[1]][[1]] # takes you to the statistics output:
# <statistics>
#  <anova>1.1102230246251565E-16</anova>
#   <maxFoldChange>321.93091917042375</maxFoldChange>
#   <mean lowest="D9" highest="D1"/>
# </statistics>

xmltop[[1]][[2]] # takes you to the "condition" level, i.e. condition name="D1"

xmltop[[1]][[2]][[1]] # takes you to the "sample" level, i.e. sample name="ACU_S1_D1_MSonly"

xmltop[[1]][[2]][[2]] # takes you to the "sample" level number 2, i.e. sample name="ACU_S2_D1_MSonly"

xmltop[[1]][[2]][[1]][[1]] # takes you to the "charge" level, i.e. adduct charge="1"

xmltop[[1]][[2]][[1]][[1]][[1]] # takes you to the "isotope" level, which includes m/z and abundance

# incrementing the last index number takes you to each isotope for that compound
# for example:

xmltop[[1]][[2]][[1]][[1]][[1]][[1]] # <mz>603.509454467435</mz> 
xmltop[[1]][[2]][[1]][[1]][[1]][[2]] # <abundance>1017.28655636311</abundance> 
xmltop[[1]][[2]][[1]][[1]][[2]][[1]] # <mz>604.51484984744</mz> 
xmltop[[1]][[2]][[1]][[1]][[2]][[2]] # <abundance>346.272257983685</abundance>
xmltop[[1]][[2]][[1]][[1]][[3]][[1]] # <mz>605.519216627667</mz>  
xmltop[[1]][[2]][[1]][[1]][[3]][[2]] # <abundance>64.8701884746552</abundance>
xmltop[[1]][[2]][[1]][[1]][[4]][[1]] # NULL
xmltop[[1]][[2]][[1]][[1]][[4]][[2]] # NULL

I am not interested in the statistics section but I would like to create a data frame where the str output would be something like:

# > str(mydata) # returns a summary of the type/ format of each column
# 'data.frame': n obs. of  n variables:
# $ compound : Factor w/ n levels 
# $ retention_time :
# $ condition : Factor w/ 4 levels "D1","D3","D6","D9":
# $ sample_name  : Factor w/ 16 levels "ACU_S1_D1","ACU_S2_D1...: 
# $ isotope_mz : num
# $ isotope_abundance : num

my final aim is to be able extract the abundance of each isotope_mz for every one of the 64 samples. In fact, knowing the condition is not important because this can be determined from sample_name.

N.B. the xml file I am working with is 150 mb and has >4000 compounds x 64 samples and each compound has between 1 and 4 isotopes that I need the mz and abundance of. In addition to the 'R' approach requested here I have also searched and tried numerous xml converters but none of them are able to decipher the structure of this xml file.

Can you give an example of the kind of result you need? Also you can elaborate on what you mean by "does not seem to work" (what doesn't it do?) and "further up the tree", giving examples. — LarsH
– LarsH, Commented Jul 25, 2016 at 21:53
I would use XML::xmlToList() then parse the list as you see fit. Also, you're missing <\compounds> at the bottom of your example xml file -- it will not load without it. — dayne
– dayne, Commented Jul 26, 2016 at 1:06
Thank you for the questions. I have added to the original post by including some of my progress so far and a better description of my aims. — Jatin
– Jatin, Commented Jul 31, 2016 at 10:55
Unfortunately as_list does not work with this file: > mylist <- as_list("QI isotope information") Error in UseMethod("as_list") : no applicable method for 'as_list' applied to an object of class "character" — Jatin
– Jatin, Commented Jul 31, 2016 at 20:45

dayne · Accepted Answer · 2016-07-26 02:38:14Z

1

Something like this should work:

library(XML)
library(data.table)

mylist <- xmlToList("isotope information.xml")
mylist <- c(mylist, mylist, mylist)

xtract <- function(x) {
  data.table(compound_id = mylist[x]$compound$.attrs["identifier"],
             sample_id = mylist[x]$compound$condition$sample$.attrs["name"],
             mz = mylist[x]$compound$condition$sample$adduct$isotope[1],
             abundance = mylist[x]$compound$condition$sample$adduct$isotope[2])
}

rbindlist(lapply(seq_along(mylist), xtract))
#          compound_id        sample_id               mz         abundance
# 1: 24.24_355.2087m/z ACU_S1_D1_MSonly 355.131459235488 0.115052197015018
# 2: 24.24_355.2087m/z ACU_S1_D1_MSonly 355.131459235488 0.115052197015018
# 3: 24.24_355.2087m/z ACU_S1_D1_MSonly 355.131459235488 0.115052197015018

edited Jul 26, 2016 at 2:38

answered Jul 26, 2016 at 1:23

dayne

7,8547 gold badges42 silver badges59 bronze badges

Sign up to request clarification or add additional context in comments.

Comments

vincentmajor · Accepted Answer · 2016-07-26 01:48:23Z

I personally prefer xml2 so here is an answer using that. I'm sure it could be improved but it will give you a list with length equal to the number of compounds and each element of the list will be the compound identifier and a data.frame of mz and abundance columns.

library(xml2)
x = read_xml(conn) # given in question
#html_structure(x) # If you want to look at the structure

output = list()
# Initialize list and collect all compunds first
a = xml_attrs(xml_find_all(x, "//compound"))
# Iterate over compounds - I'm sure this could be done in an lapply...
for(i in 1:length(a)){
  y = xml_child(x, i)
  # Get the child to simplify the xpath to collect all in this one node
  # Add a new element to the output list
  output[[i]] = list(
    a[[1]][1], # Extract identifier (assumed you didn't want the retention time) and then a df of mz and abundance
    data.frame(mz = xml_double(xml_find_all(y, "//isotope/mz" )), abundance = xml_double(xml_find_all(x, "//isotope/abundance") ))
               )
}

OUTPUT:

> output
[[1]]
[[1]][[1]]
         identifier 
"24.24_355.2087m/z" 

[[1]][[2]]
         mz   abundance
1  355.1315 0.115052197
2  355.7048 0.000000000
...
31 355.2422 1.253918178
32 355.7048 0.000000000

Collectives™ on Stack Overflow

Extract data from xml to data frame

2 Answers 2

Comments

OUTPUT:

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Comments

OUTPUT:

Comments

Your Answer

Sign up or log in

Post as a guest

Related