3

I have the following XML tree

library("XML")
library("RCurl")
url <- "https://doc-0s-9c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/rk8a2gr7rl8e8s8j0luiak0cahtcjnak/1459080000000/07495711428163271540/*/0BzmnaOABaMIgTEl6SnRUdU9Eb2M?e=download"
bin <- getURL(url)
con <- file("reference.xml", open = "wb")
writeBin(bin, con)
close(con)
OperationList <- xmlTreeParse("reference.xml", useInternal = TRUE)

I am able to get one dataframe for plan name and one for operation name.

planname <- data.frame(sapply(OperationList["//subgroups/OperationGroup/subgroups/OperationGroup/operations/OperationHeader/plans/PlanHeader/name"], xmlValue))
operationanme <- data.frame(sapply(OperationList["//subgroups/OperationGroup/subgroups/OperationGroup/operations/OperationHeader/name"], xmlValue))

but getting them together in one df (ie. flattening the xml tree) does not work.

I went through multiple approaches (cf below what I tried and the error message I got) but nothing worked so far. Thanks to point me to errors I made.

xmlToDataFrame function

Operation.df1 <-  xmlToDataFrame(OperationList)

duplicate subscripts for columns

xmlToDF function

as per https://hopstat.wordpress.com/2014/01/14/faster-xml-conversion-to-data-frames/

require(XML)
xmlToDF = function(doc, xpath, isXML = TRUE, usewhich = TRUE, verbose = TRUE) {

  if (!isXML) 
    doc = xmlParse(doc)
  #### get the records for that form
  nodeset <- getNodeSet(doc, xpath)

  ## get the field names
  var.names <- lapply(nodeset, names)

  ## get the total fields that are in any record
  fields = unique(unlist(var.names))

  ## extract the values from all fields
  dl = lapply(fields, function(x) {
    if (verbose) 
    print(paste0("  ", x))
    xpathSApply(proc, paste0(xpath, "/", x), xmlValue)
  })

  ## make logical matrix whether each record had that field
  name.mat = t(sapply(var.names, function(x) fields %in% x))
  df = data.frame(matrix(NA, nrow = nrow(name.mat), ncol = ncol(name.mat)))
  names(df) = fields

  ## fill in that data.frame
  for (icol in 1:ncol(name.mat)) {
    rep.rows = name.mat[, icol]
    if (usewhich) 
    rep.rows = which(rep.rows)
    df[rep.rows, icol] = dl[[icol]]
  }

  return(df)
}

Operation.df2 <- xmlToDF(OperationList,
  xpath = "/subgroups/OperationGroup/subgroups/OperationGroup/name")

Error in name.mat[, icol] : subscript out of bounds

rbind & xpathApply

require(XML)

Operation.df3 <- xpathApply(OperationList,
  "/subgroups/OperationGroup/subgroups/OperationGroup/name",
  function(node) {
  region <- xmlValue(node[["name"]])
  xp <- "./operations/OperationHeader/name"
  operation <- xpathSApply(node, xp, xmlValue)
  if (is.null(operation)) operation <- NA
  data.frame(region, operation, stringsAsFactors = FALSE)
})
do.call(rbind, Operation.df3  )

gives a NULL

xmlToList and plyr

require(XML) require(plyr) OperationList2 <- xmlToList(OperationList) Operation.df4 <- ldply(OperationList2, data.frame)

*Give me arguments imply differing number of rows: 1, 0

xmlToList, plyr and data.table

require(data.table)
Operation.df41 <- data.frame(rbindlist(OperationList2))

Item 1 of list input is not a data.frame, data.table or list

Operation.df42 <-  rbindlist(OperationList2)

Item 1 of list input is not a data.frame, data.table or list

Operation.df43 <- data.frame(matrix(unlist(OperationList2),
      byrow=T),stringsAsFactors=FALSE) 

only one column

Operation.df44 <- lapply(OperationList2, data.frame,
  stringsAsFactors = FALSE) 

arguments imply differing number of rows: 1, 0

Operation.df45 <- rbind.fill(Operation.df44)

Using a function in a loop

Convert (possibly malformed) xml into Data Frame in R

xp <- function (OperationList, tag){
  n <- xpathSApply(OperationList, tag, xmlValue)
  if (length(n) > 0) 
    # paste multiple values?  
    paste0(n, collapse="; ") 
  else NA
}

z <- getNodeSet(OperationList, "//subgroups/OperationGroup/subgroups/OperationGroup")
n <-length(z)
notices <-vector("list",n)
for(i in 1:n)
{
  Operation.df5<-xmlDoc(z[[i]])
  Operation.df5[[i]] <- data.frame(
    region = xp(z2, "//name"),
    operation = xp(z2, "//operations/OperationHeader/name"),
    stringsAsFactors=FALSE)
  free(Operation.df5)  
}
do.call("rbind", Operation.df5)

object of type 'externalptr' is not subsettable

With getNodeSet set

for (i in 1:length(getNodeSet(OperationList, "//subgroups/OperationGroup"))) 
{
  if (i==1) {
    foo<-xmlSApply(OperationList[[i]], xmlValue)
    Operation.df6 <-data.frame(t(foo), stringsAsFactors=FALSE)
  }
  else {
    foo<-xmlSApply(OperationList[[i]], xmlValue)
    tmp<-data.frame(t(foo), stringsAsFactors=FALSE)
    Operation.df6 <-rbind(Operation.df6, tmp)
  }
}

No method for subsetting an XMLInternalDocument with integer

Please, help! what is missing?

0

1 Answer 1

4

For each OperationHeader get its name (opName) and all the plans' names (plan) creating a list of one component per OperationHeader. Finally rbind the components together:

L <- xpathSApply(OperationList, "//OperationHeader", function(x) 
           cbind(opName = xmlValue(x[["name"]]),
                 plan = xpathSApply(x, "plans/PlanHeader/name", xmlValue) 
           )
     )

do.call("rbind", L)

giving:

     opName    plan         
[1,] "State A" "Target Plan"
[2,] "State A" "Revision"   
[3,] "State B" "Target Plan"
[4,] "Avgh"    "Target Plan"
[5,] "Alaska"  "Target Plan"
[6,] "Alaska"  "Revision"  
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.