1

The other day I answered my own question about how to loop through a dataframe to get all combinations of my variables in sets of 2, 3, 4, and 5, calculate a scale score, and assess some psychometrics.

It works, but it's really slow for my actual use case of all 616,645 combinations of 20 variables in sets of 2-10.

I don't have much experience running in parallel, but I think a possible solution is to use the foreach and doParallel packages as described in this SO answer. Unfortunately, I am not quite seeing how to adapt this idea to my use case.

Here is my actual code with a much smaller toy example that only takes seconds to run:

library(gtools)
library(OptimalCutpoints)

# new packages to run loop in parallel
library(foreach)
library(doParallel)
registerDoParallel(detectCores())  

# create fake data
  df <- data.frame(class=sample(0:1, 50, replace=T),
                   v01=sample(0:3, 50, replace=T),
                   v02=sample(0:3, 50, replace=T),
                   v03=sample(0:3, 50, replace=T),
                   v04=sample(0:3, 50, replace=T),
                   v05=sample(0:3, 50, replace=T))

# combinations
  dfoc <- as.data.frame(NULL)
  ri <- 1

  # I think the outer loop should somehow use 
  #    foreach(i=2:(length(df)-1)) %:%
  # and then the inner loop use
  #    foreach(r=1:nrow(p)) %dopar%
  # but I'm not sure of the assignment in either case
  # I want to build dfoc each iteration

  for (i in 2:(length(df)-1)) {  
    p <- combinations(n = length(df)-1, r = i, v = names(df[2:(length(df))]))
    for (r in 1:nrow(p)) {
      keep <- c("class", p[r,])
      v <- keep[-1]
      df_ <- df[, keep]
      df_$T <- rowSums(df_[,2:length(keep)])
      oc <- summary(optimal.cutpoints(X = "T", 
                                      status = "class",
                                      tag.healthy = 0,
                                      methods = "SpEqualSe",
                                      data = df_,
                                      control = control.cutpoints(),
                                      ci.fit = TRUE,
                                      conf.level = 0.95, 
                                      trace = FALSE))
      dfoc[ri,1] <- i                                    # number vars in set
      dfoc[ri,2] <- r                                    # permutation number
      dfoc[ri,3] <- paste(v, collapse=",")               # var names in set
      dfoc[ri,4] <- oc$p.table$Global$SpEqualSe[[1]][1]     # cutoff
      dfoc[ri,5] <- oc$p.table$Global$SpEqualSe[[1]][2]     # sen
      dfoc[ri,6] <- oc$p.table$Global$SpEqualSe[[1]][3]     # spe
      dfoc[ri,7] <- oc$p.table$Global$SpEqualSe[[1]][4]     # ppv
      dfoc[ri,8] <- oc$p.table$Global$SpEqualSe[[1]][5]     # npv
      dfoc[ri,9] <- oc$p.table$Global$SpEqualSe[[1]][2,2]   # sen l95
      dfoc[ri,10] <- oc$p.table$Global$SpEqualSe[[1]][2,3]  # sen u95
      dfoc[ri,11] <- oc$p.table$Global$SpEqualSe[[1]][3,2]  # spe l95
      dfoc[ri,12] <- oc$p.table$Global$SpEqualSe[[1]][3,3]  # spe u95
      dfoc[ri,13] <- oc$p.table$Global$SpEqualSe[[1]][4,2]  # ppv l95
      dfoc[ri,14] <- oc$p.table$Global$SpEqualSe[[1]][4,3]  # ppv u95
      dfoc[ri,15] <- oc$p.table$Global$SpEqualSe[[1]][5,2]  # npv l95
      dfoc[ri,16] <- oc$p.table$Global$SpEqualSe[[1]][5,3]  # npv u95
      dfoc[ri,17] <- oc$p.table$Global$AUC_CI               # auc
      ri <- ri+1
      remove(df_)
      remove(keep)
      remove(v)
      remove(oc)
    }
  }

2 Answers 2

0

This approach reduced the runtime for my actual use case (>600k combinations) from 2+ days to 2+ hours.

# packages  
  library(gtools)
  library(OptimalCutpoints)
  library(foreach)
  library(doParallel)
  registerDoParallel(detectCores())  

# create fake data
  df <- data.frame(class=sample(0:1, 50, replace=T),
                   v01=sample(0:3, 50, replace=T),
                   v02=sample(0:3, 50, replace=T),
                   v03=sample(0:3, 50, replace=T),
                   v04=sample(0:3, 50, replace=T),
                   v05=sample(0:3, 50, replace=T))

# combinations
  dfoc <- as.data.frame(NULL)
  ri <- 1

# outer function
  outer <- function(s, d) {
    p <- combinations(n = length(d)-1, r = s, v = names(d[2:(length(d))]))
    return(p)
  } 

# inner function
  combo <- function(i, r, p, d) {
    keep <- c("class", p[r,])
    v <- keep[-1]
    d_ <- d[, keep]
    d_$T <- rowSums(d_[,2:length(keep)])
    oc <- summary(optimal.cutpoints(X = "T", 
                                    status = "class",
                                    tag.healthy = 0,
                                    methods = "SpEqualSe",
                                    data = d_,
                                    control = control.cutpoints(),
                                    ci.fit = TRUE,
                                    conf.level = 0.95, 
                                    trace = FALSE))
    dfoc[ri,1] <- i                                    # number vars in set
    dfoc[ri,2] <- r                                    # permutation number
    dfoc[ri,3] <- paste(v, collapse=",")               # var names in set
    dfoc[ri,4] <- oc$p.table$Global$SpEqualSe[[1]][1]     # cutoff
    dfoc[ri,5] <- oc$p.table$Global$SpEqualSe[[1]][2]     # sen
    dfoc[ri,6] <- oc$p.table$Global$SpEqualSe[[1]][3]     # spe
    dfoc[ri,7] <- oc$p.table$Global$SpEqualSe[[1]][4]     # ppv
    dfoc[ri,8] <- oc$p.table$Global$SpEqualSe[[1]][5]     # npv
    dfoc[ri,9] <- oc$p.table$Global$SpEqualSe[[1]][2,2]   # sen l95
    dfoc[ri,10] <- oc$p.table$Global$SpEqualSe[[1]][2,3]  # sen u95
    dfoc[ri,11] <- oc$p.table$Global$SpEqualSe[[1]][3,2]  # spe l95
    dfoc[ri,12] <- oc$p.table$Global$SpEqualSe[[1]][3,3]  # spe u95
    dfoc[ri,13] <- oc$p.table$Global$SpEqualSe[[1]][4,2]  # ppv l95
    dfoc[ri,14] <- oc$p.table$Global$SpEqualSe[[1]][4,3]  # ppv u95
    dfoc[ri,15] <- oc$p.table$Global$SpEqualSe[[1]][5,2]  # npv l95
    dfoc[ri,16] <- oc$p.table$Global$SpEqualSe[[1]][5,3]  # npv u95
    dfoc[ri,17] <- oc$p.table$Global$AUC_CI               # auc
    ri <- ri+1
    remove(d_)
    remove(keep)
    remove(v)
    remove(oc)
    return(dfoc)
  }

# parallel
  system.time(
  y <- foreach(s=2:5) %do% {
    p <- outer(s, df)
    x <- foreach(r=1:nrow(p), .combine=rbind) %dopar% combo(s, r, p, df)
  }
  )

# convert to data frame
  finalDF <- do.call(rbind.data.frame, y)
Sign up to request clarification or add additional context in comments.

Comments

0

A colleague showed me how to vectorize and simplify:

## packages
library(gtools)
library(OptimalCutpoints)
library(foreach)
library(doParallel)
registerDoParallel(detectCores())

## create fake data
df <- data.frame(class=sample(0:1, 50, replace=T),
                 v01=sample(0:3, 50, replace=T),
                 v02=sample(0:3, 50, replace=T),
                 v03=sample(0:3, 50, replace=T),
                 v04=sample(0:3, 50, replace=T),
                 v05=sample(0:3, 50, replace=T))

## all combinations in one data frame
## 2:5 is the number of items
combos <- do.call(rbind, lapply(2:5, function(s) {
  data.frame(
    NItems = s,
    Vars = apply(combinations(
    n = length(df)-1,
    r = s,
    v = names(df[2:(length(df))])
  ), 1, paste, collapse = ","), stringsAsFactors=FALSE)
}))

## function
combo <- function(p, d) {
  keep <- c("class", unlist(strsplit(p[1, "Vars"], ",")))
  v <- keep[-1]
  d_ <- d[, keep]
  d_$T <- rowSums(d_[,2:length(keep)])
  oc <- summary(optimal.cutpoints(X = "T",
                                  status = "class",
                                  tag.healthy = 0,
                                  methods = "SpEqualSe",
                                  data = d_,
                                  control = control.cutpoints(),
                                  ci.fit = TRUE,
                                  conf.level = 0.95,
                                  trace = FALSE))

  out <- oc$p.table$Global$SpEqualSe[[1]]

  data.frame(
    cutoff = out[1],     # cutoff
    sen = out[2],     # sen
    spe = out[3],     # spe
    ppv = out[4],     # ppv
    npv = out[5],     # npv
    senl95 = out[2,2],   # sen l95
    senu95 = out[2,3],  # sen u95
    spel95 = out[3,2],  # spe l95
    speu95 = out[3,3],  # spe u95
    ppvl95 = out[4,2],  # ppv l95
    ppvu95 = out[4,3],  # ppv u95
    npvl95 = out[5,2],  # npv l95
    npvu95 = out[5,3],  # npv u95
    auc = oc$p.table$Global$AUC_CI, # auc
    stringsAsFactors = FALSE)
}


## not parallel
system.time(
  y <- foreach(r=1:nrow(combos), .combine=rbind) %do% combo(combos[r, , drop=FALSE], df)
)
finalDF <- cbind(combos, y)

## parallel
system.time(
  y2 <- foreach(r=1:nrow(combos), .combine=rbind) %dopar% combo(combos[r, , drop=FALSE], df)
)
finalDF2 <- cbind(combos, y2)

## test equal
all.equal(y, y2)

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.