0

In R, I have two dataframes. The first is called d and looks like this:

d <- structure(list(id = c(384923059L, 384923060L, 384923061L, 386269528L
), decimalLatitude = c(46.08, 48.73333, 46.35, 58.16), decimalLongitude = c(-55.40333, 
-52.96667, -52.73333, -61.088), datecollected = structure(c(2L, 
3L, 2L, 1L), .Label = c("2015-08-20 12:00:00+02", "2015-11-19 12:00:00+01", 
"2015-11-27 12:00:00+01"), class = "factor"), institutioncode = c("ARC", 
"ARC", "ARC", "DFOCENARC"), individualcount = c(NA_real_, NA_real_, 
NA_real_, NA_real_), depth = c(93, 95, 166, 216), resname = structure(c(1L, 
1L, 1L, 2L), .Label = c("Atlantic Reference Centre Museum of Canadian Atlantic Organisms - Invertebrates and Fishes Data", 
"DFO Central and Arctic Multi-species Stock Assessment Surveys"
), class = "factor"), originalscientificname = structure(c(1L, 
1L, 1L, 1L), .Label = "Mallotus villosus", class = "factor"), 
    collectioncode = structure(c(1L, 1L, 1L, 2L), .Label = c("ARC", 
    "DFOSurvey_Modified Standard (14\") Campelen"), class = "factor"), 
    year = c(2015, 2015, 2015, 2015), month = c(11, 11, 11, 8
    ), day = c(19, 27, 19, 20)), row.names = c(7216L, 7217L, 
7218L, 11980L), class = "data.frame")

str(d) reveals

'data.frame':   50 obs. of  6 variables:
 $ layer_name: num  0.506 1.556 2.668 3.856 5.14 ...
 $ raw_min   : num  0 1.03 2.11 3.26 4.5 ...
 $ raw_max   : num  1.03 2.11 3.26 4.5 5.84 ...
 $ bin_min   : int  0 1 2 3 5 6 7 9 11 13 ...
 $ bin_max   : int  0 1 2 4 5 6 8 10 12 15 ...
 $ layer_no  : int  1 2 3 4 5 6 7 8 9 10 ...

The second dataframe is called c

c <- structure(list(layer_name = c(0.505760014, 1.55585527, 2.66768169, 
3.85627985, 5.14036131, 6.5430336, 8.09251881, 9.82275009, 11.7736797, 
13.9910383, 16.525322, 19.4298019, 22.757616, 26.5583, 30.8745613, 
35.7402039, 41.1800232, 47.211895, 53.8506355, 61.1128387, 69.0216827, 
77.6111603, 86.9294281, 97.0413132, 108.030281, 120, 133.075821, 
147.40625, 163.164459, 180.549927, 199.789963, 221.141174, 244.890625, 
271.356384, 300.887512, 333.862823, 370.688477, 411.793854, 457.62561, 
508.639893, 565.292297, 628.026001, 697.258667, 773.368286, 856.678955, 
947.447876, 1045.85425, 1151.99121, 1265.86145, 1387.37695), 
    raw_min = c(0, 1.030807642, 2.11176848, 3.26198077, 4.49832058, 
    5.841697455, 7.317776205, 8.95763445, 10.7982149, 12.882359, 
    15.25818015, 17.97756195, 21.09370895, 24.657958, 28.71643065, 
    33.3073826, 38.46011355, 44.1959591, 50.53126525, 57.4817371, 
    65.0672607, 73.3164215, 82.2702942, 91.98537065, 102.5357971, 
    114.0151405, 126.5379105, 140.2410355, 155.2853545, 171.857193, 
    190.169945, 210.4655685, 233.0158995, 258.1235045, 286.121948, 
    317.3751675, 352.27565, 391.2411655, 434.709732, 483.1327515, 
    536.966095, 596.659149, 662.642334, 735.3134765, 815.0236205, 
    902.0634155, 996.651063, 1098.92273, 1208.92633, 1326.6192
    ), raw_max = c(1.030807642, 2.11176848, 3.26198077, 4.49832058, 
    5.841697455, 7.317776205, 8.95763445, 10.7982149, 12.882359, 
    15.25818015, 17.97756195, 21.09370895, 24.657958, 28.71643065, 
    33.3073826, 38.46011355, 44.1959591, 50.53126525, 57.4817371, 
    65.0672607, 73.3164215, 82.2702942, 91.98537065, 102.5357971, 
    114.0151405, 126.5379105, 140.2410355, 155.2853545, 171.857193, 
    190.169945, 210.4655685, 233.0158995, 258.1235045, 286.121948, 
    317.3751675, 352.27565, 391.2411655, 434.709732, 483.1327515, 
    536.966095, 596.659149, 662.642334, 735.3134765, 815.0236205, 
    902.0634155, 996.651063, 1098.92273, 1208.92633, 1326.6192, 
    1387), bin_min = c(0L, 1L, 2L, 3L, 5L, 6L, 7L, 9L, 11L, 13L, 
    16L, 18L, 21L, 25L, 29L, 33L, 39L, 44L, 51L, 58L, 65L, 73L, 
    82L, 92L, 103L, 114L, 127L, 140L, 155L, 172L, 190L, 211L, 
    233L, 258L, 286L, 317L, 352L, 391L, 435L, 483L, 537L, 597L, 
    663L, 735L, 815L, 902L, 997L, 1099L, 1209L, 1327L), bin_max = c(0L, 
    1L, 2L, 4L, 5L, 6L, 8L, 10L, 12L, 15L, 17L, 20L, 24L, 28L, 
    32L, 38L, 43L, 50L, 57L, 64L, 72L, 81L, 91L, 102L, 113L, 
    126L, 139L, 154L, 171L, 189L, 210L, 232L, 257L, 285L, 316L, 
    351L, 390L, 434L, 482L, 536L, 596L, 662L, 734L, 814L, 901L, 
    996L, 1098L, 1208L, 1326L, 1387L), layer_no = 1:50), class = "data.frame", row.names = c(NA, 
-50L))

str(c) reveals

'data.frame':   15 obs. of  13 variables:
 $ id                    : int  384923059 384923060 384923061 386269528 386270555 386270577 386270682 386272010 386272026 386272096 ...
 $ decimalLatitude       : num  46.1 48.7 46.4 58.2 61.6 ...
 $ decimalLongitude      : num  -55.4 -53 -52.7 -61.1 -69.7 ...
 $ datecollected         : Factor w/ 13219 levels "","1854-07-02 12:00:00+00:17:30",..: 13218 13219 13218 13208 13209 13209 13210 13211 13212 13212 ...
 $ institutioncode       : chr  "ARC" "ARC" "ARC" "DFOCENARC" ...
 $ individualcount       : num  NA NA NA NA NA NA NA NA NA NA ...
 $ depth                 : num  93 95 166 216 289 227 149 223 440 451 ...
 $ resname               : Factor w/ 39 levels "Arctic Marine Fish Museum Specimens",..: 3 3 3 14 14 14 14 14 14 14 ...
 $ originalscientificname: Factor w/ 2 levels "Mallotus catervarius",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ collectioncode        : Factor w/ 98 levels "","12.190","14.102",..: 35 35 35 59 58 58 58 58 58 58 ...
 $ year                  : num  2015 2015 2015 2015 2015 ...
 $ month                 : num  11 11 11 8 8 8 8 9 9 9 ...
 $ day                   : num  19 27 19 20 28 28 29 9 10 10 ...

I would like to do is for each row in d, find where d$depth is >= c$binmin & < c$binmax, and then add the corresponding value in c$layer_name to d$depth_layer. Where d$depth == NA, d$depthlayer would also == NA

In the example dataframes above, the resulting dataframe d would look like this:

id  depth   depth_layer
1   1       1.55585527
2   5       5.14036131
3   NA      NA
4   6       6.5430336
5   3       3.85627985

I have tried to create a for loop

for (i in 1:nrow(d)){
    if (d$depth[i] >= c$bin_min & d$depth[i] <= c$bin_max) {
      d$depth_layer[i] <- c$layer_name
    } else {
      d$depth_layer[i] <- NA
    }
}

but it gave the following error:

�>=� not meaningful for factors�<=� not meaningful for factorsthe condition has length > 1 and only the first element will be usedError in if (d$depth[i] >= c$bin_min & d$depth[i] <=  : missing value where TRUE/FALSE needed

I'm not sure how to fix the loop. Any help is gratefully received.

ADDITIONAL

As pointed out by @forestfanjoe in the comments, one of my variables (d$depth) was a factor. After changing this to numeric (as.numeric) both @forestfanjoe and @TinglTanglBob solutions worked.

2
  • 1
    The error message suggests that one of your variables is a factor, rather than numeric. Could you add the output from str(c) and str(d)? Also if you add dput(c) and dput(d), this will make your question more easily reproducible. Commented Oct 3, 2018 at 13:01
  • @forestfanjoe - you are absolutely right about the factor. str(d) revealed that d$depth is the culprit. I should have thought about this... thanks for pointing it out nicely. Once I changed d$depth to as.numeric, both your and @TinglTanglBob solutions worked. I'll add the str and the dput for both original dataframes (rather than the cut-down versions I made to ask the question) to the post - it may help others who stumble across this. Commented Oct 3, 2018 at 13:45

2 Answers 2

2

Does this work for you?

d <- data.frame("id" = c(1:5), "depth" = c(1, 5, NA, 6, 3))

c <- data.frame(matrix(data = c(
0,        0,        1,         0.505760014,
1,        1,        2,         1.55585527,
2,        2,        3,         2.66768169,
3,        4,        4,         3.85627985,
5,        5,        5,         5.14036131,
6,        6,       6,         6.5430336,
7,        8,        7,         8.09251881), ncol = 4, byrow = T))
names(c) <- c("bin_min", "bin_max", "layer_no", "layer_name")

check_depth <- function(d_temp)
{
  print(d_temp)
  if(is.na(d_temp)) return(NA) # if d_temp is na just return NA 
  layer_name_temp <- c$layer_name[which(c$bin_min <= d_temp & c$bin_max >= d_temp)]
  if(length(layer_name_temp) > 1) layer_name_temp <- layer_name_temp[1] # in case there are more hits, the first one is taken
  return(layer_name_temp)
}


d$depth_layer <- sapply(d$depth, check_depth)

d

output for d

> d
  id depth depth_layer
1  1     1    1.555855
2  2     5    5.140361
3  3    NA          NA
4  4     6    6.543034
5  5     3    3.856280
Sign up to request clarification or add additional context in comments.

Comments

1

Similar to TinglTanglBob's solution:

d <- read.table(
text = 
"
id  depth
1   1
2   5
3   NA
4   6
5   3
", header = T)

c <- read.table(
text = "
bin_min  bin_max  layer_no  layer_name
0        0        1         0.505760014
1        1        2         1.55585527
2        2        3         2.66768169
3        4        4         3.85627985
5        5        5         5.14036131
6        6        6         6.5430336
7        8        7         8.09251881                
", header = T)

If you need this to be a for loop:

for (i in 1:nrow(d)){
    if(!is.na(d$depth[i])) {
        rw <- which(d$depth[i] >= c$bin_min & d$depth[i] <= c$bin_max)
        d$depth_layer[i] <- c$layer_name[rw]

    } else {
        d$depth_layer[i] <- NA 
    }
}

You could also try vapply:

d$depth_layer <- vapply(d$depth, 
       function(x) {
           if(is.na(x)) return(NA)
           rw <- which(x >= c$bin_min & x <= c$bin_max)
           c$layer_name[rw]
           }, 0)

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.