Random sampling in R with set of groups that are within a group

Question

I am working with R.

I have a data set that looks like this...

structure(
  list(
    Condition = c(
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1",
      "1"
    ),
    category = c(
      "work",
      "work",
      "work",
      "work",
      "work",
      "people",
      "people",
      "people",
      "people",
      "people",
      "class",
      "class",
      "class",
      "class",
      "class",
      "beach",
      "beach",
      "beach",
      "beach",
      "beach",
      "park",
      "park",
      "park",
      "park",
      "park",
      "house",
      "house",
      "house",
      "house",
      "house",
      "street",
      "street",
      "street",
      "street",
      "street",
      "internet",
      "internet",
      "internet",
      "internet",
      "internet"
    ),
    Value = c(
      7.36,
      7.92,
      7.66,
      6.92,
      4.76,
      2.82,
      3.18,
      2.1,
      8.28,
      7.26,
      5.16,
      5.72,
      7.12,
      7.14,
      5.06,
      5.14,
      3.34,
      4.74,
      NA,
      NA,
      3.42,
      3.87,
      5.3,
      4.26,
      4.46,
      5.1,
      3.76,
      10.4,
      3.38,
      4.86,
      4.14,
      4.24,
      4.68,
      5.18,
      4.46,
      8.38,
      3.92,
      4.14,
      4.78,
      2.94
    )
  ),
  row.names = c(NA, -40L),
  class = c("tbl_df", "tbl",
            "data.frame")
)

So, as you can see the words in the category column repeat themself 5 times. Those "chunks" of five words are like a group that it is within the condition 1. So, I need a random sample of 4 chunks of words. That is a total of 20 observations under the value column.

I expect something like this...

Condition     category     Value 
   1             people     #
   1             people     #
   1             people     #
   1             people     ...
   1             people
   1             street
   1             street
   1             street
   1             street
   1             street
   1             park
   1             park 
   1             park
   1             park
   1             park
   1             class
   1             class
   1             class
   1             class
   1             class

Any help would be great. Thanks!

Yuriy Saraykin · Accepted Answer · 2021-06-08 17:55:10Z

tidyverse

set.seed(1)
library(tidyverse)
df %>% 
  group_nest(Condition, category) %>% 
  sample_n(tbl = ., size = 4) %>% 
  unnest(data)
#> # A tibble: 20 x 3
#>    Condition category Value
#>    <chr>     <chr>    <dbl>
#>  1 1         beach     5.14
#>  2 1         beach     3.34
#>  3 1         beach     4.74
#>  4 1         beach    NA   
#>  5 1         beach    NA   
#>  6 1         internet  8.38
#>  7 1         internet  3.92
#>  8 1         internet  4.14
#>  9 1         internet  4.78
#> 10 1         internet  2.94
#> 11 1         work      7.36
#> 12 1         work      7.92
#> 13 1         work      7.66
#> 14 1         work      6.92
#> 15 1         work      4.76
#> 16 1         class     5.16
#> 17 1         class     5.72
#> 18 1         class     7.12
#> 19 1         class     7.14
#> 20 1         class     5.06

^{Created on 2021-06-08 by the reprex package (v2.0.0)}

data.table

set.seed(1)

library(data.table)
library(magrittr)
setDT(df)[, lapply(.SD, list), by = list(Condition, category)] %>% 
  .[category %in% sample(category, 4)] %>% 
  .[, lapply(.SD, unlist)] %>% 
  .[order(Condition, category)]
#>     Condition category Value
#>  1:         1    beach  7.66
#>  2:         1    beach  3.18
#>  3:         1    beach  5.14
#>  4:         1    beach    NA
#>  5:         1    beach  4.78
#>  6:         1 internet  6.92
#>  7:         1 internet  2.10
#>  8:         1 internet  3.34
#>  9:         1 internet  8.38
#> 10:         1 internet  2.94
#> 11:         1   people  7.92
#> 12:         1   people  2.82
#> 13:         1   people  7.26
#> 14:         1   people    NA
#> 15:         1   people  4.14
#> 16:         1     work  7.36
#> 17:         1     work  4.76
#> 18:         1     work  8.28
#> 19:         1     work  4.74
#> 20:         1     work  3.92

^{Created on 2021-06-08 by the reprex package (v2.0.0)}

ktiu · Accepted Answer · 2021-06-08 17:43:39Z

If I understand you correctly, you want

your_data |>
  split(~ category) |>
  sample(4) |>
  dplyr::bind_rows()

returning

# A tibble: 20 x 3
   Condition category Value
   <chr>     <chr>    <dbl>
 1 1         house     5.1
 2 1         house     3.76
 3 1         house    10.4
 4 1         house     3.38
 5 1         house     4.86
 6 1         class     5.16
 7 1         class     5.72
 8 1         class     7.12
 9 1         class     7.14
10 1         class     5.06
11 1         internet  8.38
12 1         internet  3.92
13 1         internet  4.14
14 1         internet  4.78
15 1         internet  2.94
16 1         work      7.36
17 1         work      7.92
18 1         work      7.66
19 1         work      6.92
20 1         work      4.76

Collectives™ on Stack Overflow

Random sampling in R with set of groups that are within a group

2 Answers 2

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related