0

I have a tricky problem which I will do my best to explain.

I have two dataframes, the important columns being uID (unique identifier) and b(a value indicating the genotype). In one dataframe, delsFB, each uID appears only once (note:uIDs are not necessarily consecutive). In the other, delsOB, different uIDs can appear variable numbers of times (ranging from 1 to about 100 occurences) on different rows. I need to match the uIDs in delsFB to those in delsOB and update the b column with a new value.

The workflow:

  1. For each row of delsOB, look for rows in delsFB with the same uID.
  2. If any of the matching rows in delsOB have the b value KP and another contains CP, enter the value NP into delsFB$b.
  3. Otherwise, if any rows contain KP but none contain CP, enter the value KP into delsFB$b.
  4. If any rows contain CP but none contain KP, enter the value CP into delsFB$b.
  5. Any other situations should return NP as well.

I hope that made sense, and thanks very much for any assistance. Here are two small example dataframes made with dput:

dels$FB:

structure(list(chrom = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), pos_c = c(28522L, 29000L, 29211L, 29423L, 
29462L, 29552L, 29766L, 29791L, 29885L, 29946L, 29986L, 30157L, 
30587L, 30871L), uID = c(24L, 33L, 44L, 56L, 62L, 68L, 76L, 80L, 
86L, 91L, 92L, 101L, 141L, 161L), pos_k = c(28523L, 29011L, 29221L, 
29427L, 29462L, 29551L, 29766L, 29790L, 29887L, 29946L, 29953L, 
30123L, 30557L, 30841L), seq_c = structure(c(3L, 3L, 2L, 3L, 
3L, 2L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L), .Label = c("A", "C", 
"G"), class = "factor"), seq_sk = structure(c(3L, 3L, 2L, 3L, 
3L, 2L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L), .Label = c("A", "C", 
"G"), class = "factor"), type_c = c(complex(real=0, imaginary=NA), 
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
complex(real=0, imaginary=NA)), type_k = structure(c(1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "d", class = "factor"), 
    Var_len = c(2L, 3L, 7L, 6L, 3L, 2L, 2L, 2L, 3L, 34L, 2L, 
    2L, 4L, 2L), reads_all = c(20L, 25L, 27L, 18L, 17L, 12L, 
    26L, 26L, 29L, 27L, 16L, 26L, 47L, 42L), deletions = c(0L, 
    0L, 5L, 1L, 15L, 0L, 0L, 0L, 3L, 1L, 2L, 1L, 0L, 1L), insertions = c(0L, 
    1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), A = c(0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 0L, 24L, 25L, 1L, 1L, 47L, 0L), C = c(0L, 
    0L, 20L, 1L, 0L, 11L, 25L, 0L, 2L, 1L, 11L, 22L, 0L, 1L), 
    T = c(0L, 1L, 2L, 0L, 2L, 1L, 0L, 0L, 0L, 0L, 2L, 2L, 0L, 
    0L), G = c(20L, 24L, 0L, 16L, 0L, 0L, 0L, 26L, 0L, 0L, 0L, 
    0L, 0L, 40L), k = c(0L, 0L, 5L, 1L, 15L, 0L, 0L, 0L, 3L, 
    1L, 2L, 1L, 0L, 1L), c = c(20L, 24L, 20L, 16L, 0L, 11L, 25L, 
    26L, 24L, 25L, 11L, 22L, 47L, 40L), SNPreads = c(20L, 24L, 
    25L, 17L, 15L, 11L, 25L, 26L, 27L, 26L, 13L, 23L, 47L, 41L
    ), b = structure(c(1L, 1L, 3L, 1L, 2L, 1L, 1L, 1L, NA, 1L, 
    3L, 1L, 1L, 1L), .Label = c("CP", "KP", "NP"), class = "factor")), .Names = c("chrom", 
"pos_c", "uID", "pos_k", "seq_c", "seq_sk", "type_c", "type_k", 
"Var_len", "reads_all", "deletions", "insertions", "A", "C", 
"T", "G", "k", "c", "SNPreads", "b"), class = "data.frame", row.names = c(NA, 
-14L))

delsOB:

structure(list(chrom = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), pos_c = c(28523L, 29001L, 29002L, 29212L, 29213L, 
29214L, 29215L, 29216L, 29217L, 29424L, 29425L, 29426L, 29427L, 
29428L, 29463L, 29464L, 29553L, 29767L, 29792L, 29886L, 29887L, 
29947L, 29948L, 29949L, 29950L, 29951L, 29952L, 29953L, 29954L, 
29955L, 29956L, 29957L, 29958L, 29959L, 29960L, 29961L, 29962L, 
29963L, 29964L, 29965L, 29966L, 29967L, 29968L, 29969L, 29970L, 
29971L, 29972L, 29973L, 29974L, 29975L, 29976L, 29977L, 29978L, 
29979L, 29987L, 30158L, 30588L, 30589L, 30590L, 30872L), uID = c(24L, 
33L, 33L, 44L, 44L, 44L, 44L, 44L, 44L, 56L, 56L, 56L, 56L, 56L, 
62L, 62L, 68L, 76L, 80L, 86L, 86L, 91L, 91L, 91L, 91L, 91L, 91L, 
91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 
91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 
91L, 92L, 101L, 141L, 141L, 141L, 161L), pos_k = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "-", class = "factor"), 
    seq_c = structure(c(1L, 4L, 1L, 1L, 4L, 3L, 3L, 4L, 4L, 1L, 
    2L, 1L, 2L, 4L, 1L, 1L, 3L, 4L, 4L, 4L, 4L, 2L, 1L, 3L, 3L, 
    3L, 2L, 1L, 4L, 3L, 1L, 3L, 4L, 4L, 1L, 2L, 3L, 4L, 2L, 1L, 
    1L, 4L, 2L, 4L, 2L, 4L, 3L, 3L, 4L, 2L, 1L, 1L, 4L, 2L, 1L, 
    1L, 4L, 2L, 3L, 1L), .Label = c("A", "C", "G", "T"), class = "factor"), 
    seq_sk = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L), .Label = "-", class = "factor"), 
    type_c = c(complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA), 
    complex(real=0, imaginary=NA), complex(real=0, imaginary=NA)
    ), type_k = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L), .Label = "d", class = "factor"), 
    Var_len = c(2L, 3L, 3L, 7L, 7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 
    6L, 6L, 3L, 3L, 2L, 2L, 2L, 3L, 3L, 34L, 34L, 34L, 34L, 34L, 
    34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 
    34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 
    34L, 34L, 34L, 34L, 2L, 2L, 4L, 4L, 4L, 2L), reads_all = c(20L, 
    24L, 21L, 27L, 27L, 27L, 27L, 28L, 28L, 16L, 16L, 16L, 16L, 
    16L, 17L, 17L, 12L, 26L, 26L, 29L, 29L, 27L, 27L, 19L, 20L, 
    17L, 17L, 15L, 15L, 15L, 15L, 14L, 14L, 14L, 14L, 14L, 14L, 
    14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 
    14L, 14L, 14L, 14L, 14L, 16L, 26L, 47L, 45L, 45L, 42L), deletions = c(18L, 
    0L, 1L, 25L, 24L, 24L, 21L, 20L, 20L, 13L, 13L, 13L, 13L, 
    12L, 0L, 0L, 11L, 21L, 25L, 23L, 22L, 7L, 9L, 9L, 9L, 9L, 
    9L, 9L, 9L, 9L, 8L, 8L, 9L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 
    9L, 9L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 8L, 8L, 9L, 8L, 10L, 
    20L, 44L, 44L, 44L, 38L), insertions = c(0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 
    0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), A = c(2L, 0L, 
    0L, 0L, 1L, 1L, 1L, 2L, 0L, 1L, 0L, 2L, 1L, 1L, 0L, 0L, 0L, 
    0L, 0L, 0L, 2L, 3L, 16L, 1L, 0L, 1L, 0L, 3L, 4L, 4L, 4L, 
    3L, 1L, 2L, 4L, 0L, 2L, 0L, 0L, 1L, 0L, 3L, 0L, 1L, 0L, 1L, 
    1L, 1L, 0L, 2L, 4L, 3L, 1L, 0L, 3L, 4L, 0L, 1L, 0L, 3L), 
    C = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 1L, 2L, 0L, 1L, 
    0L, 16L, 0L, 0L, 3L, 0L, 4L, 1L, 16L, 0L, 1L, 0L, 1L, 8L, 
    2L, 0L, 0L, 2L, 0L, 1L, 0L, 1L, 3L, 1L, 2L, 2L, 0L, 0L, 0L, 
    3L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 2L, 2L, 1L, 
    0L, 1L, 1L), T = c(0L, 24L, 19L, 1L, 2L, 2L, 4L, 6L, 5L, 
    1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 2L, 0L, 2L, 3L, 1L, 2L, 6L, 
    2L, 5L, 0L, 1L, 1L, 2L, 1L, 1L, 3L, 3L, 1L, 2L, 2L, 1L, 2L, 
    3L, 4L, 2L, 2L, 3L, 3L, 4L, 2L, 1L, 5L, 3L, 2L, 1L, 3L, 1L, 
    1L, 0L, 2L, 0L, 0L, 0L), G = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 
    0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 16L, 0L, 0L, 1L, 0L, 1L, 
    0L, 0L, 2L, 9L, 1L, 0L, 0L, 1L, 0L, 0L, 2L, 0L, 1L, 0L, 1L, 
    0L, 2L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 4L, 0L, 0L, 0L, 
    2L, 0L, 4L, 0L, 0L, 0L, 0L, 0L, 0L), k = c(18L, 0L, 1L, 25L, 
    24L, 24L, 21L, 20L, 20L, 13L, 13L, 13L, 13L, 12L, 0L, 0L, 
    11L, 21L, 25L, 23L, 22L, 7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
    9L, 8L, 8L, 9L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
    9L, 9L, 9L, 8L, 8L, 8L, 8L, 8L, 9L, 8L, 10L, 20L, 44L, 44L, 
    44L, 38L), c = c(2L, 24L, 0L, 0L, 2L, 0L, 1L, 6L, 5L, 1L, 
    2L, 2L, 1L, 3L, 0L, 0L, 0L, 2L, 0L, 2L, 3L, 16L, 16L, 2L, 
    9L, 1L, 8L, 3L, 1L, 0L, 4L, 2L, 3L, 3L, 4L, 3L, 0L, 1L, 2L, 
    1L, 0L, 2L, 3L, 3L, 1L, 4L, 1L, 4L, 5L, 1L, 4L, 3L, 3L, 1L, 
    3L, 4L, 2L, 0L, 0L, 3L), SNPreads = c(20L, 24L, 1L, 25L, 
    26L, 24L, 22L, 26L, 25L, 14L, 15L, 15L, 14L, 15L, 0L, 0L, 
    11L, 23L, 25L, 25L, 25L, 23L, 25L, 11L, 18L, 10L, 17L, 12L, 
    10L, 9L, 12L, 10L, 12L, 11L, 12L, 11L, 9L, 10L, 11L, 10L, 
    9L, 11L, 12L, 12L, 10L, 13L, 10L, 12L, 13L, 9L, 12L, 11L, 
    12L, 9L, 13L, 24L, 46L, 44L, 44L, 41L), b = structure(c(3L, 
    1L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 
    3L, 3L, 3L, 3L, 3L, 4L, 2L, 4L, 4L, 3L, 2L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 
    3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("CP", 
    "HP", "KP", "NP"), class = "factor")), .Names = c("chrom", 
"pos_c", "uID", "pos_k", "seq_c", "seq_sk", "type_c", "type_k", 
"Var_len", "reads_all", "deletions", "insertions", "A", "C", 
"T", "G", "k", "c", "SNPreads", "b"), class = "data.frame", row.names = c(NA, 
-60L))

EDIT: Sorry, forgot expected output - for these example datasets, the b column in delsFB should be updated to: 24 KP, 33 CP, 44 KP, 56 KP, 62 NP, 68 KP, 76 KP, 80 KP, 86 KP, 91 KP, 92 KP, 101 KP, 141 KP, 161 KP.

Note that currently, e.g. uID 91 has 34 instances in delsOB, but it still only needs one KP to be considered type KP. Ideally the rule would depend somewhat on length e.g. if there are 34 instances, at least 50% of them have to be KP for b=KP, but if there are only ~5 instances, it must be 80% of them. However this may make the problem too complicated.

1
  • can you explain step 2 in your workflow? Commented May 3, 2016 at 16:06

1 Answer 1

1

One way to approach this may be to loop through the unique values in delsOB, check your desired conditions, and then update delsFB. The following code should do this, and can be modified to include additional conditions (such as changing the result based on the number of instances).

# Save delsFB to a new data frame so results can be compared afterward. 
delsFB_new <- delsFB

# Loop through the unique uID values in delsOB.
for (unique_id in unique(delsOB$uID)) {
  # Subset delsOB for the current unique uID.
  currrent_delsOB_subset <- delsOB[delsOB$uID == unique_id, ]
  # Assign desired value to result based on the conditions.
  if ("KP" %in% currrent_delsOB_subset$b & "CP" %in% currrent_delsOB_subset$b) {
    result <- "NP"
  }
  else if ("KP" %in% currrent_delsOB_subset$b) {
    result <- "KP"
  }
  else if ("CP" %in% currrent_delsOB_subset$b) {
    result <- "CP"
  }
  else {
    result <- "NP"
  }
  # Insert the result into delsFB$b.
  delsFB_new[delsFB_new$uID == unique_id, ]$b <- result
}

EDIT: Here is a second version that uses conditional logic for the additional check for "KP" proportion based on the number of instances.

# Save delsFB to a new data frame so results can be compared afterward. 
delsFB_new <- delsFB

# Loop through the unique uID values in delsOB.
for (unique_id in unique(delsOB$uID)) {
  # Subset delsOB for the current unique uID.
  currrent_delsOB_subset <- delsOB[delsOB$uID == unique_id, ]
  # Assign desired value to result based on the conditions.
  if ("KP" %in% currrent_delsOB_subset$b & "CP" %in% currrent_delsOB_subset$b) {
    result <- "NP"
  }
  else if ("KP" %in% currrent_delsOB_subset$b) {
    # Nested conditional logic to ensure a minimum proportion of "KP" values, according to the number of instances.
    if (length(currrent_delsOB_subset$b) <= 7 & length(currrent_delsOB_subset$b[currrent_delsOB_subset$b == "KP"]) >= .8*length(currrent_delsOB_subset$b)) {
      result <- "KP"
    }
    else if (length(currrent_delsOB_subset$b) > 7 & length(currrent_delsOB_subset$b[currrent_delsOB_subset$b == "KP"]) >= .5*length(currrent_delsOB_subset$b)) {
      result <- "KP"
    }
  }
  else if ("CP" %in% currrent_delsOB_subset$b) {
    result <- "CP"
  }
  else {
    result <- "NP"
  }
  # Insert the result into delsFB$b.
  delsFB_new[delsFB_new$uID == unique_id, ]$b <- result
}
Sign up to request clarification or add additional context in comments.

3 Comments

Do you think it would be possible to modify it to fulfill an extra criteria? Ideally the rule would depend somewhat on length e.g. if there are 34 instances, at least 50% of them have to be KP for b=KP, but if there are only ~5 instances, it must be 80% of them. This isn't essential though. Thanks so much for your help!
Glad it worked! It sounds like you want to add an additional check to make sure that some minimum "KP" proportion threshold is met, based on the current number of instances. Once you figure out your logic precisely, you can do this with nested conditional logic. I've added a second version of the code to get you started. Just look for the comment that starts with "# Nested conditional logic..." and modify the code that follows based on your desired logic.
Thanks again, that's extremely helpful :)

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.