I have a tricky problem which I will do my best to explain.
I have two dataframes, the important columns being uID (unique identifier) and b(a value indicating the genotype).
In one dataframe, delsFB, each uID appears only once (note:uIDs are not necessarily consecutive). In the other, delsOB, different uIDs can appear variable numbers of times (ranging from 1 to about 100 occurences) on different rows.
I need to match the uIDs in delsFB to those in delsOB and update the b column with a new value.
The workflow:
- For each row of
delsOB, look for rows indelsFBwith the sameuID. - If any of the matching rows in
delsOBhave the b valueKPand another containsCP, enter the valueNPintodelsFB$b. - Otherwise, if any rows contain
KPbut none containCP, enter the valueKPintodelsFB$b. - If any rows contain
CPbut none containKP, enter the valueCPintodelsFB$b. - Any other situations should return
NPas well.
I hope that made sense, and thanks very much for any assistance. Here are two small example dataframes made with dput:
dels$FB:
structure(list(chrom = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), pos_c = c(28522L, 29000L, 29211L, 29423L,
29462L, 29552L, 29766L, 29791L, 29885L, 29946L, 29986L, 30157L,
30587L, 30871L), uID = c(24L, 33L, 44L, 56L, 62L, 68L, 76L, 80L,
86L, 91L, 92L, 101L, 141L, 161L), pos_k = c(28523L, 29011L, 29221L,
29427L, 29462L, 29551L, 29766L, 29790L, 29887L, 29946L, 29953L,
30123L, 30557L, 30841L), seq_c = structure(c(3L, 3L, 2L, 3L,
3L, 2L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L), .Label = c("A", "C",
"G"), class = "factor"), seq_sk = structure(c(3L, 3L, 2L, 3L,
3L, 2L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 3L), .Label = c("A", "C",
"G"), class = "factor"), type_c = c(complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA)), type_k = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "d", class = "factor"),
Var_len = c(2L, 3L, 7L, 6L, 3L, 2L, 2L, 2L, 3L, 34L, 2L,
2L, 4L, 2L), reads_all = c(20L, 25L, 27L, 18L, 17L, 12L,
26L, 26L, 29L, 27L, 16L, 26L, 47L, 42L), deletions = c(0L,
0L, 5L, 1L, 15L, 0L, 0L, 0L, 3L, 1L, 2L, 1L, 0L, 1L), insertions = c(0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), A = c(0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 24L, 25L, 1L, 1L, 47L, 0L), C = c(0L,
0L, 20L, 1L, 0L, 11L, 25L, 0L, 2L, 1L, 11L, 22L, 0L, 1L),
T = c(0L, 1L, 2L, 0L, 2L, 1L, 0L, 0L, 0L, 0L, 2L, 2L, 0L,
0L), G = c(20L, 24L, 0L, 16L, 0L, 0L, 0L, 26L, 0L, 0L, 0L,
0L, 0L, 40L), k = c(0L, 0L, 5L, 1L, 15L, 0L, 0L, 0L, 3L,
1L, 2L, 1L, 0L, 1L), c = c(20L, 24L, 20L, 16L, 0L, 11L, 25L,
26L, 24L, 25L, 11L, 22L, 47L, 40L), SNPreads = c(20L, 24L,
25L, 17L, 15L, 11L, 25L, 26L, 27L, 26L, 13L, 23L, 47L, 41L
), b = structure(c(1L, 1L, 3L, 1L, 2L, 1L, 1L, 1L, NA, 1L,
3L, 1L, 1L, 1L), .Label = c("CP", "KP", "NP"), class = "factor")), .Names = c("chrom",
"pos_c", "uID", "pos_k", "seq_c", "seq_sk", "type_c", "type_k",
"Var_len", "reads_all", "deletions", "insertions", "A", "C",
"T", "G", "k", "c", "SNPreads", "b"), class = "data.frame", row.names = c(NA,
-14L))
delsOB:
structure(list(chrom = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), pos_c = c(28523L, 29001L, 29002L, 29212L, 29213L,
29214L, 29215L, 29216L, 29217L, 29424L, 29425L, 29426L, 29427L,
29428L, 29463L, 29464L, 29553L, 29767L, 29792L, 29886L, 29887L,
29947L, 29948L, 29949L, 29950L, 29951L, 29952L, 29953L, 29954L,
29955L, 29956L, 29957L, 29958L, 29959L, 29960L, 29961L, 29962L,
29963L, 29964L, 29965L, 29966L, 29967L, 29968L, 29969L, 29970L,
29971L, 29972L, 29973L, 29974L, 29975L, 29976L, 29977L, 29978L,
29979L, 29987L, 30158L, 30588L, 30589L, 30590L, 30872L), uID = c(24L,
33L, 33L, 44L, 44L, 44L, 44L, 44L, 44L, 56L, 56L, 56L, 56L, 56L,
62L, 62L, 68L, 76L, 80L, 86L, 86L, 91L, 91L, 91L, 91L, 91L, 91L,
91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L,
91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L, 91L,
91L, 92L, 101L, 141L, 141L, 141L, 161L), pos_k = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "-", class = "factor"),
seq_c = structure(c(1L, 4L, 1L, 1L, 4L, 3L, 3L, 4L, 4L, 1L,
2L, 1L, 2L, 4L, 1L, 1L, 3L, 4L, 4L, 4L, 4L, 2L, 1L, 3L, 3L,
3L, 2L, 1L, 4L, 3L, 1L, 3L, 4L, 4L, 1L, 2L, 3L, 4L, 2L, 1L,
1L, 4L, 2L, 4L, 2L, 4L, 3L, 3L, 4L, 2L, 1L, 1L, 4L, 2L, 1L,
1L, 4L, 2L, 3L, 1L), .Label = c("A", "C", "G", "T"), class = "factor"),
seq_sk = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "-", class = "factor"),
type_c = c(complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA),
complex(real=0, imaginary=NA), complex(real=0, imaginary=NA)
), type_k = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "d", class = "factor"),
Var_len = c(2L, 3L, 3L, 7L, 7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L,
6L, 6L, 3L, 3L, 2L, 2L, 2L, 3L, 3L, 34L, 34L, 34L, 34L, 34L,
34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L,
34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L,
34L, 34L, 34L, 34L, 2L, 2L, 4L, 4L, 4L, 2L), reads_all = c(20L,
24L, 21L, 27L, 27L, 27L, 27L, 28L, 28L, 16L, 16L, 16L, 16L,
16L, 17L, 17L, 12L, 26L, 26L, 29L, 29L, 27L, 27L, 19L, 20L,
17L, 17L, 15L, 15L, 15L, 15L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 16L, 26L, 47L, 45L, 45L, 42L), deletions = c(18L,
0L, 1L, 25L, 24L, 24L, 21L, 20L, 20L, 13L, 13L, 13L, 13L,
12L, 0L, 0L, 11L, 21L, 25L, 23L, 22L, 7L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 8L, 8L, 9L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 8L, 8L, 9L, 8L, 10L,
20L, 44L, 44L, 44L, 38L), insertions = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), A = c(2L, 0L,
0L, 0L, 1L, 1L, 1L, 2L, 0L, 1L, 0L, 2L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 2L, 3L, 16L, 1L, 0L, 1L, 0L, 3L, 4L, 4L, 4L,
3L, 1L, 2L, 4L, 0L, 2L, 0L, 0L, 1L, 0L, 3L, 0L, 1L, 0L, 1L,
1L, 1L, 0L, 2L, 4L, 3L, 1L, 0L, 3L, 4L, 0L, 1L, 0L, 3L),
C = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 1L, 2L, 0L, 1L,
0L, 16L, 0L, 0L, 3L, 0L, 4L, 1L, 16L, 0L, 1L, 0L, 1L, 8L,
2L, 0L, 0L, 2L, 0L, 1L, 0L, 1L, 3L, 1L, 2L, 2L, 0L, 0L, 0L,
3L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 2L, 2L, 1L,
0L, 1L, 1L), T = c(0L, 24L, 19L, 1L, 2L, 2L, 4L, 6L, 5L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 2L, 0L, 2L, 3L, 1L, 2L, 6L,
2L, 5L, 0L, 1L, 1L, 2L, 1L, 1L, 3L, 3L, 1L, 2L, 2L, 1L, 2L,
3L, 4L, 2L, 2L, 3L, 3L, 4L, 2L, 1L, 5L, 3L, 2L, 1L, 3L, 1L,
1L, 0L, 2L, 0L, 0L, 0L), G = c(0L, 0L, 1L, 1L, 0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 16L, 0L, 0L, 1L, 0L, 1L,
0L, 0L, 2L, 9L, 1L, 0L, 0L, 1L, 0L, 0L, 2L, 0L, 1L, 0L, 1L,
0L, 2L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 4L, 0L, 0L, 0L,
2L, 0L, 4L, 0L, 0L, 0L, 0L, 0L, 0L), k = c(18L, 0L, 1L, 25L,
24L, 24L, 21L, 20L, 20L, 13L, 13L, 13L, 13L, 12L, 0L, 0L,
11L, 21L, 25L, 23L, 22L, 7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 8L, 8L, 9L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 8L, 8L, 8L, 8L, 8L, 9L, 8L, 10L, 20L, 44L, 44L,
44L, 38L), c = c(2L, 24L, 0L, 0L, 2L, 0L, 1L, 6L, 5L, 1L,
2L, 2L, 1L, 3L, 0L, 0L, 0L, 2L, 0L, 2L, 3L, 16L, 16L, 2L,
9L, 1L, 8L, 3L, 1L, 0L, 4L, 2L, 3L, 3L, 4L, 3L, 0L, 1L, 2L,
1L, 0L, 2L, 3L, 3L, 1L, 4L, 1L, 4L, 5L, 1L, 4L, 3L, 3L, 1L,
3L, 4L, 2L, 0L, 0L, 3L), SNPreads = c(20L, 24L, 1L, 25L,
26L, 24L, 22L, 26L, 25L, 14L, 15L, 15L, 14L, 15L, 0L, 0L,
11L, 23L, 25L, 25L, 25L, 23L, 25L, 11L, 18L, 10L, 17L, 12L,
10L, 9L, 12L, 10L, 12L, 11L, 12L, 11L, 9L, 10L, 11L, 10L,
9L, 11L, 12L, 12L, 10L, 13L, 10L, 12L, 13L, 9L, 12L, 11L,
12L, 9L, 13L, 24L, 46L, 44L, 44L, 41L), b = structure(c(3L,
1L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
3L, 3L, 3L, 3L, 3L, 4L, 2L, 4L, 4L, 3L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L,
3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("CP",
"HP", "KP", "NP"), class = "factor")), .Names = c("chrom",
"pos_c", "uID", "pos_k", "seq_c", "seq_sk", "type_c", "type_k",
"Var_len", "reads_all", "deletions", "insertions", "A", "C",
"T", "G", "k", "c", "SNPreads", "b"), class = "data.frame", row.names = c(NA,
-60L))
EDIT: Sorry, forgot expected output - for these example datasets, the b column in delsFB should be updated to:
24 KP, 33 CP, 44 KP, 56 KP, 62 NP, 68 KP, 76 KP, 80 KP, 86 KP, 91 KP, 92 KP, 101 KP, 141 KP, 161 KP.
Note that currently, e.g. uID 91 has 34 instances in delsOB, but it still only needs one KP to be considered type KP. Ideally the rule would depend somewhat on length e.g. if there are 34 instances, at least 50% of them have to be KP for b=KP, but if there are only ~5 instances, it must be 80% of them. However this may make the problem too complicated.