0

this is probably a simple issue, but would appreciate some help please!

I have some data:

data

structure(list(Samid = c("AD001", "AD002", "AD004", "AD005", 
"AD008", "AD010", "AD011", "AD012", "AD013", "AD014", "AD015", 
"AD016", "AD017", "AD018", "AD019", "AD020", "AD021", "AD022", 
"AD023", "AD024", "AD025", "AD026", "AD027", "AD028"), GATA3 = c(0.07850703, 
0.07850703, 0.4477987, 0.07850703, 0.2362246, 0.44779867, 0.46578259, 
0, 0.46578259, 0.44779867, 0.24396914, 0.46578259, 0.23622459, 
0.24396914, 0.07850703, 0.07850703, 1.25391517, 0.82224747, 0.07850703, 
0.07850703, 0.07850703, 0.07850703, 0.83507423, 0.07850703), 
    IL4 = c(0, 0, 0, 0, 0, 0, 0, 1.26781758, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0), IL4R = c(1.65301611, 0.14846188, 
    1.6307388, 0.14846188, 0.2073535, 0.14846188, 0.4656834, 
    1.48227697, 0.65075963, 0.17073914, 0.14846188, 0.14846188, 
    0.37809262, 0.17073914, 1.65301611, 0.14846188, 1.55269688, 
    0.14846188, 2.15320576, 0.17073914, 0.44340614, 0.17073914, 
    0, 0.44340614), IRF4 = c(0, 0, 0, 0, 0, 0, 2.83446844, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), CD207 = c(0.80003601, 
    0.33421377, 3.4723849, 2.32021021, 0.5828276, 0.94797393, 
    0.13406957, 0.70861984, 2.25418614, 1.4883206, 2.38978722, 
    3.47193671, 0.32452279, 2.31827895, 0.80003601, 0.80003601, 
    0.50751017, 2.32021021, 3.0989443, 2.0619054, 1.05640955, 
    3.31881563, 3.37422811, 2.32021021), IL1B = c(0.20787567, 
    0, 0, 0.20787567, 0, 0.20787567, 0, 0, 0, 0.20787567, 0.20787567, 
    0, 0, 0, 0, 0, 0, 0.20787567, 0, 0.20787567, 0.20787567, 
    0.61415248, 0, 0), Clinical.diagnosis = structure(c(2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 
    3L, 3L, 3L, 2L, 3L, 3L, 2L), .Label = c("irritated", "negative", 
    "positive"), class = "factor")), row.names = c(NA, -24L), class = "data.frame")

I want to run a Mann-whitney U on each gene (columns 2:7) comparing with the last column Clinical.Diagnosis.

I can do this individually:

wilcox.test(data$GATA3, data$Clinical.diagnosis)

However, I want to iterate through each gene. I've tried this, but I'm having an issue parsing the gene into the function:

data %>% results=mutate_at(vars(GATA3:IL1B), ~ wilcox.test(. ~ Clinical.diagnosis))

Sadly doesn't work. I want "." to refer to the contents of the each gene. Finally, when I do get the results, I'd like to append the results (which will be as a list) to the original data frame. For example have 2 columns (W = result, p-value = result).

My priority is getting the test run for every gene though...

Many thanks in advance for your help!

1
  • Can you be more specific about your desired output? I don't understand how you could append the p-value and test statistic as new columns to the existing data, since the tests are constructed for each variable, right? Commented Jun 25, 2021 at 9:14

1 Answer 1

2

I think to do this you need to put the data into long format (using pivot_longer from tidyr). It looks like the dataframe doesn't like having a hypothesis object in when you summarise the dataframe, so you can use the broom package and tidy it into a dataframe. The wilcox.test function also doesn't seem to like having a factor as input, so I've converted it back to numeric:

library(tidyverse)
library(broom)

df %>%
  mutate(Clinical.diagnosis = as.numeric(Clinical.diagnosis)) %>%
  pivot_longer(cols = GATA3:IL1B) %>%
  group_by(name) %>%
  summarise(w = tidy(wilcox.test(value, Clinical.diagnosis))) %>% 
  select(gene = name, w = statistic, p = p.value)

#> Warning in wilcox.test.default(value, Clinical.diagnosis): cannot compute exact
#> p-value with ties
...
#> # A tibble: 6 x 3
#>   gene      w        p
#>   <chr> <dbl>    <dbl>
#> 1 CD207   211 1.09e- 1
#> 2 GATA3     0 1.42e- 9
#> 3 IL1B      0 8.45e-10
#> 4 IL4       0 1.57e-10
#> 5 IL4R     13 8.26e- 9
#> 6 IRF4     13 1.00e- 9

Full reproducible example, with data:

library(tidyverse)
library(broom)

df <- structure(list(
  Samid = c(
    "AD001", "AD002", "AD004", "AD005",
    "AD008", "AD010", "AD011", "AD012", "AD013", "AD014", "AD015",
    "AD016", "AD017", "AD018", "AD019", "AD020", "AD021", "AD022",
    "AD023", "AD024", "AD025", "AD026", "AD027", "AD028"
  ), GATA3 = c(
    0.07850703,
    0.07850703, 0.4477987, 0.07850703, 0.2362246, 0.44779867, 0.46578259,
    0, 0.46578259, 0.44779867, 0.24396914, 0.46578259, 0.23622459,
    0.24396914, 0.07850703, 0.07850703, 1.25391517, 0.82224747, 0.07850703,
    0.07850703, 0.07850703, 0.07850703, 0.83507423, 0.07850703
  ),
  IL4 = c(
    0, 0, 0, 0, 0, 0, 0, 1.26781758, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  ), IL4R = c(
    1.65301611, 0.14846188,
    1.6307388, 0.14846188, 0.2073535, 0.14846188, 0.4656834,
    1.48227697, 0.65075963, 0.17073914, 0.14846188, 0.14846188,
    0.37809262, 0.17073914, 1.65301611, 0.14846188, 1.55269688,
    0.14846188, 2.15320576, 0.17073914, 0.44340614, 0.17073914,
    0, 0.44340614
  ), IRF4 = c(
    0, 0, 0, 0, 0, 0, 2.83446844, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  ), CD207 = c(
    0.80003601,
    0.33421377, 3.4723849, 2.32021021, 0.5828276, 0.94797393,
    0.13406957, 0.70861984, 2.25418614, 1.4883206, 2.38978722,
    3.47193671, 0.32452279, 2.31827895, 0.80003601, 0.80003601,
    0.50751017, 2.32021021, 3.0989443, 2.0619054, 1.05640955,
    3.31881563, 3.37422811, 2.32021021
  ), IL1B = c(
    0.20787567,
    0, 0, 0.20787567, 0, 0.20787567, 0, 0, 0, 0.20787567, 0.20787567,
    0, 0, 0, 0, 0, 0, 0.20787567, 0, 0.20787567, 0.20787567,
    0.61415248, 0, 0
  ), Clinical.diagnosis = structure(c(
    2L, 2L,
    2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L,
    3L, 3L, 3L, 2L, 3L, 3L, 2L
  ), .Label = c(
    "irritated", "negative",
    "positive"
  ), class = "factor")
), row.names = c(NA, -24L), class = "data.frame")

df %>%
  mutate(Clinical.diagnosis = as.numeric(Clinical.diagnosis)) %>%
  pivot_longer(cols = GATA3:IL1B) %>%
  group_by(name) %>%
  summarise(tidy(wilcox.test(value, Clinical.diagnosis))) %>% 
  select(gene = name, w = statistic, p = p.value)
#> Warning in wilcox.test.default(value, Clinical.diagnosis): cannot compute exact
#> p-value with ties

#> Warning in wilcox.test.default(value, Clinical.diagnosis): cannot compute exact
#> p-value with ties

#> Warning in wilcox.test.default(value, Clinical.diagnosis): cannot compute exact
#> p-value with ties

#> Warning in wilcox.test.default(value, Clinical.diagnosis): cannot compute exact
#> p-value with ties

#> Warning in wilcox.test.default(value, Clinical.diagnosis): cannot compute exact
#> p-value with ties

#> Warning in wilcox.test.default(value, Clinical.diagnosis): cannot compute exact
#> p-value with ties
#> # A tibble: 6 x 3
#>   gene      w        p
#>   <chr> <dbl>    <dbl>
#> 1 CD207   211 1.09e- 1
#> 2 GATA3     0 1.42e- 9
#> 3 IL1B      0 8.45e-10
#> 4 IL4       0 1.57e-10
#> 5 IL4R     13 8.26e- 9
#> 6 IRF4     13 1.00e- 9

Created on 2021-06-25 by the reprex package (v1.0.0)

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.