I want to create a dataframe pathways with two columns:
Genes: All the genes that are in theGenescolumn of theenricheddata frame.Pathways: The rownames ofenrichedthat correspond to each gene; the gene may appear in more than one row.
I'm not sure where to begin.
How I generated the enriched data frame:
rownames(enrich.top5) <- enrich.top5[,2]
enrich.top5[,2] <- NULL
enriched <- data.frame(do.call('rbind', strsplit(as.character(enrich.top5$Genes),';',fixed=TRUE)))
rownames(enriched) <- rownames(enrich.top5)
enriched
> dput(enriched[1:5,1:20])
structure(list(X1 = c("CALML6", "ATF2", "MYLK2", "ATF2", "PRDM4"
), X2 = c("CALML3", "ARAF", "ITGA2B", "PPP2R2A", "CALML6"), X3 = c("CALML4",
"ELK1", "TNC", "TCL1B", "IRS1"), X4 = c("ACTB", "CRKL", "ELK1",
"TCL1A", "CALML3"), X5 = c("CRKL", "ELK4", "ACTB", "PPP2R1B",
"CALML4"), X6 = c("AKT2", "RPS6KA4", "MYLK3", "PPP2R1A", "CRKL"
), X7 = c("RASSF5", "RPS6KA3", "CRKL", "CREB3L4", "RPS6KA3"),
X8 = c("AKT3", "RPS6KA6", "MYLK", "CREB3L1", "RPS6KA6"),
X9 = c("KDR", "RPS6KA5", "ACTG1", "MYC", "RPS6KA5"), X10 = c("AKT1",
"MYC", "IGF1R", "AKT2", "AKT2"), X11 = c("PLCE1", "AKT2",
"MYLK4", "MYB", "ARHGDIA"), X12 = c("PRKCG", "RPS6KA2", "PPP1CB",
"CREB3L2", "RPS6KA2"), X13 = c("PRKCI", "AKT3", "COMP", "AKT3",
"AKT3"), X14 = c("PRKCB", "STMN1", "PPP1CC", "KDR", "RPS6KA1"
), X15 = c("PRKCA", "RPS6KA1", "CCND3", "AKT1", "ARHGDIB"
), X16 = c("TIAM1", "KDR", "CCND2", "FLT3LG", "AKT1"), X17 = c("ADCY9",
"AKT1", "CCND1", "PRKCA", "MAP3K5"), X18 = c("PRKD3", "PRKACA",
"IBSP", "EREG", "MAP2K1"), X19 = c("PARD3", "PRKACB", "TNN",
"CDC37", "MAP2K2"), X20 = c("PFN4", "PRKCG", "AKT2", "DDIT4",
"PRKCD")), row.names = c("Rap1 signaling pathway", "MAPK signaling pathway",
"Focal adhesion", "PI3K-Akt signaling pathway", "Neurotrophin signaling pathway"
), class = "data.frame")
Desired output (example only):
pathways = data.frame(
Genes = c(
"TP53",
"WT1",
"PHF6",
"DNMT3A",
"DNMT3B",
"TET1",
"TET2",
"IDH1",
"IDH2",
"FLT3",
"KIT",
"KRAS",
"NRAS",
"RUNX1",
"CEBPA",
"ASXL1",
"EZH2",
"KDM6A"
),
Pathway = rep(c(
"TSG", "DNAm", "Signalling", "TFs", "ChromMod"
), c(3, 6, 4, 2, 3)),
stringsAsFactors = FALSE
)
head(pathways)
#> Genes Pathway
#> 1 TP53 TSG
#> 2 WT1 TSG
#> 3 PHF6 TSG
#> 4 DNMT3A DNAm
#> 5 DNMT3B DNAm
#> 6 TET1 DNAm