Parse txt file to dataframe in R

Question

I have a txt file with information of targets predictions that I want to parse into a dataframe in R. The info in the file is already in the most simple way. Each line will become a row in the future dataframe that will have only 4 columns that should look like this:

  MicroRNA                          Transcript                         Type   Energy
miR-981|LQNS02278082.1_33127_3p      TRINITY_GG_20135_c0_g1_i5.mrna1   7_A1   -0.70

However, what I am doing in R is not working.

a <- read_lines("results")
> head(a)
[1] "MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -0.70 Kcal/mol"                        
[2] "MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -5.77 Kcal/mol"                        
[3] "MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -1.77 Kcal/mol"                                                        
[4] "MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -5.20 Kcal/mol"                                                        
[5] "MicroRNA   = LQNS02278075.1_32377_3p\t\tTranscript = TRINITY_GG_143691_c0_g1_i3.mrna1 Dir=sense TAG=Acidic phospholipase A2 PA4\t\tType       = 7_A1\t\tEnergy     = -3.30 Kcal/mol"                                                       
[6] "MicroRNA   = miR-317|LQNS02000228.1_2413_3p\t\tTranscript = TRINITY_GG_4592_c2_g1_i10.mrna1 Dir=sense TAG=Serine/threonine-protein phosphatase 2A regulatory subunit B'' subunit gamma\t\tType       = 7_m8\t\tEnergy     = -6.35 Kcal/mol"


dput(head(a,4))
c("MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -0.70 Kcal/mol", 
"MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -5.77 Kcal/mol", 
"MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -1.77 Kcal/mol", 
"MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -5.20 Kcal/mol"
)
re <- rex(
    
    capture(name = "MicroRNA", alpha),
    "[",
    spaces,
    capture(name = "Transcript", alpha),
    "[",
    spaces,
    capture(name = "Type", alpha),
    "[",
    spaces,
    capture(name = "Energy", digits),
    "]:")

 re_matches(a, re)

    MicroRNA Transcript Type Energy
1       <NA>       <NA> <NA>   <NA>
2       <NA>       <NA> <NA>   <NA>
3       <NA>       <NA> <NA>   <NA>

Any idea how to do this in R or shell? Thanks!

Irfaan · Accepted Answer · 2020-12-16 19:12:16Z

You could try using regex

    library(stringr)
    
    #Example data
    data <- c("MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -0.70 Kcal/mol", 
      "MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -5.77 Kcal/mol", 
      "MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -1.77 Kcal/mol", 
      "MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -5.20 Kcal/mol"
    )
    
    #Split the data
    lines_split <- strsplit(data, split="\t\t", fixed=TRUE)
    #No of columns
    cols=1:4
    #Rbind rows
    df <- as.data.frame(do.call("rbind", lapply(lines_split, "[", cols)))
    
    #Extract any info after =
    df[,-2] <- lapply(df[,-2],function(x) trimws(sub('.*=', '', x)))
    
    #Since this variable has two =, we extract info between = and Dir as per your 
    #Output
    df$V2 <- str_match(df$V2, "=\\s*(.*?)\\s*Dir")[,2]
    
    #Removing Kcal/mol
    df$V4 <- as.numeric(str_replace(df$V4,"Kcal/mol",""))

jay.sf · Accepted Answer · 2020-12-16 18:55:37Z

0

Using read.table.

r <- read.table(text=a, sep="\t", colClasses=c(NA, "NULL"), header=TRUE)
nn <- unname(sapply(r, function(x) trimws(unique(sapply(strsplit(x, "="), `[`, 1)))))
res <- setNames(as.data.frame(sapply(r, function(x) sapply(strsplit(x, "="), `[`, 2))), nn)

Result:

res
#                           MicroRNA                           Transcript  Type          Energy
# 1  miR-981|LQNS02278082.1_33127_3p  TRINITY_GG_20135_c0_g1_i5.mrna1 Dir  7_A1  -5.77 Kcal/mol
# 2          LQNS02278125.1_38470_3p  TRINITY_GG_22182_c1_g1_i2.mrna1 Dir  7_A1  -1.77 Kcal/mol
# 3          LQNS02278125.1_38470_3p  TRINITY_GG_22182_c1_g1_i2.mrna1 Dir  7_A1  -5.20 Kcal/mol

answered Dec 16, 2020 at 18:55

jay.sf

76.3k8 gold badges66 silver badges132 bronze badges

1 Comment

Amaranta_Remedios Over a year ago

Thanks! It partially works. Somehow when I tried in my data (bigger than in the example) a chuck of the data has been lost. > length(a) [1] 3577359 > dim(res) [1] 1754993 4

Collectives™ on Stack Overflow

Parse txt file to dataframe in R

2 Answers 2

1 Comment

1 Comment

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

1 Comment

1 Comment

Your Answer

Sign up or log in

Post as a guest

Related