2

I have a txt file with information of targets predictions that I want to parse into a dataframe in R. The info in the file is already in the most simple way. Each line will become a row in the future dataframe that will have only 4 columns that should look like this:

  MicroRNA                          Transcript                         Type   Energy
miR-981|LQNS02278082.1_33127_3p      TRINITY_GG_20135_c0_g1_i5.mrna1   7_A1   -0.70

However, what I am doing in R is not working.

a <- read_lines("results")
> head(a)
[1] "MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -0.70 Kcal/mol"                        
[2] "MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -5.77 Kcal/mol"                        
[3] "MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -1.77 Kcal/mol"                                                        
[4] "MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -5.20 Kcal/mol"                                                        
[5] "MicroRNA   = LQNS02278075.1_32377_3p\t\tTranscript = TRINITY_GG_143691_c0_g1_i3.mrna1 Dir=sense TAG=Acidic phospholipase A2 PA4\t\tType       = 7_A1\t\tEnergy     = -3.30 Kcal/mol"                                                       
[6] "MicroRNA   = miR-317|LQNS02000228.1_2413_3p\t\tTranscript = TRINITY_GG_4592_c2_g1_i10.mrna1 Dir=sense TAG=Serine/threonine-protein phosphatase 2A regulatory subunit B'' subunit gamma\t\tType       = 7_m8\t\tEnergy     = -6.35 Kcal/mol"


dput(head(a,4))
c("MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -0.70 Kcal/mol", 
"MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -5.77 Kcal/mol", 
"MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -1.77 Kcal/mol", 
"MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -5.20 Kcal/mol"
)
re <- rex(
    
    capture(name = "MicroRNA", alpha),
    "[",
    spaces,
    capture(name = "Transcript", alpha),
    "[",
    spaces,
    capture(name = "Type", alpha),
    "[",
    spaces,
    capture(name = "Energy", digits),
    "]:")

 re_matches(a, re)

    MicroRNA Transcript Type Energy
1       <NA>       <NA> <NA>   <NA>
2       <NA>       <NA> <NA>   <NA>
3       <NA>       <NA> <NA>   <NA>

Any idea how to do this in R or shell? Thanks!

2 Answers 2

1

You could try using regex

    library(stringr)
    
    #Example data
    data <- c("MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -0.70 Kcal/mol", 
      "MicroRNA   = miR-981|LQNS02278082.1_33127_3p\t\tTranscript = TRINITY_GG_20135_c0_g1_i5.mrna1 Dir=antisense TAG=Neuronal acetylcholine receptor subunit alpha-9\t\tType       = 7_A1\t\tEnergy     = -5.77 Kcal/mol", 
      "MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -1.77 Kcal/mol", 
      "MicroRNA   = LQNS02278125.1_38470_3p\t\tTranscript = TRINITY_GG_22182_c1_g1_i2.mrna1 Dir=antisense TAG=Uncharacterized protein\t\tType       = 7_A1\t\tEnergy     = -5.20 Kcal/mol"
    )
    
    #Split the data
    lines_split <- strsplit(data, split="\t\t", fixed=TRUE)
    #No of columns
    cols=1:4
    #Rbind rows
    df <- as.data.frame(do.call("rbind", lapply(lines_split, "[", cols)))
    
    #Extract any info after =
    df[,-2] <- lapply(df[,-2],function(x) trimws(sub('.*=', '', x)))
    
    #Since this variable has two =, we extract info between = and Dir as per your 
    #Output
    df$V2 <- str_match(df$V2, "=\\s*(.*?)\\s*Dir")[,2]
    
    #Removing Kcal/mol
    df$V4 <- as.numeric(str_replace(df$V4,"Kcal/mol",""))
Sign up to request clarification or add additional context in comments.

1 Comment

Thanks, this those the job perfectly!
0

Using read.table.

r <- read.table(text=a, sep="\t", colClasses=c(NA, "NULL"), header=TRUE)
nn <- unname(sapply(r, function(x) trimws(unique(sapply(strsplit(x, "="), `[`, 1)))))
res <- setNames(as.data.frame(sapply(r, function(x) sapply(strsplit(x, "="), `[`, 2))), nn)

Result:

res
#                           MicroRNA                           Transcript  Type          Energy
# 1  miR-981|LQNS02278082.1_33127_3p  TRINITY_GG_20135_c0_g1_i5.mrna1 Dir  7_A1  -5.77 Kcal/mol
# 2          LQNS02278125.1_38470_3p  TRINITY_GG_22182_c1_g1_i2.mrna1 Dir  7_A1  -1.77 Kcal/mol
# 3          LQNS02278125.1_38470_3p  TRINITY_GG_22182_c1_g1_i2.mrna1 Dir  7_A1  -5.20 Kcal/mol

1 Comment

Thanks! It partially works. Somehow when I tried in my data (bigger than in the example) a chuck of the data has been lost. > length(a) [1] 3577359 > dim(res) [1] 1754993 4

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.