Skip to main content
edited title
Link

File parser to get extract data from text file

deleted 28 characters in body; edited title
Source Link
Jamal
  • 35.2k
  • 13
  • 134
  • 238

file File parser to get extract data from text file

I am trying to extract the data from input file and store it for plotting. I have tested this code for a few files of same format. I I am not sure if the code works correctly with the little change in input file (like more blank spaces in between). I might have also done some terrible mistakes which iI cannot findoutfind out while testing. Basically iI need this code to be bug free.

Is there any better way of doing this task, improvements and better ways of parsing the file.

Thanks in advance.Is there any better way of doing this task, improvements and better ways of parsing the file?

codeCode:

file parser to get extract data from text file

I am trying to extract the data from input file and store it for plotting. I have tested this code for a few files of same format. I am not sure if the code works correctly with the little change in input file (like more blank spaces in between). I might have also done some terrible mistakes which i cannot findout while testing. Basically i need this code to be bug free.

Is there any better way of doing this task, improvements and better ways of parsing the file.

Thanks in advance.

code:

File parser to get extract data from text file

I am trying to extract the data from input file and store it for plotting. I have tested this code for a few files of same format. I am not sure if the code works correctly with the little change in input file (like more blank spaces in between). I might have also done some terrible mistakes which I cannot find out while testing. Basically I need this code to be bug free.

Is there any better way of doing this task, improvements and better ways of parsing the file?

Code:

Post Reopened by Mast, Simon Forsberg
deleted 186 characters in body
Source Link
    def parseFile(input, output, mode):
        infile  = open(input)
        outfile = open(output, mode)
    
        for line in infile:
            if re.match("random seed", line):
                tokens      = re.search(r'random seed = (.*)', line, re.M|re.I)
                seed        = tokens.group(1)
                #print seed
    
            if re.match("input", line):
                tokens      = re.search(r'(.*)n = (.*), m = (.*), k = (.*) \[(.*) ms\] {peak:(.*)GiB} {curr:(.*)GiB}', line, re.M|re.L)
                n           = tokens.group(2).strip()
                m           = tokens.group(3).strip()
                k           = tokens.group(4).strip()
                t           = tokens.group(5).strip()
                inPeak      = tokens.group(6).strip()
                inCurr      = tokens.group(7).strip()
                #print n, m, k, t, peak, curr
    
            if re.match("root build", line):
                tokens      = re.search(r'root build (.*)\[zero: (.*)ms\] \[pos: (.*)ms\] \[adj: (.*)ms\] \[adjsort: (.*)ms\] \[shade: (.*)ms\] done. \[(.*)ms\] {peak:(.*)GiB} {curr:(.*)GiB}', line, re.M|re.L)
                zero        = tokens.group(2).strip()
                pos         = tokens.group(3).strip()
                adj         = tokens.group(4).strip()
                adjSort     = tokens.group(5).strip()
                shade       = tokens.group(6).strip()
                rTotalTime  = tokens.group(7).strip()
                rPeak       = tokens.group(8).strip()
                rCurr       = tokens.group(9).strip()
                #print zero, pos, adj, adjSort, shade, rTotalTime, rPeak, rCurr
    
            if re.match("oracle:", line):
                tokens  = re.search(r'oracle: (.*) (.*)ms \[ (.*)GiB (.*)GiB/s (.*)GHz (.*)GHz\] (.) -- (.*)', line, re.M|re.L);
                sum         = tokens.group(1).strip()
                oracleTime  = tokens.group(2).strip()
                inSize      = tokens.group(3).strip()
                transRate   = tokens.group(4).strip()
                mulRate     = tokens.group(5).strip()
                instrRate   = tokens.group(6).strip()
                yes         = tokens.group(7).strip()
                kpath       = tokens.group(8).strip()
                #print sum, time, inSize, transRate, mulRate, instrRate, yes, kpath

    for line in infile:
        if re.match("random seed", line):
            tokens      = re.search(r'random seed = (.*)', line, re.M|re.I)
            seed        = tokens.group(1)
            #print seed

        if re.match("input", line):
            tokens      = re.search(r'(.*)n = (.*), m = (.*), k = (.*) \[(.*) ms\] {peak:(.*)GiB} {curr:(.*)GiB}', line, re.M|re.L)
            n           = tokens.group(2).strip()
            m           = tokens.group(3).strip()
            k           = tokens.group(4).strip()
            t           = tokens.group(5).strip()
            inPeak      = tokens.group(6).strip()
            inCurr      = tokens.group(7).strip()
            #print n, m, k, t, peak, curr

        if re.match("root build", line):
            tokens      = re.search(r'root build (.*)\[zero:(.*)ms\] \[pos:(.*)ms\] \[adj:(.*)ms\] \[adjsort:(.*)ms\] \[shade:(.*)ms\] done. \[(.*)ms\] {peak:(.*)GiB} {curr:(.*)GiB}', line, re.M|re.L)
            zero        = tokens.group(2).strip()
            pos         = tokens.group(3).strip()
            adj         = tokens.group(4).strip()
            adjSort     = tokens.group(5).strip()
            shade       = tokens.group(6).strip()
            rTotalTime  = tokens.group(7).strip()
            rPeak       = tokens.group(8).strip()
            rCurr       = tokens.group(9).strip()
            #print zero, pos, adj, adjSort, shade, rTotalTime, rPeak, rCurr

        if re.match("oracle:", line):
            tokens  = re.search(r'oracle: (.*) (.*)ms \[ (.*)GiB (.*)GiB/s (.*)GHz (.*)GHz\] (.) -- (.*)', line, re.M|re.L);
            sum         = tokens.group(1).strip()
            oracleTime  = tokens.group(2).strip()
            inSize      = tokens.group(3).strip()
            transRate   = tokens.group(4).strip()
            mulRate     = tokens.group(5).strip()
            instrRate   = tokens.group(6).strip()
            yes         = tokens.group(7).strip()
            kpath       = tokens.group(8).strip()
            #print sum, time, inSize, transRate, mulRate, instrRate, yes, kpath

        if re.match("command done", line):
            tokens      = re.search(r'command done \[(.*)ms\]', line, re.M|re.L)
            cmdTime     = tokens.group(1).strip()
            #print cmdTime

        if re.match("grand total", line, re.M|re.L):
            tokens      = re.search(r'grand total \[(.*)ms\] {peak: (.*)GiB}(.*)', line, re.M|re.L)
            totalTime   = tokens.group(1).strip()
            gPeak       = tokens.group(2).strip()
            #print totalTime

        if re.match("host", line):
            tokens      = re.search(r'host: (.*)', line, re.M|re.L)
            host        = tokens.group(1).strip()
            #print host
        if re.match("build:", line):
            tokens      = re.search(r'build: (.*) with(.*)', line, re.M|re.L)
            build       = tokens.group(1).strip()
            #print build

    outfile.write("%s %s %s "% (host, yes, kpath))
    outfile.write(" %6d %8d %2d %10d %8.2lf %8.2lf %8.2lf"%
                    (int(n), int(m), int(k), int(seed), float(t), float(inPeak),
                    float(inCurr)))
    outfile.write(" %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf"%
                    (float(zero), float(pos), float(adj), float(adjSort),
                    float(shade), float(rTotalTime), float(rPeak), float(rCurr)))
    outfile.write(" %s %10.2f %6.3lf %7.2lf %7.2lf %7.2lf"%
                    (sum, float(oracleTime), float(inSize), float(transRate),
                    float(mulRate), float(instrRate)))
    outfile.write(" %8.2lf %8.2lf %8.2lf"%
                    (float(cmdTime), float(totalTime), float(gPeak)))
    outfile.write(" %s\n"% (build))

    #close files
    outfile.close()
    infile.close()
#end of parseFile
    def parseFile(input, output, mode):
        infile  = open(input)
        outfile = open(output, mode)
    
        for line in infile:
            if re.match("random seed", line):
                tokens      = re.search(r'random seed = (.*)', line, re.M|re.I)
                seed        = tokens.group(1)
                #print seed
    
            if re.match("input", line):
                tokens      = re.search(r'(.*)n = (.*), m = (.*), k = (.*) \[(.*) ms\] {peak:(.*)GiB} {curr:(.*)GiB}', line, re.M|re.L)
                n           = tokens.group(2).strip()
                m           = tokens.group(3).strip()
                k           = tokens.group(4).strip()
                t           = tokens.group(5).strip()
                inPeak      = tokens.group(6).strip()
                inCurr      = tokens.group(7).strip()
                #print n, m, k, t, peak, curr
    
            if re.match("root build", line):
                tokens      = re.search(r'root build (.*)\[zero: (.*)ms\] \[pos: (.*)ms\] \[adj: (.*)ms\] \[adjsort: (.*)ms\] \[shade: (.*)ms\] done. \[(.*)ms\] {peak:(.*)GiB} {curr:(.*)GiB}', line, re.M|re.L)
                zero        = tokens.group(2).strip()
                pos         = tokens.group(3).strip()
                adj         = tokens.group(4).strip()
                adjSort     = tokens.group(5).strip()
                shade       = tokens.group(6).strip()
                rTotalTime  = tokens.group(7).strip()
                rPeak       = tokens.group(8).strip()
                rCurr       = tokens.group(9).strip()
                #print zero, pos, adj, adjSort, shade, rTotalTime, rPeak, rCurr
    
            if re.match("oracle:", line):
                tokens  = re.search(r'oracle: (.*) (.*)ms \[ (.*)GiB (.*)GiB/s (.*)GHz (.*)GHz\] (.) -- (.*)', line, re.M|re.L);
                sum         = tokens.group(1).strip()
                oracleTime  = tokens.group(2).strip()
                inSize      = tokens.group(3).strip()
                transRate   = tokens.group(4).strip()
                mulRate     = tokens.group(5).strip()
                instrRate   = tokens.group(6).strip()
                yes         = tokens.group(7).strip()
                kpath       = tokens.group(8).strip()
                #print sum, time, inSize, transRate, mulRate, instrRate, yes, kpath

        if re.match("command done", line):
            tokens      = re.search(r'command done \[(.*)ms\]', line, re.M|re.L)
            cmdTime     = tokens.group(1).strip()
            #print cmdTime

        if re.match("grand total", line, re.M|re.L):
            tokens      = re.search(r'grand total \[(.*)ms\] {peak: (.*)GiB}(.*)', line, re.M|re.L)
            totalTime   = tokens.group(1).strip()
            gPeak       = tokens.group(2).strip()
            #print totalTime

        if re.match("host", line):
            tokens      = re.search(r'host: (.*)', line, re.M|re.L)
            host        = tokens.group(1).strip()
            #print host
        if re.match("build:", line):
            tokens      = re.search(r'build: (.*) with(.*)', line, re.M|re.L)
            build       = tokens.group(1).strip()
            #print build

    outfile.write("%s %s %s "% (host, yes, kpath))
    outfile.write(" %6d %8d %2d %10d %8.2lf %8.2lf %8.2lf"%
                    (int(n), int(m), int(k), int(seed), float(t), float(inPeak),
                    float(inCurr)))
    outfile.write(" %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf"%
                    (float(zero), float(pos), float(adj), float(adjSort),
                    float(shade), float(rTotalTime), float(rPeak), float(rCurr)))
    outfile.write(" %s %10.2f %6.3lf %7.2lf %7.2lf %7.2lf"%
                    (sum, float(oracleTime), float(inSize), float(transRate),
                    float(mulRate), float(instrRate)))
    outfile.write(" %8.2lf %8.2lf %8.2lf"%
                    (float(cmdTime), float(totalTime), float(gPeak)))
    outfile.write(" %s\n"% (build))

    #close files
    outfile.close()
    infile.close()
#end of parseFile
def parseFile(input, output, mode):
    infile  = open(input)
    outfile = open(output, mode)

    for line in infile:
        if re.match("random seed", line):
            tokens      = re.search(r'random seed = (.*)', line, re.M|re.I)
            seed        = tokens.group(1)
            #print seed

        if re.match("input", line):
            tokens      = re.search(r'(.*)n = (.*), m = (.*), k = (.*) \[(.*) ms\] {peak:(.*)GiB} {curr:(.*)GiB}', line, re.M|re.L)
            n           = tokens.group(2).strip()
            m           = tokens.group(3).strip()
            k           = tokens.group(4).strip()
            t           = tokens.group(5).strip()
            inPeak      = tokens.group(6).strip()
            inCurr      = tokens.group(7).strip()
            #print n, m, k, t, peak, curr

        if re.match("root build", line):
            tokens      = re.search(r'root build (.*)\[zero:(.*)ms\] \[pos:(.*)ms\] \[adj:(.*)ms\] \[adjsort:(.*)ms\] \[shade:(.*)ms\] done. \[(.*)ms\] {peak:(.*)GiB} {curr:(.*)GiB}', line, re.M|re.L)
            zero        = tokens.group(2).strip()
            pos         = tokens.group(3).strip()
            adj         = tokens.group(4).strip()
            adjSort     = tokens.group(5).strip()
            shade       = tokens.group(6).strip()
            rTotalTime  = tokens.group(7).strip()
            rPeak       = tokens.group(8).strip()
            rCurr       = tokens.group(9).strip()
            #print zero, pos, adj, adjSort, shade, rTotalTime, rPeak, rCurr

        if re.match("oracle:", line):
            tokens  = re.search(r'oracle: (.*) (.*)ms \[ (.*)GiB (.*)GiB/s (.*)GHz (.*)GHz\] (.) -- (.*)', line, re.M|re.L);
            sum         = tokens.group(1).strip()
            oracleTime  = tokens.group(2).strip()
            inSize      = tokens.group(3).strip()
            transRate   = tokens.group(4).strip()
            mulRate     = tokens.group(5).strip()
            instrRate   = tokens.group(6).strip()
            yes         = tokens.group(7).strip()
            kpath       = tokens.group(8).strip()
            #print sum, time, inSize, transRate, mulRate, instrRate, yes, kpath

        if re.match("command done", line):
            tokens      = re.search(r'command done \[(.*)ms\]', line, re.M|re.L)
            cmdTime     = tokens.group(1).strip()
            #print cmdTime

        if re.match("grand total", line, re.M|re.L):
            tokens      = re.search(r'grand total \[(.*)ms\] {peak:(.*)GiB}(.*)', line, re.M|re.L)
            totalTime   = tokens.group(1).strip()
            gPeak       = tokens.group(2).strip()
            #print totalTime

        if re.match("host", line):
            tokens      = re.search(r'host:(.*)', line, re.M|re.L)
            host        = tokens.group(1).strip()
            #print host
        if re.match("build:", line):
            tokens      = re.search(r'build:(.*) with(.*)', line, re.M|re.L)
            build       = tokens.group(1).strip()
            #print build

    outfile.write("%s %s %s "% (host, yes, kpath))
    outfile.write(" %6d %8d %2d %10d %8.2lf %8.2lf %8.2lf"%
                    (int(n), int(m), int(k), int(seed), float(t), float(inPeak),
                    float(inCurr)))
    outfile.write(" %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf %8.2lf"%
                    (float(zero), float(pos), float(adj), float(adjSort),
                    float(shade), float(rTotalTime), float(rPeak), float(rCurr)))
    outfile.write(" %s %10.2f %6.3lf %7.2lf %7.2lf %7.2lf"%
                    (sum, float(oracleTime), float(inSize), float(transRate),
                    float(mulRate), float(instrRate)))
    outfile.write(" %8.2lf %8.2lf %8.2lf"%
                    (float(cmdTime), float(totalTime), float(gPeak)))
    outfile.write(" %s\n"% (build))

    #close files
    outfile.close()
    infile.close()
#end of parseFile
Post Closed as "Not suitable for this site" by Simon Forsberg
edited tags
Link
Loading
Source Link
Loading