Here's a generator function thats acts as an iterator on a file, cuting the lines according exotic newline being identical in all the file.
It reads the file by chunks of lenchunk characters and displays the lines in each current chunk, chunk after chunk.
Since the newline is 3 characters in my exemple (':;:'), it may happen that a chunk ends with a cut newline: this generator function takes care of this possibility and manages to display the correct lines.
In case of a newline being only one character, the function could be simplified. I wrote only the function for the most delicate case.
Employing this function allows to read a file one line at a time, without reading the entire file into memory.
from random import randrange, choice
# this part is to create an exemple file with newline being :;:
alphabet = 'abcdefghijklmnopqrstuvwxyz '
ch = ':;:'.join(''.join(choice(alphabet) for nc in xrange(randrange(0,40)))
for i in xrange(50))
with open('fofo.txt','wb') as g:
g.write(ch)
# this generator function is an iterator for a file
# if nl receives an argument whose bool is True,
# the newlines :;: are returned in the lines
def liner(filename,eol,lenchunk,nl=0):
# nl = 0 or 1 acts as 0 or 1 in splitlines()
L = len(eol)
NL = len(eol) if nl else 0
with open(filename,'rb') as f:
chunk = f.read(lenchunk)
tail = ''
while chunk:
last = chunk.rfind(eol)
if last==-1:
kept = chunk
newtail = ''
else:
kept = chunk[0:last+L] # here: L
newtail = chunk[last+L:] # here: L
chunk = tail + kept
tail = newtail
x = y = 0
while y+1:
y = chunk.find(eol,x)
if y+1: yield chunk[x:y+NL] # here: NL
else: break
x = y+L # here: L
chunk = f.read(lenchunk)
yield tail
for line in liner('fofo.txt',':;:'):
print line
Here's the same, with printings here and there to allow to follow the algorithm.
from random import randrange, choice
# this part is to create an exemple file with newline being :;:
alphabet = 'abcdefghijklmnopqrstuvwxyz '
ch = ':;:'.join(''.join(choice(alphabet) for nc in xrange(randrange(0,40)))
for i in xrange(50))
with open('fofo.txt','wb') as g:
g.write(ch)
# this generator function is an iterator for a file
# if nl receives an argument whose bool is True,
# the newlines :;: are returned in the lines
def liner(filename,eol,lenchunk,nl=0):
L = len(eol)
NL = len(eol) if nl else 0
with open(filename,'rb') as f:
ch = f.read()
the_end = '\n\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'+\
'\nend of the file=='+ch[-50:]+\
'\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n'
f.seek(0,0)
chunk = f.read(lenchunk)
tail = ''
while chunk:
if (chunk[-1]==':' and chunk[-3:]!=':;:') or chunk[-2:]==':;':
wr = [' ##########---------- cut newline cut ----------##########'+\
'\nchunk== '+chunk+\
'\n---------------------------------------------------']
else:
wr = ['chunk== '+chunk+\
'\n---------------------------------------------------']
last = chunk.rfind(eol)
if last==-1:
kept = chunk
newtail = ''
else:
kept = chunk[0:last+L] # here: L
newtail = chunk[last+L:] # here: L
wr.append('\nkept== '+kept+\
'\n---------------------------------------------------'+\
'\nnewtail== '+newtail)
chunk = tail + kept
tail = newtail
wr.append('\n---------------------------------------------------'+\
'\ntail + kept== '+chunk+\
'\n---------------------------------------------------')
print ''.join(wr)
x = y = 0
while y+1:
y = chunk.find(eol,x)
if y+1: yield chunk[x:y+NL] # here: NL
else: break
x = y+L # here: L
print '\n\n==================================================='
chunk = f.read(lenchunk)
yield tail
print the_end
for line in liner('fofo.txt',':;:',1):
print 'line== '+line
.
EDIT
I compared the times of execution of my code and of the chmullig's code.
With a 'fofo.txt' file about 10 MB, created with
alphabet = 'abcdefghijklmnopqrstuvwxyz '
ch = ':;:'.join(''.join(choice(alphabet) for nc in xrange(randrange(0,60)))
for i in xrange(324000))
with open('fofo.txt','wb') as g:
g.write(ch)
and measuring times like that:
te = clock()
for line in liner('fofo.txt',':;:', 65536):
pass
print clock()-te
fh = open('fofo.txt', 'rb')
zenBreaker = SpecialDelimiters(fh, ':;:', 65536)
te = clock()
for line in zenBreaker:
pass
print clock()-te
I obtained the following minimum times observed on several essays:
............my code 0,7067 seconds
chmullig's code 0.8373 seconds
.
EDIT 2
I changed my generator function: liner2() takes a file-handler instead of the file's name. So the opening of the file can be put out of the measuring of time, as it is for the measuring of chmullig's code
def liner2(fh,eol,lenchunk,nl=0):
L = len(eol)
NL = len(eol) if nl else 0
chunk = fh.read(lenchunk)
tail = ''
while chunk:
last = chunk.rfind(eol)
if last==-1:
kept = chunk
newtail = ''
else:
kept = chunk[0:last+L] # here: L
newtail = chunk[last+L:] # here: L
chunk = tail + kept
tail = newtail
x = y = 0
while y+1:
y = chunk.find(eol,x)
if y+1: yield chunk[x:y+NL] # here: NL
else: break
x = y+L # here: L
chunk = fh.read(lenchunk)
yield tail
fh = open('fofo.txt', 'rb')
te = clock()
for line in liner2(fh,':;:', 65536):
pass
print clock()-te
The results, after numerous essays to see the minimum times, are
.........with liner() 0.7067seconds
.......with liner2() 0.7064 seconds
chmullig's code 0.8373 seconds
In fact the opening of the file counts for an infinitesimal part in the total time.