from __future__ import division
from itertools import tee, izip
from collections import Counter
text = '''\
chr1 1 A 3
chr1 2 G 3
chr1 3 T 3
chr1 4 C 2
chr1 5 G 1
chr1 6 T 2
chr1 7 G 3
chr1 8 C 3
chr1 9 A 5
chr1 10 A 8
chr2 5 A 1
chr2 6 G 0
chr2 7 G 0
chr2 8 G 0
chr2 9 C 2
chr2 10 T 3
chr2 11 A 3'''
def window(iterable, size):
iters = tee(iterable, size)
for i in xrange(1, size):
for each in iters[i:]:
next(each, None)
return izip(*iters)
def get_avg(lists, column):
return sum(zip(*lists)[column]) / len(lists)
def get_GC_percentage(lists, column):
counts = Counter(zip(*lists)[column])
return (counts['C'] + counts['G']) / len(lists)
line_tuples = (line.split() for line in text.split('\n'))
line_tuples_casted = ((a,int(b),c,int(d)) for a,b,c,d in line_tuples)
line_tuples_chunks = window(line_tuples_casted, 2)
for (i,chunk) in enumerate(line_tuples_chunks):
print 'i: {:2} | avg: {} | GC_content: {:5.0%}'.format(i, get_avg(chunk, 3), get_GC_percentage(chunk, 2))
Output:
i: 0 | avg: 3.0 | GC_content: 50%
i: 1 | avg: 3.0 | GC_content: 50%
i: 2 | avg: 2.5 | GC_content: 50%
i: 3 | avg: 1.5 | GC_content: 100%
i: 4 | avg: 1.5 | GC_content: 50%
i: 5 | avg: 2.5 | GC_content: 50%
i: 6 | avg: 3.0 | GC_content: 100%
i: 7 | avg: 4.0 | GC_content: 50%
i: 8 | avg: 6.5 | GC_content: 0%
i: 9 | avg: 4.5 | GC_content: 0%
i: 10 | avg: 0.5 | GC_content: 50%
i: 11 | avg: 0.0 | GC_content: 100%
i: 12 | avg: 0.0 | GC_content: 100%
i: 13 | avg: 1.0 | GC_content: 100%
i: 14 | avg: 2.5 | GC_content: 50%
i: 15 | avg: 3.0 | GC_content: 0%
But note, this is not quite optimal solution. We could do better by not calculating average on each iteration for the whole window, but update it using the values which leave the window and come to it.