I'm trying to compare all the file-pairs (files with same file name) in two folders 'test1' and 'test2' and print any differences they might have. I have this code below which works partly fine. It gets the files with same file names but compares only the first file-pair and not all the files in the folder. How do I fix it? Sample csv files could be like below
import os
from collections import defaultdict
import csv
def relative_files(path):
for root, dirnames, files in os.walk(path):
relroot = os.path.relpath(root, path)
for filename in files:
yield os.path.join(relroot, filename)
def difference_in_files (root_one, root_two):
files_one = set(relative_files(root_one))
for same in files_one.intersection(relative_files(root_two)):
try:
with open(os.path.join(root_one, same)) as csvfile, open(os.path.join(root_two, same)) as csvfile2:
d = defaultdict(list)
d2 = defaultdict(list)
header = next(csvfile).rstrip().split("\t")
h1 = next((i for i, x in enumerate(header) if x == "h1"),None)
h2 = next((i for i, x in enumerate(header) if x == "h2"),None)
header2 = next(csvfile2).rstrip().split("\t")
h12 = next((i for i, x in enumerate(header2) if x == "h1"),None)
h22 = next((i for i, x in enumerate(header2) if x == "h2"),None)
if h1 is not None and h2 is not None:
r = csv.reader(csvfile,delimiter="\t")
for row in r:
d[row[h1]].append(row[h2])
if h12 is not None and h22 is not None:
r = csv.reader(csvfile2,delimiter="\t")
for row in r:
d2[row[h12]].append(row[h22])
d2 = {k: list(set(v)) for k,v in dict(d2).items()}
d = {k: list(set(v)) for k,v in dict(d).items()}
diff = dict([ (key, d2.get(key, d.get(key))) for key in set(d.keys()+d2.keys()) if (key in d and (not key in d2 or d2[key] != d[key])) or (key in d2 and (not key in d or d[key] != d2[key])) ])
diff2 = dict([ (key, d.get(key, d2.get(key))) for key in set(d2.keys()+d.keys()) if (key in d2 and (not key in d or d[key] != d2[key])) or (key in d and (not key in d2 or d2[key] != d[key])) ])
return diff, diff2
except TypeError:
pass
if __name__ == '__main__':
root_one = 'test1'
root_two = 'test2'
difference_in_files (root_one, root_two)
test1/csv1.csv
h1,h2,h3
aa,90,io
bb,86,0n
test1.csv2.csv
h1,h8,h2
jj,kj,64
df,hj,12
test2/csv1.csv
h1,h2,h3
aa,90,io
bb,66,0n
test2.csv2.csv
h1,h8,h2
jj,kj,64
df,hj,12
mm,h9,09
It compares only csv1 from both the files and not csv2.
return diff, diff2is in the for loop. It will execute at the end of the first iteration of the loop. Thus no other iterations will be executed.None.returning the value, you probably shouldyieldit. Then your function is transformed into a generator, and the caller can use it like this:for diff1, diff2 in difference_in_files(root_one, root_two): print diff1, diff2.