I did the following:
- Annotated the original version with
STRANGE (= I am not sure what you are doing), EFFICIENT (= can be made more efficient), SIMPLIFY (=can be made simpler).
- Created two other versions that may be more efficient, but change behavior (in terms of memory, or execution behavior). This is elaborated below.
To verify that these suggestions are actually helpful, consider using ipython %timeit. Enter the following at the IPython prompt:
In [0]: %timeit -n<N> %run script.py
Where <N> is the number of runs to average over (1000 by default, which might take too long).
Annotated Original
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
for logfile in myzip.namelist():
list1 = []
list2 = []
f = myzip.open(logfile)
# contents = f.readlines()
# for line in contents[:]:
for line in f: # EFFICIENT: does the same without making a copy
try:
parsed = json.loads(line[:-2])
# if "key1" in parsed.keys():
if "key1" in parsed: # EFFICIENT: no copy
# STRANGE: 'val' in dict checks for key existence by
# default, are you sure this is what you want?
if "val1" in parsed['key1']['key2']:
if "val2" in parsed['key3']:
list1.append(parsed['key1'])
list2.append(parsed['key3'])
except ValueError as e:
pass
# STRANGE: Why is this here?
# else:
# pass
df1 = pd.DataFrame(list1)
df2 = pd.DataFrame(list2)
df3 = df2.join(df1)
# EFFICIENT: prefer generator over list comprehension
# df3['col1'] = df3['col1'].apply(lambda x: ','.join([str(i) for i in x]))
df3['col1'] = df3['col1'].apply(lambda x: ','.join(str(i) for i in x))
df3.drop_duplicates(inplace=True)
# SIMPLIFY:
# with open(csvout, 'a') as f2:
# df.to_csv(f2, header=None, index=False)
# f2.close()
# STRANGE: where does `df` come from? Shouldn't this be df3?
df.to_csv(csvout, mode='a', header=None, index=False)
# STRANGE: you open f in a loop, but close it outside of the loop?
f.close()
Build in Memory, Write Once
If you have enough memory, the following might be faster: rather than appending to the file you concatenate all files in memory first.
This also changes the behavior slightly:
- duplicates are filtered across all files
Also some stylistic changes:
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
list1, list2 = [], [] # Notice these are outside the loop
for logfile in myzip.namelist():
with myzip.open(logfile) as f:
for line in f:
try:
parsed = json.loads(line[:-2])
except ValueError as e: # Presumably we only wish to catch json value errors
pass
else:
if ("key1" in parsed
and "val1" in parsed['key1']['key2']
and "val2" in parsed['key3']):
list1.append(parsed['key1'])
list2.append(parsed['key3'])
# Write only once
df = pd.DataFrame(list2).join(pd.DataFrame(list1))
df['col1'] = df['col1'].apply(lambda x: ','.join(str(i) for i in x))
df.drop_duplicates(inplace=True)
df.to_csv(csvout, header=None, index=False)
Build in Memory, Write Once, Filter Duplicates Only Per File
Keeping the duplicate filtering local to each file:
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
dfs = []
for logfile in myzip.namelist():
list1, list2 = [], []
with myzip.open(logfile) as f:
for line in f:
try:
parsed = json.loads(line[:-2])
except ValueError as e: # Presumably we only wish to catch json value errors
pass
else:
if ("key1" in parsed
and "val1" in parsed['key1']['key2']
and "val2" in parsed['key3']):
list1.append(parsed['key1'])
list2.append(parsed['key3'])
# Build a temporary dataframe to filter the duplicates:
tmp = pd.DataFrame(list2).join(pd.DataFrame(list1))
tmp['col1'] = tmp['col1'].apply(lambda x: ','.join(str(i) for i in x))
tmp.drop_duplicates(inplace=True)
dfs.append(tmp)
# Write only once
pd.concat(dfs, ignore_index=True).to_csv(csvout, header=None, index=False)
read_jsonandconcatwhich may be more efficient. At the moment there's too much going on in this question though, tough to answer.