I have two functions that give the same result, one vectorized and one with a "for" loop. Suprisingly the for loop is faster than the vectorized version. Any idea why is it so ?
def loop_for(df):
gpd = df.groupby([pd.TimeGrouper(freq="QS-JAN"), 'CD_PDP'])
result = []
for (quarter, unite), data in gpd:
nb_MAT_RH = data["MAT_RH" ].nunique()
nb_MAT_RHPI = data["MAT_RHPI"].nunique()
result.append({"CD_PDP": unite, "MOIS_COMPTABLE": quarter, "nb_mat_rh" : nb_MAT_RH, "nb_MAT_RHPI" : nb_MAT_RHPI})
return pd.DataFrame(result)
def vectorisation(df):
b = df.groupby([pd.TimeGrouper(freq="QS-JAN"), 'CD_PDP']).apply(lambda x: pd.Series( {"nb_mat_rh" : x["MAT_RH" ].nunique(),
"nb_MAT_RHPI" : x["MAT_RHPI"].nunique()}))
return b.reset_index()
when testing :
import timeit
print "loop"
print timeit.timeit(stmt="loop_for(df)",number= 2, setup="from __main__ import loop_for; from __main__ import df")
print "vector"
print timeit.timeit(stmt="vectorisation(df)",number= 2, setup="from __main__ import vectorisation; from __main__ import df")
it gives :
loop
6.83789801598
vector
7.13991713524
.apply(lambda ... )is not really vectorization; it is essentially the same as running aforloop over the data.dfyou're passing to your functions. But more importantly... what @ajcr said.