Probably not the fastest solution for large dataframes but it works. We use prod on all rows of df2 that meet the condition.
df1['factored_qty'] = df1.apply(lambda x: df2[df2.date>x.date].factor.prod() * x.qty,axis=1)
Result:
date qty factored_qty
0 2016-10-08 1 6
1 2016-11-08 8 48
2 2016-12-08 2 6
3 2017-01-08 4 12
Update
For larger dataframes we can use
merge_asof. We calculate the reverse
cumprod, i.e. from last to first row. Unfortunately it becomes a bit convoluted if the last date in df2 is less then the last date in df1 as we have to add a sentinel to df2 (maximum date of df1 with factor 1) in this case.
This method is significantly faster than Ch3steR's and sammywemmy's solutions.
df3 = pd.merge_asof(df1.assign(date=pd.to_datetime(df1.date)),
df2.assign(date=pd.to_datetime(df2.date), factor=df2.factor.iloc[::-1].cumprod().iloc[::-1]) if(df1.date.max()<df2.date.max())
else df2.assign(date=pd.to_datetime(df2.date), factor=df2.factor.iloc[::-1].cumprod().iloc[::-1]).append({'date': pd.to_datetime(df1.date.max()), 'factor': 1}, ignore_index=True),
'date',
direction='forward')
df3.factor *= df3.qty
df3.rename(columns={'factor': 'factored_qty'}, inplace=True)
TIMING for larger dataframes (df1 200 rows, df2 100 rows
import pandas as pd
import numpy as np
n = 100
np.random.seed(0)
df1_ = pd.DataFrame({'date': [(pd.Timestamp('2020-06-01') - pd.Timedelta(x,'D')).strftime('%Y-%m-%d') for x in np.sort(np.random.choice(200*n, 2*n, False))[::-1]],
'qty': np.random.randint(1, 20, 2*n)})
df2_ = pd.DataFrame({'date': [(pd.Timestamp('2020-06-01') - pd.Timedelta(x,'D')).strftime('%Y-%m-%d') for x in np.sort(np.random.choice(100*n, n, False))[::-1]],
'factor': np.random.randint(1, 10, n)})
def setup():
global df1, df2
df1 = df1_.copy(True)
df2 = df2_.copy(True)
def method_apply():
df1['factored_qty'] = df1.apply(lambda x: df2[df2.date>x.date].factor.prod() * x.qty,axis=1)
return df1
def method_merge():
df3 = pd.merge_asof(df1.assign(date=pd.to_datetime(df1.date)),
df2.assign(date=pd.to_datetime(df2.date), factor=df2.factor.iloc[::-1].cumprod().iloc[::-1]) if(df1.date.max()<df2.date.max())
else df2.assign(date=pd.to_datetime(df2.date), factor=df2.factor.iloc[::-1].cumprod().iloc[::-1]).append({'date': pd.to_datetime(df1.date.max()), 'factor': 1}, ignore_index=True),
'date',
direction='forward')
df3.factor *= df3.qty
df3.rename(columns={'factor': 'factored_qty'}, inplace=True)
return df3
from itertools import product
from collections import defaultdict
def method_dict():
d = defaultdict(list)
df1['date'] = pd.to_datetime(df1['date'])
df2['date'] = pd.to_datetime(df2['date'])
for (date1, qty), (date2, factor) in product(df1.to_numpy(),df2.to_numpy()) :
if date1 < date2 :
d[(date1, qty)].append(factor)
outcome = {k:[s,np.prod((s,*v))] for (k,s),v in d.items()}
return pd.DataFrame.from_dict(outcome, orient='index', columns=['qty','factored_qty']).reset_index()
def method_numpy():
mask = df1.date.to_numpy()[:,None] < df2.date.to_numpy()
it = iter(mask)
def mul(x):
val = np.prod(df2.loc[next(it),'factor'])
return x*val
df1['factored_qty'] = df1['qty'].map(mul)
return df1
Results:
method_apply 220 ms ± 5.99 ms per loop
method_numpy 86.7 ms ± 2.51 ms per loop
method_dict 80.7 ms ± 436 µs per loop
method_merge 8.87 ms ± 68.1 µs per loop
Depending on the random factors in df2 their product may lead to an overflow, this was ignored here. method_dict only works correctly if the last date in df2 is greater than that of df1, this was also ignored for the timings.