Short version: I need help applying someone else's groupby class on multiple pandas columns and with more complicated functions.
Long version: Someone else (Elizabeth Santorella) wrote a python class to significantly speed up groupby and groupby-apply operations and wrote instructions for it (links below). Note: the code featured in the blog post is not the same as the code on the github page.
https://github.com/esantorella/hdfe/blob/master/hdfe/groupby.py
http://esantorella.com/2016/06/16/groupby/
The class works very well for applying simple user-defined functions on groups in one column (see my code using the class below). I have two issues:
I cannot figure out how to use this groupby class groupings on multiple columns.
I want to use this groupby class on more complicated functions that take more arguments than just one column value, but I cannot get a dummy version of such a function to work either.
This code seems great and has some documentation, I'm just not educated enough to use it. I have edited this question to include a simple squaring function (squareit) and a dummy sklearn regression function (dummypredict). The squareit function will work with the Groupby class with one categorical column but not with two. The dummypredict function is not working with the Groupby class at all. I appreciate any help, and this is what I have tried so far.
# import
import numpy as np
import pandas as pd
from sklearn import linear_model
import time
# these are required for Groupby class
from typing import Union, Tuple, Callable, Any
# timing code setup
time.clock = time.time
# choose regressor
regr = linear_model.LinearRegression()
# groubpy class definition written by E. Santorella
class Groupby:
def __init__(self, keys: Union[np.ndarray, pd.Series]):
"""
:param keys: List of group identifiers. Both __init__ and apply will run
much faster if keys is already sorted.
"""
self.keys = keys
try:
already_sorted = np.issubdtype(keys.dtype, np.number) and (
np.all(np.diff(keys) >= 0)
)
except ValueError:
already_sorted = False
if already_sorted:
keys = np.squeeze(keys)
if keys.ndim > 1:
raise ValueError("keys should be 1-dimensional")
self.already_sorted = True
new_idx = np.concatenate(([1], np.diff(keys) != 0))
self.first_occurrences = np.where(new_idx)[0]
self.keys_as_int: np.ndarray = np.cumsum(new_idx) - 1
assert isinstance(self.keys_as_int, np.ndarray)
self.n_keys = self.keys_as_int[-1] + 1
else:
self.already_sorted = False
_, self.first_occurrences, self.keys_as_int = np.unique(
keys, return_index=True, return_inverse=True
)
self.n_keys = max(self.keys_as_int) + 1
self.indices = self._set_indices()
def _set_indices(self):
if self.already_sorted:
indices = [
slice(i, j)
for i, j in zip(self.first_occurrences[:-1], self.first_occurrences[1:])
]
indices.append(slice(self.first_occurrences[-1], len(self.keys_as_int)))
indices = np.array(indices)
else:
indices = [[] for _ in range(self.n_keys)]
for i, k in enumerate(self.keys_as_int):
indices[k].append(i)
indices = np.array([np.array(elt) for elt in indices])
return indices
def apply(
self,
function_: Callable[[np.ndarray], Any],
array: Union[np.ndarray, pd.Series],
broadcast: bool = True,
shape: Tuple = None,
order: str = "c",
as_dataframe: bool = False,
):
"""
Applies a function to each group, where groups are defined by self.keys_as_int
(or, equivalently, as the argument of __init__.)
If broadcast=True, first dimension of output will equal first dimension of
"array", as in Pandas "transform".
If broadcast=False, first dimension of output equals self.n_keys, as in Pandas
"groupby".
:param function_: function to be applied to each group
:param array: np.ndarray or similar. Should have same first dimension as
self.keys_as_int.
:param broadcast: bool
:param shape: Shape of output. Can be up to 3-dimensional.
First dimension must be array.shape[0] (if broadcast=True)
or self.n_keys (if broadcast=False). Default is for output to be
one-dimensional.
:param order: Should output be c-ordered or fortran-ordered?
:param as_dataframe: if False, returns output as ndarray; if True, returns
output
as DataFrame with keys as indices
:return:
"""
if isinstance(array, pd.Series):
names = [array.name]
array = np.asarray(array)
elif isinstance(array, pd.DataFrame):
names = array.columns
array = array.values
else:
names = [None]
assert isinstance(array, np.ndarray)
if broadcast:
result = np.zeros(array.shape[0] if shape is None else shape, order=order)
assert result.shape[0] == array.shape[0]
# np.take doesn't allow slice arguments, so this has to be more verbose
# than when not already sorted
if self.already_sorted:
if array.ndim == 1:
for idx in self.indices:
result[idx] = function_(array[idx])
elif array.ndim == 2:
for idx in self.indices:
result[idx] = function_(array[idx, :])
elif array.ndim == 3:
for idx in self.indices:
result[idx] = function_(array[idx, :, :])
else:
raise NotImplementedError("Can't have more than 3 dims")
else:
for idx in self.indices:
result[idx] = function_(np.take(array, idx, 0))
if as_dataframe:
return pd.DataFrame(index=self.keys, data=result)
return result
result = np.zeros(self.n_keys if shape is None else shape, order=order)
assert result.shape[0] == self.n_keys
if self.already_sorted:
if array.ndim == 1:
for k, idx in enumerate(self.indices):
result[k] = function_(array[idx])
elif array.ndim == 2:
for k, idx in enumerate(self.indices):
result[k] = function_(array[idx, :])
elif array.ndim == 3:
for k, idx in enumerate(self.indices):
result[k] = function_(array[idx, :, :])
else:
raise NotImplementedError("Can't have more than 3 dims")
else:
for first_occurrence, idx in zip(self.first_occurrences, self.indices):
result[self.keys_as_int[first_occurrence]] = function_(
np.take(array, idx, 0)
)
if as_dataframe:
return pd.DataFrame(
index=self.keys[self.first_occurrences], data=result, columns=names
)
return result
# a simple squaring function with one input
def squareit(x):
y = x*x
return y
# dummy regression function with three inputs
def dummypredict(data_in, yvar, xvars):
# create df_train from data_in based on actuals not being null
df_train = data_in[data_in['actual'].notnull()]
# create df_predict from data_in based on actuals being null
df_predict = data_in[data_in['actual'].isnull()]
# fit the coefficients using the regressor
regr.fit(df_train[xvars], df_train[yvar])
# we create the prediction matrix
X_predict = df_predict[xvars]
# we assign predicted values to
df_predict['forecast'] = regr.predict(X_predict)
# concatenate all data
data_full = pd.concat([df_train, df_predict], axis = 0, ignore_index = True)
#return coef
return data_full
# original dataframe, not used for functions:
n_obs = 10**4
n_categories = 10**3
first_cat = np.random.choice(n_categories, n_obs)
np.random.seed(2016)
y = np.random.normal(0, 1, n_obs)
df = pd.DataFrame({'first cat': first_cat,
'actual': y})
# decimal level of time reporting
n_decimals = 5
# dataframe setup code, uses two categories
n_obs = 10**5
np.random.seed(20416)
# list of category labels per category
first_cat_ls = ['Albany', 'Augusta','Ashland','Angoon','Aniak','Anvik','Appleton',
'Arcata', 'Arctic', 'Asheville', 'Abilene', 'Anaktueuk', 'Aspen',
'Athens', 'Atka', 'Atlanta', 'Atlantic', 'Atqasuk','Anchorage',
'Austin']
second_cat_ls = ['Cat', 'Dog', 'Bear', 'Hog', 'Parrot','Mongoose','Tiger', 'Lion',
'Aardvark','Ant','Roach','Pigeon','Spider','Elk','Emu', 'Mouse',
'Elephant','Human','Muskrat','Horse','Raccoon','Hummingbird',
'Sloth', 'Termite',]
# make category arrays
first_category = np.random.choice(first_cat_ls, n_obs)
second_category = np.random.choice(second_cat_ls, n_obs)
# make array of actuals data
actual = abs(np.random.normal(1, 10, n_obs))
# creating two features randomly
feature_1 = abs(np.random.normal(2, 7, n_obs))
feature_2 = abs(np.random.normal(1, 7, n_obs))
# creating the dataframe
df = pd.DataFrame({'first_cat': first_category,
'second_cat': second_category,
'feat1' : feature_1,
'feat2' : feature_2,
'actual': actual})
# creating two features again for variability
feature_1 = abs(np.random.normal(1, 3, n_obs))
feature_2 = abs(np.random.normal(1, 3, n_obs))
# creating another data frame to "predict" with
df2 = df.copy(deep=True)
# nulling out the actuals because df2 is the subset
# that will have a forecast made
df2['actual'] = np.nan
# create new df to feed into regression code made of df1 & 2
# groups are same in both and are fitted/predicted together
df = pd.concat([df, df2], axis = 0)
# sorting
df = df.sort_values(['first_cat','second_cat','actual'])
#%%
# simple squaring function with Groupby class works & creates warning (below):
start = time.clock()
x = Groupby(df['first_cat']).apply(squareit, df['actual'], broadcast=True)
# VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences
# (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes)
# is deprecated. If you meant to do this, you must specify 'dtype=object'
# when creating the ndarray. indices = np.array([np.array(elt) for elt in indices])
print('time to compute group squares with Groupby class: {0}'\
.format(round(time.clock() - start, n_decimals)))
# lamda apply method - slightly slower
start = time.clock()
df.groupby(['first_cat']).apply(lambda x: squareit(df['actual']))
print('time to compute group squares with lambda method on groupby apply: {0}'\
.format(round(time.clock() - start, n_decimals)))
# simple squaring function with native pandas - slightly slower
start = time.clock()
x = df.groupby(['first_cat'])['actual'].transform(squareit)
print('time to compute group squares with Pandas transform: {0}'\
.format(round(time.clock() - start, n_decimals)))
# dummy regression function with standard pandas / sklearn works
start = time.clock()
x = df.groupby(['first_cat']).apply(dummypredict, 'actual', ['feat1','feat2'])
print('time to compute group regressions with standard Pandas / sklearn: {0}'\
.format(round(time.clock() - start, n_decimals)))
# simple squaring function with Groupby class does not work with two categoricals:
x = Groupby(df[['first_cat','second_cat']]).apply(squareit, df['actual'], broadcast=True)
# AttributeError: 'DataFrame' object has no attribute 'dtype'
# using the dummy regression function with Groupby class does not work
x = Groupby(df['first_cat']).apply(dummypredict, 'actual', ['feat1','feat2'], broadcast=True)
# apply() got multiple values for argument 'broadcast'
# this also did not work
x = Groupby(df['first_cat']).apply(dummypredict('actual', ['feat1','feat2']), broadcast=True)
# dummypredict() missing 1 required positional argument: 'xvars'
Groupbycode, we're looking for the "complicated functions" you're using. (The example functions you've shown can be implemented without using.applyat all - so they don't help as much.)