I went through a similar problem. My solution may be incomplete as I have not tested all functions from the class pandas DataFrame to verify how they behave with my subclass. I write it down here, in case it would be useful to someone.
- Specify pd.DataFrame as the parent class
- Override the methods from pd.DataFrame such as to return a new instance of my subclass instead of an instance of pd.DataFrame.
- Override
__getitem__ in a similar manner as the other method (but for some reason it does not work if done by overwritten)
import pandas as pd
class MyDF(pd.DataFrame):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __make_func(self, attrib):
def func(*args, **kwargs):
result = getattr(super(MyDF, self), attrib)(*args, **kwargs)
if isinstance(result, pd.DataFrame):
return set_mydf(result)
return result
return func
def overriding(self):
for attrib in [func for func in dir(pd.DataFrame)]:
if attrib not in ["__getitem__"]:
if callable(getattr(pd.DataFrame, attrib)):
self.__dict__[attrib] = self.__make_func(attrib)
def __getitem__(self, key):
result = getattr(super(MyDF, self), "__getitem__")(key)
if isinstance(result, pd.DataFrame):
return set_mydf(result)
return result
def operation(self):
mydf = self
print("-1-", type(mydf))
mydf = mydf[["a", "b"]].apply(lambda x: x*10)
print("-2-", type(mydf))
return set_mydf(mydf)
def set_mydf(*args, **kwargs) -> mydf:
df = MyDF(*args, **kwargs)
df.overriding()
return df
My tests:
mydf = MyDF(data={"a":[0,1,2,3], "b":[4,5,6,7], "c":[8,9,10,11], "d":[12,13,14,15]})
print(mydf)
print(type(mydf))
a b c d
0 0 4 8 12
1 1 5 9 13
2 2 6 10 14
3 3 7 11 15
<class '__main__.MyDF'>
mydf.overriding()
# If we apply a method from pd.Dataframe, the returned result is the subclass instance, as wanted.
mydf = mydf.apply(lambda x: x*10)
print(mydf)
print(type(mydf))
a b c d
0 0 40 80 120
1 10 50 90 130
2 20 60 100 140
3 30 70 110 150
<class '__main__.MyDF'>
# Correct result when we change a value in a cell
mydf.loc[0, "a"] = 99
print(mydf)
print(type(mydf))
a b c d
0 99 40 80 120
1 10 50 90 130
2 20 60 100 140
3 30 70 110 150
<class '__main__.MyDF'>
# Correct when we add a column
mydf["e"] = [0]*mydf.shape[0]
print(mydf)
print(type(mydf))
a b c d e
0 99 40 80 120 0
1 10 50 90 130 0
2 20 60 100 140 0
3 30 70 110 150 0
<class '__main__.MyDF'>
# Correct with a custom function inside the class
mydf = mydf.operation()
print(mydf)
print(type(mydf))
-1- <class '__main__.MyDF'>
-2- <class 'pandas.core.frame.DataFrame'>
a b
0 990 400
1 100 500
2 200 600
3 300 700
<class '__main__.MyDF'>
# This method returns a pd.Series, that was predictable
mydf = mydf["a"].apply(lambda x: x/8)
print(mydf)
print(type(mydf))
0 123.75
1 12.50
2 25.00
3 37.50
Name: a, dtype: float64
<class 'pandas.core.series.Series'>
mydf = MyDF(data={"a":[0,1,2,3], "b":[4,5,6,7], "c":[8,9,10,11], "d":[12,13,14,15]})
print(mydf)
print(type(mydf))
# Correct thanks to the custom __getitem__
mydf = mydf[["a"]]
a
0 0
1 1
2 2
3 3
<class '__main__.MyDF'>