Is there a way for Polars to rename all columns, not just at the top level, but including multiple levels of nested structs?
I need them to all be lowercase via str.lower
Is there a way for Polars to rename all columns, not just at the top level, but including multiple levels of nested structs?
I need them to all be lowercase via str.lower
There was a recent PR which suggests there will be an API for this in the future:
Until then it seems you need to manually walk the schema and unpack/rebuild the nested structures with the new names.
def unpack_dtype(expr, dtype):
if dtype == pl.Array:
inner = unpack_dtype(expr.arr.explode(), dtype.inner)
expr = inner.reshape((-1, dtype.size))
elif dtype == pl.List:
inner = unpack_dtype(pl.element(), dtype.inner)
expr = expr.list.eval(inner)
elif dtype == pl.Struct:
fields = [
unpack_dtype(pl.field(inner_name.lower()), inner_dtype)
for inner_name, inner_dtype
in dtype.to_schema().items()
]
expr = expr.name.map_fields(str.lower).struct.with_fields(fields)
return expr.name.to_lowercase()
df = pl.DataFrame({
"A":[
{"A":{"B":{"C": {"D": [{"E":1}],"F":[{"G":{"H":[2]}}]}}}},
{"A":{"B":{"C": {"D": [{"E":3}],"F":[{"G":{"H":[4]}}, {"G": {"H": [5]}}]}}}},
],
"B": [67, 89],
"C": ["foo", "bar"],
"D": [
[{"DEF": 10, "GHI": {"JKL": [{"MN": 11}]}}],
[{"DEF": 12, "GHI": {"JKL": [{"MN": 13}]}}]
]
}).cast({
"D": pl.Array(pl.Struct({"DEF": pl.Int64, "GHI": pl.Struct({"JKL": pl.List(pl.Struct({"MN": pl.Int64}))})}), 1)
})
df.select(
unpack_dtype(pl.col(name), dtype)
for name, dtype in df.schema.items()
).schema
# Schema([('a',
# Struct({'a': Struct({'b': Struct({'c': Struct({'d': List(Struct({'e': Int64})), 'f': List(Struct({'g': Struct({'h': List(Int64)})}))})})})})),
# ('b', Int64),
# ('c', String),
# ('d',
# Array(Struct({'def': Int64, 'ghi': Struct({'jkl': List(Struct({'mn': Int64}))})}), shape=(1,)))])
Let's say you have a dataframe like the following, where the columns names are not in lowercase:
import polars as pl
import polars.selectors as cs
df = pl.DataFrame({
"Col1":[1,2,3],
"COL2":[1,2,3],
"CoL3":[1,2,3],
"col4": {"a":"1","B":"2","c":"3"}
})
# Unnest the struct
unnested_df = df.unnest(cs.struct())
You can get the columns names with .columns, convert them into polars Series and then use the method .str.to_lowercase() as you requested.
new_columns = pl.Series((unnested_df.columns)).str.to_lowercase()
unnested_df.columns = new_columns
print(unnested_df)
shape: (3, 6)
┌──────┬──────┬──────┬─────┬─────┬─────┐
│ col1 ┆ col2 ┆ col3 ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ str ┆ str ┆ str │
╞══════╪══════╪══════╪═════╪═════╪═════╡
│ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 2 ┆ 3 │
│ 2 ┆ 2 ┆ 2 ┆ 1 ┆ 2 ┆ 3 │
│ 3 ┆ 3 ┆ 3 ┆ 1 ┆ 2 ┆ 3 │
└──────┴──────┴──────┴─────┴─────┴─────┘
Keep in mind that this will not work if you columns names are not unique.
Here's a recursive way to deal with nested lists and structs. First make these two functions:
import polars as pl
from typing import cast
def do_struct(strct: pl.Struct, place: pl.Expr) -> pl.Expr:
strct_children = []
for field in strct.fields:
if field.dtype == pl.Struct:
strct_children.append(
do_struct(
cast(pl.Struct, field.dtype), place.struct.field(field.name)
).alias(field.name.lower())
)
elif (
field.dtype == pl.List
and (new_strct := cast(pl.List, field.dtype).inner) == pl.Struct
):
strct_children.append(
place.list.eval(
do_struct(cast(pl.Struct, new_strct), pl.element())
).alias(field.name.lower())
)
else:
strct_children.append(
place.struct.field(field.name).alias(field.name.lower())
)
return pl.struct(strct_children)
def make_names_lowercase(df: pl.DataFrame) -> pl.DataFrame:
new_exprs = []
for c, d in df.schema.items():
if d == pl.Struct:
new_exprs.append(do_struct(cast(pl.Struct, d), pl.col(c)).alias(c.lower()))
elif d == pl.List and (new_strct := cast(pl.List, d).inner) == pl.Struct:
new_exprs.append(
pl.col(c)
.list.eval(do_struct(cast(pl.Struct, new_strct), pl.element()))
.alias(c.lower())
)
else:
new_exprs.append(pl.col(c).alias(c.lower()))
return df.select(new_exprs)
Then when you have a nested df you do
make_names_lowercase(df)
The way this works is to traverse the schema recreating the Exprs that create the existing Schema except that it applies alias with .lower().