0

Is there a way for Polars to rename all columns, not just at the top level, but including multiple levels of nested structs?

I need them to all be lowercase via str.lower

3
  • 1
    There is always a way to anything. How to Ask and minimal reproducible example Commented Sep 15 at 5:04
  • Sounds like you could use loops or a recursive approach... Commented Sep 15 at 5:05
  • Please provide a Minimal Reproducible Example that demonstrates your problem Commented Sep 15 at 5:34

4 Answers 4

2

There was a recent PR which suggests there will be an API for this in the future:

Until then it seems you need to manually walk the schema and unpack/rebuild the nested structures with the new names.

def unpack_dtype(expr, dtype):
    if dtype == pl.Array:
        inner = unpack_dtype(expr.arr.explode(), dtype.inner)
        expr = inner.reshape((-1, dtype.size))
    elif dtype == pl.List:
        inner = unpack_dtype(pl.element(), dtype.inner)
        expr = expr.list.eval(inner)
    elif dtype == pl.Struct:
        fields = [
            unpack_dtype(pl.field(inner_name.lower()), inner_dtype)
            for inner_name, inner_dtype 
            in dtype.to_schema().items()
        ]
        expr = expr.name.map_fields(str.lower).struct.with_fields(fields)
    return expr.name.to_lowercase()
df = pl.DataFrame({
    "A":[
        {"A":{"B":{"C": {"D": [{"E":1}],"F":[{"G":{"H":[2]}}]}}}},
        {"A":{"B":{"C": {"D": [{"E":3}],"F":[{"G":{"H":[4]}}, {"G": {"H": [5]}}]}}}},
    ],
    "B": [67, 89],
    "C": ["foo", "bar"],
    "D": [
        [{"DEF": 10, "GHI": {"JKL": [{"MN": 11}]}}],
        [{"DEF": 12, "GHI": {"JKL": [{"MN": 13}]}}]
    ]
}).cast({
    "D": pl.Array(pl.Struct({"DEF": pl.Int64, "GHI": pl.Struct({"JKL": pl.List(pl.Struct({"MN": pl.Int64}))})}), 1)
})
df.select(
    unpack_dtype(pl.col(name), dtype)
    for name, dtype in df.schema.items()
).schema
# Schema([('a',
#          Struct({'a': Struct({'b': Struct({'c': Struct({'d': List(Struct({'e': Int64})), 'f': List(Struct({'g': Struct({'h': List(Int64)})}))})})})})),
#         ('b', Int64),
#         ('c', String),
#         ('d',
#          Array(Struct({'def': Int64, 'ghi': Struct({'jkl': List(Struct({'mn': Int64}))})}), shape=(1,)))])
Sign up to request clarification or add additional context in comments.

Comments

1

Let's say you have a dataframe like the following, where the columns names are not in lowercase:

import polars as pl
import polars.selectors as cs

df = pl.DataFrame({
    "Col1":[1,2,3],
    "COL2":[1,2,3],
    "CoL3":[1,2,3],
    "col4": {"a":"1","B":"2","c":"3"}
})

# Unnest the struct
unnested_df = df.unnest(cs.struct())

You can get the columns names with .columns, convert them into polars Series and then use the method .str.to_lowercase() as you requested.

new_columns = pl.Series((unnested_df.columns)).str.to_lowercase()
unnested_df.columns = new_columns
print(unnested_df)

shape: (3, 6)
┌──────┬──────┬──────┬─────┬─────┬─────┐
│ col1 ┆ col2 ┆ col3 ┆ a   ┆ b   ┆ c   │
│ ---  ┆ ---  ┆ ---  ┆ --- ┆ --- ┆ --- │
│ i64  ┆ i64  ┆ i64  ┆ str ┆ str ┆ str │
╞══════╪══════╪══════╪═════╪═════╪═════╡
│ 1    ┆ 1    ┆ 1    ┆ 1   ┆ 2   ┆ 3   │
│ 2    ┆ 2    ┆ 2    ┆ 1   ┆ 2   ┆ 3   │
│ 3    ┆ 3    ┆ 3    ┆ 1   ┆ 2   ┆ 3   │
└──────┴──────┴──────┴─────┴─────┴─────┘

Keep in mind that this will not work if you columns names are not unique.

Comments

1

Here's a recursive way to deal with nested lists and structs. First make these two functions:

import polars as pl
from typing import cast




def do_struct(strct: pl.Struct, place: pl.Expr) -> pl.Expr:
    strct_children = []

    for field in strct.fields:
        if field.dtype == pl.Struct:
            strct_children.append(
                do_struct(
                    cast(pl.Struct, field.dtype), place.struct.field(field.name)
                ).alias(field.name.lower())
            )
        elif (
            field.dtype == pl.List
            and (new_strct := cast(pl.List, field.dtype).inner) == pl.Struct
        ):
            strct_children.append(
                place.list.eval(
                    do_struct(cast(pl.Struct, new_strct), pl.element())
                ).alias(field.name.lower())
            )
        else:
            strct_children.append(
                place.struct.field(field.name).alias(field.name.lower())
            )
    return pl.struct(strct_children)


def make_names_lowercase(df: pl.DataFrame) -> pl.DataFrame:
    
    new_exprs = []
    for c, d in df.schema.items():
        if d == pl.Struct:
            new_exprs.append(do_struct(cast(pl.Struct, d), pl.col(c)).alias(c.lower()))
        elif d == pl.List and (new_strct := cast(pl.List, d).inner) == pl.Struct:
            new_exprs.append(
                pl.col(c)
                .list.eval(do_struct(cast(pl.Struct, new_strct), pl.element()))
                .alias(c.lower())
            )
        else:
            new_exprs.append(pl.col(c).alias(c.lower()))
    return df.select(new_exprs)

Then when you have a nested df you do

make_names_lowercase(df)

The way this works is to traverse the schema recreating the Exprs that create the existing Schema except that it applies alias with .lower().

Comments

0

Multiple levels of structs will require manual recursion, but if you're ok with just top level names and structs, then

import polars as pl
import polars.selectors as cs

df.select(
    cs.exclude(cs.struct()).name.to_lowercase(),
    cs.struct().name.to_lowercase().name.map_fields(str.lower),
)

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.