Let's start with this data:
import polars as pl
train_pl = pl.DataFrame(
{
"B_14": pl.int_range(101, eager=True),
"B_15": pl.int_range(101, eager=True).reverse(),
"B_16": pl.int_range(101, eager=True),
"other": pl.int_range(101, eager=True).reverse(),
}
)
train_pl
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═══════╡
│ 0 ┆ 100 ┆ 0 ┆ 100 │
│ 1 ┆ 99 ┆ 1 ┆ 99 │
│ 2 ┆ 98 ┆ 2 ┆ 98 │
│ 3 ┆ 97 ┆ 3 ┆ 97 │
│ 4 ┆ 96 ┆ 4 ┆ 96 │
│ … ┆ … ┆ … ┆ … │
│ 96 ┆ 4 ┆ 96 ┆ 4 │
│ 97 ┆ 3 ┆ 97 ┆ 3 │
│ 98 ┆ 2 ┆ 98 ┆ 2 │
│ 99 ┆ 1 ┆ 99 ┆ 1 │
│ 100 ┆ 0 ┆ 100 ┆ 0 │
└──────┴──────┴──────┴───────┘
All columns
To replace all values above the 0.99 quantile and below the 0.01 quantile, simultaneously, and for all columns in parallel:
(
train_pl
.with_columns(
pl.when(
(pl.all() > pl.all().quantile(0.99)) |
(pl.all() < pl.all().quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.all())
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ NaN │
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99.0 │
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98.0 │
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97.0 │
│ 4.0 ┆ 96.0 ┆ 4.0 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96.0 ┆ 4.0 │
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3.0 │
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2.0 │
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1.0 │
│ NaN ┆ NaN ┆ NaN ┆ NaN │
└──────┴──────┴──────┴───────┘
Named columns
To restrict the operation to only some columns:
only_these = ['B_14', 'B_15', 'other']
(
train_pl
.with_columns(
pl.when(
(pl.col(only_these) > pl.col(only_these).quantile(0.99)) |
(pl.col(only_these) < pl.col(only_these).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.col(only_these))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ i64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ 0 ┆ NaN │
│ 1.0 ┆ 99.0 ┆ 1 ┆ 99.0 │
│ 2.0 ┆ 98.0 ┆ 2 ┆ 98.0 │
│ 3.0 ┆ 97.0 ┆ 3 ┆ 97.0 │
│ 4.0 ┆ 96.0 ┆ 4 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96 ┆ 4.0 │
│ 97.0 ┆ 3.0 ┆ 97 ┆ 3.0 │
│ 98.0 ┆ 2.0 ┆ 98 ┆ 2.0 │
│ 99.0 ┆ 1.0 ┆ 99 ┆ 1.0 │
│ NaN ┆ NaN ┆ 100 ┆ NaN │
└──────┴──────┴──────┴───────┘
Selecting by regex expression
I should also mention that you can use regex expressions in polars.col. Just remember that the regex expression must start with ^ and end with $. (These cannot be omitted.)
regex = r"^B_.*$"
(
train_pl
.with_columns(
pl.when(
(pl.col(regex) > pl.col(regex).quantile(0.99)) |
(pl.col(regex) < pl.col(regex).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.col(regex))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ i64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ 100 │
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99 │
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98 │
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97 │
│ 4.0 ┆ 96.0 ┆ 4.0 ┆ 96 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96.0 ┆ 4 │
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3 │
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2 │
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1 │
│ NaN ┆ NaN ┆ NaN ┆ 0 │
└──────┴──────┴──────┴───────┘
Selecting by DataType
You can also select which columns by datatype.
these_types = [pl.Int64, pl.Float64]
(
train_pl
.with_columns(
pl.when(
(pl.col(these_types) > pl.col(these_types).quantile(0.99)) |
(pl.col(these_types) < pl.col(these_types).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.col(these_types))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ NaN │
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99.0 │
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98.0 │
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97.0 │
│ 4.0 ┆ 96.0 ┆ 4.0 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96.0 ┆ 4.0 │
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3.0 │
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2.0 │
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1.0 │
│ NaN ┆ NaN ┆ NaN ┆ NaN │
└──────┴──────┴──────┴───────┘
By Exclusion
Sometimes it's easier to specify the columns that you do not want. The above methods will work with polars.exclude, so that you can exclude columns that fit some criteria. For example, to select all columns except columns of strings (polars.String):
these_types = [pl.String]
(
train_pl
.with_columns(
pl.when(
(pl.exclude(these_types) > pl.exclude(these_types).quantile(0.99)) |
(pl.exclude(these_types) < pl.exclude(these_types).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.exclude(these_types))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ NaN │
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99.0 │
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98.0 │
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97.0 │
│ 4.0 ┆ 96.0 ┆ 4.0 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96.0 ┆ 4.0 │
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3.0 │
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2.0 │
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1.0 │
│ NaN ┆ NaN ┆ NaN ┆ NaN │
└──────┴──────┴──────┴───────┘
Or for example, to exclude any column that beings with B_:
regex = r"^B_.*$"
(
train_pl
.with_columns(
pl.when(
(pl.exclude(regex) > pl.exclude(regex).quantile(0.99)) |
(pl.exclude(regex) < pl.exclude(regex).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.exclude(regex))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ 0 ┆ 100 ┆ 0 ┆ NaN │
│ 1 ┆ 99 ┆ 1 ┆ 99.0 │
│ 2 ┆ 98 ┆ 2 ┆ 98.0 │
│ 3 ┆ 97 ┆ 3 ┆ 97.0 │
│ 4 ┆ 96 ┆ 4 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96 ┆ 4 ┆ 96 ┆ 4.0 │
│ 97 ┆ 3 ┆ 97 ┆ 3.0 │
│ 98 ┆ 2 ┆ 98 ┆ 2.0 │
│ 99 ┆ 1 ┆ 99 ┆ 1.0 │
│ 100 ┆ 0 ┆ 100 ┆ NaN │
└──────┴──────┴──────┴───────┘