mask n-dimensional numpy array (to save memory)

Question

Suppose one has a high dimensional numpy array:

import numpy as np
x = np.zeros((200, 200, 200))

of which only a contiguous *sub-array are 'valid' entries. other entries may be ignored (in this example, each entry which is 1 is valid, 0s may be ignored)

sub_array = np.s_[100:110, 100:110, 100:110]
x[sub_array] = 1

How can I represent x in python such that it integrates with other numpy arrays (slicing, indexing etc) but does not waste memory on all the invalid entries?

*I'd be interested in a solution where the subset is not necessarily an array too, if possible

some ideas which come close but don't do it: (1) sparse numpy arrays are only 2d docs.scipy.org/doc/scipy/reference/sparse.html (2) numpy's mask module integrates beautifully, but doesn't actually save any memory docs.scipy.org/doc/numpy-1.15.0/reference/… — eretmochelys
– eretmochelys, Commented Feb 7, 2019 at 13:29

javidcf · Accepted Answer · 2019-02-07 15:12:29Z

For several use cases, you may get away with a smartly crafted class implementing a __array__ method. Here is a sketch of one possible implementation like that:

import numpy as np

class PaddedArray:

    def __init__(self, arr, padding):
        self._arr = np.array(arr)
        self._pad = list(tuple(map(int, p)) for p in padding)
        assert self._arr.ndim == len(self._pad)
        assert all(len(p) == 2 for p in self._pad)

    def __array__(self, *args, **kwargs):
        ar = np.asarray(self._arr, *args, **kwargs)
        return np.pad(ar, self._pad, 'constant')

    def __getitem__(self, idx):
        if not isinstance(idx, (list, tuple)):
            idx = (idx,)
        new_arr = self._arr
        new_pad = list(self._pad)
        i_dim = 0
        for s in idx:
            n_arr = new_arr.shape[i_dim]
            p1, p2 = new_pad[i_dim]
            n = n_arr + p1 + p2
            if s is np.newaxis:
                new_pad.insert(i_dim, (0, 0))
                new_arr = np.expand_dims(new_arr, i_dim)
                i_dim += 1
            elif s is Ellipsis:
                # TODO - Support ellipsis
                assert False
            elif isinstance(s, int):
                s = s if s >= 0 else s + n
                assert 0 <= s < n
                new_pad.pop(i_dim)
                if s < p1 or s >= n - p2:
                    new_arr = np.zeros_like(np.take(new_arr, [0], axis=i_dim))
                else:
                    new_arr = np.take(new_arr, [s - p1], axis=i_dim)
                new_arr = np.squeeze(new_arr, i_dim)
            elif isinstance(s, slice):
                start = int(s.start) if s.start else 0
                stop = int(s.stop) if s.stop else n
                start = start if start >= 0 else start + n
                stop = stop if stop >= 0 else stop + n
                # TODO - Support arbitrary steps
                assert s.step in (None, 1)
                start = np.clip(start, 0, n)
                stop = np.clip(stop, start, n)
                d = stop - start
                if d == 0:
                    new_pad[i_dim] = (0, 0)
                    new_arr = np.take(new_arr, [], axis=i_dim)
                elif stop < p1 or start >= n - p2:
                    new_pad[i_dim] = (d, 0)
                    new_arr = np.take(new_arr, [], axis=i_dim)
                else:
                    new_pad[i_dim] = (max(p1 - start, 0), max(stop - p1 - n_arr, 0))
                    new_arr = new_arr[(slice(None),) * i_dim + (slice(max(start - p1, 0), min(stop - p1, n_arr)),)]
                i_dim += 1
            else:
                assert Fail
        return PaddedArray(new_arr, new_pad)

    @property
    def shape(self):
        return tuple(s + p1 + p2 for s, (p1, p2) in zip(self._arr.shape, self._pad))

Obviously, the complicated part is the slicing, which here does not support ellipsis (...) or arbitrary slice steps. Also, this will just instantiate a big array whenever you need to operate with it. You can use np.asarray to do that, although operating with another np.ndarray or using NumPy functions should trigger the conversion automatically. Here are some usage examples:

import numpy as np

a = np.arange(12).reshape(4, 3)
print(a)
# [[ 0  1  2]
#  [ 3  4  5]
#  [ 6  7  8]
#  [ 9 10 11]]
pa = PaddedArray(a, [(1, 3), (0, 2)])
print(pa.shape)
# (8, 5)
print(np.asarray(pa))
# [[ 0  0  0  0  0]
#  [ 0  1  2  0  0]
#  [ 3  4  5  0  0]
#  [ 6  7  8  0  0]
#  [ 9 10 11  0  0]
#  [ 0  0  0  0  0]
#  [ 0  0  0  0  0]
#  [ 0  0  0  0  0]]
print(np.asarray(pa[0]))
# [0 0 0 0 0]
print(np.asarray(pa[:, -3]))
# [ 0  2  5  8 11  0  0  0]
print(np.asarray(pa[3, np.newaxis, 2:]))
# [[8 0 0]]
print(pa[:4, :4] @ a)  # Note it is automatically converted
# [[  0   0   0]
#  [ 15  18  21]
#  [ 42  54  66]
#  [ 69  90 111]]

Collectives™ on Stack Overflow

mask n-dimensional numpy array (to save memory)

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related