I am struggling with the problem of optimizing my cython code in order to improve its speed as much as possible. One of the challenges that I could not still figure out how it should be done in cython is mapping an array on a function like what is done in numpy.vectorize function.
The simplify version of my problem is
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
cdef class Test(object):
cdef public double M, c, z
cdef public double[::1] ks, zs, pos
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.wraparound(False)
@cython.nonecheck(False)
def __cinit__(self, M, c, z, pos, ks, zs=None):
if path is None:
raise ValueError("Could not find a path to the file which contains the table of angular diameter distances")
self.M = M
self.c = c
self.z = z
self.pos = pos
if zs is None:
raise ValueError("You must give an array which contains the steps where the redshift probability distribution are computed!")
self.zs=zs
self.ks=ks
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double, ndim=1, mode='c'] __kappa(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
cdef Py_ssize_t N = x.shape[0]
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')
mask = np.where(x < 0.999)[0]
out[mask] = 2*ks/(x[mask]**2 - 1) * \
(1 - np.log((1 + ((1 - x[mask])/(x[mask] + 1))**0.5)/(1 - ((1 - x[mask])/(x[mask] + 1))**0.5))/(1 - x[mask]**2)**0.5)
mask = np.where(x > 1.001)[0]
out[mask] = 2*ks/(x[mask]**2 - 1) * \
(1 - 2*np.arctan(((x[mask] - 1)/(x[mask] + 1))**0.5)/(x[mask]**2 - 1)**0.5)
mask = np.where((x >= 0.999) & (x <= 1.001))[0]
out[mask] = ks*(22./15. - 0.8*x[mask])
return out
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double, ndim=1, mode='c'] __gamma(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
cdef Py_ssize_t N=len(x)
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')
mask = np.where(x > 0.01)[0]
out[mask] = 4*ks*(np.log(x[mask]/2) + 2* \
x[mask]**(-2) - self.__kappa(x[mask], ks)
mask = np.where(x <= 0.01)[0]
out[mask] = 4*ks*(0.25 + 0.125 * x[mask]**2 * (3.25 + 3.0*np.log(x[mask]/2)))
return out
cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
# Convert to numpy arrays for internal usage:
cdef np.ndarray[double, ndim=1, mode='c'] g, kappa, r, ks, wg
cdef np.ndarray[double, ndim=1, mode='c'] pos_x, pos_y
if not gpos[:,0].flags.c_contiguous:
pos_x = gpos[:,0].copy(order='C')
else:
pos_x = gpos[:,0]
if not gpos[:,1].flags.c_contiguous:
pos_y = gpos[:,1].copy(order='C')
else:
pos_y = gpos[:,1]
cdef Py_ssize_t i, mask, N
r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5
ks = np.ascontiguousarray(self.ks)
N = len(ks)
mask= np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]
wg = np.zeros(len(r), dtype=np.float64 , order='C')
for i from N > i >= 0:
g = self.__gamma(r, ks[i])
kappa = self.__kappa(r, ks[i])
g /= 1 - kappa
wg+=g*pdf_z[:,mask+i]
cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
dx = pos_x - self.halo_pos[0]
dy = pos_y - self.halo_pos[1]
drsq = dx*dx+dy*dy
drsq[drsq==0.] = 1. # Avoid division by 0
cos2phi = (dx*dx-dy*dy)/drsq
sin2phi = 2*dx*dy/drsq
g1 = -wg*cos2phi
g2 = -wg*sin2phi
return g1, g2
I am wondering whether there is a way that I can vectorize getSh method of Test class over ks array and avoid using the loop by using something that makes my code faster?