Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: release the GIL in cylindrical pixelizer #5018

Merged
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 66 additions & 59 deletions yt/utilities/lib/pixelization_routines.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ cdef extern from "pixelization_constants.hpp":
int WEDGE_NF
np.uint8_t wedge_face_defs[MAX_NUM_FACES][2][2]

cdef extern from "numpy/npy_math.h":
double NPY_PI

@cython.cdivision(True)
@cython.boundscheck(False)
Expand Down Expand Up @@ -566,6 +568,7 @@ def pixelize_cylinder(np.float64_t[:,:] buff,
cdef np.float64_t r_inc, theta_inc
cdef np.float64_t costheta, sintheta
cdef int i, i1, pi, pj
cdef np.float64_t twoPI = 2 * NPY_PI

cdef int imin, imax
imin = np.asarray(radius).argmin()
Expand All @@ -577,7 +580,6 @@ def pixelize_cylinder(np.float64_t[:,:] buff,
imax = np.asarray(theta).argmax()
tmin = theta[imin] - dtheta[imin]
tmax = theta[imax] + dtheta[imax]

cdef np.ndarray[np.uint8_t, ndim=2] mask_arr = np.zeros_like(buff, dtype="uint8")
cdef np.uint8_t[:, :] mask = mask_arr

Expand Down Expand Up @@ -611,65 +613,70 @@ def pixelize_cylinder(np.float64_t[:,:] buff,
rbounds[0] = 0.0
r_inc = 0.5 * fmin(dx, dy)

for i in range(radius.shape[0]):
r0 = radius[i]
theta0 = theta[i]
dr_i = dradius[i]
dtheta_i = dtheta[i]
# Skip out early if we're offsides, for zoomed in plots
if r0 + dr_i < rbounds[0] or r0 - dr_i > rbounds[1]:
continue
theta_i = theta0 - dtheta_i
theta_inc = r_inc / (r0 + dr_i)

while theta_i < theta0 + dtheta_i:
r_i = r0 - dr_i
costheta = math.cos(theta_i)
sintheta = math.sin(theta_i)
while r_i < r0 + dr_i:
if rmax <= r_i:
with nogil:
neutrinoceros marked this conversation as resolved.
Show resolved Hide resolved
for i in range(radius.shape[0]):
r0 = radius[i]
theta0 = theta[i]
dr_i = dradius[i]
dtheta_i = dtheta[i]
# Skip out early if we're offsides, for zoomed in plots
if r0 + dr_i < rbounds[0] or r0 - dr_i > rbounds[1]:
continue
theta_i = theta0 - dtheta_i
theta_inc = r_inc / (r0 + dr_i)

while theta_i < theta0 + dtheta_i:
r_i = r0 - dr_i
costheta = math.cos(theta_i)
sintheta = math.sin(theta_i)
while r_i < r0 + dr_i:
if rmax <= r_i:
r_i += r_inc
continue
y = r_i * costheta
x = r_i * sintheta
pi = <int>((x - x0)/dx)
pj = <int>((y - y0)/dy)
if pi >= 0 and pi < buff.shape[0] and \
pj >= 0 and pj < buff.shape[1]:
# we got a pixel that intersects the grid cell
# now check that this pixel doesn't go beyond the data domain
xp = x0 + pi*dx
yp = y0 + pj*dy
corners[0] = xp*xp + yp*yp
corners[1] = xp*xp + (yp+dy)**2
corners[2] = (xp+dx)**2 + yp*yp
corners[3] = (xp+dx)**2 + (yp+dy)**2
prbounds[0] = prbounds[1] = corners[3]
for i1 in range(3):
prbounds[0] = fmin(prbounds[0], corners[i1])
prbounds[1] = fmax(prbounds[1], corners[i1])
prbounds[0] = math.sqrt(prbounds[0])
prbounds[1] = math.sqrt(prbounds[1])

corners[0] = math.atan2(xp, yp)
corners[1] = math.atan2(xp, yp+dy)
corners[2] = math.atan2(xp+dx, yp)
corners[3] = math.atan2(xp+dx, yp+dy)
ptbounds[0] = ptbounds[1] = corners[3]
for i1 in range(3):
ptbounds[0] = fmin(ptbounds[0], corners[i1])
ptbounds[1] = fmax(ptbounds[1], corners[i1])

# shift to a [0, PI] interval
ptbounds[0] = ptbounds[0] % twoPI
if ptbounds[0] < 0:
ptbounds[0] += NPY_PI
ptbounds[1] = ptbounds[1] % twoPI
if ptbounds[1] < 0:
ptbounds[1] += NPY_PI
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I'm not sure why I needed to add the check for negative numbers here, but after switching from ptbounds[1] % (2*np.pi) to ptbounds[1] % twoPI, the result returns numbers in the range (-pi,pi).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to understand this bit, and possibly come up with a way to perform the computation that doesn't involve branching; hard-wiring a somewhat expensive calculation generally hurts performance less than ruining branch prediction.
I'm writing this comment early in my review but I'll try to come up with more insight.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm taking a bet that the reason why you're seeing a change in behavior is that the function is decorated with @cython.cdivision(True): on main the Python % is used, while with twoPI you're getting a C operator instead.

http://docs.cython.org/en/latest/src/tutorial/caveats.html

Copy link
Member

@neutrinoceros neutrinoceros Oct 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you try this patch ?

diff --git a/yt/utilities/lib/pixelization_routines.pyx b/yt/utilities/lib/pixelization_routines.pyx
index 194ce7af6..6728375d0 100644
--- a/yt/utilities/lib/pixelization_routines.pyx
+++ b/yt/utilities/lib/pixelization_routines.pyx
@@ -664,12 +664,8 @@ def pixelize_cylinder(np.float64_t[:,:] buff,
                             ptbounds[1] = fmax(ptbounds[1], corners[i1])
 
                         # shift to a [0, PI] interval
-                        ptbounds[0] = ptbounds[0] % twoPI
-                        if ptbounds[0] < 0:
-                            ptbounds[0] += NPY_PI
-                        ptbounds[1] = ptbounds[1] % twoPI
-                        if ptbounds[1] < 0:
-                            ptbounds[1] += NPY_PI
+                        ptbounds[0] = (ptbounds[0]+twoPI) % twoPI
+                        ptbounds[1] = (ptbounds[1]+twoPI) % twoPI
 
                         if prbounds[0] >= rmin and prbounds[1] <= rmax and \
                            ptbounds[0] >= tmin and ptbounds[1] <= tmax:

the result should be the same in all cases, but it avoids branching, so I expect it to be faster.
If this works, we'll want to add a comment for why it's done this way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering if we could use fmod here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like in cython, % is an alias to fmod already when using C functions, which returns a negative number when the first argument is negative (which is why the tests started failing here I think). but I do think it's clearer to use fmod to avoid the confusion in the first place (so fmod(ptbounds[0]+twoPI, twoPI) to handle the negative numbers). In case it's helpful, I wrote up some scratch code to double check, here's a plot:

comparison-cython-modulo


if prbounds[0] >= rmin and prbounds[1] <= rmax and \
ptbounds[0] >= tmin and ptbounds[1] <= tmax:
buff[pi, pj] = field[i]
mask[pi, pj] = 1
r_i += r_inc
continue
y = r_i * costheta
x = r_i * sintheta
pi = <int>((x - x0)/dx)
pj = <int>((y - y0)/dy)
if pi >= 0 and pi < buff.shape[0] and \
pj >= 0 and pj < buff.shape[1]:
# we got a pixel that intersects the grid cell
# now check that this pixel doesn't go beyond the data domain
xp = x0 + pi*dx
yp = y0 + pj*dy
corners[0] = xp*xp + yp*yp
corners[1] = xp*xp + (yp+dy)**2
corners[2] = (xp+dx)**2 + yp*yp
corners[3] = (xp+dx)**2 + (yp+dy)**2
prbounds[0] = prbounds[1] = corners[3]
for i1 in range(3):
prbounds[0] = fmin(prbounds[0], corners[i1])
prbounds[1] = fmax(prbounds[1], corners[i1])
prbounds[0] = math.sqrt(prbounds[0])
prbounds[1] = math.sqrt(prbounds[1])

corners[0] = math.atan2(xp, yp)
corners[1] = math.atan2(xp, yp+dy)
corners[2] = math.atan2(xp+dx, yp)
corners[3] = math.atan2(xp+dx, yp+dy)
ptbounds[0] = ptbounds[1] = corners[3]
for i1 in range(3):
ptbounds[0] = fmin(ptbounds[0], corners[i1])
ptbounds[1] = fmax(ptbounds[1], corners[i1])

# shift to a [0, PI] interval
ptbounds[0] = ptbounds[0] % (2*np.pi)
ptbounds[1] = ptbounds[1] % (2*np.pi)

if prbounds[0] >= rmin and prbounds[1] <= rmax and \
ptbounds[0] >= tmin and ptbounds[1] <= tmax:
buff[pi, pj] = field[i]
mask[pi, pj] = 1
r_i += r_inc
theta_i += theta_inc
theta_i += theta_inc

if return_mask:
return mask_arr.astype("bool")
Expand Down
Loading