Loopy: Transforming a PDE to Code

In [1]:
import pymbolic.primitives as p

u = p.Variable("u")

We'll write code that evaluates $\triangle u$ using finite differences.

To that end, we define a new expression 'thing': An operator for the Laplacian.

In [2]:
class Laplacian(p.Expression):
    def __init__(self, child):
        self.child = child
        
    def __getinitargs__(self):
        return (self.child,)
    
    mapper_method = "map_laplacian"
        
pde = Laplacian(u)+u**2-1
pde
Out[2]:
Sum((Laplacian(Variable('u')), Power(Variable('u'), 2), -1))

Now we'll write code to turn Laplacians into their discrete finite difference forms, using i and j as formal indices, using

$$f''(x) \approx \frac{f(x+h) - 2 f(x) + f(x-h)}{h^{2}}$$

Pay close attention to indexing!

In [3]:
from pymbolic.mapper import IdentityMapper

i = p.Variable("i")
j = p.Variable("j")

ii = i+1
jj = j+1
In [7]:
class FDMapper(IdentityMapper):
    def map_variable(self, expr):
        return expr[ii, jj]

    def map_laplacian(self, expr):
        var = expr.child
        return (-4*var[ii,jj] + var[ii+1,jj] + var[ii-1,jj]
                + var[ii,jj+1] + var[ii,jj-1])
In [8]:
fd_mapper = FDMapper()
print(fd_mapper(pde))
u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]

Now let's generate some code for this, using loopy:

In [9]:
import loopy as lp

result = p.Variable("result")

Observe the two parts of the loopy kernel description:

  • Polyhedral loop domain
  • Instructions
In [12]:
knl = lp.make_kernel(
    "{[i,j]: 0<=i,j<n}",
    [lp.Assignment(result[ii, jj], fd_mapper(pde))],
    )

Kernels can always be inspected--simply use print:

In [13]:
print(knl)
---------------------------------------------------------------------------
KERNEL: loopy_kernel
---------------------------------------------------------------------------
ARGUMENTS:
n: ValueArg, type: <runtime>
result: GlobalArg, type: <runtime>, shape: (1 + n, 1 + n), dim_tags: (N1:stride:1 + n, N0:stride:1)
u: GlobalArg, type: <runtime>, shape: (2 + n, 2 + n), dim_tags: (N1:stride:2 + n, N0:stride:1)
---------------------------------------------------------------------------
DOMAINS:
[n] -> { [i, j] : 0 <= i < n and 0 <= j < n }
---------------------------------------------------------------------------
INAME IMPLEMENTATION TAGS:
i: None
j: None
---------------------------------------------------------------------------
INSTRUCTIONS:
[i,j]                                result[i + 1, j + 1] <- u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]   # insn
---------------------------------------------------------------------------

Let's move towards running this code. To do so, we need pyopencl:

In [15]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

And some data to work with:

In [16]:
n = 1000
u = cl.clrandom.rand(queue, (n+2,n+2), dtype=np.float32)

Now run the code, and tell loopy to print what it generates:

In [17]:
knl = lp.set_options(knl, write_cl=True)

evt, (result,) = knl(queue, u=u, n=n)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)
{

  for (int j = 0; j <= -1 + n; ++j)
    for (int i = 0; i <= -1 + n; ++i)
      result[(1 + n) * (i + 1) + j + 1] = u[(2 + n) * (i + 1) + j + 1] * u[(2 + n) * (i + 1) + j + 1] + -1.0f + -4.0f * u[(2 + n) * (i + 1) + j + 1] + u[(2 + n) * (i + 1 + 1) + j + 1] + u[(2 + n) * (i + 1 + -1) + j + 1] + u[(2 + n) * (i + 1) + j + 1 + 1] + u[(2 + n) * (i + 1) + j + 1 + -1];
}
/home/andreas/src/loopy/loopy/compiled.py:860: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
  kernel = get_one_scheduled_kernel(kernel)
/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)
  warn(text, type)

That's obviously not very parallel. Introduce parallelism:

In [18]:
tknl = knl
tknl = lp.tag_inames(tknl, {"i": "g.0", "j": "g.1"})
evt, (result,) = tknl(queue, u=u)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)
{

  result[(1 + n) * (gid(0) + 1) + gid(1) + 1] = u[(2 + n) * (gid(0) + 1) + gid(1) + 1] * u[(2 + n) * (gid(0) + 1) + gid(1) + 1] + -1.0f + -4.0f * u[(2 + n) * (gid(0) + 1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1 + 1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1 + -1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1) + gid(1) + 1 + 1] + u[(2 + n) * (gid(0) + 1) + gid(1) + 1 + -1];
}
/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)
  warn(text, type)

But OpenCL/CUDA require blocking to be efficient!

In [19]:
sknl = knl
sknl = lp.split_iname(sknl,
        "i", 16, outer_tag="g.1", inner_tag="l.1")
sknl = lp.split_iname(sknl,
        "j", 16, outer_tag="g.0", inner_tag="l.0")
evt, (result,) = sknl(queue, u=u)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)
{

  if (
      -1 + -16 * gid(0) + -1 * lid(0) + n >= 0
      && -1 + -16 * gid(1) + -1 * lid(1) + n >= 0
    )
    result[(1 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] = u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] * u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + -1.0f + -4.0f * u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + 1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + -1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + -1 + lid(0) + gid(0) * 16];
}
/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)
  warn(text, type)

How about some data reuse?

In [20]:
sknl = knl
sknl = lp.split_iname(sknl,
        "i", 16, outer_tag="g.1", inner_tag="l.1")
sknl = lp.split_iname(sknl,
        "j", 16, outer_tag="g.0", inner_tag="l.0")
sknl = lp.add_prefetch(sknl, "u",
    ["i_inner", "j_inner"],
    fetch_bounding_box=True)
evt, (result,) = sknl(queue, u=u, n=n)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)
{
  __local float u_fetch[18 * 18];

  for (int u_dim_1_outer = 0; u_dim_1_outer <= 1; ++u_dim_1_outer)
    if (
        17 + -16 * u_dim_1_outer + -1 * lid(0) >= 0
        && 1 + -16 * u_dim_1_outer + -1 * lid(0) + -16 * gid(0) + n >= 0
      )
      for (int u_dim_0_outer = 0; u_dim_0_outer <= 1; ++u_dim_0_outer)
        if (
            17 + -16 * u_dim_0_outer + -1 * lid(1) >= 0
            && 1 + -16 * u_dim_0_outer + -1 * lid(1) + -16 * gid(1) + n >= 0
          )
          u_fetch[18 * (lid(1) + u_dim_0_outer * 16) + lid(0) + u_dim_1_outer * 16] = u[(2 + n) * (16 * gid(1) + lid(1) + u_dim_0_outer * 16) + 16 * gid(0) + lid(0) + u_dim_1_outer * 16];
  barrier(CLK_LOCAL_MEM_FENCE) /* for u_fetch (insn depends on u_fetch_rule) */;
  if (
      -1 + -16 * gid(0) + -1 * lid(0) + n >= 0
      && -1 + -16 * gid(1) + -1 * lid(1) + n >= 0
    )
    result[(1 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] = u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] * u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] + -1.0f + -4.0f * u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] + u_fetch[18 * (2 + lid(1)) + 1 + lid(0)] + u_fetch[18 * lid(1) + 1 + lid(0)] + u_fetch[18 * (1 + lid(1)) + 2 + lid(0)] + u_fetch[18 * (1 + lid(1)) + lid(0)];
}
/home/andreas/src/loopy/loopy/compiled.py:860: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
  kernel = get_one_scheduled_kernel(kernel)
/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)
  warn(text, type)
In [ ]: