Practice: Matrix Products¶

Do the following:

Implement a matrix-matrix product $A\overline{B}^T$ in loopy. Let $A$ be real-valued and $B$ be complex-valued. The overline symbolizes complex conjugation.

Setup code¶

import numpy as np
import numpy.linalg as la
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

n = 1024
A = cl.clrandom.rand(queue, (n, n), dtype=np.float64)
B = (
    cl.clrandom.rand(queue, (n, n), dtype=np.float64)
    +
    1j * cl.clrandom.rand(queue, (n, n), dtype=np.float64))

Implementing the Kernel¶

Implement the basic kernel here:

knl = lp.make_kernel(
    "{[i,j,k]: 0<=i,j,k<n}",
    "C[i,j] = sum(k, A[i, k]*conj(B[j, k]))"
    )

Here we execute the kernel, making sure we get to see the generated code:

_knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
evt, (C,) = _knl(queue, A=A, B=B)

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#define PYOPENCL_DEFINE_CDOUBLE

#include <pyopencl-complex.h>

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global double const *restrict A, __global cdouble_t const *restrict B, __global cdouble_t *restrict C, int const n)
{
  cdouble_t acc_k;

  for (int j = 0; j <= -1 + n; ++j)
    for (int i = 0; i <= -1 + n; ++i)
    {
      acc_k = cdouble_fromreal(0.0);
      for (int k = 0; k <= -1 + n; ++k)
        acc_k = cdouble_add(acc_k, cdouble_rmul(A[n * i + k], cdouble_conj(B[n * j + k])));
      C[n * i + j] = acc_k;
    }
}

/home/andreas/src/loopy/loopy/compiled.py:733: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
  kernel = get_one_scheduled_kernel(kernel)

Next, we test that we got the right result, using numpy:

C_ref = A.get() @ B.get().T.conj()
la.norm(C.get()-C_ref) / la.norm(C_ref)

8.1249888675681689e-16

Playing with the loop ordering¶

Check the loopy documentation to see how to use the seet_loop_priority transform to prescribe a loop ordering.

Try a few different variants, time their execution. Make sure to exclude the first run, because the time for that will include code generation and compilation.

You may use the Python function time.time to get the wall clock time in seconds since the Jan 1, 1970.

Also make sure to call queue.finish() before you start and stop the clock.

tknl = lp.set_loop_priority(knl, "i,j")


def do_timing(timed_knl):
    timed_knl = lp.set_options(timed_knl, write_cl=True, highlight_cl=True)
    # Run once to 'warm up' the code
    timed_knl(queue, A=A, B=B)

    queue.finish()

    from time import time
    start = time()

    nruns = 2
    for i in range(nruns):
        timed_knl(queue, A=A, B=B)

    queue.finish()

    timing = (time()-start)/nruns
    print(timing,"s per run")
    
do_timing(tknl)

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#define PYOPENCL_DEFINE_CDOUBLE

#include <pyopencl-complex.h>

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global double const *restrict A, __global cdouble_t const *restrict B, __global cdouble_t *restrict C, int const n)
{
  cdouble_t acc_k;

  for (int i = 0; i <= -1 + n; ++i)
    for (int j = 0; j <= -1 + n; ++j)
    {
      acc_k = cdouble_fromreal(0.0);
      for (int k = 0; k <= -1 + n; ++k)
        acc_k = cdouble_add(acc_k, cdouble_rmul(A[n * i + k], cdouble_conj(B[n * j + k])));
      C[n * i + j] = acc_k;
    }
}

1.8014510869979858 s per run

Parallelization: Single-element work groups¶

Next, parallelize the operation using a 2D grid. Use the tag_inames transformation.

Experiment with the ordering.

tknl = lp.tag_inames(knl, "i:g.0,j:g.1")

do_timing(tknl)

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#define PYOPENCL_DEFINE_CDOUBLE

#include <pyopencl-complex.h>

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global double const *restrict A, __global cdouble_t const *restrict B, __global cdouble_t *restrict C, int const n)
{
  cdouble_t acc_k;

  acc_k = cdouble_fromreal(0.0);
  for (int k = 0; k <= -1 + n; ++k)
    acc_k = cdouble_add(acc_k, cdouble_rmul(A[n * gid(0) + k], cdouble_conj(B[n * gid(1) + k])));
  C[n * gid(0) + gid(1)] = acc_k;
}

0.8107101917266846 s per run

Parallelization: Multi-element work groups¶

Next, use more than one element per workgroup. Use the split_iname transformation.

Experiment with group sizes and axis order.

tknl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
tknl = lp.split_iname(tknl, "j", 16, outer_tag="g.1", inner_tag="l.1")

do_timing(tknl)

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#define PYOPENCL_DEFINE_CDOUBLE

#include <pyopencl-complex.h>

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global double const *restrict A, __global cdouble_t const *restrict B, __global cdouble_t *restrict C, int const n)
{
  cdouble_t acc_k;

  if (-1 + -16 * gid(1) + -1 * lid(1) + n >= 0 && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
  {
    acc_k = cdouble_fromreal(0.0);
    for (int k = 0; k <= -1 + n; ++k)
      acc_k = cdouble_add(acc_k, cdouble_rmul(A[n * (lid(0) + gid(0) * 16) + k], cdouble_conj(B[n * (lid(1) + gid(1) * 16) + k])));
    C[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = acc_k;
  }
}

0.5177402496337891 s per run

Where to go from here¶

Things to try:

Loop Unrolling (the unr iname tag)
Instruction level parallelism (the ilp iname tag)
Prefetching (add_prefetch)
Run this on an actual GPU
Measure GFLOPS and GBytes/s