{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Hello Loopy: Computing a Rank-One Matrix\n",
        "\n",
        "## Setup Code"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import pyopencl as cl\n",
        "import pyopencl.array\n",
        "import pyopencl.clrandom\n",
        "import loopy as lp"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "ctx = cl.create_some_context()\n",
        "queue = cl.CommandQueue(ctx)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "n = 1024\n",
        "a = cl.clrandom.rand(queue, n, dtype=np.float32)\n",
        "b = cl.clrandom.rand(queue, n, dtype=np.float32)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## The Initial Kernel"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "knl = lp.make_kernel(\n",
        "    \"{[i,j]: 0<=i,j<n}\",\n",
        "    \"c[i, j] = a[i]*b[j]\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n",
            "{\n",
            "\n",
            "  for (int j = 0; j <= -1 + n; ++j)\n",
            "    for (int i = 0; i <= -1 + n; ++i)\n",
            "      c[n * i + j] = a[i] * b[j];\n",
            "}\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/andreas/src/loopy/loopy/compiled.py:841: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring\n",
            "  kernel = get_one_scheduled_kernel(kernel)\n"
          ]
        }
      ],
      "source": [
        "knl = lp.set_options(knl, write_cl=True)\n",
        "evt, (mat,) = knl(queue, a=a, b=b)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Python glue for ease of use: The kernel \"wrapper\"\n",
        "\n",
        "Invocation details handled by generated Python \"wrapper\":\n",
        "\n",
        "* Argument derivation\n",
        "* Data copying (from/to host, if requested)\n",
        "* Shape/type/data layout checks"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "from __future__ import division\n",
            "\n",
            "import pyopencl as _lpy_cl\n",
            "import pyopencl.array as _lpy_cl_array\n",
            "import pyopencl.tools as _lpy_cl_tools\n",
            "import numpy as _lpy_np\n",
            "from struct import pack as _lpy_pack\n",
            "\n",
            "def invoke_loopy_kernel_loopy_kernel(cl_kernel, queue, allocator=None, wait_for=None, out_host=None, a=None, b=None, c=None, n=None):\n",
            "    if allocator is None:\n",
            "        allocator = _lpy_cl_tools.DeferredAllocator(queue.context)\n",
            "\n",
            "    # {{{ find integer arguments from shapes\n",
            "\n",
            "    if n is None:\n",
            "        if a is not None:\n",
            "            n = a.shape[0]\n",
            "        elif b is not None:\n",
            "            n = b.shape[0]\n",
            "        elif c is not None:\n",
            "            n = c.shape[0]\n",
            "        elif c is not None:\n",
            "            n = c.shape[1]\n",
            "\n",
            "    # }}}\n",
            "\n",
            "    # {{{ find integer arguments from offsets\n",
            "\n",
            "    # }}}\n",
            "\n",
            "    # {{{ find integer arguments from strides\n",
            "\n",
            "    # }}}\n",
            "\n",
            "    # {{{ process n\n",
            "\n",
            "    if n is None:\n",
            "        raise RuntimeError(\"input argument 'n' must \"\n",
            "            \"be supplied\")\n",
            "    cl_kernel.set_arg(3, _lpy_pack('i', n))\n",
            "\n",
            "    # }}}\n",
            "\n",
            "    # {{{ set up array arguments\n",
            "\n",
            "    _lpy_encountered_numpy = False\n",
            "    _lpy_encountered_dev = False\n",
            "\n",
            "    # {{{ process a\n",
            "\n",
            "    if isinstance(a, _lpy_np.ndarray):\n",
            "        # synchronous, nothing to worry about\n",
            "        a = _lpy_cl_array.to_device(queue, a, allocator=allocator)\n",
            "        _lpy_encountered_numpy = True\n",
            "    elif a is not None:\n",
            "        _lpy_encountered_dev = True\n",
            "\n",
            "    if a is None:\n",
            "        raise RuntimeError(\"input argument 'a' must be supplied\")\n",
            "\n",
            "    if True:\n",
            "        if a.dtype != _lpy_np.float32:\n",
            "            raise TypeError(\"dtype mismatch on argument 'a' (got: %s, expected: float32)\" % a.dtype)\n",
            "        if a.shape != (n,):\n",
            "            raise TypeError(\"shape mismatch on argument 'a' (got: %s, expected: %s)\" % (a.shape, (n,)))\n",
            "        if a.strides != (4,):\n",
            "            raise TypeError(\"strides mismatch on argument 'a' (got: %s, expected: %s)\" % (a.strides, (4,)))\n",
            "        if a.offset:\n",
            "            raise ValueError(\"Argument 'a' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().\")\n",
            "\n",
            "    cl_kernel.set_arg(0, a.base_data)\n",
            "\n",
            "    # }}}\n",
            "\n",
            "    # {{{ process b\n",
            "\n",
            "    if isinstance(b, _lpy_np.ndarray):\n",
            "        # synchronous, nothing to worry about\n",
            "        b = _lpy_cl_array.to_device(queue, b, allocator=allocator)\n",
            "        _lpy_encountered_numpy = True\n",
            "    elif b is not None:\n",
            "        _lpy_encountered_dev = True\n",
            "\n",
            "    if b is None:\n",
            "        raise RuntimeError(\"input argument 'b' must be supplied\")\n",
            "\n",
            "    if True:\n",
            "        if b.dtype != _lpy_np.float32:\n",
            "            raise TypeError(\"dtype mismatch on argument 'b' (got: %s, expected: float32)\" % b.dtype)\n",
            "        if b.shape != (n,):\n",
            "            raise TypeError(\"shape mismatch on argument 'b' (got: %s, expected: %s)\" % (b.shape, (n,)))\n",
            "        if b.strides != (4,):\n",
            "            raise TypeError(\"strides mismatch on argument 'b' (got: %s, expected: %s)\" % (b.strides, (4,)))\n",
            "        if b.offset:\n",
            "            raise ValueError(\"Argument 'b' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().\")\n",
            "\n",
            "    cl_kernel.set_arg(1, b.base_data)\n",
            "\n",
            "    # }}}\n",
            "\n",
            "    # {{{ process c\n",
            "\n",
            "    if isinstance(c, _lpy_np.ndarray):\n",
            "        # synchronous, nothing to worry about\n",
            "        c = _lpy_cl_array.to_device(queue, c, allocator=allocator)\n",
            "        _lpy_encountered_numpy = True\n",
            "    elif c is not None:\n",
            "        _lpy_encountered_dev = True\n",
            "\n",
            "    _lpy_made_by_loopy = False\n",
            "\n",
            "    if c is None:\n",
            "        _lpy_shape_0 = n\n",
            "        _lpy_shape_1 = n\n",
            "        _lpy_strides_0 = 4*n\n",
            "        _lpy_strides_1 = 4\n",
            "        assert _lpy_strides_0 > 0, \"'c' has negative stride in axis 0\"\n",
            "        assert _lpy_strides_1 > 0, \"'c' has negative stride in axis 1\"\n",
            "        _lpy_alloc_size = _lpy_strides_0*(_lpy_shape_0 + -1) + _lpy_strides_1*(_lpy_shape_1 + -1) + 4\n",
            "        c = _lpy_cl_array.Array(queue, (_lpy_shape_0, _lpy_shape_1), _lpy_np.float32, strides=(_lpy_strides_0, _lpy_strides_1), data=allocator(_lpy_alloc_size), allocator=allocator)\n",
            "        del _lpy_shape_0\n",
            "        del _lpy_strides_0\n",
            "        del _lpy_shape_1\n",
            "        del _lpy_strides_1\n",
            "        del _lpy_alloc_size\n",
            "\n",
            "        _lpy_made_by_loopy = True\n",
            "\n",
            "    if not _lpy_made_by_loopy:\n",
            "        if c.dtype != _lpy_np.float32:\n",
            "            raise TypeError(\"dtype mismatch on argument 'c' (got: %s, expected: float32)\" % c.dtype)\n",
            "        if c.shape != (n, n):\n",
            "            raise TypeError(\"shape mismatch on argument 'c' (got: %s, expected: %s)\" % (c.shape, (n, n,)))\n",
            "        if c.strides != (4*n, 4):\n",
            "            raise TypeError(\"strides mismatch on argument 'c' (got: %s, expected: %s)\" % (c.strides, (4*n, 4)))\n",
            "        if c.offset:\n",
            "            raise ValueError(\"Argument 'c' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().\")\n",
            "\n",
            "    del _lpy_made_by_loopy\n",
            "\n",
            "    cl_kernel.set_arg(2, c.base_data)\n",
            "\n",
            "    # }}}\n",
            "\n",
            "    # }}}\n",
            "\n",
            "    _lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, cl_kernel, (int(1),), (int(1),),  wait_for=wait_for, g_times_l=True)\n",
            "\n",
            "    if out_host is None and (_lpy_encountered_numpy and not _lpy_encountered_dev):\n",
            "        out_host = True\n",
            "    if out_host:\n",
            "        pass\n",
            "        c = c.get(queue=queue)\n",
            "\n",
            "    return _lpy_evt, (c,)\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n",
            "  warn(text, type)\n"
          ]
        }
      ],
      "source": [
        "wknl = lp.set_options(knl, write_wrapper=True, write_cl=False)\n",
        "evt, (mat,) = wknl(queue, a=a, b=b)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Transforming kernels: Loop Splitting\n",
        "\n",
        "Next: transform kernel. Example: Split a loop into fixed-length \"chunks\"."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )\n",
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n",
            "{\n",
            "\n",
            "  for (int i_outer = 0; i_outer <= -1 + int_floor_div_pos_b(3 + n, 4); ++i_outer)\n",
            "    for (int j = 0; j <= -1 + n; ++j)\n",
            "      for (int i_inner = 0; i_inner <= 3; ++i_inner)\n",
            "        if (-1 + -1 * i_inner + -4 * i_outer + n >= 0)\n",
            "          c[n * (i_inner + i_outer * 4) + j] = a[i_inner + i_outer * 4] * b[j];\n",
            "}\n"
          ]
        }
      ],
      "source": [
        "isplit_knl = knl\n",
        "isplit_knl = lp.split_iname(isplit_knl, \"i\", 4)\n",
        "\n",
        "evt, (mat,) = isplit_knl(queue, a=a, b=b)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Want to get rid of the conditional?"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Transforming kernels: Implementation Tags\n",
        "\n",
        "Every loop axis (\"iname\") comes with an *implementation tag*."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )\n",
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n",
            "{\n",
            "\n",
            "  for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer)\n",
            "    for (int j = 0; j <= -1 + n; ++j)\n",
            "    {\n",
            "      c[n * (0 + i_outer * 4) + j] = a[0 + i_outer * 4] * b[j];\n",
            "      c[n * (1 + i_outer * 4) + j] = a[1 + i_outer * 4] * b[j];\n",
            "      c[n * (2 + i_outer * 4) + j] = a[2 + i_outer * 4] * b[j];\n",
            "      c[n * (3 + i_outer * 4) + j] = a[3 + i_outer * 4] * b[j];\n",
            "    }\n",
            "}\n"
          ]
        }
      ],
      "source": [
        "isplit_knl = knl\n",
        "isplit_knl = lp.assume(isplit_knl, \"n mod 4 = 0\")\n",
        "isplit_knl = lp.split_iname(isplit_knl, \"i\", 4)\n",
        "isplit_knl = lp.tag_inames(isplit_knl, {\"i_inner\": \"unr\"})\n",
        "\n",
        "evt, (mat,) = isplit_knl(queue, a=a, b=b)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "May want to influence loop ordering."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "----\n",
        "\"Map to GPU hw axis\" is an iname tag as well.\n",
        "\n",
        "Use shortcuts for less typing:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n",
            "{\n",
            "\n",
            "  if (\n",
            "      -1 + -16 * gid(1) + -1 * lid(1) + n >= 0\n",
            "      && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n",
            "    )\n",
            "    c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a[lid(0) + gid(0) * 16] * b[lid(1) + gid(1) * 16];\n",
            "}\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n",
            "  warn(text, type)\n"
          ]
        }
      ],
      "source": [
        "split_knl = knl\n",
        "split_knl = lp.split_iname(split_knl, \"i\", 16,\n",
        "        outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
        "split_knl = lp.split_iname(split_knl, \"j\", 16,\n",
        "        outer_tag=\"g.1\", inner_tag=\"l.1\")\n",
        "\n",
        "evt, (mat,) = split_knl(queue, a=a, b=b)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Transforming kernels: Leveraging data reuse\n",
        "\n",
        "Better! But still not much data reuse."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n",
            "{\n",
            "  float b_fetch_0;\n",
            "  float a_fetch_0;\n",
            "\n",
            "  for (int i = 0; i <= -1 + n; ++i)\n",
            "  {\n",
            "    a_fetch_0 = a[i];\n",
            "    for (int j = 0; j <= -1 + n; ++j)\n",
            "    {\n",
            "      b_fetch_0 = b[j];\n",
            "      c[n * i + j] = a_fetch_0 * b_fetch_0;\n",
            "    }\n",
            "  }\n",
            "}\n"
          ]
        }
      ],
      "source": [
        "fetch1_knl = knl\n",
        "\n",
        "fetch1_knl = lp.add_prefetch(fetch1_knl, \"a\")\n",
        "fetch1_knl = lp.add_prefetch(fetch1_knl, \"b\")\n",
        "\n",
        "evt, (mat,) = fetch1_knl(queue, a=a, b=b)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "But this is useless for the GPU version. (demo)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "---\n",
        "\n",
        "Would like to fetch entire \"access footprint\" of a loop."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n",
            "{\n",
            "  __local float b_fetch_0[16];\n",
            "  __local float a_fetch_0[16];\n",
            "\n",
            "  if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n",
            "    a_fetch_0[lid(0)] = a[lid(0) + 16 * gid(0)];\n",
            "  if (-1 + -16 * gid(1) + -1 * lid(0) + n >= 0)\n",
            "    b_fetch_0[lid(0)] = b[lid(0) + 16 * gid(1)];\n",
            "  barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch_0 (insn depends on a_fetch) */;\n",
            "  if (\n",
            "      -1 + -16 * gid(1) + -1 * lid(1) + n >= 0\n",
            "      && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n",
            "    )\n",
            "    c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch_0[lid(0)] * b_fetch_0[lid(1)];\n",
            "}\n"
          ]
        }
      ],
      "source": [
        "fetch_knl = split_knl\n",
        "\n",
        "fetch_knl = lp.add_prefetch(fetch_knl, \"a\", [\"i_inner\"])\n",
        "fetch_knl = lp.add_prefetch(fetch_knl, \"b\", [\"j_inner\"])\n",
        "\n",
        "evt, (mat,) = fetch_knl(queue, a=a, b=b)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Transforming kernels: Eliminating Conditionals\n",
        "\n",
        "All those conditionals take time to evaluate!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n",
            "{\n",
            "  __local float a_fetch[16];\n",
            "  __local float b_fetch[16];\n",
            "\n",
            "  /* bulk slab for 'j_outer' */\n",
            "\n",
            "  /* bulk slab for 'i_outer' */\n",
            "\n",
            "  if (\n",
            "      -17 + -16 * gid(1) + n >= 0\n",
            "      && -17 + -16 * gid(0) + n >= 0\n",
            "    )\n",
            "  {\n",
            "    b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];\n",
            "    a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];\n",
            "    barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;\n",
            "    c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];\n",
            "  }\n",
            "  /* final slab for 'i_outer' */\n",
            "\n",
            "  if (\n",
            "      16 + 16 * gid(0) + -1 * n >= 0\n",
            "      && -17 + -16 * gid(1) + n >= 0\n",
            "    )\n",
            "  {\n",
            "    b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];\n",
            "    if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n",
            "      a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];\n",
            "    barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;\n",
            "    if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n",
            "      c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];\n",
            "  }\n",
            "  /* final slab for 'j_outer' */\n",
            "\n",
            "  /* bulk slab for 'i_outer' */\n",
            "\n",
            "  if (\n",
            "      16 + 16 * gid(1) + -1 * n >= 0\n",
            "      && -17 + -16 * gid(0) + n >= 0\n",
            "    )\n",
            "  {\n",
            "    if (-1 + -16 * gid(1) + -1 * lid(0) + n >= 0)\n",
            "      b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];\n",
            "    a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];\n",
            "    barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;\n",
            "    if (-1 + -16 * gid(1) + -1 * lid(1) + n >= 0)\n",
            "      c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];\n",
            "  }\n",
            "  /* final slab for 'i_outer' */\n",
            "\n",
            "  if (\n",
            "      -1 * gid(0) + gid(1) == 0\n",
            "      && 16 + 16 * gid(0) + -1 * n >= 0\n",
            "    )\n",
            "  {\n",
            "    if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n",
            "      b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];\n",
            "    if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n",
            "      a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];\n",
            "    barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;\n",
            "    if (\n",
            "        -1 + -16 * gid(0) + -1 * lid(1) + n >= 0\n",
            "        && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n",
            "      )\n",
            "      c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];\n",
            "  }\n",
            "}\n"
          ]
        }
      ],
      "source": [
        "sfetch_knl = knl\n",
        "sfetch_knl = lp.split_iname(sfetch_knl, \"i\", 16,\n",
        "        outer_tag=\"g.0\", inner_tag=\"l.0\", slabs=(0,1))\n",
        "sfetch_knl = lp.split_iname(sfetch_knl, \"j\", 16,\n",
        "        outer_tag=\"g.1\", inner_tag=\"l.1\", slabs=(0,1))\n",
        "\n",
        "sfetch_knl = lp.add_prefetch(sfetch_knl, \"a\", [\"i_inner\"])\n",
        "sfetch_knl = lp.add_prefetch(sfetch_knl, \"b\", [\"j_inner\"])\n",
        "\n",
        "evt, (mat,) = sfetch_knl(queue, a=a, b=b)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {},
  "nbformat": 4,
  "nbformat_minor": 0
}