{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Hello Loopy: Computing a Rank-One Matrix\n", "\n", "## Setup Code" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "import pyopencl as cl\n", "import pyopencl.array\n", "import pyopencl.clrandom\n", "import loopy as lp" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ctx = cl.create_some_context()\n", "queue = cl.CommandQueue(ctx)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "n = 1024\n", "a = cl.clrandom.rand(queue, n, dtype=np.float32)\n", "b = cl.clrandom.rand(queue, n, dtype=np.float32)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The Initial Kernel" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "knl = lp.make_kernel(\n", " \"{[i,j]: 0<=i,j 0, \"'c' has negative stride in axis 0\"\n", " assert _lpy_strides_1 > 0, \"'c' has negative stride in axis 1\"\n", " _lpy_alloc_size = _lpy_strides_0*(_lpy_shape_0 + -1) + _lpy_strides_1*(_lpy_shape_1 + -1) + 4\n", " c = _lpy_cl_array.Array(queue, (_lpy_shape_0, _lpy_shape_1), _lpy_np.float32, strides=(_lpy_strides_0, _lpy_strides_1), data=allocator(_lpy_alloc_size), allocator=allocator)\n", " del _lpy_shape_0\n", " del _lpy_strides_0\n", " del _lpy_shape_1\n", " del _lpy_strides_1\n", " del _lpy_alloc_size\n", "\n", " _lpy_made_by_loopy = True\n", "\n", " if not _lpy_made_by_loopy:\n", " if c.dtype != _lpy_np.float32:\n", " raise TypeError(\"dtype mismatch on argument 'c' (got: %s, expected: float32)\" % c.dtype)\n", " if c.shape != (n, n):\n", " raise TypeError(\"shape mismatch on argument 'c' (got: %s, expected: %s)\" % (c.shape, (n, n,)))\n", " if c.strides != (4*n, 4):\n", " raise TypeError(\"strides mismatch on argument 'c' (got: %s, expected: %s)\" % (c.strides, (4*n, 4)))\n", " if c.offset:\n", " raise ValueError(\"Argument 'c' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().\")\n", "\n", " del _lpy_made_by_loopy\n", "\n", " cl_kernel.set_arg(2, c.base_data)\n", "\n", " # }}}\n", "\n", " # }}}\n", "\n", " _lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, cl_kernel, (int(1),), (int(1),), wait_for=wait_for, g_times_l=True)\n", "\n", " if out_host is None and (_lpy_encountered_numpy and not _lpy_encountered_dev):\n", " out_host = True\n", " if out_host:\n", " pass\n", " c = c.get(queue=queue)\n", "\n", " return _lpy_evt, (c,)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n", " warn(text, type)\n" ] } ], "source": [ "wknl = lp.set_options(knl, write_wrapper=True, write_cl=False)\n", "evt, (mat,) = wknl(queue, a=a, b=b)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Transforming kernels: Loop Splitting\n", "\n", "Next: transform kernel. Example: Split a loop into fixed-length \"chunks\"." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define int_floor_div_pos_b(a,b) ( ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) )\n", "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n", "{\n", "\n", " for (int i_outer = 0; i_outer <= -1 + int_floor_div_pos_b(3 + n, 4); ++i_outer)\n", " for (int j = 0; j <= -1 + n; ++j)\n", " for (int i_inner = 0; i_inner <= 3; ++i_inner)\n", " if (-1 + -1 * i_inner + -4 * i_outer + n >= 0)\n", " c[n * (i_inner + i_outer * 4) + j] = a[i_inner + i_outer * 4] * b[j];\n", "}\n" ] } ], "source": [ "isplit_knl = knl\n", "isplit_knl = lp.split_iname(isplit_knl, \"i\", 4)\n", "\n", "evt, (mat,) = isplit_knl(queue, a=a, b=b)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Want to get rid of the conditional?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Transforming kernels: Implementation Tags\n", "\n", "Every loop axis (\"iname\") comes with an *implementation tag*." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define int_floor_div_pos_b(a,b) ( ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) )\n", "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n", "{\n", "\n", " for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer)\n", " for (int j = 0; j <= -1 + n; ++j)\n", " {\n", " c[n * (0 + i_outer * 4) + j] = a[0 + i_outer * 4] * b[j];\n", " c[n * (1 + i_outer * 4) + j] = a[1 + i_outer * 4] * b[j];\n", " c[n * (2 + i_outer * 4) + j] = a[2 + i_outer * 4] * b[j];\n", " c[n * (3 + i_outer * 4) + j] = a[3 + i_outer * 4] * b[j];\n", " }\n", "}\n" ] } ], "source": [ "isplit_knl = knl\n", "isplit_knl = lp.assume(isplit_knl, \"n mod 4 = 0\")\n", "isplit_knl = lp.split_iname(isplit_knl, \"i\", 4)\n", "isplit_knl = lp.tag_inames(isplit_knl, {\"i_inner\": \"unr\"})\n", "\n", "evt, (mat,) = isplit_knl(queue, a=a, b=b)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "May want to influence loop ordering." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\"Map to GPU hw axis\" is an iname tag as well.\n", "\n", "Use shortcuts for less typing:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n", "{\n", "\n", " if (\n", " -1 + -16 * gid(1) + -1 * lid(1) + n >= 0\n", " && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n", " )\n", " c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a[lid(0) + gid(0) * 16] * b[lid(1) + gid(1) * 16];\n", "}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n", " warn(text, type)\n" ] } ], "source": [ "split_knl = knl\n", "split_knl = lp.split_iname(split_knl, \"i\", 16,\n", " outer_tag=\"g.0\", inner_tag=\"l.0\")\n", "split_knl = lp.split_iname(split_knl, \"j\", 16,\n", " outer_tag=\"g.1\", inner_tag=\"l.1\")\n", "\n", "evt, (mat,) = split_knl(queue, a=a, b=b)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Transforming kernels: Leveraging data reuse\n", "\n", "Better! But still not much data reuse." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n", "{\n", " float b_fetch_0;\n", " float a_fetch_0;\n", "\n", " for (int i = 0; i <= -1 + n; ++i)\n", " {\n", " a_fetch_0 = a[i];\n", " for (int j = 0; j <= -1 + n; ++j)\n", " {\n", " b_fetch_0 = b[j];\n", " c[n * i + j] = a_fetch_0 * b_fetch_0;\n", " }\n", " }\n", "}\n" ] } ], "source": [ "fetch1_knl = knl\n", "\n", "fetch1_knl = lp.add_prefetch(fetch1_knl, \"a\")\n", "fetch1_knl = lp.add_prefetch(fetch1_knl, \"b\")\n", "\n", "evt, (mat,) = fetch1_knl(queue, a=a, b=b)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "But this is useless for the GPU version. (demo)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "Would like to fetch entire \"access footprint\" of a loop." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n", "{\n", " __local float b_fetch_0[16];\n", " __local float a_fetch_0[16];\n", "\n", " if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n", " a_fetch_0[lid(0)] = a[lid(0) + 16 * gid(0)];\n", " if (-1 + -16 * gid(1) + -1 * lid(0) + n >= 0)\n", " b_fetch_0[lid(0)] = b[lid(0) + 16 * gid(1)];\n", " barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch_0 (insn depends on a_fetch) */;\n", " if (\n", " -1 + -16 * gid(1) + -1 * lid(1) + n >= 0\n", " && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n", " )\n", " c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch_0[lid(0)] * b_fetch_0[lid(1)];\n", "}\n" ] } ], "source": [ "fetch_knl = split_knl\n", "\n", "fetch_knl = lp.add_prefetch(fetch_knl, \"a\", [\"i_inner\"])\n", "fetch_knl = lp.add_prefetch(fetch_knl, \"b\", [\"j_inner\"])\n", "\n", "evt, (mat,) = fetch_knl(queue, a=a, b=b)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Transforming kernels: Eliminating Conditionals\n", "\n", "All those conditionals take time to evaluate!" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n", "{\n", " __local float a_fetch[16];\n", " __local float b_fetch[16];\n", "\n", " /* bulk slab for 'j_outer' */\n", "\n", " /* bulk slab for 'i_outer' */\n", "\n", " if (\n", " -17 + -16 * gid(1) + n >= 0\n", " && -17 + -16 * gid(0) + n >= 0\n", " )\n", " {\n", " b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];\n", " a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];\n", " barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;\n", " c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];\n", " }\n", " /* final slab for 'i_outer' */\n", "\n", " if (\n", " 16 + 16 * gid(0) + -1 * n >= 0\n", " && -17 + -16 * gid(1) + n >= 0\n", " )\n", " {\n", " b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];\n", " if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n", " a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];\n", " barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;\n", " if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n", " c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];\n", " }\n", " /* final slab for 'j_outer' */\n", "\n", " /* bulk slab for 'i_outer' */\n", "\n", " if (\n", " 16 + 16 * gid(1) + -1 * n >= 0\n", " && -17 + -16 * gid(0) + n >= 0\n", " )\n", " {\n", " if (-1 + -16 * gid(1) + -1 * lid(0) + n >= 0)\n", " b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];\n", " a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];\n", " barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;\n", " if (-1 + -16 * gid(1) + -1 * lid(1) + n >= 0)\n", " c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];\n", " }\n", " /* final slab for 'i_outer' */\n", "\n", " if (\n", " -1 * gid(0) + gid(1) == 0\n", " && 16 + 16 * gid(0) + -1 * n >= 0\n", " )\n", " {\n", " if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n", " b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];\n", " if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)\n", " a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];\n", " barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;\n", " if (\n", " -1 + -16 * gid(0) + -1 * lid(1) + n >= 0\n", " && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n", " )\n", " c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];\n", " }\n", "}\n" ] } ], "source": [ "sfetch_knl = knl\n", "sfetch_knl = lp.split_iname(sfetch_knl, \"i\", 16,\n", " outer_tag=\"g.0\", inner_tag=\"l.0\", slabs=(0,1))\n", "sfetch_knl = lp.split_iname(sfetch_knl, \"j\", 16,\n", " outer_tag=\"g.1\", inner_tag=\"l.1\", slabs=(0,1))\n", "\n", "sfetch_knl = lp.add_prefetch(sfetch_knl, \"a\", [\"i_inner\"])\n", "sfetch_knl = lp.add_prefetch(sfetch_knl, \"b\", [\"j_inner\"])\n", "\n", "evt, (mat,) = sfetch_knl(queue, a=a, b=b)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 0 }