{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Loopy: Transforming a PDE to Code" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pymbolic.primitives as p\n", "\n", "u = p.Variable(\"u\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We'll write code that evaluates $\\triangle u$ using finite differences.\n", "\n", "To that end, we define a new expression 'thing': An operator for the Laplacian." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Sum((Laplacian(Variable('u')), Power(Variable('u'), 2), -1))" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "class Laplacian(p.Expression):\n", " def __init__(self, child):\n", " self.child = child\n", " \n", " def __getinitargs__(self):\n", " return (self.child,)\n", " \n", " mapper_method = \"map_laplacian\"\n", " \n", "pde = Laplacian(u)+u**2-1\n", "pde" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we'll write code to turn Laplacians into their discrete finite difference forms, using `i` and `j` as formal indices, using\n", "\n", "$$f''(x) \\approx \\frac{f(x+h) - 2 f(x) + f(x-h)}{h^{2}}$$\n", "\n", "Pay close attention to indexing!" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from pymbolic.mapper import IdentityMapper\n", "\n", "i = p.Variable(\"i\")\n", "j = p.Variable(\"j\")\n", "\n", "ii = i+1\n", "jj = j+1" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "class FDMapper(IdentityMapper):\n", " def map_variable(self, expr):\n", " return expr[ii, jj]\n", "\n", " def map_laplacian(self, expr):\n", " var = expr.child\n", " return (-4*var[ii,jj] + var[ii+1,jj] + var[ii-1,jj]\n", " + var[ii,jj+1] + var[ii,jj-1])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]\n" ] } ], "source": [ "fd_mapper = FDMapper()\n", "print(fd_mapper(pde))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\n", "Now let's generate some code for this, using `loopy`:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import loopy as lp\n", "\n", "result = p.Variable(\"result\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Observe the two parts of the `loopy` kernel description:\n", "\n", "* Polyhedral loop domain\n", "* Instructions" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "knl = lp.make_kernel(\n", " \"{[i,j]: 0<=i,j\n", "result: GlobalArg, type: , shape: (1 + n, 1 + n), dim_tags: (N1:stride:1 + n, N0:stride:1)\n", "u: GlobalArg, type: , shape: (2 + n, 2 + n), dim_tags: (N1:stride:2 + n, N0:stride:1)\n", "---------------------------------------------------------------------------\n", "DOMAINS:\n", "[n] -> { [i, j] : 0 <= i < n and 0 <= j < n }\n", "---------------------------------------------------------------------------\n", "INAME IMPLEMENTATION TAGS:\n", "i: None\n", "j: None\n", "---------------------------------------------------------------------------\n", "INSTRUCTIONS:\n", "[i,j] \u001b[34mresult[i + 1, j + 1]\u001b[0m <- \u001b[35mu[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]\u001b[0m # \u001b[32minsn\u001b[0m\n", "---------------------------------------------------------------------------\n" ] } ], "source": [ "print(knl)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\n", "Let's move towards running this code. To do so, we need `pyopencl`:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "import pyopencl as cl\n", "import pyopencl.array\n", "import pyopencl.clrandom\n", "\n", "ctx = cl.create_some_context()\n", "queue = cl.CommandQueue(ctx)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And some data to work with:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "n = 1000\n", "u = cl.clrandom.rand(queue, (n+2,n+2), dtype=np.float32)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now run the code, and tell loopy to print what it generates:" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)\n", "{\n", "\n", " for (int j = 0; j <= -1 + n; ++j)\n", " for (int i = 0; i <= -1 + n; ++i)\n", " result[(1 + n) * (i + 1) + j + 1] = u[(2 + n) * (i + 1) + j + 1] * u[(2 + n) * (i + 1) + j + 1] + -1.0f + -4.0f * u[(2 + n) * (i + 1) + j + 1] + u[(2 + n) * (i + 1 + 1) + j + 1] + u[(2 + n) * (i + 1 + -1) + j + 1] + u[(2 + n) * (i + 1) + j + 1 + 1] + u[(2 + n) * (i + 1) + j + 1 + -1];\n", "}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/andreas/src/loopy/loopy/compiled.py:860: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring\n", " kernel = get_one_scheduled_kernel(kernel)\n", "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n", " warn(text, type)\n" ] } ], "source": [ "knl = lp.set_options(knl, write_cl=True)\n", "\n", "evt, (result,) = knl(queue, u=u, n=n)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "That's obviously not very parallel. Introduce parallelism:" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)\n", "{\n", "\n", " result[(1 + n) * (gid(0) + 1) + gid(1) + 1] = u[(2 + n) * (gid(0) + 1) + gid(1) + 1] * u[(2 + n) * (gid(0) + 1) + gid(1) + 1] + -1.0f + -4.0f * u[(2 + n) * (gid(0) + 1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1 + 1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1 + -1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1) + gid(1) + 1 + 1] + u[(2 + n) * (gid(0) + 1) + gid(1) + 1 + -1];\n", "}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n", " warn(text, type)\n" ] } ], "source": [ "tknl = knl\n", "tknl = lp.tag_inames(tknl, {\"i\": \"g.0\", \"j\": \"g.1\"})\n", "evt, (result,) = tknl(queue, u=u)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "But OpenCL/CUDA require blocking to be efficient!" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)\n", "{\n", "\n", " if (\n", " -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n", " && -1 + -16 * gid(1) + -1 * lid(1) + n >= 0\n", " )\n", " result[(1 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] = u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] * u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + -1.0f + -4.0f * u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + 1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + -1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + -1 + lid(0) + gid(0) * 16];\n", "}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n", " warn(text, type)\n" ] } ], "source": [ "sknl = knl\n", "sknl = lp.split_iname(sknl,\n", " \"i\", 16, outer_tag=\"g.1\", inner_tag=\"l.1\")\n", "sknl = lp.split_iname(sknl,\n", " \"j\", 16, outer_tag=\"g.0\", inner_tag=\"l.0\")\n", "evt, (result,) = sknl(queue, u=u)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How about some data reuse?" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)\n", "{\n", " __local float u_fetch[18 * 18];\n", "\n", " for (int u_dim_1_outer = 0; u_dim_1_outer <= 1; ++u_dim_1_outer)\n", " if (\n", " 17 + -16 * u_dim_1_outer + -1 * lid(0) >= 0\n", " && 1 + -16 * u_dim_1_outer + -1 * lid(0) + -16 * gid(0) + n >= 0\n", " )\n", " for (int u_dim_0_outer = 0; u_dim_0_outer <= 1; ++u_dim_0_outer)\n", " if (\n", " 17 + -16 * u_dim_0_outer + -1 * lid(1) >= 0\n", " && 1 + -16 * u_dim_0_outer + -1 * lid(1) + -16 * gid(1) + n >= 0\n", " )\n", " u_fetch[18 * (lid(1) + u_dim_0_outer * 16) + lid(0) + u_dim_1_outer * 16] = u[(2 + n) * (16 * gid(1) + lid(1) + u_dim_0_outer * 16) + 16 * gid(0) + lid(0) + u_dim_1_outer * 16];\n", " barrier(CLK_LOCAL_MEM_FENCE) /* for u_fetch (insn depends on u_fetch_rule) */;\n", " if (\n", " -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n", " && -1 + -16 * gid(1) + -1 * lid(1) + n >= 0\n", " )\n", " result[(1 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] = u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] * u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] + -1.0f + -4.0f * u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] + u_fetch[18 * (2 + lid(1)) + 1 + lid(0)] + u_fetch[18 * lid(1) + 1 + lid(0)] + u_fetch[18 * (1 + lid(1)) + 2 + lid(0)] + u_fetch[18 * (1 + lid(1)) + lid(0)];\n", "}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/andreas/src/loopy/loopy/compiled.py:860: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring\n", " kernel = get_one_scheduled_kernel(kernel)\n", "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n", " warn(text, type)\n" ] } ], "source": [ "sknl = knl\n", "sknl = lp.split_iname(sknl,\n", " \"i\", 16, outer_tag=\"g.1\", inner_tag=\"l.1\")\n", "sknl = lp.split_iname(sknl,\n", " \"j\", 16, outer_tag=\"g.0\", inner_tag=\"l.0\")\n", "sknl = lp.add_prefetch(sknl, \"u\",\n", " [\"i_inner\", \"j_inner\"],\n", " fetch_bounding_box=True)\n", "evt, (result,) = sknl(queue, u=u, n=n)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1+" } }, "nbformat": 4, "nbformat_minor": 0 }