{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Loopy: Transforming a PDE to Code"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import pymbolic.primitives as p\n",
        "\n",
        "u = p.Variable(\"u\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We'll write code that evaluates $\\triangle u$ using finite differences.\n",
        "\n",
        "To that end, we define a new expression 'thing': An operator for the Laplacian."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "Sum((Laplacian(Variable('u')), Power(Variable('u'), 2), -1))"
            ]
          },
          "execution_count": 2,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "class Laplacian(p.Expression):\n",
        "    def __init__(self, child):\n",
        "        self.child = child\n",
        "        \n",
        "    def __getinitargs__(self):\n",
        "        return (self.child,)\n",
        "    \n",
        "    mapper_method = \"map_laplacian\"\n",
        "        \n",
        "pde = Laplacian(u)+u**2-1\n",
        "pde"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Now we'll write code to turn Laplacians into their discrete finite difference forms, using `i` and `j` as formal indices, using\n",
        "\n",
        "$$f''(x) \\approx \\frac{f(x+h) - 2 f(x) + f(x-h)}{h^{2}}$$\n",
        "\n",
        "Pay close attention to indexing!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from pymbolic.mapper import IdentityMapper\n",
        "\n",
        "i = p.Variable(\"i\")\n",
        "j = p.Variable(\"j\")\n",
        "\n",
        "ii = i+1\n",
        "jj = j+1"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "class FDMapper(IdentityMapper):\n",
        "    def map_variable(self, expr):\n",
        "        return expr[ii, jj]\n",
        "\n",
        "    def map_laplacian(self, expr):\n",
        "        var = expr.child\n",
        "        return (-4*var[ii,jj] + var[ii+1,jj] + var[ii-1,jj]\n",
        "                + var[ii,jj+1] + var[ii,jj-1])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]\n"
          ]
        }
      ],
      "source": [
        "fd_mapper = FDMapper()\n",
        "print(fd_mapper(pde))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "----\n",
        "\n",
        "Now let's generate some code for this, using `loopy`:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import loopy as lp\n",
        "\n",
        "result = p.Variable(\"result\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Observe the two parts of the `loopy` kernel description:\n",
        "\n",
        "* Polyhedral loop domain\n",
        "* Instructions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "knl = lp.make_kernel(\n",
        "    \"{[i,j]: 0<=i,j<n}\",\n",
        "    [lp.Assignment(result[ii, jj], fd_mapper(pde))],\n",
        "    )"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Kernels can always be inspected--simply use `print`:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "---------------------------------------------------------------------------\n",
            "KERNEL: loopy_kernel\n",
            "---------------------------------------------------------------------------\n",
            "ARGUMENTS:\n",
            "n: ValueArg, type: <runtime>\n",
            "result: GlobalArg, type: <runtime>, shape: (1 + n, 1 + n), dim_tags: (N1:stride:1 + n, N0:stride:1)\n",
            "u: GlobalArg, type: <runtime>, shape: (2 + n, 2 + n), dim_tags: (N1:stride:2 + n, N0:stride:1)\n",
            "---------------------------------------------------------------------------\n",
            "DOMAINS:\n",
            "[n] -> { [i, j] : 0 <= i < n and 0 <= j < n }\n",
            "---------------------------------------------------------------------------\n",
            "INAME IMPLEMENTATION TAGS:\n",
            "i: None\n",
            "j: None\n",
            "---------------------------------------------------------------------------\n",
            "INSTRUCTIONS:\n",
            "[i,j]                                \u001b[34mresult[i + 1, j + 1]\u001b[0m <- \u001b[35mu[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]\u001b[0m   # \u001b[32minsn\u001b[0m\n",
            "---------------------------------------------------------------------------\n"
          ]
        }
      ],
      "source": [
        "print(knl)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "----\n",
        "\n",
        "Let's move towards running this code. To do so, we need `pyopencl`:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import pyopencl as cl\n",
        "import pyopencl.array\n",
        "import pyopencl.clrandom\n",
        "\n",
        "ctx = cl.create_some_context()\n",
        "queue = cl.CommandQueue(ctx)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "And some data to work with:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "n = 1000\n",
        "u = cl.clrandom.rand(queue, (n+2,n+2), dtype=np.float32)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Now run the code, and tell loopy to print what it generates:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)\n",
            "{\n",
            "\n",
            "  for (int j = 0; j <= -1 + n; ++j)\n",
            "    for (int i = 0; i <= -1 + n; ++i)\n",
            "      result[(1 + n) * (i + 1) + j + 1] = u[(2 + n) * (i + 1) + j + 1] * u[(2 + n) * (i + 1) + j + 1] + -1.0f + -4.0f * u[(2 + n) * (i + 1) + j + 1] + u[(2 + n) * (i + 1 + 1) + j + 1] + u[(2 + n) * (i + 1 + -1) + j + 1] + u[(2 + n) * (i + 1) + j + 1 + 1] + u[(2 + n) * (i + 1) + j + 1 + -1];\n",
            "}\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/andreas/src/loopy/loopy/compiled.py:860: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring\n",
            "  kernel = get_one_scheduled_kernel(kernel)\n",
            "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n",
            "  warn(text, type)\n"
          ]
        }
      ],
      "source": [
        "knl = lp.set_options(knl, write_cl=True)\n",
        "\n",
        "evt, (result,) = knl(queue, u=u, n=n)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "That's obviously not very parallel. Introduce parallelism:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)\n",
            "{\n",
            "\n",
            "  result[(1 + n) * (gid(0) + 1) + gid(1) + 1] = u[(2 + n) * (gid(0) + 1) + gid(1) + 1] * u[(2 + n) * (gid(0) + 1) + gid(1) + 1] + -1.0f + -4.0f * u[(2 + n) * (gid(0) + 1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1 + 1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1 + -1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1) + gid(1) + 1 + 1] + u[(2 + n) * (gid(0) + 1) + gid(1) + 1 + -1];\n",
            "}\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n",
            "  warn(text, type)\n"
          ]
        }
      ],
      "source": [
        "tknl = knl\n",
        "tknl = lp.tag_inames(tknl, {\"i\": \"g.0\", \"j\": \"g.1\"})\n",
        "evt, (result,) = tknl(queue, u=u)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "But OpenCL/CUDA require blocking to be efficient!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)\n",
            "{\n",
            "\n",
            "  if (\n",
            "      -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n",
            "      && -1 + -16 * gid(1) + -1 * lid(1) + n >= 0\n",
            "    )\n",
            "    result[(1 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] = u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] * u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + -1.0f + -4.0f * u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + 1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + -1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + -1 + lid(0) + gid(0) * 16];\n",
            "}\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n",
            "  warn(text, type)\n"
          ]
        }
      ],
      "source": [
        "sknl = knl\n",
        "sknl = lp.split_iname(sknl,\n",
        "        \"i\", 16, outer_tag=\"g.1\", inner_tag=\"l.1\")\n",
        "sknl = lp.split_iname(sknl,\n",
        "        \"j\", 16, outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
        "evt, (result,) = sknl(queue, u=u)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "How about some data reuse?"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 20,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)\n",
            "{\n",
            "  __local float u_fetch[18 * 18];\n",
            "\n",
            "  for (int u_dim_1_outer = 0; u_dim_1_outer <= 1; ++u_dim_1_outer)\n",
            "    if (\n",
            "        17 + -16 * u_dim_1_outer + -1 * lid(0) >= 0\n",
            "        && 1 + -16 * u_dim_1_outer + -1 * lid(0) + -16 * gid(0) + n >= 0\n",
            "      )\n",
            "      for (int u_dim_0_outer = 0; u_dim_0_outer <= 1; ++u_dim_0_outer)\n",
            "        if (\n",
            "            17 + -16 * u_dim_0_outer + -1 * lid(1) >= 0\n",
            "            && 1 + -16 * u_dim_0_outer + -1 * lid(1) + -16 * gid(1) + n >= 0\n",
            "          )\n",
            "          u_fetch[18 * (lid(1) + u_dim_0_outer * 16) + lid(0) + u_dim_1_outer * 16] = u[(2 + n) * (16 * gid(1) + lid(1) + u_dim_0_outer * 16) + 16 * gid(0) + lid(0) + u_dim_1_outer * 16];\n",
            "  barrier(CLK_LOCAL_MEM_FENCE) /* for u_fetch (insn depends on u_fetch_rule) */;\n",
            "  if (\n",
            "      -1 + -16 * gid(0) + -1 * lid(0) + n >= 0\n",
            "      && -1 + -16 * gid(1) + -1 * lid(1) + n >= 0\n",
            "    )\n",
            "    result[(1 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] = u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] * u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] + -1.0f + -4.0f * u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] + u_fetch[18 * (2 + lid(1)) + 1 + lid(0)] + u_fetch[18 * lid(1) + 1 + lid(0)] + u_fetch[18 * (1 + lid(1)) + 2 + lid(0)] + u_fetch[18 * (1 + lid(1)) + lid(0)];\n",
            "}\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/andreas/src/loopy/loopy/compiled.py:860: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring\n",
            "  kernel = get_one_scheduled_kernel(kernel)\n",
            "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n",
            "  warn(text, type)\n"
          ]
        }
      ],
      "source": [
        "sknl = knl\n",
        "sknl = lp.split_iname(sknl,\n",
        "        \"i\", 16, outer_tag=\"g.1\", inner_tag=\"l.1\")\n",
        "sknl = lp.split_iname(sknl,\n",
        "        \"j\", 16, outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
        "sknl = lp.add_prefetch(sknl, \"u\",\n",
        "    [\"i_inner\", \"j_inner\"],\n",
        "    fetch_bounding_box=True)\n",
        "evt, (result,) = sknl(queue, u=u, n=n)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.5.1+"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}