{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Loopy: Counting Operations\n",
        "\n",
        "## Setup code"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import pyopencl as cl\n",
        "import pyopencl.array\n",
        "import pyopencl.clrandom\n",
        "import loopy as lp"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "ctx = cl.create_some_context()\n",
        "queue = cl.CommandQueue(ctx)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/usr/lib/python3/dist-packages/IPython/core/completer.py:737: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n",
            "  args,_,_1,defaults = inspect.getargspec(call_obj)\n"
          ]
        }
      ],
      "source": [
        "n = 1024\n",
        "a = cl.clrandom.rand(queue, (n, n), dtype=np.float32)\n",
        "b = cl.clrandom.rand(queue, (n, n), dtype=np.float32)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Operation-counting matrix multiplication"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Here is the simple matrix-matrix multiplication kernel again:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "knl = lp.make_kernel(\n",
        "    \"{[i,j,k]: 0<=i,j,k<n}\",\n",
        "    \"c[i, j] = sum(k, a[i, k]*b[k, j])\"\n",
        "    )\n",
        "knl = lp.add_and_infer_dtypes(knl, {\"a\": np.float32, \"b\":np.float32})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Counting flops\n",
        "\n",
        "Let us determine the number of arithmetic operations being carried out:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "{(dtype('float32'), 'add'): PwQPolynomial(\"[n] -> { n^3 : n >= 1 }\"),\n",
              " (dtype('float32'), 'mul'): PwQPolynomial(\"[n] -> { n^3 : n >= 1 }\")}"
            ]
          },
          "execution_count": 10,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "lp.get_op_poly(knl)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "The return type is easy to evaluate for a given set of parameters--just use the `.eval_with_dict` method:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "3375"
            ]
          },
          "execution_count": 15,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "poly = lp.get_op_poly(knl)[np.dtype(np.float32), \"add\"]\n",
        "poly.eval_with_dict({\"n\": 15})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Counting memory access"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "{(dtype('float32'),\n",
              "  'uniform',\n",
              "  'load'): PwQPolynomial(\"[n] -> { 2 * n^3 : n >= 1 }\"),\n",
              " (dtype('float32'),\n",
              "  'uniform',\n",
              "  'store'): PwQPolynomial(\"[n] -> { n^2 : n >= 1 }\")}"
            ]
          },
          "execution_count": 17,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "lp.get_gmem_access_poly(knl)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Operation-counting a transformed kernel"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 31,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )\n",
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n",
            "{\n",
            "  float acc_k_outer_k_inner;\n",
            "\n",
            "  acc_k_outer_k_inner = 0.0f;\n",
            "  for (int k_outer = 0; k_outer <= int_floor_div_pos_b(-16 + n, 16); ++k_outer)\n",
            "    for (int k_inner = 0; k_inner <= 15; ++k_inner)\n",
            "      acc_k_outer_k_inner = acc_k_outer_k_inner + a[n * (lid(1) + gid(0) * 16) + k_inner + k_outer * 16] * b[n * (k_inner + k_outer * 16) + lid(0) + gid(1) * 16];\n",
            "  c[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = acc_k_outer_k_inner;\n",
            "}\n"
          ]
        }
      ],
      "source": [
        "opt_knl = knl\n",
        "opt_knl = lp.assume(opt_knl, \"n mod 16 = 0\")\n",
        "opt_knl = lp.split_iname(opt_knl, \"i\", 16, outer_tag=\"g.0\", inner_tag=\"l.1\")\n",
        "opt_knl = lp.split_iname(opt_knl, \"j\", 16, outer_tag=\"g.1\", inner_tag=\"l.0\")\n",
        "opt_knl = lp.split_iname(opt_knl, \"k\", 16)\n",
        "#opt_knl = lp.add_prefetch(opt_knl, \"a\", \"i_inner,k_inner\")\n",
        "#opt_knl = lp.add_prefetch(opt_knl, \"b\", \"j_inner,k_inner\")\n",
        "\n",
        "opt_knl = lp.set_options(opt_knl, write_cl=True)\n",
        "_ = opt_knl(queue, a=a, b=b)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Now count the memory accesses in the transformed version:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 32,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "{(dtype('float32'),\n",
              "  'consecutive',\n",
              "  'load'): PwQPolynomial(\"[n] -> { (((512 - 192 * n + 24 * n^2 - n^3) * floor((n)/16)^3 + (1536 - 384 * n + 24 * n^2) * floor((n)/16)^4 + (1536 - 192 * n) * floor((n)/16)^5 + 512 * floor((n)/16)^6) + ((1536 - 192 * n - 24 * n^2 + 3 * n^3) * floor((n)/16)^2 + (3072 - 48 * n^2) * floor((n)/16)^3 + (1536 + 192 * n) * floor((n)/16)^4) * floor((15 + n)/16) + ((1536 + 192 * n - 24 * n^2 - 3 * n^3) * floor((n)/16) + 768 * n * floor((n)/16)^2 + (-3072 + 384 * n) * floor((n)/16)^3 - 1536 * floor((n)/16)^4) * floor((15 + n)/16)^2 + ((512 + 192 * n + 24 * n^2 + n^3) + (-3072 + 48 * n^2) * floor((n)/16) + (-3072 - 384 * n) * floor((n)/16)^2) * floor((15 + n)/16)^3 + ((-1536 - 384 * n - 24 * n^2) + (1536 - 192 * n) * floor((n)/16) + 1536 * floor((n)/16)^2) * floor((15 + n)/16)^4 + (1536 + 192 * n) * floor((15 + n)/16)^5 - 512 * floor((15 + n)/16)^6) : n >= 1 }\"),\n",
              " (dtype('float32'),\n",
              "  'consecutive',\n",
              "  'store'): PwQPolynomial(\"[n] -> { (((8 * n - n^2) * floor((n)/16) + 8 * n * floor((n)/16)^2) + (8 * n + n^2) * floor((15 + n)/16) + -8 * n * floor((15 + n)/16)^2) : n >= 1 }\"),\n",
              " (dtype('float32'),\n",
              "  'nonconsecutive',\n",
              "  'load'): PwQPolynomial(\"[n] -> { (((512 - 192 * n + 24 * n^2 - n^3) * floor((n)/16)^3 + (1536 - 384 * n + 24 * n^2) * floor((n)/16)^4 + (1536 - 192 * n) * floor((n)/16)^5 + 512 * floor((n)/16)^6) + ((1536 - 192 * n - 24 * n^2 + 3 * n^3) * floor((n)/16)^2 + (3072 - 48 * n^2) * floor((n)/16)^3 + (1536 + 192 * n) * floor((n)/16)^4) * floor((15 + n)/16) + ((1536 + 192 * n - 24 * n^2 - 3 * n^3) * floor((n)/16) + 768 * n * floor((n)/16)^2 + (-3072 + 384 * n) * floor((n)/16)^3 - 1536 * floor((n)/16)^4) * floor((15 + n)/16)^2 + ((512 + 192 * n + 24 * n^2 + n^3) + (-3072 + 48 * n^2) * floor((n)/16) + (-3072 - 384 * n) * floor((n)/16)^2) * floor((15 + n)/16)^3 + ((-1536 - 384 * n - 24 * n^2) + (1536 - 192 * n) * floor((n)/16) + 1536 * floor((n)/16)^2) * floor((15 + n)/16)^4 + (1536 + 192 * n) * floor((15 + n)/16)^5 - 512 * floor((15 + n)/16)^6) : n >= 1 }\")}"
            ]
          },
          "execution_count": 32,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "lp.get_gmem_access_poly(opt_knl)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Now enable the prefetch transformation above."
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.5.1+"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}