{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Practice: Matrix Products"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Do the following:\n",
        "    \n",
        "* Implement a matrix-matrix product $A\\overline{B}^T$ in loopy. Let $A$ be real-valued and $B$ be complex-valued. The overline symbolizes complex conjugation."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup code"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "collapsed": true
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import numpy.linalg as la\n",
        "import pyopencl as cl\n",
        "import pyopencl.array\n",
        "import pyopencl.clrandom\n",
        "import loopy as lp"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "ctx = cl.create_some_context()\n",
        "queue = cl.CommandQueue(ctx)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "n = 1024\n",
        "A = cl.clrandom.rand(queue, (n, n), dtype=np.float64)\n",
        "B = (\n",
        "    cl.clrandom.rand(queue, (n, n), dtype=np.float64)\n",
        "    +\n",
        "    1j * cl.clrandom.rand(queue, (n, n), dtype=np.float64))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Implementing the Kernel"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Implement the basic kernel here:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 20,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "knl = lp.make_kernel(\n",
        "    \"{[i,j,k]: 0<=i,j,k<n}\",\n",
        "    \"C[i,j] = sum(k, A[i, k]*conj(B[j, k]))\"\n",
        "    )"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Here we execute the kernel, making sure we get to see the generated code:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 21,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine PYOPENCL_DEFINE_CDOUBLE\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36minclude\u001b[39;49;00m \u001b[37m<pyopencl-complex.h>\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\n",
            "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *\u001b[34mrestrict\u001b[39;49;00m A, __global cdouble_t \u001b[34mconst\u001b[39;49;00m *\u001b[34mrestrict\u001b[39;49;00m B, __global cdouble_t *\u001b[34mrestrict\u001b[39;49;00m C, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
            "{\n",
            "  cdouble_t acc_k;\n",
            "\n",
            "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= -\u001b[34m1\u001b[39;49;00m + n; ++j)\n",
            "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i = \u001b[34m0\u001b[39;49;00m; i <= -\u001b[34m1\u001b[39;49;00m + n; ++i)\n",
            "    {\n",
            "      acc_k = cdouble_fromreal(\u001b[34m0.0\u001b[39;49;00m);\n",
            "      \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m k = \u001b[34m0\u001b[39;49;00m; k <= -\u001b[34m1\u001b[39;49;00m + n; ++k)\n",
            "        acc_k = cdouble_add(acc_k, cdouble_rmul(A[n * i + k], cdouble_conj(B[n * j + k])));\n",
            "      C[n * i + j] = acc_k;\n",
            "    }\n",
            "}\n",
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/andreas/src/loopy/loopy/compiled.py:733: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring\n",
            "  kernel = get_one_scheduled_kernel(kernel)\n"
          ]
        }
      ],
      "source": [
        "_knl = lp.set_options(knl, write_cl=True, highlight_cl=True)\n",
        "evt, (C,) = _knl(queue, A=A, B=B)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Next, we test that we got the right result, using `numpy`:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 24,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "8.1249888675681689e-16"
            ]
          },
          "execution_count": 24,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "C_ref = A.get() @ B.get().T.conj()\n",
        "la.norm(C.get()-C_ref) / la.norm(C_ref)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Playing with the loop ordering\n",
        "\n",
        "Check the [loopy documentation](https://documen.tician.de/loopy) to see how to use the `seet_loop_priority` transform to prescribe a loop ordering.\n",
        "\n",
        "Try a few different variants, time their execution. Make sure to exclude the first run, because the time for that will include code generation and compilation.\n",
        "\n",
        "You may use the Python function `time.time` to get the wall clock time in seconds since the Jan 1, 1970.\n",
        "\n",
        "Also make sure to call `queue.finish()` before you start and stop the clock."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 57,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine PYOPENCL_DEFINE_CDOUBLE\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36minclude\u001b[39;49;00m \u001b[37m<pyopencl-complex.h>\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\n",
            "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *\u001b[34mrestrict\u001b[39;49;00m A, __global cdouble_t \u001b[34mconst\u001b[39;49;00m *\u001b[34mrestrict\u001b[39;49;00m B, __global cdouble_t *\u001b[34mrestrict\u001b[39;49;00m C, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
            "{\n",
            "  cdouble_t acc_k;\n",
            "\n",
            "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i = \u001b[34m0\u001b[39;49;00m; i <= -\u001b[34m1\u001b[39;49;00m + n; ++i)\n",
            "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= -\u001b[34m1\u001b[39;49;00m + n; ++j)\n",
            "    {\n",
            "      acc_k = cdouble_fromreal(\u001b[34m0.0\u001b[39;49;00m);\n",
            "      \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m k = \u001b[34m0\u001b[39;49;00m; k <= -\u001b[34m1\u001b[39;49;00m + n; ++k)\n",
            "        acc_k = cdouble_add(acc_k, cdouble_rmul(A[n * i + k], cdouble_conj(B[n * j + k])));\n",
            "      C[n * i + j] = acc_k;\n",
            "    }\n",
            "}\n",
            "\n",
            "1.8014510869979858 s per run\n"
          ]
        }
      ],
      "source": [
        "\n",
        "tknl = lp.set_loop_priority(knl, \"i,j\")\n",
        "\n",
        "\n",
        "def do_timing(timed_knl):\n",
        "    timed_knl = lp.set_options(timed_knl, write_cl=True, highlight_cl=True)\n",
        "    # Run once to 'warm up' the code\n",
        "    timed_knl(queue, A=A, B=B)\n",
        "\n",
        "    queue.finish()\n",
        "\n",
        "    from time import time\n",
        "    start = time()\n",
        "\n",
        "    nruns = 2\n",
        "    for i in range(nruns):\n",
        "        timed_knl(queue, A=A, B=B)\n",
        "\n",
        "    queue.finish()\n",
        "\n",
        "    timing = (time()-start)/nruns\n",
        "    print(timing,\"s per run\")\n",
        "    \n",
        "do_timing(tknl)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Parallelization: Single-element work groups\n",
        "\n",
        "Next, parallelize the operation using a 2D grid. Use the `tag_inames` transformation.\n",
        "\n",
        "Experiment with the ordering."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 58,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine PYOPENCL_DEFINE_CDOUBLE\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36minclude\u001b[39;49;00m \u001b[37m<pyopencl-complex.h>\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\n",
            "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *\u001b[34mrestrict\u001b[39;49;00m A, __global cdouble_t \u001b[34mconst\u001b[39;49;00m *\u001b[34mrestrict\u001b[39;49;00m B, __global cdouble_t *\u001b[34mrestrict\u001b[39;49;00m C, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
            "{\n",
            "  cdouble_t acc_k;\n",
            "\n",
            "  acc_k = cdouble_fromreal(\u001b[34m0.0\u001b[39;49;00m);\n",
            "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m k = \u001b[34m0\u001b[39;49;00m; k <= -\u001b[34m1\u001b[39;49;00m + n; ++k)\n",
            "    acc_k = cdouble_add(acc_k, cdouble_rmul(A[n * gid(\u001b[34m0\u001b[39;49;00m) + k], cdouble_conj(B[n * gid(\u001b[34m1\u001b[39;49;00m) + k])));\n",
            "  C[n * gid(\u001b[34m0\u001b[39;49;00m) + gid(\u001b[34m1\u001b[39;49;00m)] = acc_k;\n",
            "}\n",
            "\n",
            "0.8107101917266846 s per run\n"
          ]
        }
      ],
      "source": [
        "tknl = lp.tag_inames(knl, \"i:g.0,j:g.1\")\n",
        "\n",
        "do_timing(tknl)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Parallelization: Multi-element work groups\n",
        "\n",
        "Next, use more than one element per workgroup. Use the `split_iname` transformation.\n",
        "\n",
        "Experiment with group sizes and axis order."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 60,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine PYOPENCL_DEFINE_CDOUBLE\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\n",
            "\u001b[36m#\u001b[39;49;00m\u001b[36minclude\u001b[39;49;00m \u001b[37m<pyopencl-complex.h>\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
            "\n",
            "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m16\u001b[39;49;00m, \u001b[34m16\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *\u001b[34mrestrict\u001b[39;49;00m A, __global cdouble_t \u001b[34mconst\u001b[39;49;00m *\u001b[34mrestrict\u001b[39;49;00m B, __global cdouble_t *\u001b[34mrestrict\u001b[39;49;00m C, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
            "{\n",
            "  cdouble_t acc_k;\n",
            "\n",
            "  \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m && -\u001b[34m1\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
            "  {\n",
            "    acc_k = cdouble_fromreal(\u001b[34m0.0\u001b[39;49;00m);\n",
            "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m k = \u001b[34m0\u001b[39;49;00m; k <= -\u001b[34m1\u001b[39;49;00m + n; ++k)\n",
            "      acc_k = cdouble_add(acc_k, cdouble_rmul(A[n * (lid(\u001b[34m0\u001b[39;49;00m) + gid(\u001b[34m0\u001b[39;49;00m) * \u001b[34m16\u001b[39;49;00m) + k], cdouble_conj(B[n * (lid(\u001b[34m1\u001b[39;49;00m) + gid(\u001b[34m1\u001b[39;49;00m) * \u001b[34m16\u001b[39;49;00m) + k])));\n",
            "    C[n * (lid(\u001b[34m0\u001b[39;49;00m) + gid(\u001b[34m0\u001b[39;49;00m) * \u001b[34m16\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m) + gid(\u001b[34m1\u001b[39;49;00m) * \u001b[34m16\u001b[39;49;00m] = acc_k;\n",
            "  }\n",
            "}\n",
            "\n",
            "0.5177402496337891 s per run\n"
          ]
        }
      ],
      "source": [
        "tknl = lp.split_iname(knl, \"i\", 16, outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
        "tknl = lp.split_iname(tknl, \"j\", 16, outer_tag=\"g.1\", inner_tag=\"l.1\")\n",
        "\n",
        "do_timing(tknl)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Where to go from here\n",
        "Things to try:\n",
        "    \n",
        "* Loop Unrolling (the `unr` iname tag)\n",
        "* Instruction level parallelism (the `ilp` iname tag)\n",
        "* Prefetching (`add_prefetch`)\n",
        "* Run this on an actual GPU\n",
        "* Measure GFLOPS and GBytes/s"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": true
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.5.1+"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}