{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# A mini-`einsum` using loopy\n",
        "\n",
        "In this problem, we will design a function that carries out an `einsum`-type operation using `loopy`. It should be usable as shown in the tests towards the end of the worksheet. Also try to perform a simple parallelization so that your code will run on a GPU."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import numpy.linalg as la\n",
        "\n",
        "import pyopencl as cl\n",
        "import pyopencl.array\n",
        "import pyopencl.clmath\n",
        "import pyopencl.clrandom\n",
        "\n",
        "import loopy as lp"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Some hints:\n",
        "\n",
        "* `loopy.Reduction(\"sum\", (\"i\", \"j\", \"k\"), expr)` expresses a sum.\n",
        "* Build the loop domain `{[i,j]: 0<=i<Ni and 0<=j<Nj}` as a string and pass it to loopy.\n",
        "* To build strings, use\n",
        "    * `str.join()`: `\",\".join(names)` and \n",
        "    * `str.format`: `\"Hi {name}\".format(name=\"Andreas\")`"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "def loopy_einsum(queue, spec, *args):\n",
        "    arg_spec, out_spec = spec.split(\"->\")\n",
        "    arg_specs = arg_spec.split(\",\")\n",
        "    # ..."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "def loopy_einsum(queue, spec, *args):\n",
        "    arg_spec, out_spec = spec.split(\"->\")\n",
        "    arg_specs = arg_spec.split(\",\")\n",
        "\n",
        "    out_indices = set(out_spec)\n",
        "\n",
        "    all_indices = set(\n",
        "        idx\n",
        "        for argsp in arg_specs\n",
        "        for idx in argsp) | out_indices\n",
        "\n",
        "    sum_indices = all_indices - out_indices\n",
        "\n",
        "    from pymbolic import var\n",
        "    lhs = var(\"out\")[tuple(var(i) for i in out_spec)]\n",
        "\n",
        "    rhs = 1\n",
        "    for i, argsp in enumerate(arg_specs):\n",
        "        rhs = rhs * var(\"arg%d\" % i)[tuple(var(i) for i in argsp)]\n",
        "\n",
        "    if sum_indices:\n",
        "        rhs = lp.Reduction(\"sum\", tuple(var(idx) for idx in sum_indices), rhs)\n",
        "\n",
        "    constraints = \" and \".join(\n",
        "        \"0 <= %s < N%s\" % (idx, idx)\n",
        "        for idx in all_indices\n",
        "        )\n",
        "\n",
        "    domain = \"{[%s]: %s}\" % (\",\".join(all_indices), constraints)\n",
        "    knl = lp.make_kernel(domain, [lp.ExpressionInstruction(lhs, rhs)])\n",
        "\n",
        "    knl = lp.set_options(knl, write_cl=True)\n",
        "\n",
        "    kwargs = {}\n",
        "    for i, arg in enumerate(args):\n",
        "        kwargs[\"arg%d\" % i] = arg\n",
        "\n",
        "    evt, (out,) = knl(queue, **kwargs)\n",
        "    return out"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Now, let us test our implementation, using a simple matrix-matrix multiplication:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "#if __OPENCL_C_VERSION__ < 120\n",
            "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n",
            "#endif\n",
            "#define lid(N) ((int) get_local_id(N))\n",
            "#define gid(N) ((int) get_group_id(N))\n",
            "\n",
            "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const Ni, int const Nj, int const Nk, __global double const *restrict arg0, __global double const *restrict arg1, __global double *restrict out)\n",
            "{\n",
            "  double acc_k;\n",
            "\n",
            "  for (int j = 0; j <= -1 + Nj; ++j)\n",
            "    if (\n",
            "        -1 + Nk >= 0\n",
            "        && -1 + Ni >= 0\n",
            "      )\n",
            "      for (int i = 0; i <= -1 + Ni; ++i)\n",
            "      {\n",
            "        acc_k = 0.0;\n",
            "        for (int k = 0; k <= -1 + Nk; ++k)\n",
            "          acc_k = acc_k + arg0[Nk * i + k] * arg1[Nj * k + j];\n",
            "        out[Nj * i + j] = acc_k;\n",
            "      }\n",
            "}\n",
            "1.361706615e-12\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/home/andreas/src/loopy/loopy/compiled.py:841: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring\n",
            "  kernel = get_one_scheduled_kernel(kernel)\n",
            "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n",
            "  warn(text, type)\n"
          ]
        }
      ],
      "source": [
        "cl_context = cl.create_some_context()\n",
        "queue = cl.CommandQueue(cl_context)\n",
        "\n",
        "a = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)\n",
        "b = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)\n",
        "\n",
        "ab = loopy_einsum(queue, \"ik,kj->ij\", a, b)\n",
        "\n",
        "diff =  a.get().dot(b.get()) - ab.get()\n",
        "\n",
        "print(la.norm(diff, 2))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.5.1+"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}