{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# A mini-`einsum` using loopy\n", "\n", "In this problem, we will design a function that carries out an `einsum`-type operation using `loopy`. It should be usable as shown in the tests towards the end of the worksheet. Also try to perform a simple parallelization so that your code will run on a GPU." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "import numpy.linalg as la\n", "\n", "import pyopencl as cl\n", "import pyopencl.array\n", "import pyopencl.clmath\n", "import pyopencl.clrandom\n", "\n", "import loopy as lp" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some hints:\n", "\n", "* `loopy.Reduction(\"sum\", (\"i\", \"j\", \"k\"), expr)` expresses a sum.\n", "* Build the loop domain `{[i,j]: 0<=i\")\n", " arg_specs = arg_spec.split(\",\")\n", " # ..." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def loopy_einsum(queue, spec, *args):\n", " arg_spec, out_spec = spec.split(\"->\")\n", " arg_specs = arg_spec.split(\",\")\n", "\n", " out_indices = set(out_spec)\n", "\n", " all_indices = set(\n", " idx\n", " for argsp in arg_specs\n", " for idx in argsp) | out_indices\n", "\n", " sum_indices = all_indices - out_indices\n", "\n", " from pymbolic import var\n", " lhs = var(\"out\")[tuple(var(i) for i in out_spec)]\n", "\n", " rhs = 1\n", " for i, argsp in enumerate(arg_specs):\n", " rhs = rhs * var(\"arg%d\" % i)[tuple(var(i) for i in argsp)]\n", "\n", " if sum_indices:\n", " rhs = lp.Reduction(\"sum\", tuple(var(idx) for idx in sum_indices), rhs)\n", "\n", " constraints = \" and \".join(\n", " \"0 <= %s < N%s\" % (idx, idx)\n", " for idx in all_indices\n", " )\n", "\n", " domain = \"{[%s]: %s}\" % (\",\".join(all_indices), constraints)\n", " knl = lp.make_kernel(domain, [lp.ExpressionInstruction(lhs, rhs)])\n", "\n", " knl = lp.set_options(knl, write_cl=True)\n", "\n", " kwargs = {}\n", " for i, arg in enumerate(args):\n", " kwargs[\"arg%d\" % i] = arg\n", "\n", " evt, (out,) = knl(queue, **kwargs)\n", " return out" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, let us test our implementation, using a simple matrix-matrix multiplication:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#if __OPENCL_C_VERSION__ < 120\n", "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n", "#endif\n", "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const Ni, int const Nj, int const Nk, __global double const *restrict arg0, __global double const *restrict arg1, __global double *restrict out)\n", "{\n", " double acc_k;\n", "\n", " for (int j = 0; j <= -1 + Nj; ++j)\n", " if (\n", " -1 + Nk >= 0\n", " && -1 + Ni >= 0\n", " )\n", " for (int i = 0; i <= -1 + Ni; ++i)\n", " {\n", " acc_k = 0.0;\n", " for (int k = 0; k <= -1 + Nk; ++k)\n", " acc_k = acc_k + arg0[Nk * i + k] * arg1[Nj * k + j];\n", " out[Nj * i + j] = acc_k;\n", " }\n", "}\n", "1.361706615e-12\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/andreas/src/loopy/loopy/compiled.py:841: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring\n", " kernel = get_one_scheduled_kernel(kernel)\n", "/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)\n", " warn(text, type)\n" ] } ], "source": [ "cl_context = cl.create_some_context()\n", "queue = cl.CommandQueue(cl_context)\n", "\n", "a = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)\n", "b = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)\n", "\n", "ab = loopy_einsum(queue, \"ik,kj->ij\", a, b)\n", "\n", "diff = a.get().dot(b.get()) - ab.get()\n", "\n", "print(la.norm(diff, 2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1+" } }, "nbformat": 4, "nbformat_minor": 0 }