{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Loopy: Counting Operations\n", "\n", "## Setup code" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "import pyopencl as cl\n", "import pyopencl.array\n", "import pyopencl.clrandom\n", "import loopy as lp" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ctx = cl.create_some_context()\n", "queue = cl.CommandQueue(ctx)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/lib/python3/dist-packages/IPython/core/completer.py:737: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead\n", " args,_,_1,defaults = inspect.getargspec(call_obj)\n" ] } ], "source": [ "n = 1024\n", "a = cl.clrandom.rand(queue, (n, n), dtype=np.float32)\n", "b = cl.clrandom.rand(queue, (n, n), dtype=np.float32)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Operation-counting matrix multiplication" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is the simple matrix-matrix multiplication kernel again:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "knl = lp.make_kernel(\n", " \"{[i,j,k]: 0<=i,j,k { n^3 : n >= 1 }\"),\n", " (dtype('float32'), 'mul'): PwQPolynomial(\"[n] -> { n^3 : n >= 1 }\")}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lp.get_op_poly(knl)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The return type is easy to evaluate for a given set of parameters--just use the `.eval_with_dict` method:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "3375" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "poly = lp.get_op_poly(knl)[np.dtype(np.float32), \"add\"]\n", "poly.eval_with_dict({\"n\": 15})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Counting memory access" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{(dtype('float32'),\n", " 'uniform',\n", " 'load'): PwQPolynomial(\"[n] -> { 2 * n^3 : n >= 1 }\"),\n", " (dtype('float32'),\n", " 'uniform',\n", " 'store'): PwQPolynomial(\"[n] -> { n^2 : n >= 1 }\")}" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lp.get_gmem_access_poly(knl)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Operation-counting a transformed kernel" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#define int_floor_div_pos_b(a,b) ( ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) )\n", "#define lid(N) ((int) get_local_id(N))\n", "#define gid(N) ((int) get_group_id(N))\n", "\n", "__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)\n", "{\n", " float acc_k_outer_k_inner;\n", "\n", " acc_k_outer_k_inner = 0.0f;\n", " for (int k_outer = 0; k_outer <= int_floor_div_pos_b(-16 + n, 16); ++k_outer)\n", " for (int k_inner = 0; k_inner <= 15; ++k_inner)\n", " acc_k_outer_k_inner = acc_k_outer_k_inner + a[n * (lid(1) + gid(0) * 16) + k_inner + k_outer * 16] * b[n * (k_inner + k_outer * 16) + lid(0) + gid(1) * 16];\n", " c[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = acc_k_outer_k_inner;\n", "}\n" ] } ], "source": [ "opt_knl = knl\n", "opt_knl = lp.assume(opt_knl, \"n mod 16 = 0\")\n", "opt_knl = lp.split_iname(opt_knl, \"i\", 16, outer_tag=\"g.0\", inner_tag=\"l.1\")\n", "opt_knl = lp.split_iname(opt_knl, \"j\", 16, outer_tag=\"g.1\", inner_tag=\"l.0\")\n", "opt_knl = lp.split_iname(opt_knl, \"k\", 16)\n", "#opt_knl = lp.add_prefetch(opt_knl, \"a\", \"i_inner,k_inner\")\n", "#opt_knl = lp.add_prefetch(opt_knl, \"b\", \"j_inner,k_inner\")\n", "\n", "opt_knl = lp.set_options(opt_knl, write_cl=True)\n", "_ = opt_knl(queue, a=a, b=b)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now count the memory accesses in the transformed version:" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{(dtype('float32'),\n", " 'consecutive',\n", " 'load'): PwQPolynomial(\"[n] -> { (((512 - 192 * n + 24 * n^2 - n^3) * floor((n)/16)^3 + (1536 - 384 * n + 24 * n^2) * floor((n)/16)^4 + (1536 - 192 * n) * floor((n)/16)^5 + 512 * floor((n)/16)^6) + ((1536 - 192 * n - 24 * n^2 + 3 * n^3) * floor((n)/16)^2 + (3072 - 48 * n^2) * floor((n)/16)^3 + (1536 + 192 * n) * floor((n)/16)^4) * floor((15 + n)/16) + ((1536 + 192 * n - 24 * n^2 - 3 * n^3) * floor((n)/16) + 768 * n * floor((n)/16)^2 + (-3072 + 384 * n) * floor((n)/16)^3 - 1536 * floor((n)/16)^4) * floor((15 + n)/16)^2 + ((512 + 192 * n + 24 * n^2 + n^3) + (-3072 + 48 * n^2) * floor((n)/16) + (-3072 - 384 * n) * floor((n)/16)^2) * floor((15 + n)/16)^3 + ((-1536 - 384 * n - 24 * n^2) + (1536 - 192 * n) * floor((n)/16) + 1536 * floor((n)/16)^2) * floor((15 + n)/16)^4 + (1536 + 192 * n) * floor((15 + n)/16)^5 - 512 * floor((15 + n)/16)^6) : n >= 1 }\"),\n", " (dtype('float32'),\n", " 'consecutive',\n", " 'store'): PwQPolynomial(\"[n] -> { (((8 * n - n^2) * floor((n)/16) + 8 * n * floor((n)/16)^2) + (8 * n + n^2) * floor((15 + n)/16) + -8 * n * floor((15 + n)/16)^2) : n >= 1 }\"),\n", " (dtype('float32'),\n", " 'nonconsecutive',\n", " 'load'): PwQPolynomial(\"[n] -> { (((512 - 192 * n + 24 * n^2 - n^3) * floor((n)/16)^3 + (1536 - 384 * n + 24 * n^2) * floor((n)/16)^4 + (1536 - 192 * n) * floor((n)/16)^5 + 512 * floor((n)/16)^6) + ((1536 - 192 * n - 24 * n^2 + 3 * n^3) * floor((n)/16)^2 + (3072 - 48 * n^2) * floor((n)/16)^3 + (1536 + 192 * n) * floor((n)/16)^4) * floor((15 + n)/16) + ((1536 + 192 * n - 24 * n^2 - 3 * n^3) * floor((n)/16) + 768 * n * floor((n)/16)^2 + (-3072 + 384 * n) * floor((n)/16)^3 - 1536 * floor((n)/16)^4) * floor((15 + n)/16)^2 + ((512 + 192 * n + 24 * n^2 + n^3) + (-3072 + 48 * n^2) * floor((n)/16) + (-3072 - 384 * n) * floor((n)/16)^2) * floor((15 + n)/16)^3 + ((-1536 - 384 * n - 24 * n^2) + (1536 - 192 * n) * floor((n)/16) + 1536 * floor((n)/16)^2) * floor((15 + n)/16)^4 + (1536 + 192 * n) * floor((15 + n)/16)^5 - 512 * floor((15 + n)/16)^6) : n >= 1 }\")}" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lp.get_gmem_access_poly(opt_knl)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now enable the prefetch transformation above." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1+" } }, "nbformat": 4, "nbformat_minor": 0 }