{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# PyOpenCL Parallel Patterns: Map/Elementwise"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Setup code"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import pyopencl as cl\n",
        "import pyopencl.array\n",
        "import pyopencl.clrandom\n",
        "import numpy as np"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "ctx = cl.create_some_context()\n",
        "queue = cl.CommandQueue(ctx)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "n = 10**7\n",
        "a = cl.clrandom.rand(queue, n, np.float32)\n",
        "b = cl.clrandom.rand(queue, n, np.float32)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## A simple 'target application'"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We would like to evaluate this linear combination:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "c1 = 5*a + 6*b"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "A problem with this is that every single operator (all three of them--and easily more for complicated expressions) corresponds to a kernel call, which can lead to high overhead. Let's try and avoid that by stuffing the entire operation into one kernel, in turn saving lots of memory traffic:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from pyopencl.elementwise import ElementwiseKernel"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "lin_comb = ElementwiseKernel(ctx,\n",
        "\n",
        "        \"float a, float *x, float b, float *y, float *c\",\n",
        "\n",
        "        \"c[i] = a*x[i] + b*y[i]\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "<pyopencl.cffi_cl.Event at 0x7f6f3bd72a20>"
            ]
          },
          "execution_count": 7,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "c2 = cl.array.empty_like(a)\n",
        "lin_comb(5, a, 6, b, c2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "0.0\n"
          ]
        }
      ],
      "source": [
        "import numpy.linalg as la\n",
        "print(la.norm(c1.get() - c2.get()))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Timing ElementwiseKernel\n",
        "\n",
        "Did this optimization pay off?"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "elapsed: 5.4626686573028564 s\n"
          ]
        }
      ],
      "source": [
        "from time import time\n",
        "queue.finish()\n",
        "start_time = time()\n",
        "\n",
        "for i in range(10):\n",
        "    c1 = 5*a + 6*b\n",
        "    \n",
        "queue.finish()\n",
        "print(\"elapsed: {0} s\".format(time()-start_time))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "collapsed": false
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "elapsed: 2.354213237762451 s\n"
          ]
        }
      ],
      "source": [
        "from time import time\n",
        "queue.finish()\n",
        "start_time = time()\n",
        "\n",
        "for i in range(10):\n",
        "    lin_comb(5, a, 6, b, c2)\n",
        "    \n",
        "queue.finish()\n",
        "print(\"elapsed: {0} s\".format(time()-start_time))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.5.0+"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}