# coding: utf-8 # # PyOpenCL Parallel Patterns: Map/Elementwise # ## Setup code # In[1]: import pyopencl as cl import pyopencl.array import pyopencl.clrandom import numpy as np # In[2]: ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # In[3]: n = 10**7 a = cl.clrandom.rand(queue, n, np.float32) b = cl.clrandom.rand(queue, n, np.float32) # ## A simple 'target application' # We would like to evaluate this linear combination: # In[4]: c1 = 5*a + 6*b # A problem with this is that every single operator (all three of them--and easily more for complicated expressions) corresponds to a kernel call, which can lead to high overhead. Let's try and avoid that by stuffing the entire operation into one kernel, in turn saving lots of memory traffic: # In[5]: from pyopencl.elementwise import ElementwiseKernel # In[6]: lin_comb = ElementwiseKernel(ctx, "float a, float *x, float b, float *y, float *c", "c[i] = a*x[i] + b*y[i]") # In[7]: c2 = cl.array.empty_like(a) lin_comb(5, a, 6, b, c2) # In[8]: import numpy.linalg as la print(la.norm(c1.get() - c2.get())) # ## Timing ElementwiseKernel # # Did this optimization pay off? # In[9]: from time import time queue.finish() start_time = time() for i in range(10): c1 = 5*a + 6*b queue.finish() print("elapsed: {0} s".format(time()-start_time)) # In[10]: from time import time queue.finish() start_time = time() for i in range(10): lin_comb(5, a, 6, b, c2) queue.finish() print("elapsed: {0} s".format(time()-start_time)) # In[ ]: