# coding: utf-8

# # PyOpenCL Parallel Patterns: Reduction

# ## Setup Code

# In[1]:

import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import numpy as np


# In[2]:

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)


# In[3]:

n = 10**7
x = cl.clrandom.rand(queue, n, np.float64)


# ## Setting up the kernel: Computing a sum of squares
# 
# Want to compute the sum of the squares of all entries in `x`.
# 
# First, using `numpy`, as `result1` (watch out: `.get()`)

# In[4]:

result1 = np.sum(x.get()**2)


# Then, using PyOpenCL:

# In[5]:

from pyopencl.reduction import ReductionKernel


# Syntax:
# 
# ReductionKernel(context, dtype, netural, reduce_expr, map_expr, arguments)

# In[6]:

rknl = ReductionKernel(ctx, np.float64,
        neutral="0",
        reduce_expr="a+b", map_expr="x[i]*x[i]",
        arguments="double *x")


# ## Testing the result

# In[7]:

result2 = rknl(x)


# In[8]:

type(result2)


# In[9]:

result2.shape


# Now check the result:

# In[10]:

print(result2.get()-result1)


# * Change this to find maximum.
# * Works on structured types, too.
# * What if you wanted to find maximum *and* location?

# In[ ]: