
# coding: utf-8

# # PyOpenCL Parallel Patterns: Scan/Prefix Sum
# 
# ## Setup Code

# In[1]:

import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import numpy as np
import numpy.linalg as la


# In[2]:

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)


# In[3]:

n = 10**7
x = cl.clrandom.rand(queue, n, np.float64)


# ## Setting up the kernel: Compute the prefix sum of squares
# 
# Want to compute the prefix sum of the squares of all entries in `x`.
# 
# First, using `numpy`, as `result1`:

# In[4]:

result1 = np.cumsum(x.get()**2)


# Then, using PyOpenCL:

# In[5]:

from pyopencl.scan import GenericScanKernel


# Syntax:
#     
# GSK(context, dtype, arguments, input_expr, scan_expr using `a` and `b`, neutral, output_statement with `item`)

# In[7]:

sknl = GenericScanKernel(ctx, np.float64,
    arguments="double *y, double *x",
    input_expr="x[i]*x[i]",
    scan_expr="a+b", neutral="0",
    output_statement="y[i] = item")


# In[8]:

result2 = cl.array.empty_like(x)
sknl(result2, x)


# ## Testing the outcome

# In[9]:

print(la.norm(result2.get() - result1))


# More features:
# 
# * Segmented Scan
# * Output stencils
# * Works on structured types
