
# coding: utf-8

# # Symbolic Image Filtering

# In[1]:

import pymbolic.primitives as p

u_var = p.Variable("u")


# Want to define an image filter.
# 
# ----
# 
# To that end, define a new formula 'thing': A neighbor-average.

# In[2]:

class NeighborAverage(p.Expression):
    def __init__(self, child):
        self.child = child
        
    def __getinitargs__(self):
        return (self.child,)
    
    mapper_method = "map_neighbor_average"
        
img_filter = NeighborAverage(u_var)
#img_filter = u_var + u_var - NeighborAverage(u_var)

img_filter


# Let's define some indexing variables:

# In[3]:

from pymbolic.mapper import IdentityMapper

i = p.Variable("i")
j = p.Variable("j")

ii = i+1
jj = j+1


# In[4]:

class IndexMapper(IdentityMapper):
    def map_variable(self, expr):
        return expr[ii, jj]
    
    def map_neighbor_average(self, expr):
        var = expr.child
        return (2*var[ii,jj] + var[ii+1,jj] + var[ii-1,jj]
                + var[ii,jj+1] + var[ii,jj-1])/6


# Now apply this to our filter:

# In[5]:

idx_mapper = IndexMapper()
print(idx_mapper(img_filter))


# ----
# 
# Now let's generate some code for this, using `loopy`:

# In[6]:

import loopy as lp
result_var = p.Variable("result")


# Observe the two parts of the `loopy` kernel description:
# 
# * Polyhedral loop domain
# * Instructions `[lp.ExpressionInstruction()]`

# In[7]:

knl = lp.make_kernel(
    "{[i,j]: 0<=i,j<n}",
    [lp.ExpressionInstruction(
            result_var[ii,jj], idx_mapper(img_filter))],
    [lp.GlobalArg("result", shape="n+2, n+2"), ...]
    )


# Kernels can always be inspected--simply use `print`:

# In[8]:

print(knl)


# ----
# 
# Let's move towards running this code. To do so, we need `pyopencl`:

# In[9]:

import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom

ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)


# And some data to work with:

# In[10]:

n = 200

import scipy.misc
u = scipy.misc.imread("cat.jpeg").astype(np.float32).sum(axis=2)/(3*256)
u = cl.array.to_device(queue, u)


# In[11]:

import matplotlib.pyplot as pt
pt.imshow(u.get(), cmap="gray")


# Now run the code, and tell loopy to print what it generates:

# In[12]:

knl = lp.set_options(knl, write_cl=True)

result = cl.array.zeros_like(u)
_ = knl(queue, u=u, result=result, n=n)
u = result


# In[13]:

pt.imshow(u.get(), cmap="gray", vmin=0, vmax=1)


# That's obviously not very parallel. Introduce parallelism:

# In[14]:

tknl = knl
tknl = lp.tag_inames(tknl, {"i": "g.0", "j": "g.1"})
evt, (result,) = tknl(queue, u=u)


# But OpenCL/CUDA require blocking to be efficient!

# In[15]:

sknl = knl
sknl = lp.split_iname(sknl,
        "i", 16, outer_tag="g.1", inner_tag="l.1")
sknl = lp.split_iname(sknl,
        "j", 16, outer_tag="g.0", inner_tag="l.0")
evt, (result,) = sknl(queue, u=u)


# How about some data reuse?

# In[16]:

sknl = knl
sknl = lp.split_iname(sknl,
        "i", 16, outer_tag="g.1", inner_tag="l.1")
sknl = lp.split_iname(sknl,
        "j", 16, outer_tag="g.0", inner_tag="l.0")
sknl = lp.add_prefetch(sknl, "u",
    ["i_inner", "j_inner"],
    fetch_bounding_box=True)
evt, (result,) = sknl(queue, u=u, n=n)


# In[61]:

cpuknl = knl
cpuknl = lp.split_iname(cpuknl, "i", 16, outer_tag="g.0")
cpuknl = lp.split_iname(cpuknl, "j", 8, inner_tag="l.0", slabs=(0,1))
cpuknl = lp.set_loop_priority(cpuknl, "i_inner, j_outer")

evt, (result,) = cpuknl(queue, u=u, n=n)


# ## Judgement day: Has it helped?

# Let's time the execution of our "blur" operation on an actually big image. (We'll make do with random data for this timing:

# In[79]:

from time import time

rounds = 4

n = 5000
u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)

uh = u.get()


# First, `numpy`:

# In[80]:

t_start = time()
for i in range(rounds):
    blurred_u = np.zeros_like(uh)
    blurred_u[1:-1, 1:-1] = (
        2*uh[1:-1, 1:-1]
        + uh[2:, 1:-1]
        + uh[:-2, 1:-1]
        + uh[1:-1, 2:]
        + uh[1:-1, 2:]
        )/6

print((time()-t_start)/rounds)


# Next, our generated code:

# In[81]:

queue.finish()
t_start = time()
for i in range(rounds):
    evt, (result,) = cpuknl(queue, u=u)
    
queue.finish()
codegen_time = (time()-t_start)/rounds
print(codegen_time)


# Now estimate performance:

# In[83]:

pixels_accessed = 6*n**2
bytes_accessed = pixels_accessed * 4
gbytes_per_second = bytes_accessed / codegen_time / 1e9
print(gbytes_per_second)


# In[ ]:



