
# coding: utf-8

# # Loopy: Counting Operations
# 
# ## Setup code

# In[1]:

import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp


# In[2]:

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)


# In[18]:

n = 1024
a = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
b = cl.clrandom.rand(queue, (n, n), dtype=np.float32)


# ## Operation-counting matrix multiplication

# Here is the simple matrix-matrix multiplication kernel again:

# In[7]:

knl = lp.make_kernel(
    "{[i,j,k]: 0<=i,j,k<n}",
    "c[i, j] = sum(k, a[i, k]*b[k, j])"
    )
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b":np.float32})


# ### Counting flops
# 
# Let us determine the number of arithmetic operations being carried out:

# In[10]:

lp.get_op_poly(knl)


# The return type is easy to evaluate for a given set of parameters--just use the `.eval_with_dict` method:

# In[15]:

poly = lp.get_op_poly(knl)[np.dtype(np.float32), "add"]
poly.eval_with_dict({"n": 15})


# ### Counting memory access

# In[17]:

lp.get_gmem_access_poly(knl)


# ## Operation-counting a transformed kernel

# In[31]:

opt_knl = knl
opt_knl = lp.assume(opt_knl, "n mod 16 = 0")
opt_knl = lp.split_iname(opt_knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
opt_knl = lp.split_iname(opt_knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
opt_knl = lp.split_iname(opt_knl, "k", 16)
#opt_knl = lp.add_prefetch(opt_knl, "a", "i_inner,k_inner")
#opt_knl = lp.add_prefetch(opt_knl, "b", "j_inner,k_inner")

opt_knl = lp.set_options(opt_knl, write_cl=True)
_ = opt_knl(queue, a=a, b=b)


# Now count the memory accesses in the transformed version:

# In[32]:

lp.get_gmem_access_poly(opt_knl)


# Now enable the prefetch transformation above.
