
# coding: utf-8

# # Practice: Matrix Products

# Do the following:
#     
# * Implement a matrix-matrix product $A\overline{B}^T$ in loopy. Let $A$ be real-valued and $B$ be complex-valued. The overline symbolizes complex conjugation.

# ## Setup code

# In[17]:

import numpy as np
import numpy.linalg as la
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp


# In[18]:

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)


# In[19]:

n = 1024
A = cl.clrandom.rand(queue, (n, n), dtype=np.float64)
B = (
    cl.clrandom.rand(queue, (n, n), dtype=np.float64)
    +
    1j * cl.clrandom.rand(queue, (n, n), dtype=np.float64))


# ## Implementing the Kernel

# Implement the basic kernel here:

# In[20]:

knl = lp.make_kernel(
    "{[i,j,k]: 0<=i,j,k<n}",
    "C[i,j] = sum(k, A[i, k]*conj(B[j, k]))"
    )


# Here we execute the kernel, making sure we get to see the generated code:

# In[21]:

_knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
evt, (C,) = _knl(queue, A=A, B=B)


# Next, we test that we got the right result, using `numpy`:

# In[24]:

C_ref = A.get() @ B.get().T.conj()
la.norm(C.get()-C_ref) / la.norm(C_ref)


# ## Playing with the loop ordering
# 
# Check the [loopy documentation](https://documen.tician.de/loopy) to see how to use the `seet_loop_priority` transform to prescribe a loop ordering.
# 
# Try a few different variants, time their execution. Make sure to exclude the first run, because the time for that will include code generation and compilation.
# 
# You may use the Python function `time.time` to get the wall clock time in seconds since the Jan 1, 1970.
# 
# Also make sure to call `queue.finish()` before you start and stop the clock.

# In[57]:


tknl = lp.set_loop_priority(knl, "i,j")


def do_timing(timed_knl):
    timed_knl = lp.set_options(timed_knl, write_cl=True, highlight_cl=True)
    # Run once to 'warm up' the code
    timed_knl(queue, A=A, B=B)

    queue.finish()

    from time import time
    start = time()

    nruns = 2
    for i in range(nruns):
        timed_knl(queue, A=A, B=B)

    queue.finish()

    timing = (time()-start)/nruns
    print(timing,"s per run")
    
do_timing(tknl)


# ## Parallelization: Single-element work groups
# 
# Next, parallelize the operation using a 2D grid. Use the `tag_inames` transformation.
# 
# Experiment with the ordering.

# In[58]:

tknl = lp.tag_inames(knl, "i:g.0,j:g.1")

do_timing(tknl)


# ## Parallelization: Multi-element work groups
# 
# Next, use more than one element per workgroup. Use the `split_iname` transformation.
# 
# Experiment with group sizes and axis order.

# In[60]:

tknl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
tknl = lp.split_iname(tknl, "j", 16, outer_tag="g.1", inner_tag="l.1")

do_timing(tknl)


# ## Where to go from here
# Things to try:
#     
# * Loop Unrolling (the `unr` iname tag)
# * Instruction level parallelism (the `ilp` iname tag)
# * Prefetching (`add_prefetch`)
# * Run this on an actual GPU
# * Measure GFLOPS and GBytes/s

# In[ ]:



