# coding: utf-8

# # A mini-`einsum` using loopy
# 
# In this problem, we will design a function that carries out an `einsum`-type operation using `loopy`. It should be usable as shown in the tests towards the end of the worksheet. Also try to perform a simple parallelization so that your code will run on a GPU.

# In[1]:

import numpy as np
import numpy.linalg as la

import pyopencl as cl
import pyopencl.array
import pyopencl.clmath
import pyopencl.clrandom

import loopy as lp


# Some hints:
# 
# * `loopy.Reduction("sum", ("i", "j", "k"), expr)` expresses a sum.
# * Build the loop domain `{[i,j]: 0<=i<Ni and 0<=j<Nj}` as a string and pass it to loopy.
# * To build strings, use
#     * `str.join()`: `",".join(names)` and 
#     * `str.format`: `"Hi {name}".format(name="Andreas")`

# In[2]:

def loopy_einsum(queue, spec, *args):
    arg_spec, out_spec = spec.split("->")
    arg_specs = arg_spec.split(",")
    # ...


# In[3]:

def loopy_einsum(queue, spec, *args):
    arg_spec, out_spec = spec.split("->")
    arg_specs = arg_spec.split(",")

    out_indices = set(out_spec)

    all_indices = set(
        idx
        for argsp in arg_specs
        for idx in argsp) | out_indices

    sum_indices = all_indices - out_indices

    from pymbolic import var
    lhs = var("out")[tuple(var(i) for i in out_spec)]

    rhs = 1
    for i, argsp in enumerate(arg_specs):
        rhs = rhs * var("arg%d" % i)[tuple(var(i) for i in argsp)]

    if sum_indices:
        rhs = lp.Reduction("sum", tuple(var(idx) for idx in sum_indices), rhs)

    constraints = " and ".join(
        "0 <= %s < N%s" % (idx, idx)
        for idx in all_indices
        )

    domain = "{[%s]: %s}" % (",".join(all_indices), constraints)
    knl = lp.make_kernel(domain, [lp.ExpressionInstruction(lhs, rhs)])

    knl = lp.set_options(knl, write_cl=True)

    kwargs = {}
    for i, arg in enumerate(args):
        kwargs["arg%d" % i] = arg

    evt, (out,) = knl(queue, **kwargs)
    return out


# Now, let us test our implementation, using a simple matrix-matrix multiplication:

# In[4]:

cl_context = cl.create_some_context()
queue = cl.CommandQueue(cl_context)

a = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)
b = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)

ab = loopy_einsum(queue, "ik,kj->ij", a, b)

diff =  a.get().dot(b.get()) - ab.get()

print(la.norm(diff, 2))


# In[ ]: