# coding: utf-8 # # A mini-`einsum` using loopy # # In this problem, we will design a function that carries out an `einsum`-type operation using `loopy`. It should be usable as shown in the tests towards the end of the worksheet. Also try to perform a simple parallelization so that your code will run on a GPU. # In[1]: import numpy as np import numpy.linalg as la import pyopencl as cl import pyopencl.array import pyopencl.clmath import pyopencl.clrandom import loopy as lp # Some hints: # # * `loopy.Reduction("sum", ("i", "j", "k"), expr)` expresses a sum. # * Build the loop domain `{[i,j]: 0<=i") arg_specs = arg_spec.split(",") # ... # In[3]: def loopy_einsum(queue, spec, *args): arg_spec, out_spec = spec.split("->") arg_specs = arg_spec.split(",") out_indices = set(out_spec) all_indices = set( idx for argsp in arg_specs for idx in argsp) | out_indices sum_indices = all_indices - out_indices from pymbolic import var lhs = var("out")[tuple(var(i) for i in out_spec)] rhs = 1 for i, argsp in enumerate(arg_specs): rhs = rhs * var("arg%d" % i)[tuple(var(i) for i in argsp)] if sum_indices: rhs = lp.Reduction("sum", tuple(var(idx) for idx in sum_indices), rhs) constraints = " and ".join( "0 <= %s < N%s" % (idx, idx) for idx in all_indices ) domain = "{[%s]: %s}" % (",".join(all_indices), constraints) knl = lp.make_kernel(domain, [lp.ExpressionInstruction(lhs, rhs)]) knl = lp.set_options(knl, write_cl=True) kwargs = {} for i, arg in enumerate(args): kwargs["arg%d" % i] = arg evt, (out,) = knl(queue, **kwargs) return out # Now, let us test our implementation, using a simple matrix-matrix multiplication: # In[4]: cl_context = cl.create_some_context() queue = cl.CommandQueue(cl_context) a = cl.clrandom.rand(queue, (300, 300), dtype=np.float64) b = cl.clrandom.rand(queue, (300, 300), dtype=np.float64) ab = loopy_einsum(queue, "ik,kj->ij", a, b) diff = a.get().dot(b.get()) - ab.get() print(la.norm(diff, 2)) # In[ ]: