# coding: utf-8 # # Loopy: Dealing with Intermediate Results # ## Setup code # In[1]: import numpy as np import pyopencl as cl import pyopencl.array import pyopencl.clmath import pyopencl.clrandom import loopy as lp # In[2]: ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # In[3]: grid = np.linspace(0, 2*np.pi, 2048, endpoint=False) h = grid[1] - grid[0] u = cl.clmath.sin(cl.array.to_device(queue, grid)) # ## Two kernels: Finite Differences and a Flux # We will implement computing the ODE right-hand side for Burgers' equation: # $$ # \frac{\partial u}{\partial t} + \frac{\partial }{\partial x} \left( \frac{u^2}2 \right) = 0, # $$ # # Now, it is likely that the code fore the derivative is initially separate from the code for the flux function $f(u):=u^2/2$. # # For simplicity, we will use central finite differences to build a kernel `fin_diff_knl` to take the derivative: # In[4]: fin_diff_knl = lp.make_kernel( "{[i]: 1<=i<=n}", "out[i] = -(f[i+1] - f[i-1])/(2*h)", [lp.GlobalArg("out", shape="2+n"), ...]) print(fin_diff_knl) # ------------ # Next, make the flux kernel as `flux_knl`. # * Use `j` as the loop variable. # * Make sure to declare `f` and `u` to be the right size. # In[5]: flux_knl = lp.make_kernel( "{[j]: 1<=j<=n}", "f[j] = u[j]**2/2", [ lp.GlobalArg("f", shape="2+n"), lp.GlobalArg("u", shape="2+n"), ]) print(flux_knl) # ## Fuse the Kernels # # Next, fuse the two kernels together as `fused_knl`, using `lp.fuse_kernels([knl_a, knl_b])`: # In[6]: fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl]) print(fused_knl) # Let's take a look at the generated code: # In[7]: fused_knl = lp.set_options(fused_knl, write_cl=True) evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) # Next, eliminate the separate loop to compute `f`: # In[8]: fused_knl = lp.assignment_to_subst(fused_knl, "f") print(fused_knl) # Again, let's take a look at the generated code: # In[9]: fused_knl = lp.set_options(fused_knl, write_cl=True) evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) # ## Transform the kernel for execution # # For easier transformation, renumber the loop to start at 0, using `affine_map_inames(kernel, old_inames, new_inames, equations)`: # In[10]: fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") print(fused0_knl) # Now, map the kernel to OpenCL axes: # In[11]: gpu_knl = lp.split_iname(fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") print(gpu_knl) # Finally, precompute the fluxes locally in each workgroup: # In[12]: precomp_knl = lp.precompute(gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) precomp_knl = lp.set_options(precomp_knl, return_dict=True) evt, _ = precomp_knl(queue, u=u, h=h) # ## Run the PDE # In[13]: import matplotlib.pyplot as pt # Forward Euler on a hyperbolic problem: terrible idea. Oh well. dt = 0.2*h u = cl.clmath.sin(cl.array.to_device(queue, grid)) for i in range(1800): _, result_dict = precomp_knl(queue, u=u, h=h) out = result_dict["out"] out[0] = out[-1] = 0 u = u + dt*out if i % 300 == 0: pt.plot(u.get()) # In[ ]: