# coding: utf-8

# # Loopy: Dealing with Intermediate Results

# ## Setup code

# In[1]:

import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clmath
import pyopencl.clrandom
import loopy as lp


# In[2]:

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)


# In[3]:

grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
h = grid[1] - grid[0]
u = cl.clmath.sin(cl.array.to_device(queue, grid))


# ## Two kernels: Finite Differences and a Flux

# We will implement computing the ODE right-hand side for Burgers' equation:
# $$
# \frac{\partial u}{\partial t} + \frac{\partial }{\partial x} \left( \frac{u^2}2 \right) = 0,
# $$
# 
# Now, it is likely that the code fore the derivative is initially separate from the code for the flux function $f(u):=u^2/2$.
# 
# For simplicity, we will use central finite differences to build a kernel `fin_diff_knl` to take the derivative:

# In[4]:

fin_diff_knl = lp.make_kernel(
    "{[i]: 1<=i<=n}",
    "out[i] = -(f[i+1] - f[i-1])/(2*h)",
    [lp.GlobalArg("out", shape="2+n"), ...])
print(fin_diff_knl)


# ------------
# Next, make the flux kernel as `flux_knl`.
# * Use `j` as the loop variable.
# * Make sure to declare `f` and `u` to be the right size.

# In[5]:

flux_knl = lp.make_kernel(
    "{[j]: 1<=j<=n}",
    "f[j] = u[j]**2/2",
    [
    lp.GlobalArg("f", shape="2+n"),
    lp.GlobalArg("u", shape="2+n"),
    ])
print(flux_knl)


# ## Fuse the Kernels
# 
# Next, fuse the two kernels together as `fused_knl`, using `lp.fuse_kernels([knl_a, knl_b])`:

# In[6]:

fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl])

print(fused_knl)


# Let's take a look at the generated code:

# In[7]:

fused_knl = lp.set_options(fused_knl, write_cl=True)
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))


# Next, eliminate the separate loop to compute `f`:

# In[8]:

fused_knl = lp.assignment_to_subst(fused_knl, "f")
print(fused_knl)


# Again, let's take a look at the generated code:

# In[9]:

fused_knl = lp.set_options(fused_knl, write_cl=True)
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))


# ## Transform the kernel for execution
# 
# For easier transformation, renumber the loop to start at 0, using `affine_map_inames(kernel, old_inames, new_inames, equations)`:

# In[10]:

fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")

print(fused0_knl)


# Now, map the kernel to OpenCL axes:

# In[11]:

gpu_knl = lp.split_iname(fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")
print(gpu_knl)


# Finally, precompute the fluxes locally in each workgroup:

# In[12]:

precomp_knl = lp.precompute(gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)
precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
precomp_knl = lp.set_options(precomp_knl, return_dict=True)
evt, _ = precomp_knl(queue, u=u, h=h)


# ## Run the PDE

# In[13]:

import matplotlib.pyplot as pt

# Forward Euler on a hyperbolic problem: terrible idea. Oh well.

dt = 0.2*h
u = cl.clmath.sin(cl.array.to_device(queue, grid))
for i in range(1800):
    _, result_dict = precomp_knl(queue, u=u, h=h)
    out = result_dict["out"]
    out[0] = out[-1] = 0
    u = u + dt*out
    
    if i % 300 == 0:
        pt.plot(u.get())


# In[ ]: