As a simple example of a Monte Carlo method, we will approximate the value of $\pi$:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
knl = lp.make_kernel(
"{ [i, j]: 0<=i<n and 0<=j < 2}",
"""
<> key2 = make_uint2(i, k1)
<> ctr = make_uint4(0, 1, 2, 3)
<> rng_res, <> dummy = philox4x32_f32(ctr, key2)
samples[i,0] = rng_res.s0 + 1j*rng_res.s1
samples[i,1] = rng_res.s2 + 1j*rng_res.s3
accepted[i,j] = real(samples[i,j] * conj(samples[i,j])) < 1
""")
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
knl = lp.set_options(knl, return_dict=True)
knl = lp.set_options(knl, write_cl=True)
evt, result = knl(queue, n=100000, k1=np.int32(99123))
samples = result["samples"].reshape(-1)
accepted = result["accepted"].reshape(-1)
import matplotlib.pyplot as pt
samples_host = samples.get()
accepted_host = accepted.get() != 0
pt.gca().set_aspect("equal")
pt.plot(
samples_host[accepted_host].real,
samples_host[accepted_host].imag, "o")
Lastly, we compute the ratio of accepted to the total number of samples to get our approximate value of $\pi$:
4 * cl.array.sum(accepted).get() / len(samples)
We (roughly) expect convergence as $1/\sqrt N$, so this gives an idea of the relative error to expect:
1/np.sqrt(len(samples))
knl = lp.make_kernel(
"{ [l, g, j, isamp]: 0<=l<nl and 0<=g,j<ng and 0<=isamp< 2}",
"""
<> key2 = make_uint2(l + nl*g, 1234)
<> ctr = make_uint4(0, 1, 2, 3)
<> rng_res, <> dummy = philox4x32_f32(ctr, key2)
<> samples[0] = rng_res.s0 + 1j*rng_res.s1 {id=samp1}
samples[1] = rng_res.s2 + 1j*rng_res.s3 {id=samp2}
<> accepted[isamp] = real(samples[isamp] * conj(samples[isamp])) < 1 \
{dep=samp1:samp2}
<> tmp[g] = sum(l, accepted[0] + accepted[1])
result = sum(j, tmp[j])
""")
size = 1000000
ng = 50
knl = lp.fix_parameters(knl, ng=ng)
knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
ref_knl = knl
knl = lp.split_iname(knl, "l", 128, inner_tag="l.0")
knl = lp.split_reduction_outward(knl, "l_inner")
knl = lp.tag_inames(knl, "g:g.0,j:l.0")
evt, (result,) = knl(queue, nl=size)
nsamples = size*2*ng
print(4*result.get()/nsamples, nsamples/1e9, "billion samples")