Monte Carlo Method

As a simple example of a Monte Carlo method, we will approximate the value of $\pi$:

In [1]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
In [2]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

Sample generation

In [3]:
knl = lp.make_kernel(
        "{ [i, j]: 0<=i<n and 0<=j < 2}",
        """
        <> key2 = make_uint2(i, k1)
        <> ctr = make_uint4(0, 1, 2, 3)
        <> rng_res, <> dummy = philox4x32_f32(ctr, key2)
        
        samples[i,0] = rng_res.s0 + 1j*rng_res.s1
        samples[i,1] = rng_res.s2 + 1j*rng_res.s3

        accepted[i,j] = real(samples[i,j] * conj(samples[i,j])) < 1
        """)                                                                     

knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
knl = lp.set_options(knl, return_dict=True)
In [4]:
knl = lp.set_options(knl, write_cl=True)
In [5]:
evt, result = knl(queue, n=100000, k1=np.int32(99123))

samples = result["samples"].reshape(-1)
accepted = result["accepted"].reshape(-1)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#include <pyopencl-complex.h>
#include <pyopencl-random123/philox.cl>



typedef union {
    uint4 v;
    philox4x32_ctr_t c;
} philox4x32_ctr_vec_union;


uint4 philox4x32_bump(uint4 ctr)
{
    if (++ctr.x == 0)
        if (++ctr.y == 0)
            ++ctr.z;
    return ctr;
}

uint4 philox4x32_gen(
        uint4 ctr,
        uint2 key,
        uint4 *new_ctr)
{
    philox4x32_ctr_vec_union result;
    result.c = philox4x32(
        *(philox4x32_ctr_t *) &ctr,
        *(philox4x32_key_t *) &key);
    *new_ctr = philox4x32_bump(ctr);
    return result.v;
}

float4 philox4x32_f32(
        uint4 ctr,
        uint2 key,
        uint4 *new_ctr)
{
    *new_ctr = ctr;
    return
        convert_float4(philox4x32_gen(*new_ctr, key, new_ctr))
        * 2.3283064365386963e-10f;
}

double4 philox4x32_f64(
        uint4 ctr,
        uint2 key,
        uint4 *new_ctr)
{
    *new_ctr = ctr;
        return
            convert_double4(philox4x32_gen(*new_ctr, key, new_ctr))
            * 2.3283064365386963e-10
            +
            convert_double4(philox4x32_gen(*new_ctr, key, new_ctr))
            * 5.421010862427522e-20;

}

__kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global int *restrict accepted, int const k1, int const n, __global cfloat_t *restrict samples)
{
  uint4 ctr;
  uint4 dummy;
  uint2 key2;
  float4 rng_res;

  ctr = (uint4) (0, 1, 2, 3);
  if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
  {
    key2 = (uint2) (lid(0) + gid(0) * 128, k1);
    rng_res = philox4x32_f32(ctr, key2, &(dummy));
    samples[2 * (lid(0) + gid(0) * 128) + 1] = cfloat_radd(rng_res.s2, cfloat_rmul(rng_res.s3, cfloat_new(0.0, 1.0)));
    samples[2 * (lid(0) + gid(0) * 128)] = cfloat_radd(rng_res.s0, cfloat_rmul(rng_res.s1, cfloat_new(0.0, 1.0)));
    for (int j = 0; j <= 1; ++j)
      accepted[2 * (lid(0) + gid(0) * 128) + j] = cfloat_real(cfloat_mul(samples[2 * (lid(0) + gid(0) * 128) + j], cfloat_conj(samples[2 * (lid(0) + gid(0) * 128) + j]))) < 1;
  }
}
In [6]:
import matplotlib.pyplot as pt

samples_host = samples.get()
accepted_host = accepted.get() != 0

pt.gca().set_aspect("equal")
pt.plot(
    samples_host[accepted_host].real,
    samples_host[accepted_host].imag, "o")
Out[6]:
[<matplotlib.lines.Line2D at 0x7fed129c74a8>]

Lastly, we compute the ratio of accepted to the total number of samples to get our approximate value of $\pi$:

In [7]:
4 * cl.array.sum(accepted).get() / len(samples)
Out[7]:
3.1429200000000002

We (roughly) expect convergence as $1/\sqrt N$, so this gives an idea of the relative error to expect:

In [8]:
1/np.sqrt(len(samples))
Out[8]:
0.0022360679774997899

Direct Reduction

In [9]:
knl = lp.make_kernel(
        "{ [l, g, j, isamp]: 0<=l<nl and 0<=g,j<ng and 0<=isamp< 2}",
        """
        <> key2 = make_uint2(l + nl*g, 1234)
        <> ctr = make_uint4(0, 1, 2, 3)
        <> rng_res, <> dummy = philox4x32_f32(ctr, key2)
        
        <> samples[0] = rng_res.s0 + 1j*rng_res.s1     {id=samp1}
           samples[1] = rng_res.s2 + 1j*rng_res.s3     {id=samp2}

        <> accepted[isamp] = real(samples[isamp] * conj(samples[isamp])) < 1 \
                                                        {dep=samp1:samp2} 
        
        <> tmp[g] = sum(l, accepted[0] + accepted[1])           
                                                                                      
        result = sum(j, tmp[j])    
        """)                                                                     

size = 1000000
ng = 50

knl = lp.fix_parameters(knl, ng=ng)                                               

knl = lp.set_options(knl, write_cl=True, highlight_cl=True)                                          

ref_knl = knl                                                                     

knl = lp.split_iname(knl, "l", 128, inner_tag="l.0")
knl = lp.split_reduction_outward(knl, "l_inner")
knl = lp.tag_inames(knl, "g:g.0,j:l.0")

evt, (result,) = knl(queue, nl=size) 

nsamples = size*2*ng
print(4*result.get()/nsamples, nsamples/1e9, "billion samples")
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
#include <pyopencl-complex.h>
#include <pyopencl-random123/philox.cl>



typedef union {
    uint4 v;
    philox4x32_ctr_t c;
} philox4x32_ctr_vec_union;


uint4 philox4x32_bump(uint4 ctr)
{
    if (++ctr.x == 0)
        if (++ctr.y == 0)
            ++ctr.z;
    return ctr;
}

uint4 philox4x32_gen(
        uint4 ctr,
        uint2 key,
        uint4 *new_ctr)
{
    philox4x32_ctr_vec_union result;
    result.c = philox4x32(
        *(philox4x32_ctr_t *) &ctr,
        *(philox4x32_key_t *) &key);
    *new_ctr = philox4x32_bump(ctr);
    return result.v;
}

float4 philox4x32_f32(
        uint4 ctr,
        uint2 key,
        uint4 *new_ctr)
{
    *new_ctr = ctr;
    return
        convert_float4(philox4x32_gen(*new_ctr, key, new_ctr))
        * 2.3283064365386963e-10f;
}

double4 philox4x32_f64(
        uint4 ctr,
        uint2 key,
        uint4 *new_ctr)
{
    *new_ctr = ctr;
        return
            convert_double4(philox4x32_gen(*new_ctr, key, new_ctr))
            * 2.3283064365386963e-10
            +
            convert_double4(philox4x32_gen(*new_ctr, key, new_ctr))
            * 5.421010862427522e-20;

}

__kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(int const nl, __global int *restrict result, __global int *restrict tmp)
{
  __local int acc_j[50];
  __local int acc_l_inner[128];
  int acc_l_outer;
  int accepted[2];
  uint4 ctr;
  uint4 dummy;
  uint2 key2;
  float4 rng_res;
  cfloat_t samples[2];

  if (-1 + nl >= 0)
  {
    if (49 + -1 * lid(0) >= 0)
      acc_j[lid(0)] = 0;
    if (-1 + -1 * lid(0) + nl >= 0)
      acc_l_outer = 0;
    acc_l_inner[lid(0)] = 0;
    ctr = (uint4) (0, 1, 2, 3);
    if (-1 + -1 * lid(0) + nl >= 0)
    {
      for (int l_outer = 0; l_outer <= -1 + int_floor_div_pos_b(127 + nl, 128); ++l_outer)
        if (-1 + -128 * l_outer + -1 * lid(0) + nl >= 0)
        {
          key2 = (uint2) (nl * gid(0) + lid(0) + l_outer * 128, 1234);
          rng_res = philox4x32_f32(ctr, key2, &(dummy));
          samples[1] = cfloat_radd(rng_res.s2, cfloat_rmul(rng_res.s3, cfloat_new(0.0, 1.0)));
          samples[0] = cfloat_radd(rng_res.s0, cfloat_rmul(rng_res.s1, cfloat_new(0.0, 1.0)));
          for (int isamp = 0; isamp <= 1; ++isamp)
            accepted[isamp] = cfloat_real(cfloat_mul(samples[isamp], cfloat_conj(samples[isamp]))) < 1;
          acc_l_outer = acc_l_outer + accepted[0] + accepted[1];
        }
      acc_l_inner[lid(0)] = acc_l_outer;
    }
    barrier(CLK_LOCAL_MEM_FENCE) /* for acc_l_inner (red_l_inner_stage_0 depends on insn_3_l_inner_init) */;
    if (63 + -1 * lid(0) >= 0)
      acc_l_inner[lid(0)] = acc_l_inner[lid(0)] + acc_l_inner[lid(0) + 64];
    barrier(CLK_LOCAL_MEM_FENCE) /* for acc_l_inner (red_l_inner_stage_1 depends on red_l_inner_stage_0) */;
    if (31 + -1 * lid(0) >= 0)
      acc_l_inner[lid(0)] = acc_l_inner[lid(0)] + acc_l_inner[lid(0) + 32];
    barrier(CLK_LOCAL_MEM_FENCE) /* for acc_l_inner (red_l_inner_stage_2 depends on red_l_inner_stage_1) */;
    if (15 + -1 * lid(0) >= 0)
      acc_l_inner[lid(0)] = acc_l_inner[lid(0)] + acc_l_inner[lid(0) + 16];
    barrier(CLK_LOCAL_MEM_FENCE) /* for acc_l_inner (red_l_inner_stage_3 depends on red_l_inner_stage_2) */;
    if (7 + -1 * lid(0) >= 0)
      acc_l_inner[lid(0)] = acc_l_inner[lid(0)] + acc_l_inner[lid(0) + 8];
    barrier(CLK_LOCAL_MEM_FENCE) /* for acc_l_inner (red_l_inner_stage_4 depends on red_l_inner_stage_3) */;
    if (3 + -1 * lid(0) >= 0)
      acc_l_inner[lid(0)] = acc_l_inner[lid(0)] + acc_l_inner[lid(0) + 4];
    barrier(CLK_LOCAL_MEM_FENCE) /* for acc_l_inner (red_l_inner_stage_5 depends on red_l_inner_stage_4) */;
    if (1 + -1 * lid(0) >= 0)
      acc_l_inner[lid(0)] = acc_l_inner[lid(0)] + acc_l_inner[lid(0) + 2];
    barrier(CLK_LOCAL_MEM_FENCE) /* for acc_l_inner (red_l_inner_stage_6 depends on red_l_inner_stage_5) */;
    if (lid(0) == 0)
    {
      acc_l_inner[lid(0)] = acc_l_inner[lid(0)] + acc_l_inner[lid(0) + 1];
      tmp[gid(0)] = acc_l_inner[0];
    }
  }
}

__kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel_0(int const nl, __global int *restrict result, __global int *restrict tmp)
{
  __local int acc_j[50];
  __local int acc_l_inner[128];
  int acc_l_outer;
  int accepted[2];
  uint4 ctr;
  uint4 dummy;
  uint2 key2;
  float4 rng_res;
  cfloat_t samples[2];

  if (-1 + nl >= 0 && 49 + -1 * lid(0) >= 0)
    acc_j[lid(0)] = tmp[lid(0)];
  barrier(CLK_LOCAL_MEM_FENCE) /* for acc_j (red_j_stage_0 depends on insn_4_j_transfer) */;
  if (17 + -1 * lid(0) >= 0)
    acc_j[lid(0)] = acc_j[lid(0)] + acc_j[lid(0) + 32];
  barrier(CLK_LOCAL_MEM_FENCE) /* for acc_j (red_j_stage_1 depends on red_j_stage_0) */;
  if (15 + -1 * lid(0) >= 0)
    acc_j[lid(0)] = acc_j[lid(0)] + acc_j[lid(0) + 16];
  barrier(CLK_LOCAL_MEM_FENCE) /* for acc_j (red_j_stage_2 depends on red_j_stage_1) */;
  if (7 + -1 * lid(0) >= 0)
    acc_j[lid(0)] = acc_j[lid(0)] + acc_j[lid(0) + 8];
  barrier(CLK_LOCAL_MEM_FENCE) /* for acc_j (red_j_stage_3 depends on red_j_stage_2) */;
  if (3 + -1 * lid(0) >= 0)
    acc_j[lid(0)] = acc_j[lid(0)] + acc_j[lid(0) + 4];
  barrier(CLK_LOCAL_MEM_FENCE) /* for acc_j (red_j_stage_4 depends on red_j_stage_3) */;
  if (1 + -1 * lid(0) >= 0)
    acc_j[lid(0)] = acc_j[lid(0)] + acc_j[lid(0) + 2];
  barrier(CLK_LOCAL_MEM_FENCE) /* for acc_j (red_j_stage_5 depends on red_j_stage_4) */;
  if (lid(0) == 0)
  {
    acc_j[lid(0)] = acc_j[lid(0)] + acc_j[lid(0) + 1];
    *result = acc_j[0];
  }
}

3.14172144 0.1 billion samples
In [ ]: