# coding: utf-8

# # Practice: Orthogonality of Hermite Polynomials
# 
# In this exercise, modify the Monte-Carlo example to demonstrate the orthonormality of the two [Hermite polynomials](https://en.wikipedia.org/wiki/Hermite_polynomials)
# 
# * $1$ and
# * $x^2-1$
# 
# with respect to the weight $e^{-\frac{x^2}2}$, i.e. show (numerically, using a Monte Carlo method) that
# 
# $$
# \int_{-\infty}^\infty 1 \cdot (x^2-1) \cdot e^{-\frac{x^2}2}dx = 0
# $$
# 
# and that
# 
# $$
# \int_{-\infty}^\infty (x^2-1)^2  \cdot e^{-\frac{x^2}2}dx = 2\sqrt{2\pi}.
# $$
# 
# Realize that
# $$
# \int_{-\infty}^\infty \dots  \cdot \frac{e^{-\frac{x^2}2}}{\sqrt{2\pi}}dx
# $$
# can be evaluated by Monte-Carlo summation of $\dots$ where the $x$ are normally distributed.
# 
# Use the [Box-Muller transform](https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform) to obtain normally-distributed random numbers from the uniformly distributed ones returned by PyOpenCL's random number generator.
# 
# 

# ### Initialization

# In[3]:

import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)


# ### Boilerplate for Random Number Generator

# In[4]:

generator_preamble = """
#include <pyopencl-random123/philox.cl>

typedef union {
    uint4 v;
    philox4x32_ctr_t c;
} philox4x32_ctr_vec_union;


uint4 philox4x32_bump(uint4 ctr)
{
    if (++ctr.x == 0)
        if (++ctr.y == 0)
            ++ctr.z;
    return ctr;
}

uint4 philox4x32_gen(
        uint4 ctr,
        uint2 key,
        uint4 *new_ctr)
{
    philox4x32_ctr_vec_union result;
    result.c = philox4x32(
        *(philox4x32_ctr_t *) &ctr,
        *(philox4x32_key_t *) &key);
    *new_ctr = philox4x32_bump(ctr);
    return result.v;
}

float4 philox4x32_f32(
        uint4 ctr,
        uint2 key,
        uint4 *new_ctr)
{
    *new_ctr = ctr;
    return
        convert_float4(philox4x32_gen(*new_ctr, key, new_ctr))
        * 2.3283064365386963e-10f;
}
"""


# ### Monte-Carlo code

# In[36]:


from mako.template import Template

mc_preamble_src = Template("""

#include <pyopencl-complex.h>

float compute_sample(int i, unsigned int k1)
{
    uint4 ctr = { 0, 1, 2, 3 };
    uint2 key2 = { i, k1 };
    float4 rng_res = philox4x32_f32(ctr, key2, &(ctr));
    
    float r0 = sqrt(-2*log(rng_res.s0));
    float v0 = r0*cos((float) (2*M_PI) * rng_res.s1);
    float v1 = r0*sin((float) (2*M_PI) * rng_res.s1);

    float r2 = sqrt(-2*log(rng_res.s2));
    float v2 = r2*cos((float) (2*M_PI) * rng_res.s3);
    float v3 = r2*sin((float) (2*M_PI) * rng_res.s3);
    
    float result = 0;
    
    %for x in ["v0", "v1", "v2", "v3"]:
    {
        float x = ${x};
        float H2 = x*x - 1;
        result += H2;
    }
    %endfor
    
    return result;
}
""", strict_undefined=True).render()


# In[37]:


from pyopencl.reduction import ReductionKernel

rknl = ReductionKernel(ctx, np.float32,
        neutral="0",
        reduce_expr="a+b", map_expr="compute_sample(i, k1)",
        arguments="unsigned int k1", preamble=generator_preamble+mc_preamble_src)


# In[38]:

n = 10000000

nsamples = 4*n
result = rknl(15, range=slice(n), queue=queue).get() / nsamples

print(result)


# In[ ]: