Userspace program:

#!/usr/bin/python

import pyopencl as cl
import numpy as np

# create context
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# create some input data, in this case array filled with zeros
a = np.array(np.zeros((20),dtype=np.uint64))

# create opencl buffer
buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a) 

# load kernel source
f=open("kernel.c","r")
SRC = ''.join(f.readlines())
f.close()

# compile it
prg = cl.Program(ctx, SRC).build()

print("Input:")
print(a)

# launch the kernel
event = prg.brm(queue, a.shape, None, buf)
event.wait()
 
# copy data back from opencl
cl.enqueue_copy(queue, a, buf)

# print it
print("CL returned:")
print(a)

kernel.c:

// kernel definition
kernel void brm(global ulong *buf) {

    // kernel index
    private size_t me = get_global_id(0);

    // local vars
    private ulong a;

    // read input data
    a = buf[me];

    // excersise some magic
    a = a + me*me;

    // write result back to memory
    buf[me] = a;
}