import pyopencl as cl
import pyopencl.array as cl_array
import numpy
import numpy.linalg as la
a = numpy.random.rand(50000).astype(numpy.float32)
b = numpy.random.rand(50000).astype(numpy.float32)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
a_dev = cl_array.to_device(queue, a)
b_dev = cl_array.to_device(queue, b)
dest_dev = cl_array.empty_like(a_dev)
prg = cl.Program(ctx, """
__kernel void sum(__global const float *a,
__global const float *b, __global float *c)
{
int gid = get_global_id(0);
c[gid] = a[gid] + b[gid];
}
""").build()
prg.sum(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
print(la.norm((dest_dev - (a_dev+b_dev)).get()))
# -*- coding: utf-8 -*-
import pyopencl as cl
import pyopencl.array as cl_array
import numpy
import numpy.linalg as la
a = numpy.random.rand(50000).astype(numpy.float32)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
a_dev = cl_array.to_device(queue, a)
dest_dev = cl_array.empty_like(a_dev)
prg = cl.Program(ctx, """
__kernel void ParallelSelection(__global const float * in,__global float * out)
{
int i = get_global_id(0);
int n = get_global_size(0);
float a = in[i];
int pos = 0;
for (int j = 0; j < n; j++)
{
float b = in[j];
bool smaller = (b < a) || (b == a && j < i);
pos += (smaller)?1:0;
}
out[pos] = a;
}
""").build()
prg.ParallelSelection(queue, a.shape, None, a_dev.data, dest_dev.data)
print dest_dev.get()[:100]
import pyopencl as cl
import numpy
block_size = 16
class Transpose1:
def __init__(self, ctx):
self.kernel = cl.Program(ctx, """
__kernel void transpose(__global float *a_t, __global float *a, unsigned a_width, unsigned a_height)
{
int read_idx = get_global_id(0) + get_global_id(1) * a_width;
int write_idx = get_global_id(1) + get_global_id(0) * a_height;
a_t[write_idx] = a[read_idx];
}
"""% {"block_size": block_size}).build().transpose
def __call__(self, queue, tgt, src, shape):
w, h = shape
return self.kernel(queue, (w, h), (block_size, block_size), tgt, src, numpy.uint32(w), numpy.uint32(h))
class Transpose2(Transpose1):
def __call__(self, queue, tgt, src, shape):
w, h = shape
return self.kernel(queue, (w, h), None, tgt, src, numpy.uint32(w), numpy.uint32(h))
def benchmark_transpose():
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
sizes = [4096, 4448, 4864, 5280, 5792, 6304, 6880, 7488]
mem_bandwidths = {}
methods = [Transpose2, Transpose1]
for klasa in methods:
name = klasa.__name__
mem_bandwidths[klasa] = meth_mem_bws = []
for size in sizes:
source = numpy.random.rand(size, size).astype(numpy.float32)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = source)
a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size = source.nbytes)
method = klasa(ctx)
events = []
count = 4
for i in range(count): # pozovi 4 puta za svaku velicinu
events.append(method(queue, a_t_buf, a_buf, source.shape))
events[-1].wait()
time = sum(evt.profile.end - evt.profile.start for evt in events)
mem_bw = 2*source.nbytes * count/(time*1e-9)
print("benchmarking", name, size, mem_bw/1e9, "GB/s")
a_buf.release()
a_t_buf.release()
benchmark_transpose()
Izradio: Krunoslav Đuras