rusticl: implement cl_khr_suggested_local_work_size
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28020>
This commit is contained in:
@@ -962,7 +962,7 @@ Rusticl extensions that are not part of any OpenCL version:
|
||||
cl_khr_subgroup_rotate not started
|
||||
cl_khr_subgroup_shuffle DONE (iris, llvmpipe, radeonsi)
|
||||
cl_khr_subgroup_shuffle_relative DONE (iris, llvmpipe, radeonsi)
|
||||
cl_khr_suggested_local_work_size not started
|
||||
cl_khr_suggested_local_work_size DONE
|
||||
cl_khr_terminate_context not started
|
||||
cl_khr_throttle_hints not started
|
||||
cl_khr_work_group_uniform_arithmetic not started
|
||||
|
@@ -510,6 +510,11 @@ extern "C" fn cl_get_extension_function_address(
|
||||
"clGetGLObjectInfo" => cl_get_gl_object_info as *mut ::std::ffi::c_void,
|
||||
"clGetGLTextureInfo" => cl_get_gl_texture_info as *mut ::std::ffi::c_void,
|
||||
|
||||
// cl_khr_suggested_local_work_size
|
||||
"clGetKernelSuggestedLocalWorkSizeKHR" => {
|
||||
cl_get_kernel_suggested_local_work_size_khr as *mut ::std::ffi::c_void
|
||||
}
|
||||
|
||||
// cl_arm_shared_virtual_memory
|
||||
"clEnqueueSVMFreeARM" => cl_enqueue_svm_free_arm as *mut ::std::ffi::c_void,
|
||||
"clEnqueueSVMMapARM" => cl_enqueue_svm_map_arm as *mut ::std::ffi::c_void,
|
||||
|
@@ -236,6 +236,17 @@ unsafe fn kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -
|
||||
}
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// This function is only safe when called on an array of `work_dim` length
|
||||
unsafe fn kernel_work_arr_mut<'a>(arr: *mut usize, work_dim: cl_uint) -> Option<&'a mut [usize]> {
|
||||
if !arr.is_null() {
|
||||
unsafe { Some(slice::from_raw_parts_mut(arr, work_dim as usize)) }
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cl_entrypoint]
|
||||
fn create_kernel(
|
||||
program: cl_program,
|
||||
@@ -649,3 +660,109 @@ fn clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel> {
|
||||
let k = Kernel::ref_from_raw(source_kernel)?;
|
||||
Ok(Arc::new(k.clone()).into_cl())
|
||||
}
|
||||
|
||||
#[cl_entrypoint]
|
||||
fn get_kernel_suggested_local_work_size_khr(
|
||||
command_queue: cl_command_queue,
|
||||
kernel: cl_kernel,
|
||||
work_dim: cl_uint,
|
||||
global_work_offset: *const usize,
|
||||
global_work_size: *const usize,
|
||||
suggested_local_work_size: *mut usize,
|
||||
) -> CLResult<()> {
|
||||
// CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified in
|
||||
// global_work_size are 0.
|
||||
if global_work_size.is_null() {
|
||||
return Err(CL_INVALID_GLOBAL_WORK_SIZE);
|
||||
}
|
||||
|
||||
if global_work_offset.is_null() {
|
||||
return Err(CL_INVALID_GLOBAL_OFFSET);
|
||||
}
|
||||
|
||||
// CL_INVALID_VALUE if suggested_local_work_size is NULL.
|
||||
if suggested_local_work_size.is_null() {
|
||||
return Err(CL_INVALID_VALUE);
|
||||
}
|
||||
|
||||
// CL_INVALID_COMMAND_QUEUE if command_queue is not a valid host command-queue.
|
||||
let queue = Queue::ref_from_raw(command_queue)?;
|
||||
|
||||
// CL_INVALID_KERNEL if kernel is not a valid kernel object.
|
||||
let kernel = Kernel::ref_from_raw(kernel)?;
|
||||
|
||||
// CL_INVALID_CONTEXT if the context associated with kernel is not the same as the context
|
||||
// associated with command_queue.
|
||||
if queue.context != kernel.prog.context {
|
||||
return Err(CL_INVALID_CONTEXT);
|
||||
}
|
||||
|
||||
// CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
|
||||
// for kernel for the device associated with command_queue.
|
||||
if kernel.prog.status(queue.device) != CL_BUILD_SUCCESS as cl_build_status {
|
||||
return Err(CL_INVALID_PROGRAM_EXECUTABLE);
|
||||
}
|
||||
|
||||
// CL_INVALID_KERNEL_ARGS if all argument values for kernel have not been set.
|
||||
if kernel.arg_values().iter().any(|v| v.is_none()) {
|
||||
return Err(CL_INVALID_KERNEL_ARGS);
|
||||
}
|
||||
|
||||
// CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
|
||||
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
|
||||
if work_dim == 0 || work_dim > queue.device.max_grid_dimensions() {
|
||||
return Err(CL_INVALID_WORK_DIMENSION);
|
||||
}
|
||||
|
||||
let mut global_work_size =
|
||||
unsafe { kernel_work_arr_or_default(global_work_size, work_dim).to_vec() };
|
||||
|
||||
let suggested_local_work_size = unsafe {
|
||||
kernel_work_arr_mut(suggested_local_work_size, work_dim).ok_or(CL_INVALID_VALUE)?
|
||||
};
|
||||
|
||||
let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
|
||||
|
||||
let device_bits = queue.device.address_bits();
|
||||
let device_max = u64::MAX >> (u64::BITS - device_bits);
|
||||
for i in 0..work_dim as usize {
|
||||
let gws = global_work_size[i];
|
||||
let gwo = global_work_offset[i];
|
||||
|
||||
// CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified
|
||||
// in global_work_size are 0.
|
||||
if gws == 0 {
|
||||
return Err(CL_INVALID_GLOBAL_WORK_SIZE);
|
||||
}
|
||||
// CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size exceed the
|
||||
// maximum value representable by size_t on the device associated with command_queue.
|
||||
if gws as u64 > device_max {
|
||||
return Err(CL_INVALID_GLOBAL_WORK_SIZE);
|
||||
}
|
||||
// CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size plus the
|
||||
// corresponding value in global_work_offset for dimension exceeds the maximum value
|
||||
// representable by size_t on the device associated with command_queue.
|
||||
if u64::checked_add(gws as u64, gwo as u64)
|
||||
.filter(|&x| x <= device_max)
|
||||
.is_none()
|
||||
{
|
||||
return Err(CL_INVALID_GLOBAL_OFFSET);
|
||||
}
|
||||
}
|
||||
|
||||
kernel.suggest_local_size(
|
||||
queue.device,
|
||||
work_dim as usize,
|
||||
&mut global_work_size,
|
||||
suggested_local_work_size,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
// CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is set as an argument to kernel and the offset specified when the sub-buffer object was created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN for the device associated with command_queue.
|
||||
// CL_INVALID_IMAGE_SIZE if an image object is set as an argument to kernel and the image dimensions are not supported by device associated with command_queue.
|
||||
// CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is set as an argument to kernel and the image format is not supported by the device associated with command_queue.
|
||||
// CL_INVALID_OPERATION if an SVM pointer is set as an argument to kernel and the device associated with command_queue does not support SVM or the required SVM capabilities for the SVM pointer.
|
||||
// CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the device.
|
||||
// CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
|
||||
}
|
||||
|
@@ -831,21 +831,19 @@ impl Kernel {
|
||||
})
|
||||
}
|
||||
|
||||
fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
|
||||
let mut threads = self.max_threads_per_block(d) as u32;
|
||||
pub fn suggest_local_size(
|
||||
&self,
|
||||
d: &Device,
|
||||
work_dim: usize,
|
||||
grid: &mut [usize],
|
||||
block: &mut [usize],
|
||||
) {
|
||||
let mut threads = self.max_threads_per_block(d);
|
||||
let dim_threads = d.max_block_sizes();
|
||||
let subgroups = self.preferred_simd_size(d) as u32;
|
||||
let subgroups = self.preferred_simd_size(d);
|
||||
|
||||
if !block.contains(&0) {
|
||||
for i in 0..3 {
|
||||
// we already made sure everything is fine
|
||||
grid[i] /= block[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for i in 0..3 {
|
||||
let t = cmp::min(threads, dim_threads[i] as u32);
|
||||
for i in 0..work_dim {
|
||||
let t = cmp::min(threads, dim_threads[i]);
|
||||
let gcd = gcd(t, grid[i]);
|
||||
|
||||
block[i] = gcd;
|
||||
@@ -856,9 +854,9 @@ impl Kernel {
|
||||
}
|
||||
|
||||
// if we didn't fill the subgroup we can do a bit better if we have threads remaining
|
||||
let total_threads = block[0] * block[1] * block[2];
|
||||
let total_threads = block.iter().take(work_dim).product::<usize>();
|
||||
if threads != 1 && total_threads < subgroups {
|
||||
for i in 0..3 {
|
||||
for i in 0..work_dim {
|
||||
if grid[i] * total_threads < threads {
|
||||
block[i] *= grid[i];
|
||||
grid[i] = 1;
|
||||
@@ -869,6 +867,31 @@ impl Kernel {
|
||||
}
|
||||
}
|
||||
|
||||
fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
|
||||
if !block.contains(&0) {
|
||||
for i in 0..3 {
|
||||
// we already made sure everything is fine
|
||||
grid[i] /= block[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let mut usize_grid = [0usize; 3];
|
||||
let mut usize_block = [0usize; 3];
|
||||
|
||||
for i in 0..3 {
|
||||
usize_grid[i] = grid[i] as usize;
|
||||
usize_block[i] = block[i] as usize;
|
||||
}
|
||||
|
||||
self.suggest_local_size(d, 3, &mut usize_grid, &mut usize_block);
|
||||
|
||||
for i in 0..3 {
|
||||
grid[i] = usize_grid[i] as u32;
|
||||
block[i] = usize_block[i] as u32;
|
||||
}
|
||||
}
|
||||
|
||||
// the painful part is, that host threads are allowed to modify the kernel object once it was
|
||||
// enqueued, so return a closure with all req data included.
|
||||
pub fn launch(
|
||||
|
@@ -53,6 +53,7 @@ gen_cl_exts!([
|
||||
(1, 0, 0, "cl_khr_icd"),
|
||||
(1, 0, 0, "cl_khr_il_program"),
|
||||
(1, 0, 0, "cl_khr_spirv_no_integer_wrap_decoration"),
|
||||
(1, 0, 0, "cl_khr_suggested_local_work_size"),
|
||||
]);
|
||||
|
||||
static mut PLATFORM: Platform = Platform {
|
||||
|
Reference in New Issue
Block a user