rusticl: implement cl_khr_suggested_local_work_size

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28020>
This commit is contained in:
David Tobolik
2024-02-23 12:41:38 +01:00
committed by Marge Bot
parent de94d98940
commit 376d1e6667
5 changed files with 162 additions and 16 deletions

View File

@@ -962,7 +962,7 @@ Rusticl extensions that are not part of any OpenCL version:
cl_khr_subgroup_rotate not started
cl_khr_subgroup_shuffle DONE (iris, llvmpipe, radeonsi)
cl_khr_subgroup_shuffle_relative DONE (iris, llvmpipe, radeonsi)
cl_khr_suggested_local_work_size not started
cl_khr_suggested_local_work_size DONE
cl_khr_terminate_context not started
cl_khr_throttle_hints not started
cl_khr_work_group_uniform_arithmetic not started

View File

@@ -510,6 +510,11 @@ extern "C" fn cl_get_extension_function_address(
"clGetGLObjectInfo" => cl_get_gl_object_info as *mut ::std::ffi::c_void,
"clGetGLTextureInfo" => cl_get_gl_texture_info as *mut ::std::ffi::c_void,
// cl_khr_suggested_local_work_size
"clGetKernelSuggestedLocalWorkSizeKHR" => {
cl_get_kernel_suggested_local_work_size_khr as *mut ::std::ffi::c_void
}
// cl_arm_shared_virtual_memory
"clEnqueueSVMFreeARM" => cl_enqueue_svm_free_arm as *mut ::std::ffi::c_void,
"clEnqueueSVMMapARM" => cl_enqueue_svm_map_arm as *mut ::std::ffi::c_void,

View File

@@ -236,6 +236,17 @@ unsafe fn kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -
}
}
/// # Safety
///
/// This function is only safe when called on an array of `work_dim` length
unsafe fn kernel_work_arr_mut<'a>(arr: *mut usize, work_dim: cl_uint) -> Option<&'a mut [usize]> {
if !arr.is_null() {
unsafe { Some(slice::from_raw_parts_mut(arr, work_dim as usize)) }
} else {
None
}
}
#[cl_entrypoint]
fn create_kernel(
program: cl_program,
@@ -649,3 +660,109 @@ fn clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel> {
let k = Kernel::ref_from_raw(source_kernel)?;
Ok(Arc::new(k.clone()).into_cl())
}
#[cl_entrypoint]
fn get_kernel_suggested_local_work_size_khr(
command_queue: cl_command_queue,
kernel: cl_kernel,
work_dim: cl_uint,
global_work_offset: *const usize,
global_work_size: *const usize,
suggested_local_work_size: *mut usize,
) -> CLResult<()> {
// CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified in
// global_work_size are 0.
if global_work_size.is_null() {
return Err(CL_INVALID_GLOBAL_WORK_SIZE);
}
if global_work_offset.is_null() {
return Err(CL_INVALID_GLOBAL_OFFSET);
}
// CL_INVALID_VALUE if suggested_local_work_size is NULL.
if suggested_local_work_size.is_null() {
return Err(CL_INVALID_VALUE);
}
// CL_INVALID_COMMAND_QUEUE if command_queue is not a valid host command-queue.
let queue = Queue::ref_from_raw(command_queue)?;
// CL_INVALID_KERNEL if kernel is not a valid kernel object.
let kernel = Kernel::ref_from_raw(kernel)?;
// CL_INVALID_CONTEXT if the context associated with kernel is not the same as the context
// associated with command_queue.
if queue.context != kernel.prog.context {
return Err(CL_INVALID_CONTEXT);
}
// CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
// for kernel for the device associated with command_queue.
if kernel.prog.status(queue.device) != CL_BUILD_SUCCESS as cl_build_status {
return Err(CL_INVALID_PROGRAM_EXECUTABLE);
}
// CL_INVALID_KERNEL_ARGS if all argument values for kernel have not been set.
if kernel.arg_values().iter().any(|v| v.is_none()) {
return Err(CL_INVALID_KERNEL_ARGS);
}
// CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
if work_dim == 0 || work_dim > queue.device.max_grid_dimensions() {
return Err(CL_INVALID_WORK_DIMENSION);
}
let mut global_work_size =
unsafe { kernel_work_arr_or_default(global_work_size, work_dim).to_vec() };
let suggested_local_work_size = unsafe {
kernel_work_arr_mut(suggested_local_work_size, work_dim).ok_or(CL_INVALID_VALUE)?
};
let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
let device_bits = queue.device.address_bits();
let device_max = u64::MAX >> (u64::BITS - device_bits);
for i in 0..work_dim as usize {
let gws = global_work_size[i];
let gwo = global_work_offset[i];
// CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified
// in global_work_size are 0.
if gws == 0 {
return Err(CL_INVALID_GLOBAL_WORK_SIZE);
}
// CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size exceed the
// maximum value representable by size_t on the device associated with command_queue.
if gws as u64 > device_max {
return Err(CL_INVALID_GLOBAL_WORK_SIZE);
}
// CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size plus the
// corresponding value in global_work_offset for dimension exceeds the maximum value
// representable by size_t on the device associated with command_queue.
if u64::checked_add(gws as u64, gwo as u64)
.filter(|&x| x <= device_max)
.is_none()
{
return Err(CL_INVALID_GLOBAL_OFFSET);
}
}
kernel.suggest_local_size(
queue.device,
work_dim as usize,
&mut global_work_size,
suggested_local_work_size,
);
Ok(())
// CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is set as an argument to kernel and the offset specified when the sub-buffer object was created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN for the device associated with command_queue.
// CL_INVALID_IMAGE_SIZE if an image object is set as an argument to kernel and the image dimensions are not supported by device associated with command_queue.
// CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is set as an argument to kernel and the image format is not supported by the device associated with command_queue.
// CL_INVALID_OPERATION if an SVM pointer is set as an argument to kernel and the device associated with command_queue does not support SVM or the required SVM capabilities for the SVM pointer.
// CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the device.
// CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
}

View File

@@ -831,21 +831,19 @@ impl Kernel {
})
}
fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
let mut threads = self.max_threads_per_block(d) as u32;
pub fn suggest_local_size(
&self,
d: &Device,
work_dim: usize,
grid: &mut [usize],
block: &mut [usize],
) {
let mut threads = self.max_threads_per_block(d);
let dim_threads = d.max_block_sizes();
let subgroups = self.preferred_simd_size(d) as u32;
let subgroups = self.preferred_simd_size(d);
if !block.contains(&0) {
for i in 0..3 {
// we already made sure everything is fine
grid[i] /= block[i];
}
return;
}
for i in 0..3 {
let t = cmp::min(threads, dim_threads[i] as u32);
for i in 0..work_dim {
let t = cmp::min(threads, dim_threads[i]);
let gcd = gcd(t, grid[i]);
block[i] = gcd;
@@ -856,9 +854,9 @@ impl Kernel {
}
// if we didn't fill the subgroup we can do a bit better if we have threads remaining
let total_threads = block[0] * block[1] * block[2];
let total_threads = block.iter().take(work_dim).product::<usize>();
if threads != 1 && total_threads < subgroups {
for i in 0..3 {
for i in 0..work_dim {
if grid[i] * total_threads < threads {
block[i] *= grid[i];
grid[i] = 1;
@@ -869,6 +867,31 @@ impl Kernel {
}
}
fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
if !block.contains(&0) {
for i in 0..3 {
// we already made sure everything is fine
grid[i] /= block[i];
}
return;
}
let mut usize_grid = [0usize; 3];
let mut usize_block = [0usize; 3];
for i in 0..3 {
usize_grid[i] = grid[i] as usize;
usize_block[i] = block[i] as usize;
}
self.suggest_local_size(d, 3, &mut usize_grid, &mut usize_block);
for i in 0..3 {
grid[i] = usize_grid[i] as u32;
block[i] = usize_block[i] as u32;
}
}
// the painful part is, that host threads are allowed to modify the kernel object once it was
// enqueued, so return a closure with all req data included.
pub fn launch(

View File

@@ -53,6 +53,7 @@ gen_cl_exts!([
(1, 0, 0, "cl_khr_icd"),
(1, 0, 0, "cl_khr_il_program"),
(1, 0, 0, "cl_khr_spirv_no_integer_wrap_decoration"),
(1, 0, 0, "cl_khr_suggested_local_work_size"),
]);
static mut PLATFORM: Platform = Platform {