rusticl: translate spirv to nir and first steps to kernel arg handling

Signed-off-by: Karol Herbst <kherbst@redhat.com> Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15439>
2022-04-28 22:07:49 +02:00
parent 46ee5988cd
commit 5795ee0e08
16 changed files with 1108 additions and 49 deletions
--- a/meson.build
+++ b/meson.build
@@ -966,6 +966,7 @@ if with_gallium_rusticl
  add_languages('rust', required: true)

  with_clc = true
+  with_libclc = true
 endif

 dep_clc = null_dep
--- a/src/gallium/frontends/rusticl/api/icd.rs
+++ b/src/gallium/frontends/rusticl/api/icd.rs
@@ -1079,18 +1079,27 @@ extern "C" fn cl_enqueue_unmap_mem_object(
 }

 extern "C" fn cl_enqueue_ndrange_kernel(
-    _command_queue: cl_command_queue,
-    _kernel: cl_kernel,
-    _work_dim: cl_uint,
-    _global_work_offset: *const usize,
-    _global_work_size: *const usize,
-    _local_work_size: *const usize,
-    _num_events_in_wait_list: cl_uint,
-    _event_wait_list: *const cl_event,
-    _event: *mut cl_event,
+    command_queue: cl_command_queue,
+    kernel: cl_kernel,
+    work_dim: cl_uint,
+    global_work_offset: *const usize,
+    global_work_size: *const usize,
+    local_work_size: *const usize,
+    num_events_in_wait_list: cl_uint,
+    event_wait_list: *const cl_event,
+    event: *mut cl_event,
 ) -> cl_int {
-    println!("cl_enqueue_ndrange_kernel not implemented");
-    CL_OUT_OF_HOST_MEMORY
+    match_err!(enqueue_ndrange_kernel(
+        command_queue,
+        kernel,
+        work_dim,
+        global_work_offset,
+        global_work_size,
+        local_work_size,
+        num_events_in_wait_list,
+        event_wait_list,
+        event
+    ))
 }

 extern "C" fn cl_get_extension_function_address(
--- a/src/gallium/frontends/rusticl/api/kernel.rs
+++ b/src/gallium/frontends/rusticl/api/kernel.rs
@@ -1,14 +1,17 @@
 extern crate mesa_rust_util;
 extern crate rusticl_opencl_gen;

+use crate::api::event::create_and_queue;
 use crate::api::icd::*;
 use crate::api::util::*;
+use crate::core::event::*;
 use crate::core::kernel::*;

 use self::mesa_rust_util::string::*;
 use self::rusticl_opencl_gen::*;

 use std::collections::HashSet;
+use std::slice;
 use std::sync::Arc;

 impl CLInfo<cl_kernel_info> for cl_kernel {
@@ -73,6 +76,19 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
    }
 }

+const ZERO_ARR: [usize; 3] = [0; 3];
+
+/// # Safety
+///
+/// This function is only safe when called on an array of `work_dim` length
+unsafe fn kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -> &'a [usize] {
+    if !arr.is_null() {
+        slice::from_raw_parts(arr, work_dim as usize)
+    } else {
+        &ZERO_ARR
+    }
+}
+
 pub fn create_kernel(
    program: cl_program,
    kernel_name: *const ::std::os::raw::c_char,
@@ -108,6 +124,7 @@ pub fn create_kernel(
    // CL_INVALID_KERNEL_DEFINITION if the function definition for __kernel function given by
    // kernel_name such as the number of arguments, the argument types are not the same for all
    // devices for which the program executable has been built.
+    let nirs = p.nirs(&name);
    let kernel_args: HashSet<_> = devs.iter().map(|d| p.args(d, &name)).collect();
    if kernel_args.len() != 1 {
        return Err(CL_INVALID_KERNEL_DEFINITION);
@@ -116,6 +133,7 @@ pub fn create_kernel(
    Ok(cl_kernel::from_arc(Kernel::new(
        name,
        p,
+        nirs,
        kernel_args.into_iter().next().unwrap(),
    )))
 }
@@ -123,24 +141,177 @@ pub fn create_kernel(
 pub fn set_kernel_arg(
    kernel: cl_kernel,
    arg_index: cl_uint,
-    _arg_size: usize,
-    _arg_value: *const ::std::os::raw::c_void,
+    arg_size: usize,
+    arg_value: *const ::std::os::raw::c_void,
 ) -> CLResult<()> {
    let k = kernel.get_arc()?;

    // CL_INVALID_ARG_INDEX if arg_index is not a valid argument index.
-    if arg_index as usize >= k.args.len() {
-        return Err(CL_INVALID_ARG_INDEX);
+    if let Some(arg) = k.args.get(arg_index as usize) {
+        // CL_INVALID_ARG_SIZE if arg_size does not match the size of the data type for an argument
+        // that is not a memory object or if the argument is a memory object and
+        // arg_size != sizeof(cl_mem) or if arg_size is zero and the argument is declared with the
+        // local qualifier or if the argument is a sampler and arg_size != sizeof(cl_sampler).
+        match arg.kind {
+            KernelArgType::MemLocal => {
+                if arg_size == 0 {
+                    return Err(CL_INVALID_ARG_SIZE);
+                }
+            }
+            _ => {
+                if arg.size != arg_size {
+                    return Err(CL_INVALID_ARG_SIZE);
+                }
+            }
+        }
+
+        // CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
+        match arg.kind {
+            // If the argument is declared with the local qualifier, the arg_value entry must be
+            // NULL.
+            KernelArgType::MemLocal => {
+                if !arg_value.is_null() {
+                    return Err(CL_INVALID_ARG_VALUE);
+                }
+            }
+            // If the argument is of type sampler_t, the arg_value entry must be a pointer to the
+            // sampler object.
+            KernelArgType::Constant | KernelArgType::Sampler => {
+                if arg_value.is_null() {
+                    return Err(CL_INVALID_ARG_VALUE);
+                }
+            }
+            _ => {}
+        };
+
+        // let's create the arg now
+        let arg = unsafe {
+            if arg.dead {
+                KernelArgValue::None
+            } else {
+                match arg.kind {
+                    KernelArgType::Constant => KernelArgValue::Constant(
+                        slice::from_raw_parts(arg_value.cast(), arg_size).to_vec(),
+                    ),
+                    KernelArgType::MemConstant | KernelArgType::MemGlobal => {
+                        let ptr: *const cl_mem = arg_value.cast();
+                        if ptr.is_null() || (*ptr).is_null() {
+                            KernelArgValue::None
+                        } else {
+                            KernelArgValue::MemObject((*ptr).get_ref()?)
+                        }
+                    }
+                    KernelArgType::MemLocal => KernelArgValue::LocalMem(arg_size),
+                    KernelArgType::Sampler => {
+                        let ptr: *const cl_sampler = arg_value.cast();
+                        KernelArgValue::Sampler((*ptr).get_ref()?)
+                    }
+                }
+            }
+        };
+        k.values.get(arg_index as usize).unwrap().replace(Some(arg));
+        Ok(())
+    } else {
+        Err(CL_INVALID_ARG_INDEX)
    }

-    //• CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
-    //• CL_INVALID_MEM_OBJECT for an argument declared to be a memory object when the specified arg_value is not a valid memory object.
-    //• CL_INVALID_SAMPLER for an argument declared to be of type sampler_t when the specified arg_value is not a valid sampler object.
    //• CL_INVALID_DEVICE_QUEUE for an argument declared to be of type queue_t when the specified arg_value is not a valid device queue object. This error code is missing before version 2.0.
-    //• CL_INVALID_ARG_SIZE if arg_size does not match the size of the data type for an argument that is not a memory object or if the argument is a memory object and arg_size != sizeof(cl_mem) or if arg_size is zero and the argument is declared with the local qualifier or if the argument is a sampler and arg_size != sizeof(cl_sampler).
-    //• CL_MAX_SIZE_RESTRICTION_EXCEEDED if the size in bytes of the memory object (if the argument is a memory object) or arg_size (if the argument is declared with local qualifier) exceeds a language- specified maximum size restriction for this argument, such as the MaxByteOffset SPIR-V decoration. This error code is missing before version 2.2.
    //• CL_INVALID_ARG_VALUE if the argument is an image declared with the read_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_WRITE_ONLY or if the image argument is declared with the write_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_READ_ONLY.
-
-    println!("set_kernel_arg not implemented");
-    Err(CL_OUT_OF_HOST_MEMORY)
+    //• CL_MAX_SIZE_RESTRICTION_EXCEEDED if the size in bytes of the memory object (if the argument is a memory object) or arg_size (if the argument is declared with local qualifier) exceeds a language- specified maximum size restriction for this argument, such as the MaxByteOffset SPIR-V decoration. This error code is missing before version 2.2.
+}
+
+pub fn enqueue_ndrange_kernel(
+    command_queue: cl_command_queue,
+    kernel: cl_kernel,
+    work_dim: cl_uint,
+    global_work_offset: *const usize,
+    global_work_size: *const usize,
+    local_work_size: *const usize,
+    num_events_in_wait_list: cl_uint,
+    event_wait_list: *const cl_event,
+    event: *mut cl_event,
+) -> CLResult<()> {
+    let q = command_queue.get_arc()?;
+    let k = kernel.get_arc()?;
+    let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?;
+
+    // CL_INVALID_CONTEXT if context associated with command_queue and kernel are not the same
+    if q.context != k.prog.context {
+        return Err(CL_INVALID_CONTEXT);
+    }
+
+    // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
+    // for device associated with command_queue.
+    if k.prog.status(&q.device) != CL_BUILD_SUCCESS as cl_build_status {
+        return Err(CL_INVALID_PROGRAM_EXECUTABLE);
+    }
+
+    // CL_INVALID_KERNEL_ARGS if the kernel argument values have not been specified.
+    if k.values.iter().any(|v| v.borrow().is_none()) {
+        return Err(CL_INVALID_KERNEL_ARGS);
+    }
+
+    // CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
+    // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
+    if work_dim == 0 || work_dim > q.device.max_grid_dimensions() {
+        return Err(CL_INVALID_WORK_DIMENSION);
+    }
+
+    // we assume the application gets it right and doesn't pass shorter arrays then actually needed.
+    let global_work_size = unsafe { kernel_work_arr_or_default(global_work_size, work_dim) };
+    let local_work_size = unsafe { kernel_work_arr_or_default(local_work_size, work_dim) };
+    let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
+
+    if q.device.address_bits() == 32 {
+        for (s, o) in global_work_size.iter().zip(global_work_offset) {
+            // CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size[0], …
+            // global_work_size[work_dim - 1] exceed the maximum value representable by size_t on
+            // the device on which the kernel-instance will be enqueued.
+            if *s > u32::MAX as usize {
+                return Err(CL_INVALID_GLOBAL_WORK_SIZE);
+            }
+            // CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size + the
+            // corresponding values in global_work_offset for any dimensions is greater than the
+            // maximum value representable by size t on the device on which the kernel-instance
+            // will be enqueued
+            if s + o > u32::MAX as usize {
+                return Err(CL_INVALID_GLOBAL_OFFSET);
+            }
+        }
+    }
+
+    // CL_INVALID_WORK_ITEM_SIZE if the number of work-items specified in any of
+    // local_work_size[0], … local_work_size[work_dim - 1] is greater than the corresponding values
+    // specified by CL_DEVICE_MAX_WORK_ITEM_SIZES[0], …, CL_DEVICE_MAX_WORK_ITEM_SIZES[work_dim - 1].
+    if local_work_size.iter().gt(q.device.max_block_sizes().iter()) {
+        return Err(CL_INVALID_WORK_ITEM_SIZE);
+    }
+
+    // If global_work_size is NULL, or the value in any passed dimension is 0 then the kernel
+    // command will trivially succeed after its event dependencies are satisfied and subsequently
+    // update its completion event.
+    let cb: EventSig = if global_work_size.contains(&0) {
+        Box::new(|_, _| Ok(()))
+    } else {
+        k.launch(
+            &q,
+            work_dim,
+            local_work_size,
+            global_work_size,
+            global_work_offset,
+        )
+    };
+
+    create_and_queue(q, CL_COMMAND_NDRANGE_KERNEL, evs, event, false, cb)
+
+    //• CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not match the required work-group size for kernel in the program source.
+    //• CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not consistent with the required number of sub-groups for kernel in the program source.
+    //• CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the total number of work-items in the work-group computed as local_work_size[0] × … local_work_size[work_dim - 1] is greater than the value specified by CL_KERNEL_WORK_GROUP_SIZE in the Kernel Object Device Queries table.
+    //• CL_INVALID_WORK_GROUP_SIZE if the work-group size must be uniform and the local_work_size is not NULL, is not equal to the required work-group size specified in the kernel source, or the global_work_size is not evenly divisible by the local_work_size.
+    //• CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is specified as the value for an argument that is a buffer object and the offset specified when the sub-buffer object is created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for device associated with queue. This error code
+    //• CL_INVALID_IMAGE_SIZE if an image object is specified as an argument value and the image dimensions (image width, height, specified or compute row and/or slice pitch) are not supported by device associated with queue.
+    //• CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is specified as an argument value and the image format (image channel order and data type) is not supported by device associated with queue.
+    //• CL_OUT_OF_RESOURCES if there is a failure to queue the execution instance of kernel on the command-queue because of insufficient resources needed to execute the kernel. For example, the explicitly specified local_work_size causes a failure to execute the kernel because of insufficient resources such as registers or local memory. Another example would be the number of read-only image args used in kernel exceed the CL_DEVICE_MAX_READ_IMAGE_ARGS value for device or the number of write-only and read-write image args used in kernel exceed the CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS value for device or the number of samplers used in kernel exceed CL_DEVICE_MAX_SAMPLERS for device.
+    //• CL_MEM_OBJECT_ALLOCATION_FAILURE if there is a failure to allocate memory for data store associated with image or buffer objects specified as arguments to kernel.
+    //• CL_INVALID_OPERATION if SVM pointers are passed as arguments to a kernel and the device does not support SVM or if system pointers are passed as arguments to a kernel and/or stored inside SVM allocations passed as kernel arguments and the device does not support fine grain system SVM allocations.
 }
--- a/src/gallium/frontends/rusticl/core/context.rs
+++ b/src/gallium/frontends/rusticl/core/context.rs
@@ -30,7 +30,7 @@ impl Context {
        })
    }

-    pub fn create_buffer(&self, size: usize) -> CLResult<HashMap<Arc<Device>, PipeResource>> {
+    pub fn create_buffer(&self, size: usize) -> CLResult<HashMap<Arc<Device>, Arc<PipeResource>>> {
        let adj_size: u32 = size.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?;
        let mut res = HashMap::new();
        for dev in &self.devs {
@@ -38,7 +38,7 @@ impl Context {
                .screen()
                .resource_create_buffer(adj_size)
                .ok_or(CL_OUT_OF_RESOURCES);
-            res.insert(Arc::clone(dev), resource?);
+            res.insert(Arc::clone(dev), Arc::new(resource?));
        }
        Ok(res)
    }
@@ -47,7 +47,7 @@ impl Context {
        &self,
        size: usize,
        user_ptr: *mut c_void,
-    ) -> CLResult<HashMap<Arc<Device>, PipeResource>> {
+    ) -> CLResult<HashMap<Arc<Device>, Arc<PipeResource>>> {
        let adj_size: u32 = size.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?;
        let mut res = HashMap::new();
        for dev in &self.devs {
@@ -55,7 +55,7 @@ impl Context {
                .screen()
                .resource_create_buffer_from_user(adj_size, user_ptr)
                .ok_or(CL_OUT_OF_RESOURCES);
-            res.insert(Arc::clone(dev), resource?);
+            res.insert(Arc::clone(dev), Arc::new(resource?));
        }
        Ok(res)
    }
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@@ -10,6 +10,8 @@ use crate::core::util::*;
 use crate::core::version::*;
 use crate::impl_cl_type_trait;

+use self::mesa_rust::compiler::clc::*;
+use self::mesa_rust::compiler::nir::*;
 use self::mesa_rust::pipe::context::*;
 use self::mesa_rust::pipe::device::load_screens;
 use self::mesa_rust::pipe::screen::*;
@@ -27,7 +29,7 @@ use std::sync::MutexGuard;

 pub struct Device {
    pub base: CLObjectBase<CL_INVALID_DEVICE>,
-    screen: Arc<PipeScreen>,
+    pub screen: Arc<PipeScreen>,
    pub cl_version: CLVersion,
    pub clc_version: CLVersion,
    pub clc_versions: Vec<cl_name_version>,
@@ -36,6 +38,7 @@ pub struct Device {
    pub extension_string: String,
    pub extensions: Vec<cl_name_version>,
    pub formats: HashMap<cl_image_format, HashMap<cl_mem_object_type, cl_mem_flags>>,
+    pub lib_clc: NirShader,
    helper_ctx: Mutex<Arc<PipeContext>>,
 }

@@ -43,6 +46,11 @@ impl_cl_type_trait!(cl_device_id, Device, CL_INVALID_DEVICE);

 impl Device {
    fn new(screen: Arc<PipeScreen>) -> Option<Arc<Device>> {
+        if !Self::check_valid(&screen) {
+            return None;
+        }
+
+        let lib_clc = spirv::SPIRVBin::get_lib_clc(&screen);
        let mut d = Self {
            base: CLObjectBase::new(),
            helper_ctx: Mutex::new(screen.create_context().unwrap()),
@@ -55,12 +63,9 @@ impl Device {
            extension_string: String::from(""),
            extensions: Vec::new(),
            formats: HashMap::new(),
+            lib_clc: lib_clc?,
        };

-        if !d.check_valid() {
-            return None;
-        }
-
        d.fill_format_tables();

        // check if we are embedded or full profile first
@@ -112,17 +117,21 @@ impl Device {
        }
    }

-    fn check_valid(&self) -> bool {
-        if self.screen.param(pipe_cap::PIPE_CAP_COMPUTE) == 0 ||
+    fn check_valid(screen: &PipeScreen) -> bool {
+        if screen.param(pipe_cap::PIPE_CAP_COMPUTE) == 0 ||
         // even though we use PIPE_SHADER_IR_NIR, PIPE_SHADER_IR_NIR_SERIALIZED marks CL support by the driver
-         self.shader_param(pipe_shader_cap::PIPE_SHADER_CAP_SUPPORTED_IRS) & (1 << (pipe_shader_ir::PIPE_SHADER_IR_NIR_SERIALIZED as i32)) == 0
+         screen.shader_param(pipe_shader_type::PIPE_SHADER_COMPUTE, pipe_shader_cap::PIPE_SHADER_CAP_SUPPORTED_IRS) & (1 << (pipe_shader_ir::PIPE_SHADER_IR_NIR_SERIALIZED as i32)) == 0
        {
            return false;
        }

        // CL_DEVICE_MAX_PARAMETER_SIZE
        // For this minimum value, only a maximum of 128 arguments can be passed to a kernel
-        if self.param_max_size() < 128 {
+        if ComputeParam::<u64>::compute_param(
+            screen,
+            pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_INPUT_SIZE,
+        ) < 128
+        {
            return false;
        }
        true
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@@ -3,37 +3,430 @@ extern crate mesa_rust_gen;
 extern crate rusticl_opencl_gen;

 use crate::api::icd::*;
+use crate::core::device::*;
+use crate::core::event::*;
+use crate::core::memory::*;
 use crate::core::program::*;
+use crate::core::queue::*;
 use crate::impl_cl_type_trait;

 use self::mesa_rust::compiler::clc::*;
+use self::mesa_rust::compiler::nir::*;
 use self::mesa_rust_gen::*;
 use self::rusticl_opencl_gen::*;

+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::collections::HashSet;
+use std::convert::TryInto;
+use std::ptr;
 use std::sync::Arc;

+// ugh, we are not allowed to take refs, so...
+pub enum KernelArgValue {
+    None,
+    Constant(Vec<u8>),
+    MemObject(&'static Mem),
+    Sampler(&'static Sampler),
+    LocalMem(usize),
+}
+
+#[derive(PartialEq, Eq)]
+pub enum KernelArgType {
+    Constant, // for anything passed by value
+    Sampler,
+    MemGlobal,
+    MemConstant,
+    MemLocal,
+}
+
+#[derive(Hash, PartialEq, Eq)]
+pub enum InternalKernelArgType {
+    ConstantBuffer,
+}
+
+pub struct KernelArg {
+    spirv: spirv::SPIRVKernelArg,
+    pub kind: KernelArgType,
+    pub size: usize,
+    pub offset: usize,
+    pub dead: bool,
+}
+
+#[derive(Hash, PartialEq, Eq)]
+pub struct InternalKernelArg {
+    pub kind: InternalKernelArgType,
+    pub size: usize,
+    pub offset: usize,
+}
+
+impl KernelArg {
+    fn from_spirv_nir(spirv: Vec<spirv::SPIRVKernelArg>, nir: &mut NirShader) -> Vec<Self> {
+        let nir_arg_map: HashMap<_, _> = nir
+            .variables_with_mode(
+                nir_variable_mode::nir_var_uniform | nir_variable_mode::nir_var_image,
+            )
+            .map(|v| (v.data.location, v))
+            .collect();
+        let mut res = Vec::new();
+
+        for (i, s) in spirv.into_iter().enumerate() {
+            let nir = nir_arg_map.get(&(i as i32)).unwrap();
+            let kind = match s.address_qualifier {
+                clc_kernel_arg_address_qualifier::CLC_KERNEL_ARG_ADDRESS_PRIVATE => {
+                    if unsafe { glsl_type_is_sampler(nir.type_) } {
+                        KernelArgType::Sampler
+                    } else {
+                        KernelArgType::Constant
+                    }
+                }
+                clc_kernel_arg_address_qualifier::CLC_KERNEL_ARG_ADDRESS_CONSTANT => {
+                    KernelArgType::MemConstant
+                }
+                clc_kernel_arg_address_qualifier::CLC_KERNEL_ARG_ADDRESS_LOCAL => {
+                    KernelArgType::MemLocal
+                }
+                clc_kernel_arg_address_qualifier::CLC_KERNEL_ARG_ADDRESS_GLOBAL => {
+                    KernelArgType::MemGlobal
+                }
+            };
+
+            res.push(Self {
+                spirv: s,
+                size: unsafe { glsl_get_cl_size(nir.type_) } as usize,
+                // we'll update it later in the 2nd pass
+                kind: kind,
+                offset: 0,
+                dead: true,
+            });
+        }
+        res
+    }
+
+    fn assign_locations(
+        args: &mut [Self],
+        internal_args: &mut [InternalKernelArg],
+        nir: &mut NirShader,
+    ) {
+        for var in nir.variables_with_mode(
+            nir_variable_mode::nir_var_uniform | nir_variable_mode::nir_var_image,
+        ) {
+            if let Some(arg) = args.get_mut(var.data.location as usize) {
+                arg.offset = var.data.driver_location as usize;
+                arg.dead = false;
+            } else {
+                internal_args
+                    .get_mut(var.data.location as usize - args.len())
+                    .unwrap()
+                    .offset = var.data.driver_location as usize;
+            }
+        }
+    }
+}
+
 #[repr(C)]
 pub struct Kernel {
    pub base: CLObjectBase<CL_INVALID_KERNEL>,
    pub prog: Arc<Program>,
    pub name: String,
-    pub args: Vec<spirv::SPIRVKernelArg>,
+    pub args: Vec<KernelArg>,
+    pub values: Vec<RefCell<Option<KernelArgValue>>>,
+    internal_args: Vec<InternalKernelArg>,
+    nirs: HashMap<Arc<Device>, NirShader>,
 }

 impl_cl_type_trait!(cl_kernel, Kernel, CL_INVALID_KERNEL);

+fn create_kernel_arr(vals: &[usize], val: u32) -> [u32; 3] {
+    let mut res = [val; 3];
+    for (i, v) in vals.iter().enumerate() {
+        res[i] = (*v).try_into().expect("64 bit work groups not supported");
+    }
+    res
+}
+
+// mostly like clc_spirv_to_dxil
+// does not DCEe uniforms or images!
+fn lower_and_optimize_nir_pre_inputs(nir: &mut NirShader, lib_clc: &NirShader) {
+    nir.set_workgroup_size(&[0; 3]);
+    nir.structurize();
+    while {
+        let mut progress = false;
+        progress |= nir.pass0(nir_copy_prop);
+        progress |= nir.pass0(nir_opt_copy_prop_vars);
+        progress |= nir.pass0(nir_opt_deref);
+        progress |= nir.pass0(nir_opt_dce);
+        progress |= nir.pass0(nir_opt_undef);
+        progress |= nir.pass0(nir_opt_constant_folding);
+        progress |= nir.pass0(nir_opt_cse);
+        progress |= nir.pass0(nir_lower_vars_to_ssa);
+        progress |= nir.pass0(nir_opt_algebraic);
+        progress
+    } {}
+    nir.inline(lib_clc);
+    nir.remove_non_entrypoints();
+    while {
+        let mut progress = false;
+        progress |= nir.pass0(nir_copy_prop);
+        progress |= nir.pass0(nir_opt_copy_prop_vars);
+        progress |= nir.pass0(nir_opt_deref);
+        progress |= nir.pass0(nir_opt_dce);
+        progress |= nir.pass0(nir_opt_undef);
+        progress |= nir.pass0(nir_opt_constant_folding);
+        progress |= nir.pass0(nir_opt_cse);
+        progress |= nir.pass0(nir_split_var_copies);
+        progress |= nir.pass0(nir_lower_var_copies);
+        progress |= nir.pass0(nir_lower_vars_to_ssa);
+        progress |= nir.pass0(nir_opt_algebraic);
+        progress |= nir.pass1(
+            nir_opt_if,
+            nir_opt_if_options::nir_opt_if_aggressive_last_continue
+                | nir_opt_if_options::nir_opt_if_optimize_phi_true_false,
+        );
+        progress |= nir.pass0(nir_opt_dead_cf);
+        progress |= nir.pass0(nir_opt_remove_phis);
+        // we don't want to be too aggressive here, but it kills a bit of CFG
+        progress |= nir.pass3(nir_opt_peephole_select, 1, true, true);
+        progress |= nir.pass1(
+            nir_lower_vec3_to_vec4,
+            nir_variable_mode::nir_var_mem_generic | nir_variable_mode::nir_var_uniform,
+        );
+        progress
+    } {}
+    // TODO variable initializers
+    // TODO lower memcpy
+    nir.pass0(nir_dedup_inline_samplers);
+    nir.pass2(
+        nir_lower_vars_to_explicit_types,
+        nir_variable_mode::nir_var_function_temp,
+        Some(glsl_get_cl_type_size_align),
+    );
+    // TODO printf
+    nir.pass0(nir_split_var_copies);
+    nir.pass0(nir_opt_copy_prop_vars);
+    nir.pass0(nir_lower_var_copies);
+    nir.pass0(nir_lower_vars_to_ssa);
+    nir.pass0(nir_lower_alu);
+    nir.pass0(nir_opt_dce);
+    nir.pass0(nir_opt_deref);
+}
+
+fn lower_and_optimize_nir_late(
+    dev: &Device,
+    nir: &mut NirShader,
+    args: usize,
+) -> Vec<InternalKernelArg> {
+    let mut res = Vec::new();
+    let mut lower_state = rusticl_lower_state::default();
+
+    nir.pass2(
+        nir_remove_dead_variables,
+        nir_variable_mode::nir_var_uniform
+            | nir_variable_mode::nir_var_mem_constant
+            | nir_variable_mode::nir_var_function_temp,
+        ptr::null(),
+    );
+    nir.pass1(nir_lower_readonly_images_to_tex, false);
+    nir.pass2(
+        nir_remove_dead_variables,
+        nir_variable_mode::nir_var_mem_shared | nir_variable_mode::nir_var_function_temp,
+        ptr::null(),
+    );
+    nir.reset_scratch_size();
+    nir.pass2(
+        nir_lower_vars_to_explicit_types,
+        nir_variable_mode::nir_var_mem_constant,
+        Some(glsl_get_cl_type_size_align),
+    );
+    nir.extract_constant_initializers();
+    // TODO printf
+    // TODO 32 bit devices
+    if nir.has_constant() {
+        res.push(InternalKernelArg {
+            kind: InternalKernelArgType::ConstantBuffer,
+            offset: 0,
+            size: 8,
+        });
+        lower_state.const_buf = nir.add_var(
+            nir_variable_mode::nir_var_uniform,
+            unsafe { glsl_uint64_t_type() },
+            args + res.len() - 1,
+            "constant_buffer_addr",
+        );
+    }
+
+    nir.pass2(
+        nir_lower_vars_to_explicit_types,
+        nir_variable_mode::nir_var_mem_shared
+            | nir_variable_mode::nir_var_function_temp
+            | nir_variable_mode::nir_var_uniform
+            | nir_variable_mode::nir_var_mem_global,
+        Some(glsl_get_cl_type_size_align),
+    );
+    nir.pass2(
+        nir_lower_explicit_io,
+        nir_variable_mode::nir_var_mem_global | nir_variable_mode::nir_var_mem_constant,
+        nir_address_format::nir_address_format_64bit_global,
+    );
+    nir.pass1(rusticl_lower_intrinsics, &mut lower_state);
+    nir.pass2(
+        nir_lower_explicit_io,
+        nir_variable_mode::nir_var_mem_shared
+            | nir_variable_mode::nir_var_function_temp
+            | nir_variable_mode::nir_var_uniform,
+        nir_address_format::nir_address_format_32bit_offset_as_64bit,
+    );
+    nir.pass0(nir_lower_system_values);
+    let compute_options = nir_lower_compute_system_values_options::default();
+    nir.pass1(nir_lower_compute_system_values, &compute_options);
+    nir.pass0(nir_opt_deref);
+    nir.pass0(nir_lower_vars_to_ssa);
+
+    // TODO whatever clc is doing here
+
+    nir.pass1(nir_lower_convert_alu_types, None);
+    nir.pass0(nir_opt_dce);
+    dev.screen.finalize_nir(nir);
+    res
+}
+
 impl Kernel {
-    pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
+    pub fn new(
+        name: String,
+        prog: Arc<Program>,
+        mut nirs: HashMap<Arc<Device>, NirShader>,
+        args: Vec<spirv::SPIRVKernelArg>,
+    ) -> Arc<Kernel> {
+        nirs.iter_mut()
+            .for_each(|(d, n)| lower_and_optimize_nir_pre_inputs(n, &d.lib_clc));
+        let nir = nirs.values_mut().next().unwrap();
+        let mut args = KernelArg::from_spirv_nir(args, nir);
+        // can't use vec!...
+        let values = args.iter().map(|_| RefCell::new(None)).collect();
+        let internal_args: HashSet<_> = nirs
+            .iter_mut()
+            .map(|(d, n)| lower_and_optimize_nir_late(d, n, args.len()))
+            .collect();
+        // we want the same internal args for every compiled kernel, for now
+        assert!(internal_args.len() == 1);
+        let mut internal_args = internal_args.into_iter().next().unwrap();
+
+        nirs.values_mut()
+            .for_each(|n| KernelArg::assign_locations(&mut args, &mut internal_args, n));
+
        Arc::new(Self {
            base: CLObjectBase::new(),
            prog: prog,
            name: name,
            args: args,
+            values: values,
+            internal_args: internal_args,
+            // caller has to verify all kernels have the same sig
+            nirs: nirs,
+        })
+    }
+
+    // the painful part is, that host threads are allowed to modify the kernel object once it was
+    // enqueued, so return a closure with all req data included.
+    pub fn launch(
+        &self,
+        q: &Arc<Queue>,
+        work_dim: u32,
+        block: &[usize],
+        grid: &[usize],
+        offsets: &[usize],
+    ) -> EventSig {
+        let nir = self.nirs.get(&q.device).unwrap();
+        let mut block = create_kernel_arr(block, 1);
+        let mut grid = create_kernel_arr(grid, 1);
+        let offsets = create_kernel_arr(offsets, 0);
+        let mut input: Vec<u8> = Vec::new();
+        let mut resource_info = Vec::new();
+        let mut local_size: u32 = nir.shared_size();
+
+        for i in 0..3 {
+            if block[i] == 0 {
+                block[i] = 1;
+            } else {
+                grid[i] /= block[i];
+            }
+        }
+
+        for (arg, val) in self.args.iter().zip(&self.values) {
+            if arg.dead {
+                continue;
+            }
+            match val.borrow().as_ref().unwrap() {
+                KernelArgValue::Constant(c) => input.extend_from_slice(c),
+                KernelArgValue::MemObject(mem) => {
+                    input.extend_from_slice(&mem.offset.to_ne_bytes());
+                    resource_info.push((Some(mem.get_res_of_dev(&q.device).clone()), arg.offset));
+                }
+                KernelArgValue::LocalMem(size) => {
+                    // TODO 32 bit
+                    input.extend_from_slice(&[0; 8]);
+                    local_size += *size as u32;
+                }
+                KernelArgValue::None => {
+                    assert!(
+                        arg.kind == KernelArgType::MemGlobal
+                            || arg.kind == KernelArgType::MemConstant
+                    );
+                    input.extend_from_slice(&[0; 8]);
+                }
+                _ => panic!("unhandled arg type"),
+            }
+        }
+
+        for arg in &self.internal_args {
+            match arg.kind {
+                InternalKernelArgType::ConstantBuffer => {
+                    input.extend_from_slice(&[0; 8]);
+                    let buf = nir.get_constant_buffer();
+                    let res = Arc::new(
+                        q.device
+                            .screen()
+                            .resource_create_buffer(buf.len() as u32)
+                            .unwrap(),
+                    );
+                    q.device.helper_ctx().buffer_subdata(
+                        &res,
+                        0,
+                        buf.as_ptr().cast(),
+                        buf.len() as u32,
+                    );
+                    resource_info.push((Some(res), arg.offset));
+                }
+            }
+        }
+
+        let cso = q
+            .device
+            .helper_ctx()
+            .create_compute_state(nir, input.len() as u32, local_size);
+
+        Box::new(move |_, ctx| {
+            let mut input = input.clone();
+            let mut resources = Vec::with_capacity(resource_info.len());
+            let mut globals: Vec<*mut u32> = Vec::new();
+
+            for (res, offset) in resource_info.clone() {
+                resources.push(res);
+                globals.push(unsafe { input.as_mut_ptr().add(offset) }.cast());
+            }
+
+            ctx.bind_compute_state(cso);
+            ctx.set_global_binding(resources.as_slice(), &mut globals);
+            ctx.launch_grid(work_dim, block, grid, offsets, &input);
+            ctx.clear_global_binding(globals.len() as u32);
+            ctx.delete_compute_state(cso);
+            ctx.memory_barrier(PIPE_BARRIER_GLOBAL_BUFFER);
+            Ok(())
        })
    }

    pub fn access_qualifier(&self, idx: cl_uint) -> cl_kernel_arg_access_qualifier {
-        let aq = self.args[idx as usize].access_qualifier;
+        let aq = self.args[idx as usize].spirv.access_qualifier;

        if aq
            == clc_kernel_arg_access_qualifier::CLC_KERNEL_ARG_ACCESS_READ
@@ -50,7 +443,7 @@ impl Kernel {
    }

    pub fn address_qualifier(&self, idx: cl_uint) -> cl_kernel_arg_address_qualifier {
-        match self.args[idx as usize].address_qualifier {
+        match self.args[idx as usize].spirv.address_qualifier {
            clc_kernel_arg_address_qualifier::CLC_KERNEL_ARG_ADDRESS_PRIVATE => {
                CL_KERNEL_ARG_ADDRESS_PRIVATE
            }
@@ -67,7 +460,7 @@ impl Kernel {
    }

    pub fn type_qualifier(&self, idx: cl_uint) -> cl_kernel_arg_type_qualifier {
-        let tq = self.args[idx as usize].type_qualifier;
+        let tq = self.args[idx as usize].spirv.type_qualifier;
        let zero = clc_kernel_arg_type_qualifier(0);
        let mut res = CL_KERNEL_ARG_TYPE_NONE;

@@ -87,10 +480,10 @@ impl Kernel {
    }

    pub fn arg_name(&self, idx: cl_uint) -> &String {
-        &self.args[idx as usize].name
+        &self.args[idx as usize].spirv.name
    }

    pub fn arg_type_name(&self, idx: cl_uint) -> &String {
-        &self.args[idx as usize].type_name
+        &self.args[idx as usize].spirv.type_name
    }
 }
--- a/src/gallium/frontends/rusticl/core/memory.rs
+++ b/src/gallium/frontends/rusticl/core/memory.rs
@@ -36,7 +36,7 @@ pub struct Mem {
    pub image_desc: cl_image_desc,
    pub image_elem_size: u8,
    pub cbs: Mutex<Vec<Box<dyn Fn(cl_mem)>>>,
-    res: Option<HashMap<Arc<Device>, PipeResource>>,
+    res: Option<HashMap<Arc<Device>, Arc<PipeResource>>>,
    maps: Mutex<HashMap<*mut c_void, (u32, PipeTransfer)>>,
 }

@@ -187,7 +187,7 @@ impl Mem {
        ptr::eq(a, b)
    }

-    fn get_res(&self) -> &HashMap<Arc<Device>, PipeResource> {
+    fn get_res(&self) -> &HashMap<Arc<Device>, Arc<PipeResource>> {
        self.parent
            .as_ref()
            .map_or(self, |p| p.as_ref())
@@ -196,6 +196,10 @@ impl Mem {
            .unwrap()
    }

+    pub fn get_res_of_dev(&self, dev: &Arc<Device>) -> &Arc<PipeResource> {
+        self.get_res().get(dev).unwrap()
+    }
+
    fn to_parent<'a>(&'a self, offset: &mut usize) -> &'a Self {
        if let Some(parent) = &self.parent {
            offset.add_assign(self.offset);
--- a/src/gallium/frontends/rusticl/core/program.rs
+++ b/src/gallium/frontends/rusticl/core/program.rs
@@ -1,4 +1,5 @@
 extern crate mesa_rust;
+extern crate mesa_rust_gen;
 extern crate rusticl_opencl_gen;

 use crate::api::icd::*;
@@ -7,6 +8,8 @@ use crate::core::device::*;
 use crate::impl_cl_type_trait;

 use self::mesa_rust::compiler::clc::*;
+use self::mesa_rust::compiler::nir::*;
+use self::mesa_rust_gen::*;
 use self::rusticl_opencl_gen::*;

 use std::collections::HashMap;
@@ -227,4 +230,28 @@ impl Program {
            }),
        })
    }
+
+    pub fn nirs(&self, kernel: &str) -> HashMap<Arc<Device>, NirShader> {
+        let mut lock = self.build_info();
+        let mut res = HashMap::new();
+        for d in &self.devs {
+            let info = Self::dev_build_info(&mut lock, d);
+            if info.status != CL_BUILD_SUCCESS as cl_build_status {
+                continue;
+            }
+            let nir = info
+                .spirv
+                .as_ref()
+                .unwrap()
+                .to_nir(
+                    kernel,
+                    d.screen
+                        .nir_shader_compiler_options(pipe_shader_type::PIPE_SHADER_COMPUTE),
+                    &d.lib_clc,
+                )
+                .unwrap();
+            res.insert(d.clone(), nir);
+        }
+        res
+    }
 }
--- a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs
+++ b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs
@@ -1,6 +1,9 @@
 extern crate mesa_rust_gen;
 extern crate mesa_rust_util;

+use crate::compiler::nir::*;
+use crate::pipe::screen::*;
+
 use self::mesa_rust_gen::*;
 use self::mesa_rust_util::string::*;

@@ -167,6 +170,70 @@ impl SPIRVBin {
                .collect(),
        }
    }
+
+    fn get_spirv_options(library: bool, clc_shader: *const nir_shader) -> spirv_to_nir_options {
+        spirv_to_nir_options {
+            create_library: library,
+            environment: nir_spirv_execution_environment::NIR_SPIRV_OPENCL,
+            clc_shader: clc_shader,
+            float_controls_execution_mode: float_controls::FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32
+                as u16,
+
+            caps: spirv_supported_capabilities {
+                address: true,
+                float64: true,
+                int8: true,
+                int16: true,
+                int64: true,
+                kernel: true,
+                kernel_image: true,
+                linkage: true,
+                ..Default::default()
+            },
+
+            constant_addr_format: nir_address_format::nir_address_format_64bit_global,
+            global_addr_format: nir_address_format::nir_address_format_64bit_global, // TODO 32 bit devices
+            shared_addr_format: nir_address_format::nir_address_format_32bit_offset_as_64bit,
+            temp_addr_format: nir_address_format::nir_address_format_32bit_offset_as_64bit,
+
+            // default
+            debug: spirv_to_nir_options__bindgen_ty_1::default(),
+            ..Default::default()
+        }
+    }
+
+    pub fn to_nir(
+        &self,
+        entry_point: &str,
+        nir_options: *const nir_shader_compiler_options,
+        libclc: &NirShader,
+    ) -> Option<NirShader> {
+        let c_entry = CString::new(entry_point.as_bytes()).unwrap();
+        let spirv_options = Self::get_spirv_options(false, libclc.get_nir());
+        let nir = unsafe {
+            spirv_to_nir(
+                self.spirv.data.cast(),
+                self.spirv.size / 4,
+                ptr::null_mut(), // spec
+                0,               // spec count
+                gl_shader_stage::MESA_SHADER_KERNEL,
+                c_entry.as_ptr(),
+                &spirv_options,
+                nir_options,
+            )
+        };
+
+        NirShader::new(nir)
+    }
+
+    pub fn get_lib_clc(screen: &PipeScreen) -> Option<NirShader> {
+        let nir_options = screen.nir_shader_compiler_options(pipe_shader_type::PIPE_SHADER_COMPUTE);
+        let spirv_options = Self::get_spirv_options(true, ptr::null());
+        let shader_cache = screen.shader_cache();
+        NirShader::new(unsafe {
+            nir_load_libclc_shader(64, shader_cache, &spirv_options, nir_options)
+        })
+    }
 }

 impl Drop for SPIRVBin {
--- a/src/gallium/frontends/rusticl/mesa/compiler/nir.rs
+++ b/src/gallium/frontends/rusticl/mesa/compiler/nir.rs
@@ -2,12 +2,215 @@ extern crate mesa_rust_gen;

 use self::mesa_rust_gen::*;

+use std::convert::TryInto;
+use std::ffi::c_void;
+use std::ffi::CString;
+use std::marker::PhantomData;
+use std::ptr;
 use std::ptr::NonNull;
+use std::slice;
+
+// from https://internals.rust-lang.org/t/discussion-on-offset-of/7440/2
+macro_rules! offset_of {
+    ($Struct:path, $field:ident) => {{
+        // Using a separate function to minimize unhygienic hazards
+        // (e.g. unsafety of #[repr(packed)] field borrows).
+        // Uncomment `const` when `const fn`s can juggle pointers.
+        /*const*/
+        fn offset() -> usize {
+            let u = std::mem::MaybeUninit::<$Struct>::uninit();
+            // Use pattern-matching to avoid accidentally going through Deref.
+            let &$Struct { $field: ref f, .. } = unsafe { &*u.as_ptr() };
+            let o = (f as *const _ as usize).wrapping_sub(&u as *const _ as usize);
+            // Triple check that we are within `u` still.
+            assert!((0..=std::mem::size_of_val(&u)).contains(&o));
+            o
+        }
+        offset()
+    }};
+}
+
+pub struct ExecListIter<'a, T> {
+    n: &'a mut exec_node,
+    offset: usize,
+    _marker: PhantomData<T>,
+}
+
+impl<'a, T> ExecListIter<'a, T> {
+    fn new(l: &'a mut exec_list, offset: usize) -> Self {
+        Self {
+            n: &mut l.head_sentinel,
+            offset: offset,
+            _marker: PhantomData,
+        }
+    }
+}
+
+impl<'a, T: 'a> Iterator for ExecListIter<'a, T> {
+    type Item = &'a mut T;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.n = unsafe { &mut *self.n.next };
+        if self.n.next.is_null() {
+            None
+        } else {
+            let t: *mut c_void = (self.n as *mut exec_node).cast();
+            Some(unsafe { &mut *(t.sub(self.offset).cast()) })
+        }
+    }
+}

 pub struct NirShader {
    nir: NonNull<nir_shader>,
 }

+impl NirShader {
+    pub fn new(nir: *mut nir_shader) -> Option<Self> {
+        NonNull::new(nir).map(|nir| Self { nir: nir })
+    }
+
+    pub fn print(&self) {
+        unsafe { nir_print_shader(self.nir.as_ptr(), stderr) };
+    }
+
+    pub fn get_nir(&self) -> *mut nir_shader {
+        self.nir.as_ptr()
+    }
+
+    pub fn dup_for_driver(&self) -> *mut nir_shader {
+        unsafe { nir_shader_clone(ptr::null_mut(), self.nir.as_ptr()) }
+    }
+
+    pub fn pass0<R>(&mut self, pass: unsafe extern "C" fn(*mut nir_shader) -> R) -> R {
+        unsafe { pass(self.nir.as_ptr()) }
+    }
+
+    pub fn pass1<R, A>(
+        &mut self,
+        pass: unsafe extern "C" fn(*mut nir_shader, a: A) -> R,
+        a: A,
+    ) -> R {
+        unsafe { pass(self.nir.as_ptr(), a) }
+    }
+
+    pub fn pass2<R, A, B>(
+        &mut self,
+        pass: unsafe extern "C" fn(*mut nir_shader, a: A, b: B) -> R,
+        a: A,
+        b: B,
+    ) -> R {
+        unsafe { pass(self.nir.as_ptr(), a, b) }
+    }
+
+    pub fn pass3<R, A, B, C>(
+        &mut self,
+        pass: unsafe extern "C" fn(*mut nir_shader, a: A, b: B, c: C) -> R,
+        a: A,
+        b: B,
+        c: C,
+    ) -> R {
+        unsafe { pass(self.nir.as_ptr(), a, b, c) }
+    }
+
+    pub fn structurize(&mut self) {
+        self.pass0(nir_lower_goto_ifs);
+        self.pass0(nir_opt_dead_cf);
+    }
+
+    pub fn inline(&mut self, libclc: &NirShader) {
+        self.pass1(
+            nir_lower_variable_initializers,
+            nir_variable_mode::nir_var_function_temp,
+        );
+        self.pass0(nir_lower_returns);
+        self.pass1(nir_lower_libclc, libclc.nir.as_ptr());
+        self.pass0(nir_inline_functions);
+    }
+
+    pub fn remove_non_entrypoints(&mut self) {
+        unsafe { nir_remove_non_entrypoints(self.nir.as_ptr()) };
+    }
+
+    pub fn variables(&mut self) -> ExecListIter<nir_variable> {
+        ExecListIter::new(
+            &mut unsafe { self.nir.as_mut() }.variables,
+            offset_of!(nir_variable, node),
+        )
+    }
+
+    pub fn reset_scratch_size(&self) {
+        unsafe {
+            (*self.nir.as_ptr()).scratch_size = 0;
+        }
+    }
+
+    pub fn shared_size(&self) -> u32 {
+        unsafe { (*self.nir.as_ptr()).info.shared_size }
+    }
+
+    pub fn set_workgroup_size(&self, workgroup: &[u16; 3]) {
+        let mut nir = self.nir.as_ptr();
+        unsafe {
+            (*nir).info.set_workgroup_size_variable(workgroup[0] == 0);
+            (*nir).info.workgroup_size[0] = workgroup[0];
+            (*nir).info.workgroup_size[1] = workgroup[1];
+            (*nir).info.workgroup_size[2] = workgroup[2];
+        }
+    }
+
+    pub fn variables_with_mode(
+        &mut self,
+        mode: nir_variable_mode,
+    ) -> impl Iterator<Item = &mut nir_variable> {
+        self.variables()
+            .filter(move |v| v.data.mode() & mode.0 != 0)
+    }
+
+    pub fn extract_constant_initializers(&self) {
+        let nir = self.nir.as_ptr();
+        unsafe {
+            if (*nir).constant_data_size > 0 {
+                assert!((*nir).constant_data.is_null());
+                (*nir).constant_data = rzalloc_size(nir.cast(), (*nir).constant_data_size as usize);
+                nir_gather_explicit_io_initializers(
+                    nir,
+                    (*nir).constant_data,
+                    (*nir).constant_data_size as usize,
+                    nir_variable_mode::nir_var_mem_constant,
+                );
+            }
+        }
+    }
+
+    pub fn has_constant(&self) -> bool {
+        unsafe {
+            !self.nir.as_ref().constant_data.is_null() && self.nir.as_ref().constant_data_size > 0
+        }
+    }
+
+    pub fn get_constant_buffer(&self) -> &[u8] {
+        unsafe {
+            let nir = self.nir.as_ref();
+            slice::from_raw_parts(nir.constant_data.cast(), nir.constant_data_size as usize)
+        }
+    }
+
+    pub fn add_var(
+        &self,
+        mode: nir_variable_mode,
+        glsl_type: *const glsl_type,
+        loc: usize,
+        name: &str,
+    ) -> *mut nir_variable {
+        let name = CString::new(name).unwrap();
+        unsafe {
+            let var = nir_variable_create(self.nir.as_ptr(), mode, glsl_type, name.as_ptr());
+            (*var).data.location = loc.try_into().unwrap();
+            var
+        }
+    }
+}
+
 impl Drop for NirShader {
    fn drop(&mut self) {
        unsafe { ralloc_free(self.nir.as_ptr().cast()) };
--- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
@@ -1,5 +1,6 @@
 extern crate mesa_rust_gen;

+use crate::compiler::nir::*;
 use crate::pipe::resource::*;
 use crate::pipe::transfer::*;

@@ -97,6 +98,84 @@ impl PipeContext {

        unsafe { self.pipe.as_ref().blit.unwrap()(self.pipe.as_ptr(), &blit_info) }
    }
+
+    pub fn create_compute_state(
+        &self,
+        nir: &NirShader,
+        input_mem: u32,
+        local_mem: u32,
+    ) -> *mut c_void {
+        let state = pipe_compute_state {
+            ir_type: pipe_shader_ir::PIPE_SHADER_IR_NIR,
+            prog: nir.dup_for_driver().cast(),
+            req_input_mem: input_mem,
+            req_local_mem: local_mem,
+            req_private_mem: 0,
+        };
+        unsafe { self.pipe.as_ref().create_compute_state.unwrap()(self.pipe.as_ptr(), &state) }
+    }
+
+    pub fn bind_compute_state(&self, state: *mut c_void) {
+        unsafe { self.pipe.as_ref().bind_compute_state.unwrap()(self.pipe.as_ptr(), state) }
+    }
+
+    pub fn delete_compute_state(&self, state: *mut c_void) {
+        unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) }
+    }
+
+    pub fn launch_grid(
+        &self,
+        work_dim: u32,
+        block: [u32; 3],
+        grid: [u32; 3],
+        grid_base: [u32; 3],
+        input: &[u8],
+    ) {
+        let info = pipe_grid_info {
+            pc: 0,
+            input: input.as_ptr().cast(),
+            work_dim: work_dim,
+            block: block,
+            last_block: [0; 3],
+            grid: grid,
+            grid_base: grid_base,
+            indirect: ptr::null_mut(),
+            indirect_offset: 0,
+        };
+        unsafe { self.pipe.as_ref().launch_grid.unwrap()(self.pipe.as_ptr(), &info) }
+    }
+
+    pub fn set_global_binding(&self, res: &[Option<Arc<PipeResource>>], out: &mut [*mut u32]) {
+        let mut res: Vec<_> = res
+            .iter()
+            .map(|o| o.as_ref().map_or(ptr::null_mut(), |r| r.pipe()))
+            .collect();
+        unsafe {
+            self.pipe.as_ref().set_global_binding.unwrap()(
+                self.pipe.as_ptr(),
+                0,
+                res.len() as u32,
+                res.as_mut_ptr(),
+                out.as_mut_ptr(),
+            )
+        }
+    }
+
+    pub fn clear_global_binding(&self, count: u32) {
+        unsafe {
+            self.pipe.as_ref().set_global_binding.unwrap()(
+                self.pipe.as_ptr(),
+                0,
+                count,
+                ptr::null_mut(),
+                ptr::null_mut(),
+            )
+        }
+    }
+
+    pub fn memory_barrier(&self, barriers: u32) {
+        unsafe { self.pipe.as_ref().memory_barrier.unwrap()(self.pipe.as_ptr(), barriers) }
+    }
 }

 impl Drop for PipeContext {
@@ -109,8 +188,14 @@ impl Drop for PipeContext {

 fn has_required_cbs(c: &pipe_context) -> bool {
    c.destroy.is_some()
+        && c.bind_compute_state.is_some()
        && c.blit.is_some()
        && c.buffer_map.is_some()
        && c.buffer_subdata.is_some()
        && c.buffer_unmap.is_some()
+        && c.create_compute_state.is_some()
+        && c.delete_compute_state.is_some()
+        && c.launch_grid.is_some()
+        && c.memory_barrier.is_some()
+        && c.set_global_binding.is_some()
 }
--- a/src/gallium/frontends/rusticl/mesa/pipe/screen.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/screen.rs
@@ -1,6 +1,7 @@
 extern crate mesa_rust_gen;
 extern crate mesa_rust_util;

+use crate::compiler::nir::NirShader;
 use crate::pipe::context::*;
 use crate::pipe::device::*;
 use crate::pipe::resource::*;
@@ -151,6 +152,39 @@ impl PipeScreen {
        let s = &mut unsafe { *self.screen };
        unsafe { s.is_format_supported.unwrap()(self.screen, format, target, 0, 0, bindings) }
    }
+
+    pub fn nir_shader_compiler_options(
+        &self,
+        shader: pipe_shader_type,
+    ) -> *const nir_shader_compiler_options {
+        unsafe {
+            (*self.screen).get_compiler_options.unwrap()(
+                self.screen,
+                pipe_shader_ir::PIPE_SHADER_IR_NIR,
+                shader,
+            )
+            .cast()
+        }
+    }
+
+    pub fn shader_cache(&self) -> *mut disk_cache {
+        let s = &mut unsafe { *self.screen };
+
+        if let Some(func) = s.get_disk_shader_cache {
+            unsafe { func(self.screen) }
+        } else {
+            ptr::null_mut()
+        }
+    }
+
+    pub fn finalize_nir(&self, nir: &NirShader) {
+        let s = &mut unsafe { *self.screen };
+        if let Some(func) = s.finalize_nir {
+            unsafe {
+                func(s, nir.get_nir().cast());
+            }
+        }
+    }
 }

 impl Drop for PipeScreen {
@@ -165,6 +199,7 @@ fn has_required_cbs(screen: *mut pipe_screen) -> bool {
    let s = unsafe { *screen };
    s.context_create.is_some()
        && s.destroy.is_some()
+        && s.get_compiler_options.is_some()
        && s.get_compute_param.is_some()
        && s.get_name.is_some()
        && s.get_param.is_some()
--- a/src/gallium/frontends/rusticl/meson.build
+++ b/src/gallium/frontends/rusticl/meson.build
@@ -149,13 +149,16 @@ rusticl_mesa_bindings_inline_wrapper = static_library(
  'mesa_bindings_inline_wrapper',
  [
    'rusticl_mesa_inline_bindings_wrapper.c',
-    'rusticl_mesa_inline_bindings_wrapper.h'
+    'rusticl_mesa_inline_bindings_wrapper.h',
+    'rusticl_nir.c',
+    'rusticl_nir.h',
  ],
  gnu_symbol_visibility : 'hidden',
  include_directories : [
    inc_gallium,
    inc_gallium_aux,
    inc_include,
+    inc_nir,
    inc_src,
  ],
  c_args : pre_args,
@@ -179,16 +182,23 @@ rusticl_mesa_bindings_rs = rust.bindgen(
  args : [
    rusticl_bindgen_args,
    '--whitelist-function', 'clc_.*',
+    '--whitelist-function', 'glsl_.*',
    '--whitelist-function', 'nir_.*',
    '--whitelist-function', 'pipe_.*',
-    '--whitelist-function', 'ralloc_.*',
+    '--whitelist-function', 'rusticl_.*',
+    '--whitelist-function', 'rz?alloc_.*',
+    '--whitelist-function', 'spirv_.*',
    '--whitelist-type', 'pipe_endian',
    '--whitelist-type', 'clc_kernel_arg_access_qualifier',
    '--bitfield-enum', 'clc_kernel_arg_access_qualifier',
    '--whitelist-type', 'clc_kernel_arg_type_qualifier',
    '--bitfield-enum', 'clc_kernel_arg_type_qualifier',
+    '--bitfield-enum', 'nir_opt_if_options',
+    '--bitfield-enum', 'nir_variable_mode',
+    '--whitelist-type', 'float_controls',
    '--whitelist-var', 'PIPE_.*',
    '--bitfield-enum', 'pipe_map_flags',
+    '--allowlist-var', 'stderr',
  ],
 )

--- a/src/gallium/frontends/rusticl/rusticl_mesa_bindings.h
+++ b/src/gallium/frontends/rusticl/rusticl_mesa_bindings.h
@@ -1,11 +1,13 @@
 #include "rusticl_mesa_inline_bindings_wrapper.h"

 #include "compiler/clc/clc.h"
-
-#include "nir.h"
+#include "nir_types.h"
+#include "spirv/nir_spirv.h"

 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
 #include "pipe-loader/pipe_loader.h"
+
+#include "rusticl_nir.h"
--- a/src/gallium/frontends/rusticl/rusticl_nir.c
+++ b/src/gallium/frontends/rusticl/rusticl_nir.c
@@ -0,0 +1,38 @@
+#include "nir.h"
+#include "nir_builder.h"
+
+#include "rusticl_nir.h"
+
+static bool
+rusticl_lower_intrinsics_filter(const nir_instr* instr, const void* state)
+{
+    return instr->type == nir_instr_type_intrinsic;
+}
+
+static nir_ssa_def*
+rusticl_lower_intrinsics_instr(
+    nir_builder *b,
+    nir_instr *instr,
+    void* _state
+) {
+    nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+    struct rusticl_lower_state *state = _state;
+
+    switch (intrinsic->intrinsic) {
+    case nir_intrinsic_load_constant_base_ptr:
+        return nir_load_var(b, state->const_buf);
+    default:
+        return NULL;
+    }
+}
+
+bool
+rusticl_lower_intrinsics(nir_shader *nir, struct rusticl_lower_state* state)
+{
+    return nir_shader_lower_instructions(
+        nir,
+        rusticl_lower_intrinsics_filter,
+        rusticl_lower_intrinsics_instr,
+        state
+    );
+}
--- a/src/gallium/frontends/rusticl/rusticl_nir.h
+++ b/src/gallium/frontends/rusticl/rusticl_nir.h
@@ -0,0 +1,5 @@
+struct rusticl_lower_state {
+    nir_variable *const_buf;
+};
+
+bool rusticl_lower_intrinsics(nir_shader *nir, struct rusticl_lower_state *state);