rusticl: support subgroups

Signed-off-by: Karol Herbst <git@karolherbst.de> Reviewed-by: Nora Allen <blackcatgames@protonmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22893>
2024-11-23 18:24:13 +08:00 · 2023-05-07 17:10:14 +02:00 · 2023-05-07 17:10:14 +02:00 · d681cdf4a6
commit d681cdf4a6
parent 0fd1b21db2
10 changed files with 226 additions and 10 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@ -876,7 +876,7 @@ Rusticl OpenCL 2.0 -- all DONE:

 Rusticl OpenCL 2.1 -- all DONE:

-  Sub groups                                            in progress
+  Sub groups                                            DONE (iris, llvmpipe, radeonsi)
  - cl_khr_subgroups                                    in progress
  cl_khr_il_program                                     DONE
  Device and host timer synchronization                 DONE (iris, llvmpipe, radeonsi)
--- a/src/gallium/frontends/rusticl/api/device.rs
+++ b/src/gallium/frontends/rusticl/api/device.rs
@ -171,7 +171,11 @@ impl CLInfo<cl_device_info> for cl_device_id {
            CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE => cl_prop::<cl_ulong>(dev.const_max_size()),
            CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE => cl_prop::<usize>(0),
            CL_DEVICE_MAX_MEM_ALLOC_SIZE => cl_prop::<cl_ulong>(dev.max_mem_alloc()),
-            CL_DEVICE_MAX_NUM_SUB_GROUPS => cl_prop::<cl_uint>(0),
+            CL_DEVICE_MAX_NUM_SUB_GROUPS => cl_prop::<cl_uint>(if dev.subgroups_supported() {
+                dev.max_subgroups()
+            } else {
+                0
+            }),
            CL_DEVICE_MAX_ON_DEVICE_EVENTS => cl_prop::<cl_uint>(0),
            CL_DEVICE_MAX_ON_DEVICE_QUEUES => cl_prop::<cl_uint>(0),
            CL_DEVICE_MAX_PARAMETER_SIZE => cl_prop::<usize>(dev.param_max_size()),
@ -274,6 +278,13 @@ impl CLInfo<cl_device_info> for cl_device_id {
                (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN) as cl_device_fp_config,
            ),
            CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS => cl_prop::<bool>(false),
+            CL_DEVICE_SUB_GROUP_SIZES_INTEL => {
+                cl_prop::<Vec<usize>>(if dev.subgroups_supported() {
+                    dev.subgroup_sizes()
+                } else {
+                    vec![0; 1]
+                })
+            }
            CL_DEVICE_SVM_CAPABILITIES | CL_DEVICE_SVM_CAPABILITIES_ARM => {
                cl_prop::<cl_device_svm_capabilities>(
                    if dev.svm_supported() {
--- a/src/gallium/frontends/rusticl/api/icd.rs
+++ b/src/gallium/frontends/rusticl/api/icd.rs
@ -465,7 +465,7 @@ extern "C" fn cl_get_kernel_sub_group_info(
    param_value_size_ret: *mut usize,
 ) -> cl_int {
    match kernel.get_info_obj(
-        (device, input_value_size, input_value),
+        (device, input_value_size, input_value, param_value_size),
        param_name,
        param_value_size,
        param_value,
--- a/src/gallium/frontends/rusticl/api/kernel.rs
+++ b/src/gallium/frontends/rusticl/api/kernel.rs
@ -10,6 +10,7 @@ use rusticl_opencl_gen::*;
 use rusticl_proc_macros::cl_entrypoint;
 use rusticl_proc_macros::cl_info_entrypoint;

+use std::cmp;
 use std::mem::{self, MaybeUninit};
 use std::os::raw::c_void;
 use std::ptr;
@ -106,16 +107,115 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
    }
 }

-impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void)> for cl_kernel {
+impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void, usize)>
+    for cl_kernel
+{
    fn query(
        &self,
-        (d, _input_value_size, _input_value): (cl_device_id, usize, *const c_void),
-        _q: cl_program_build_info,
+        (dev, input_value_size, input_value, output_value_size): (
+            cl_device_id,
+            usize,
+            *const c_void,
+            usize,
+        ),
+        q: cl_program_build_info,
    ) -> CLResult<Vec<MaybeUninit<u8>>> {
-        let _kernel = self.get_ref()?;
-        let _dev = d.get_arc()?;
+        let kernel = self.get_ref()?;

-        Err(CL_INVALID_OPERATION)
+        // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated
+        // with kernel.
+        let dev = if dev.is_null() {
+            if kernel.prog.devs.len() > 1 {
+                return Err(CL_INVALID_DEVICE);
+            } else {
+                kernel.prog.devs[0].clone()
+            }
+        } else {
+            dev.get_arc()?
+        };
+
+        // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
+        if !kernel.prog.devs.contains(&dev) {
+            return Err(CL_INVALID_DEVICE);
+        }
+
+        // CL_INVALID_OPERATION if device does not support subgroups.
+        if !dev.subgroups_supported() {
+            return Err(CL_INVALID_OPERATION);
+        }
+
+        let usize_byte = mem::size_of::<usize>();
+        // first we have to convert the input to a proper thing
+        let input: &[usize] = match q {
+            CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE | CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
+                // CL_INVALID_VALUE if param_name is CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                // CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE or ... and the size in bytes specified by
+                // input_value_size is not valid or if input_value is NULL.
+                if ![usize_byte, 2 * usize_byte, 3 * usize_byte].contains(&input_value_size) {
+                    return Err(CL_INVALID_VALUE);
+                }
+                // SAFETY: we verified the size as best as possible, with the rest we trust the client
+                unsafe { slice::from_raw_parts(input_value.cast(), input_value_size / usize_byte) }
+            }
+            CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
+                // CL_INVALID_VALUE if param_name is ... CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT
+                // and the size in bytes specified by input_value_size is not valid or if
+                // input_value is NULL.
+                if input_value_size != usize_byte || input_value.is_null() {
+                    return Err(CL_INVALID_VALUE);
+                }
+                // SAFETY: we trust the client here
+                unsafe { slice::from_raw_parts(input_value.cast(), 1) }
+            }
+            _ => &[],
+        };
+
+        Ok(match q {
+            CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
+                cl_prop::<usize>(kernel.subgroups_for_block(&dev, input))
+            }
+            CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE => {
+                cl_prop::<usize>(kernel.subgroup_size_for_block(&dev, input))
+            }
+            CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
+                let subgroups = input[0];
+                let mut res = vec![0; 3];
+
+                for subgroup_size in kernel.subgroup_sizes(&dev) {
+                    let threads = subgroups * subgroup_size;
+
+                    if threads > dev.max_threads_per_block() {
+                        continue;
+                    }
+
+                    let block = [threads, 1, 1];
+                    let real_subgroups = kernel.subgroups_for_block(&dev, &block);
+
+                    if real_subgroups == subgroups {
+                        res = block.to_vec();
+                        break;
+                    }
+                }
+
+                res.truncate(output_value_size / usize_byte);
+                cl_prop::<Vec<usize>>(res)
+            }
+            CL_KERNEL_MAX_NUM_SUB_GROUPS => {
+                let threads = kernel.max_threads_per_block(&dev);
+                let max_groups = dev.max_subgroups();
+
+                let mut result = 0;
+                for sgs in kernel.subgroup_sizes(&dev) {
+                    result = cmp::max(result, threads / sgs);
+                    result = cmp::min(result, max_groups as usize);
+                }
+                cl_prop::<usize>(result)
+            }
+            CL_KERNEL_COMPILE_NUM_SUB_GROUPS => cl_prop::<usize>(kernel.num_subgroups),
+            CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL => cl_prop::<usize>(kernel.subgroup_size),
+            // CL_INVALID_VALUE if param_name is not one of the supported values
+            _ => return Err(CL_INVALID_VALUE),
+        })
    }
 }

--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@ -82,6 +82,7 @@ pub trait HelperContextWrapper {
    fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
    fn delete_compute_state(&self, cso: *mut c_void);
    fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
+    fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32;

    fn unmap(&self, tx: PipeTransfer);
 }
@ -170,6 +171,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
        self.lock.compute_state_info(state)
    }

+    fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32 {
+        self.lock.compute_state_subgroup_size(state, block)
+    }
+
    fn unmap(&self, tx: PipeTransfer) {
        tx.with_ctx(&self.lock);
    }
@ -572,6 +577,12 @@ impl Device {
            add_ext(1, 0, 0, "cl_khr_device_uuid");
        }

+        if self.subgroups_supported() {
+            // requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
+            //add_ext(1, 0, 0, "cl_khr_subgroups");
+            add_feat(1, 0, 0, "__opencl_c_subgroups");
+        }
+
        if self.svm_supported() {
            add_ext(1, 0, 0, "cl_arm_shared_virtual_memory");
        }
@ -857,6 +868,22 @@ impl Device {
            .collect()
    }

+    pub fn max_subgroups(&self) -> u32 {
+        ComputeParam::<u32>::compute_param(
+            self.screen.as_ref(),
+            pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_SUBGROUPS,
+        )
+    }
+
+    pub fn subgroups_supported(&self) -> bool {
+        let subgroup_sizes = self.subgroup_sizes().len();
+
+        // we need to be able to query a CSO for subgroup sizes if multiple sub group sizes are
+        // supported, doing it without shareable shaders isn't practical
+        self.max_subgroups() > 0
+            && (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders()))
+    }
+
    pub fn svm_supported(&self) -> bool {
        self.screen.param(pipe_cap::PIPE_CAP_SYSTEM_SVM) == 1
    }
@ -905,7 +932,7 @@ impl Device {
            images_write_3d: self.image_3d_write_supported(),
            integer_dot_product: true,
            intel_subgroups: false,
-            subgroups: false,
+            subgroups: self.subgroups_supported(),
            subgroups_ifp: false,
        }
    }
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@ -335,6 +335,8 @@ pub struct Kernel {
    pub values: Vec<RefCell<Option<KernelArgValue>>>,
    pub work_group_size: [usize; 3],
    pub build: Arc<NirKernelBuild>,
+    pub subgroup_size: usize,
+    pub num_subgroups: usize,
    dev_state: Arc<KernelDevState>,
 }

@ -813,6 +815,8 @@ impl Kernel {
            prog: prog,
            name: name,
            work_group_size: work_group_size,
+            subgroup_size: nir.subgroup_size() as usize,
+            num_subgroups: nir.num_subgroups() as usize,
            values: values,
            dev_state: KernelDevState::new(nirs),
            build: nir_kernel_build,
@ -1208,6 +1212,42 @@ impl Kernel {
    pub fn has_svm_devs(&self) -> bool {
        self.prog.devs.iter().any(|dev| dev.svm_supported())
    }
+
+    pub fn subgroup_sizes(&self, dev: &Device) -> Vec<usize> {
+        SetBitIndices::from_msb(self.dev_state.get(dev).info.simd_sizes)
+            .map(|bit| 1 << bit)
+            .collect()
+    }
+
+    pub fn subgroups_for_block(&self, dev: &Device, block: &[usize]) -> usize {
+        let subgroup_size = self.subgroup_size_for_block(dev, block);
+        if subgroup_size == 0 {
+            return 0;
+        }
+
+        let threads = block.iter().product();
+        div_round_up(threads, subgroup_size)
+    }
+
+    pub fn subgroup_size_for_block(&self, dev: &Device, block: &[usize]) -> usize {
+        let subgroup_sizes = self.subgroup_sizes(dev);
+        if subgroup_sizes.is_empty() {
+            return 0;
+        }
+
+        if subgroup_sizes.len() == 1 {
+            return subgroup_sizes[0];
+        }
+
+        let block = [
+            *block.get(0).unwrap_or(&1) as u32,
+            *block.get(1).unwrap_or(&1) as u32,
+            *block.get(2).unwrap_or(&1) as u32,
+        ];
+
+        dev.helper_ctx()
+            .compute_state_subgroup_size(self.dev_state.get(dev).cso, &block) as usize
+    }
 }

 impl Clone for Kernel {
@ -1219,6 +1259,8 @@ impl Clone for Kernel {
            values: self.values.clone(),
            work_group_size: self.work_group_size,
            build: self.build.clone(),
+            subgroup_size: self.subgroup_size,
+            num_subgroups: self.num_subgroups,
            dev_state: self.dev_state.clone(),
        }
    }
--- a/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs
+++ b/src/gallium/frontends/rusticl/mesa/compiler/clc/spirv.rs
@ -332,6 +332,7 @@ impl SPIRVBin {
                float16: true,
                float64: true,
                generic_pointers: true,
+                groups: true,
                int8: true,
                int16: true,
                int64: true,
--- a/src/gallium/frontends/rusticl/mesa/compiler/nir.rs
+++ b/src/gallium/frontends/rusticl/mesa/compiler/nir.rs
@ -179,6 +179,27 @@ impl NirShader {
        unsafe { (*self.nir.as_ptr()).info.workgroup_size }
    }

+    pub fn subgroup_size(&self) -> u8 {
+        let subgroup_size = unsafe { (*self.nir.as_ptr()).info.subgroup_size };
+        let valid_subgroup_sizes = [
+            gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_8,
+            gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_16,
+            gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_32,
+            gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_64,
+            gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_128,
+        ];
+
+        if valid_subgroup_sizes.contains(&subgroup_size) {
+            subgroup_size as u8
+        } else {
+            0
+        }
+    }
+
+    pub fn num_subgroups(&self) -> u8 {
+        unsafe { (*self.nir.as_ptr()).info.num_subgroups }
+    }
+
    pub fn set_workgroup_size_variable_if_zero(&self) {
        let nir = self.nir.as_ptr();
        unsafe {
--- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
@ -327,6 +327,16 @@ impl PipeContext {
        info
    }

+    pub fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32 {
+        unsafe {
+            if let Some(cb) = self.pipe.as_ref().get_compute_state_subgroup_size {
+                cb(self.pipe.as_ptr(), state, block)
+            } else {
+                0
+            }
+        }
+    }
+
    pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
        unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
    }
--- a/src/gallium/frontends/rusticl/rusticl_llvm_bindings.hpp
+++ b/src/gallium/frontends/rusticl/rusticl_llvm_bindings.hpp
@ -0,0 +1,4 @@
+//#include <clang/Basic/Version.h>
+//#include <llvm/IR/LLVMContext.h>
+#include <LLVMSPIRVLib/LLVMSPIRVLib.h>
+#include <spirv-tools/linker.hpp>