rusticl: support subgroups

Signed-off-by: Karol Herbst <git@karolherbst.de>
Reviewed-by: Nora Allen <blackcatgames@protonmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22893>
This commit is contained in:
Karol Herbst 2023-05-07 17:10:14 +02:00 committed by Marge Bot
parent 0fd1b21db2
commit d681cdf4a6
10 changed files with 226 additions and 10 deletions

View File

@ -876,7 +876,7 @@ Rusticl OpenCL 2.0 -- all DONE:
Rusticl OpenCL 2.1 -- all DONE:
Sub groups in progress
Sub groups DONE (iris, llvmpipe, radeonsi)
- cl_khr_subgroups in progress
cl_khr_il_program DONE
Device and host timer synchronization DONE (iris, llvmpipe, radeonsi)

View File

@ -171,7 +171,11 @@ impl CLInfo<cl_device_info> for cl_device_id {
CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE => cl_prop::<cl_ulong>(dev.const_max_size()),
CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE => cl_prop::<usize>(0),
CL_DEVICE_MAX_MEM_ALLOC_SIZE => cl_prop::<cl_ulong>(dev.max_mem_alloc()),
CL_DEVICE_MAX_NUM_SUB_GROUPS => cl_prop::<cl_uint>(0),
CL_DEVICE_MAX_NUM_SUB_GROUPS => cl_prop::<cl_uint>(if dev.subgroups_supported() {
dev.max_subgroups()
} else {
0
}),
CL_DEVICE_MAX_ON_DEVICE_EVENTS => cl_prop::<cl_uint>(0),
CL_DEVICE_MAX_ON_DEVICE_QUEUES => cl_prop::<cl_uint>(0),
CL_DEVICE_MAX_PARAMETER_SIZE => cl_prop::<usize>(dev.param_max_size()),
@ -274,6 +278,13 @@ impl CLInfo<cl_device_info> for cl_device_id {
(CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN) as cl_device_fp_config,
),
CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS => cl_prop::<bool>(false),
CL_DEVICE_SUB_GROUP_SIZES_INTEL => {
cl_prop::<Vec<usize>>(if dev.subgroups_supported() {
dev.subgroup_sizes()
} else {
vec![0; 1]
})
}
CL_DEVICE_SVM_CAPABILITIES | CL_DEVICE_SVM_CAPABILITIES_ARM => {
cl_prop::<cl_device_svm_capabilities>(
if dev.svm_supported() {

View File

@ -465,7 +465,7 @@ extern "C" fn cl_get_kernel_sub_group_info(
param_value_size_ret: *mut usize,
) -> cl_int {
match kernel.get_info_obj(
(device, input_value_size, input_value),
(device, input_value_size, input_value, param_value_size),
param_name,
param_value_size,
param_value,

View File

@ -10,6 +10,7 @@ use rusticl_opencl_gen::*;
use rusticl_proc_macros::cl_entrypoint;
use rusticl_proc_macros::cl_info_entrypoint;
use std::cmp;
use std::mem::{self, MaybeUninit};
use std::os::raw::c_void;
use std::ptr;
@ -106,16 +107,115 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
}
}
impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void)> for cl_kernel {
impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void, usize)>
for cl_kernel
{
fn query(
&self,
(d, _input_value_size, _input_value): (cl_device_id, usize, *const c_void),
_q: cl_program_build_info,
(dev, input_value_size, input_value, output_value_size): (
cl_device_id,
usize,
*const c_void,
usize,
),
q: cl_program_build_info,
) -> CLResult<Vec<MaybeUninit<u8>>> {
let _kernel = self.get_ref()?;
let _dev = d.get_arc()?;
let kernel = self.get_ref()?;
Err(CL_INVALID_OPERATION)
// CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated
// with kernel.
let dev = if dev.is_null() {
if kernel.prog.devs.len() > 1 {
return Err(CL_INVALID_DEVICE);
} else {
kernel.prog.devs[0].clone()
}
} else {
dev.get_arc()?
};
// CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
if !kernel.prog.devs.contains(&dev) {
return Err(CL_INVALID_DEVICE);
}
// CL_INVALID_OPERATION if device does not support subgroups.
if !dev.subgroups_supported() {
return Err(CL_INVALID_OPERATION);
}
let usize_byte = mem::size_of::<usize>();
// first we have to convert the input to a proper thing
let input: &[usize] = match q {
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE | CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
// CL_INVALID_VALUE if param_name is CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
// CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE or ... and the size in bytes specified by
// input_value_size is not valid or if input_value is NULL.
if ![usize_byte, 2 * usize_byte, 3 * usize_byte].contains(&input_value_size) {
return Err(CL_INVALID_VALUE);
}
// SAFETY: we verified the size as best as possible, with the rest we trust the client
unsafe { slice::from_raw_parts(input_value.cast(), input_value_size / usize_byte) }
}
CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
// CL_INVALID_VALUE if param_name is ... CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT
// and the size in bytes specified by input_value_size is not valid or if
// input_value is NULL.
if input_value_size != usize_byte || input_value.is_null() {
return Err(CL_INVALID_VALUE);
}
// SAFETY: we trust the client here
unsafe { slice::from_raw_parts(input_value.cast(), 1) }
}
_ => &[],
};
Ok(match q {
CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
cl_prop::<usize>(kernel.subgroups_for_block(&dev, input))
}
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE => {
cl_prop::<usize>(kernel.subgroup_size_for_block(&dev, input))
}
CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
let subgroups = input[0];
let mut res = vec![0; 3];
for subgroup_size in kernel.subgroup_sizes(&dev) {
let threads = subgroups * subgroup_size;
if threads > dev.max_threads_per_block() {
continue;
}
let block = [threads, 1, 1];
let real_subgroups = kernel.subgroups_for_block(&dev, &block);
if real_subgroups == subgroups {
res = block.to_vec();
break;
}
}
res.truncate(output_value_size / usize_byte);
cl_prop::<Vec<usize>>(res)
}
CL_KERNEL_MAX_NUM_SUB_GROUPS => {
let threads = kernel.max_threads_per_block(&dev);
let max_groups = dev.max_subgroups();
let mut result = 0;
for sgs in kernel.subgroup_sizes(&dev) {
result = cmp::max(result, threads / sgs);
result = cmp::min(result, max_groups as usize);
}
cl_prop::<usize>(result)
}
CL_KERNEL_COMPILE_NUM_SUB_GROUPS => cl_prop::<usize>(kernel.num_subgroups),
CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL => cl_prop::<usize>(kernel.subgroup_size),
// CL_INVALID_VALUE if param_name is not one of the supported values
_ => return Err(CL_INVALID_VALUE),
})
}
}

View File

@ -82,6 +82,7 @@ pub trait HelperContextWrapper {
fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
fn delete_compute_state(&self, cso: *mut c_void);
fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32;
fn unmap(&self, tx: PipeTransfer);
}
@ -170,6 +171,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
self.lock.compute_state_info(state)
}
fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32 {
self.lock.compute_state_subgroup_size(state, block)
}
fn unmap(&self, tx: PipeTransfer) {
tx.with_ctx(&self.lock);
}
@ -572,6 +577,12 @@ impl Device {
add_ext(1, 0, 0, "cl_khr_device_uuid");
}
if self.subgroups_supported() {
// requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
//add_ext(1, 0, 0, "cl_khr_subgroups");
add_feat(1, 0, 0, "__opencl_c_subgroups");
}
if self.svm_supported() {
add_ext(1, 0, 0, "cl_arm_shared_virtual_memory");
}
@ -857,6 +868,22 @@ impl Device {
.collect()
}
pub fn max_subgroups(&self) -> u32 {
ComputeParam::<u32>::compute_param(
self.screen.as_ref(),
pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_SUBGROUPS,
)
}
pub fn subgroups_supported(&self) -> bool {
let subgroup_sizes = self.subgroup_sizes().len();
// we need to be able to query a CSO for subgroup sizes if multiple sub group sizes are
// supported, doing it without shareable shaders isn't practical
self.max_subgroups() > 0
&& (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders()))
}
pub fn svm_supported(&self) -> bool {
self.screen.param(pipe_cap::PIPE_CAP_SYSTEM_SVM) == 1
}
@ -905,7 +932,7 @@ impl Device {
images_write_3d: self.image_3d_write_supported(),
integer_dot_product: true,
intel_subgroups: false,
subgroups: false,
subgroups: self.subgroups_supported(),
subgroups_ifp: false,
}
}

View File

@ -335,6 +335,8 @@ pub struct Kernel {
pub values: Vec<RefCell<Option<KernelArgValue>>>,
pub work_group_size: [usize; 3],
pub build: Arc<NirKernelBuild>,
pub subgroup_size: usize,
pub num_subgroups: usize,
dev_state: Arc<KernelDevState>,
}
@ -813,6 +815,8 @@ impl Kernel {
prog: prog,
name: name,
work_group_size: work_group_size,
subgroup_size: nir.subgroup_size() as usize,
num_subgroups: nir.num_subgroups() as usize,
values: values,
dev_state: KernelDevState::new(nirs),
build: nir_kernel_build,
@ -1208,6 +1212,42 @@ impl Kernel {
pub fn has_svm_devs(&self) -> bool {
self.prog.devs.iter().any(|dev| dev.svm_supported())
}
pub fn subgroup_sizes(&self, dev: &Device) -> Vec<usize> {
SetBitIndices::from_msb(self.dev_state.get(dev).info.simd_sizes)
.map(|bit| 1 << bit)
.collect()
}
pub fn subgroups_for_block(&self, dev: &Device, block: &[usize]) -> usize {
let subgroup_size = self.subgroup_size_for_block(dev, block);
if subgroup_size == 0 {
return 0;
}
let threads = block.iter().product();
div_round_up(threads, subgroup_size)
}
pub fn subgroup_size_for_block(&self, dev: &Device, block: &[usize]) -> usize {
let subgroup_sizes = self.subgroup_sizes(dev);
if subgroup_sizes.is_empty() {
return 0;
}
if subgroup_sizes.len() == 1 {
return subgroup_sizes[0];
}
let block = [
*block.get(0).unwrap_or(&1) as u32,
*block.get(1).unwrap_or(&1) as u32,
*block.get(2).unwrap_or(&1) as u32,
];
dev.helper_ctx()
.compute_state_subgroup_size(self.dev_state.get(dev).cso, &block) as usize
}
}
impl Clone for Kernel {
@ -1219,6 +1259,8 @@ impl Clone for Kernel {
values: self.values.clone(),
work_group_size: self.work_group_size,
build: self.build.clone(),
subgroup_size: self.subgroup_size,
num_subgroups: self.num_subgroups,
dev_state: self.dev_state.clone(),
}
}

View File

@ -332,6 +332,7 @@ impl SPIRVBin {
float16: true,
float64: true,
generic_pointers: true,
groups: true,
int8: true,
int16: true,
int64: true,

View File

@ -179,6 +179,27 @@ impl NirShader {
unsafe { (*self.nir.as_ptr()).info.workgroup_size }
}
pub fn subgroup_size(&self) -> u8 {
let subgroup_size = unsafe { (*self.nir.as_ptr()).info.subgroup_size };
let valid_subgroup_sizes = [
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_8,
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_16,
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_32,
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_64,
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_128,
];
if valid_subgroup_sizes.contains(&subgroup_size) {
subgroup_size as u8
} else {
0
}
}
pub fn num_subgroups(&self) -> u8 {
unsafe { (*self.nir.as_ptr()).info.num_subgroups }
}
pub fn set_workgroup_size_variable_if_zero(&self) {
let nir = self.nir.as_ptr();
unsafe {

View File

@ -327,6 +327,16 @@ impl PipeContext {
info
}
pub fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32 {
unsafe {
if let Some(cb) = self.pipe.as_ref().get_compute_state_subgroup_size {
cb(self.pipe.as_ptr(), state, block)
} else {
0
}
}
}
pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
}

View File

@ -0,0 +1,4 @@
//#include <clang/Basic/Version.h>
//#include <llvm/IR/LLVMContext.h>
#include <LLVMSPIRVLib/LLVMSPIRVLib.h>
#include <spirv-tools/linker.hpp>