mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2024-11-23 18:24:13 +08:00
rusticl: support subgroups
Signed-off-by: Karol Herbst <git@karolherbst.de> Reviewed-by: Nora Allen <blackcatgames@protonmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22893>
This commit is contained in:
parent
0fd1b21db2
commit
d681cdf4a6
@ -876,7 +876,7 @@ Rusticl OpenCL 2.0 -- all DONE:
|
||||
|
||||
Rusticl OpenCL 2.1 -- all DONE:
|
||||
|
||||
Sub groups in progress
|
||||
Sub groups DONE (iris, llvmpipe, radeonsi)
|
||||
- cl_khr_subgroups in progress
|
||||
cl_khr_il_program DONE
|
||||
Device and host timer synchronization DONE (iris, llvmpipe, radeonsi)
|
||||
|
@ -171,7 +171,11 @@ impl CLInfo<cl_device_info> for cl_device_id {
|
||||
CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE => cl_prop::<cl_ulong>(dev.const_max_size()),
|
||||
CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE => cl_prop::<usize>(0),
|
||||
CL_DEVICE_MAX_MEM_ALLOC_SIZE => cl_prop::<cl_ulong>(dev.max_mem_alloc()),
|
||||
CL_DEVICE_MAX_NUM_SUB_GROUPS => cl_prop::<cl_uint>(0),
|
||||
CL_DEVICE_MAX_NUM_SUB_GROUPS => cl_prop::<cl_uint>(if dev.subgroups_supported() {
|
||||
dev.max_subgroups()
|
||||
} else {
|
||||
0
|
||||
}),
|
||||
CL_DEVICE_MAX_ON_DEVICE_EVENTS => cl_prop::<cl_uint>(0),
|
||||
CL_DEVICE_MAX_ON_DEVICE_QUEUES => cl_prop::<cl_uint>(0),
|
||||
CL_DEVICE_MAX_PARAMETER_SIZE => cl_prop::<usize>(dev.param_max_size()),
|
||||
@ -274,6 +278,13 @@ impl CLInfo<cl_device_info> for cl_device_id {
|
||||
(CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN) as cl_device_fp_config,
|
||||
),
|
||||
CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS => cl_prop::<bool>(false),
|
||||
CL_DEVICE_SUB_GROUP_SIZES_INTEL => {
|
||||
cl_prop::<Vec<usize>>(if dev.subgroups_supported() {
|
||||
dev.subgroup_sizes()
|
||||
} else {
|
||||
vec![0; 1]
|
||||
})
|
||||
}
|
||||
CL_DEVICE_SVM_CAPABILITIES | CL_DEVICE_SVM_CAPABILITIES_ARM => {
|
||||
cl_prop::<cl_device_svm_capabilities>(
|
||||
if dev.svm_supported() {
|
||||
|
@ -465,7 +465,7 @@ extern "C" fn cl_get_kernel_sub_group_info(
|
||||
param_value_size_ret: *mut usize,
|
||||
) -> cl_int {
|
||||
match kernel.get_info_obj(
|
||||
(device, input_value_size, input_value),
|
||||
(device, input_value_size, input_value, param_value_size),
|
||||
param_name,
|
||||
param_value_size,
|
||||
param_value,
|
||||
|
@ -10,6 +10,7 @@ use rusticl_opencl_gen::*;
|
||||
use rusticl_proc_macros::cl_entrypoint;
|
||||
use rusticl_proc_macros::cl_info_entrypoint;
|
||||
|
||||
use std::cmp;
|
||||
use std::mem::{self, MaybeUninit};
|
||||
use std::os::raw::c_void;
|
||||
use std::ptr;
|
||||
@ -106,16 +107,115 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
|
||||
}
|
||||
}
|
||||
|
||||
impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void)> for cl_kernel {
|
||||
impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void, usize)>
|
||||
for cl_kernel
|
||||
{
|
||||
fn query(
|
||||
&self,
|
||||
(d, _input_value_size, _input_value): (cl_device_id, usize, *const c_void),
|
||||
_q: cl_program_build_info,
|
||||
(dev, input_value_size, input_value, output_value_size): (
|
||||
cl_device_id,
|
||||
usize,
|
||||
*const c_void,
|
||||
usize,
|
||||
),
|
||||
q: cl_program_build_info,
|
||||
) -> CLResult<Vec<MaybeUninit<u8>>> {
|
||||
let _kernel = self.get_ref()?;
|
||||
let _dev = d.get_arc()?;
|
||||
let kernel = self.get_ref()?;
|
||||
|
||||
Err(CL_INVALID_OPERATION)
|
||||
// CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated
|
||||
// with kernel.
|
||||
let dev = if dev.is_null() {
|
||||
if kernel.prog.devs.len() > 1 {
|
||||
return Err(CL_INVALID_DEVICE);
|
||||
} else {
|
||||
kernel.prog.devs[0].clone()
|
||||
}
|
||||
} else {
|
||||
dev.get_arc()?
|
||||
};
|
||||
|
||||
// CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
|
||||
if !kernel.prog.devs.contains(&dev) {
|
||||
return Err(CL_INVALID_DEVICE);
|
||||
}
|
||||
|
||||
// CL_INVALID_OPERATION if device does not support subgroups.
|
||||
if !dev.subgroups_supported() {
|
||||
return Err(CL_INVALID_OPERATION);
|
||||
}
|
||||
|
||||
let usize_byte = mem::size_of::<usize>();
|
||||
// first we have to convert the input to a proper thing
|
||||
let input: &[usize] = match q {
|
||||
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE | CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
|
||||
// CL_INVALID_VALUE if param_name is CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
|
||||
// CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE or ... and the size in bytes specified by
|
||||
// input_value_size is not valid or if input_value is NULL.
|
||||
if ![usize_byte, 2 * usize_byte, 3 * usize_byte].contains(&input_value_size) {
|
||||
return Err(CL_INVALID_VALUE);
|
||||
}
|
||||
// SAFETY: we verified the size as best as possible, with the rest we trust the client
|
||||
unsafe { slice::from_raw_parts(input_value.cast(), input_value_size / usize_byte) }
|
||||
}
|
||||
CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
|
||||
// CL_INVALID_VALUE if param_name is ... CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT
|
||||
// and the size in bytes specified by input_value_size is not valid or if
|
||||
// input_value is NULL.
|
||||
if input_value_size != usize_byte || input_value.is_null() {
|
||||
return Err(CL_INVALID_VALUE);
|
||||
}
|
||||
// SAFETY: we trust the client here
|
||||
unsafe { slice::from_raw_parts(input_value.cast(), 1) }
|
||||
}
|
||||
_ => &[],
|
||||
};
|
||||
|
||||
Ok(match q {
|
||||
CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
|
||||
cl_prop::<usize>(kernel.subgroups_for_block(&dev, input))
|
||||
}
|
||||
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE => {
|
||||
cl_prop::<usize>(kernel.subgroup_size_for_block(&dev, input))
|
||||
}
|
||||
CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
|
||||
let subgroups = input[0];
|
||||
let mut res = vec![0; 3];
|
||||
|
||||
for subgroup_size in kernel.subgroup_sizes(&dev) {
|
||||
let threads = subgroups * subgroup_size;
|
||||
|
||||
if threads > dev.max_threads_per_block() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let block = [threads, 1, 1];
|
||||
let real_subgroups = kernel.subgroups_for_block(&dev, &block);
|
||||
|
||||
if real_subgroups == subgroups {
|
||||
res = block.to_vec();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
res.truncate(output_value_size / usize_byte);
|
||||
cl_prop::<Vec<usize>>(res)
|
||||
}
|
||||
CL_KERNEL_MAX_NUM_SUB_GROUPS => {
|
||||
let threads = kernel.max_threads_per_block(&dev);
|
||||
let max_groups = dev.max_subgroups();
|
||||
|
||||
let mut result = 0;
|
||||
for sgs in kernel.subgroup_sizes(&dev) {
|
||||
result = cmp::max(result, threads / sgs);
|
||||
result = cmp::min(result, max_groups as usize);
|
||||
}
|
||||
cl_prop::<usize>(result)
|
||||
}
|
||||
CL_KERNEL_COMPILE_NUM_SUB_GROUPS => cl_prop::<usize>(kernel.num_subgroups),
|
||||
CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL => cl_prop::<usize>(kernel.subgroup_size),
|
||||
// CL_INVALID_VALUE if param_name is not one of the supported values
|
||||
_ => return Err(CL_INVALID_VALUE),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -82,6 +82,7 @@ pub trait HelperContextWrapper {
|
||||
fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
|
||||
fn delete_compute_state(&self, cso: *mut c_void);
|
||||
fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
|
||||
fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32;
|
||||
|
||||
fn unmap(&self, tx: PipeTransfer);
|
||||
}
|
||||
@ -170,6 +171,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
|
||||
self.lock.compute_state_info(state)
|
||||
}
|
||||
|
||||
fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32 {
|
||||
self.lock.compute_state_subgroup_size(state, block)
|
||||
}
|
||||
|
||||
fn unmap(&self, tx: PipeTransfer) {
|
||||
tx.with_ctx(&self.lock);
|
||||
}
|
||||
@ -572,6 +577,12 @@ impl Device {
|
||||
add_ext(1, 0, 0, "cl_khr_device_uuid");
|
||||
}
|
||||
|
||||
if self.subgroups_supported() {
|
||||
// requires CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
|
||||
//add_ext(1, 0, 0, "cl_khr_subgroups");
|
||||
add_feat(1, 0, 0, "__opencl_c_subgroups");
|
||||
}
|
||||
|
||||
if self.svm_supported() {
|
||||
add_ext(1, 0, 0, "cl_arm_shared_virtual_memory");
|
||||
}
|
||||
@ -857,6 +868,22 @@ impl Device {
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn max_subgroups(&self) -> u32 {
|
||||
ComputeParam::<u32>::compute_param(
|
||||
self.screen.as_ref(),
|
||||
pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_SUBGROUPS,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn subgroups_supported(&self) -> bool {
|
||||
let subgroup_sizes = self.subgroup_sizes().len();
|
||||
|
||||
// we need to be able to query a CSO for subgroup sizes if multiple sub group sizes are
|
||||
// supported, doing it without shareable shaders isn't practical
|
||||
self.max_subgroups() > 0
|
||||
&& (subgroup_sizes == 1 || (subgroup_sizes > 1 && self.shareable_shaders()))
|
||||
}
|
||||
|
||||
pub fn svm_supported(&self) -> bool {
|
||||
self.screen.param(pipe_cap::PIPE_CAP_SYSTEM_SVM) == 1
|
||||
}
|
||||
@ -905,7 +932,7 @@ impl Device {
|
||||
images_write_3d: self.image_3d_write_supported(),
|
||||
integer_dot_product: true,
|
||||
intel_subgroups: false,
|
||||
subgroups: false,
|
||||
subgroups: self.subgroups_supported(),
|
||||
subgroups_ifp: false,
|
||||
}
|
||||
}
|
||||
|
@ -335,6 +335,8 @@ pub struct Kernel {
|
||||
pub values: Vec<RefCell<Option<KernelArgValue>>>,
|
||||
pub work_group_size: [usize; 3],
|
||||
pub build: Arc<NirKernelBuild>,
|
||||
pub subgroup_size: usize,
|
||||
pub num_subgroups: usize,
|
||||
dev_state: Arc<KernelDevState>,
|
||||
}
|
||||
|
||||
@ -813,6 +815,8 @@ impl Kernel {
|
||||
prog: prog,
|
||||
name: name,
|
||||
work_group_size: work_group_size,
|
||||
subgroup_size: nir.subgroup_size() as usize,
|
||||
num_subgroups: nir.num_subgroups() as usize,
|
||||
values: values,
|
||||
dev_state: KernelDevState::new(nirs),
|
||||
build: nir_kernel_build,
|
||||
@ -1208,6 +1212,42 @@ impl Kernel {
|
||||
pub fn has_svm_devs(&self) -> bool {
|
||||
self.prog.devs.iter().any(|dev| dev.svm_supported())
|
||||
}
|
||||
|
||||
pub fn subgroup_sizes(&self, dev: &Device) -> Vec<usize> {
|
||||
SetBitIndices::from_msb(self.dev_state.get(dev).info.simd_sizes)
|
||||
.map(|bit| 1 << bit)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn subgroups_for_block(&self, dev: &Device, block: &[usize]) -> usize {
|
||||
let subgroup_size = self.subgroup_size_for_block(dev, block);
|
||||
if subgroup_size == 0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let threads = block.iter().product();
|
||||
div_round_up(threads, subgroup_size)
|
||||
}
|
||||
|
||||
pub fn subgroup_size_for_block(&self, dev: &Device, block: &[usize]) -> usize {
|
||||
let subgroup_sizes = self.subgroup_sizes(dev);
|
||||
if subgroup_sizes.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if subgroup_sizes.len() == 1 {
|
||||
return subgroup_sizes[0];
|
||||
}
|
||||
|
||||
let block = [
|
||||
*block.get(0).unwrap_or(&1) as u32,
|
||||
*block.get(1).unwrap_or(&1) as u32,
|
||||
*block.get(2).unwrap_or(&1) as u32,
|
||||
];
|
||||
|
||||
dev.helper_ctx()
|
||||
.compute_state_subgroup_size(self.dev_state.get(dev).cso, &block) as usize
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Kernel {
|
||||
@ -1219,6 +1259,8 @@ impl Clone for Kernel {
|
||||
values: self.values.clone(),
|
||||
work_group_size: self.work_group_size,
|
||||
build: self.build.clone(),
|
||||
subgroup_size: self.subgroup_size,
|
||||
num_subgroups: self.num_subgroups,
|
||||
dev_state: self.dev_state.clone(),
|
||||
}
|
||||
}
|
||||
|
@ -332,6 +332,7 @@ impl SPIRVBin {
|
||||
float16: true,
|
||||
float64: true,
|
||||
generic_pointers: true,
|
||||
groups: true,
|
||||
int8: true,
|
||||
int16: true,
|
||||
int64: true,
|
||||
|
@ -179,6 +179,27 @@ impl NirShader {
|
||||
unsafe { (*self.nir.as_ptr()).info.workgroup_size }
|
||||
}
|
||||
|
||||
pub fn subgroup_size(&self) -> u8 {
|
||||
let subgroup_size = unsafe { (*self.nir.as_ptr()).info.subgroup_size };
|
||||
let valid_subgroup_sizes = [
|
||||
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_8,
|
||||
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_16,
|
||||
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_32,
|
||||
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_64,
|
||||
gl_subgroup_size::SUBGROUP_SIZE_REQUIRE_128,
|
||||
];
|
||||
|
||||
if valid_subgroup_sizes.contains(&subgroup_size) {
|
||||
subgroup_size as u8
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
pub fn num_subgroups(&self) -> u8 {
|
||||
unsafe { (*self.nir.as_ptr()).info.num_subgroups }
|
||||
}
|
||||
|
||||
pub fn set_workgroup_size_variable_if_zero(&self) {
|
||||
let nir = self.nir.as_ptr();
|
||||
unsafe {
|
||||
|
@ -327,6 +327,16 @@ impl PipeContext {
|
||||
info
|
||||
}
|
||||
|
||||
pub fn compute_state_subgroup_size(&self, state: *mut c_void, block: &[u32; 3]) -> u32 {
|
||||
unsafe {
|
||||
if let Some(cb) = self.pipe.as_ref().get_compute_state_subgroup_size {
|
||||
cb(self.pipe.as_ptr(), state, block)
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
|
||||
unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
|
||||
}
|
||||
|
4
src/gallium/frontends/rusticl/rusticl_llvm_bindings.hpp
Normal file
4
src/gallium/frontends/rusticl/rusticl_llvm_bindings.hpp
Normal file
@ -0,0 +1,4 @@
|
||||
//#include <clang/Basic/Version.h>
|
||||
//#include <llvm/IR/LLVMContext.h>
|
||||
#include <LLVMSPIRVLib/LLVMSPIRVLib.h>
|
||||
#include <spirv-tools/linker.hpp>
|
Loading…
Reference in New Issue
Block a user