mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-01-24 22:53:51 +08:00
radv: Add the fuchsia radix sort
Signed-off-by: Konstantin Seurer <konstantin.seurer@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15648>
This commit is contained in:
parent
9c020b525b
commit
5d9ef0efb5
@ -680,7 +680,7 @@ if with_gallium_d3d12 or with_microsoft_clc or with_microsoft_vk
|
||||
endif
|
||||
endif
|
||||
|
||||
if with_vulkan_overlay_layer or with_aco_tests
|
||||
if with_vulkan_overlay_layer or with_aco_tests or with_amd_vk
|
||||
prog_glslang = find_program('glslangValidator')
|
||||
endif
|
||||
|
||||
|
@ -120,6 +120,9 @@ if with_llvm
|
||||
)
|
||||
endif
|
||||
|
||||
subdir('radix_sort')
|
||||
libradv_files += radix_sort_files
|
||||
|
||||
radv_deps = []
|
||||
radv_flags = cc.get_supported_arguments(['-Wimplicit-fallthrough', '-Wshadow'])
|
||||
|
||||
@ -151,7 +154,7 @@ endif
|
||||
|
||||
libvulkan_radeon = shared_library(
|
||||
'vulkan_radeon',
|
||||
[libradv_files, radv_entrypoints, sha1_h],
|
||||
[libradv_files, radv_entrypoints, sha1_h, radix_sort_spv],
|
||||
vs_module_defs : vulkan_api_def,
|
||||
include_directories : [
|
||||
inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_amd, inc_amd_common, inc_amd_common_llvm, inc_compiler, inc_util,
|
||||
|
24
src/amd/vulkan/radix_sort/LICENSE
Normal file
24
src/amd/vulkan/radix_sort/LICENSE
Normal file
@ -0,0 +1,24 @@
|
||||
Copyright 2019 The Fuchsia Authors.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
112
src/amd/vulkan/radix_sort/common/macros.h
Normal file
112
src/amd/vulkan/radix_sort/common/macros.h
Normal file
@ -0,0 +1,112 @@
|
||||
// Copyright 2019 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_MACROS_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_MACROS_H_
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
//
|
||||
// clang-format off
|
||||
//
|
||||
|
||||
#define ARRAY_LENGTH_MACRO(x_) (sizeof(x_)/sizeof(x_[0]))
|
||||
#define OFFSETOF_MACRO(t_,m_) offsetof(t_,m_)
|
||||
#define MEMBER_SIZE_MACRO(t_,m_) sizeof(((t_*)0)->m_)
|
||||
|
||||
//
|
||||
// FIXME(allanmac):
|
||||
//
|
||||
// Consider providing typed min/max() functions:
|
||||
//
|
||||
// <type> [min|max]_<type>(a,b) { ; }
|
||||
//
|
||||
// But note we still need preprocessor-time min/max().
|
||||
//
|
||||
|
||||
#define MAX_MACRO(t_,a_,b_) (((a_) > (b_)) ? (a_) : (b_))
|
||||
#define MIN_MACRO(t_,a_,b_) (((a_) < (b_)) ? (a_) : (b_))
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define BITS_TO_MASK_MACRO(n_) (((uint32_t)1<<(n_))-1)
|
||||
#define BITS_TO_MASK_64_MACRO(n_) (((uint64_t)1<<(n_))-1)
|
||||
|
||||
#define BITS_TO_MASK_AT_MACRO(n_,b_) (BITS_TO_MASK_MACRO(n_) <<(b_))
|
||||
#define BITS_TO_MASK_AT_64_MACRO(n_,b_) (BITS_TO_MASK_64_MACRO(n_)<<(b_))
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define STRINGIFY_MACRO_2(a_) #a_
|
||||
#define STRINGIFY_MACRO(a_) STRINGIFY_MACRO_2(a_)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define CONCAT_MACRO_2(a_,b_) a_ ## b_
|
||||
#define CONCAT_MACRO(a_,b_) CONCAT_MACRO_2(a_,b_)
|
||||
|
||||
//
|
||||
// Round up/down
|
||||
//
|
||||
|
||||
#define ROUND_DOWN_MACRO(v_,q_) (((v_) / (q_)) * (q_))
|
||||
#define ROUND_UP_MACRO(v_,q_) ((((v_) + (q_) - 1) / (q_)) * (q_))
|
||||
|
||||
//
|
||||
// Round up/down when q is a power-of-two.
|
||||
//
|
||||
|
||||
#define ROUND_DOWN_POW2_MACRO(v_,q_) ((v_) & ~((q_) - 1))
|
||||
#define ROUND_UP_POW2_MACRO(v_,q_) ROUND_DOWN_POW2_MACRO((v_) + (q_) - 1, q_)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if defined (_MSC_VER) && !defined (__clang__)
|
||||
#define STATIC_ASSERT_MACRO(c_,m_) static_assert(c_,m_)
|
||||
#else
|
||||
#define STATIC_ASSERT_MACRO(c_,m_) _Static_assert(c_,m_)
|
||||
#endif
|
||||
|
||||
#define STATIC_ASSERT_MACRO_1(c_) STATIC_ASSERT_MACRO(c_,#c_)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if defined (_MSC_VER) && !defined (__clang__)
|
||||
#define POPCOUNT_MACRO(...) __popcnt(__VA_ARGS__)
|
||||
#else
|
||||
#define POPCOUNT_MACRO(...) __builtin_popcount(__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if defined (_MSC_VER) && !defined (__clang__)
|
||||
#define ALIGN_MACRO(bytes_) __declspec(align(bytes_)) // only accepts integer as arg
|
||||
#else
|
||||
#include <stdalign.h>
|
||||
#define ALIGN_MACRO(bytes_) alignas(bytes_)
|
||||
#endif
|
||||
|
||||
//
|
||||
// clang-format on
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_COMMON_MACROS_H_
|
90
src/amd/vulkan/radix_sort/common/util.c
Normal file
90
src/amd/vulkan/radix_sort/common/util.c
Normal file
@ -0,0 +1,90 @@
|
||||
// Copyright 2019 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "util.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
|
||||
#include <intrin.h>
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
bool
|
||||
is_pow2_u32(uint32_t n)
|
||||
{
|
||||
return n && !(n & (n - 1));
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
uint32_t
|
||||
pow2_ru_u32(uint32_t n)
|
||||
{
|
||||
assert(n <= 0x80000000U);
|
||||
|
||||
n--;
|
||||
n |= n >> 1;
|
||||
n |= n >> 2;
|
||||
n |= n >> 4;
|
||||
n |= n >> 8;
|
||||
n |= n >> 16;
|
||||
n++;
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
uint32_t
|
||||
pow2_rd_u32(uint32_t n)
|
||||
{
|
||||
assert(n > 0);
|
||||
|
||||
return 1u << msb_idx_u32(n);
|
||||
}
|
||||
|
||||
//
|
||||
// ASSUMES NON-ZERO
|
||||
//
|
||||
|
||||
uint32_t
|
||||
msb_idx_u32(uint32_t n)
|
||||
{
|
||||
assert(n > 0);
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
|
||||
uint32_t index;
|
||||
|
||||
_BitScanReverse((unsigned long *)&index, n);
|
||||
|
||||
return index;
|
||||
|
||||
#elif defined(__GNUC__)
|
||||
|
||||
return __builtin_clz(n) ^ 31;
|
||||
|
||||
#else
|
||||
|
||||
#error "No msb_index()"
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
59
src/amd/vulkan/radix_sort/common/util.h
Normal file
59
src/amd/vulkan/radix_sort/common/util.h
Normal file
@ -0,0 +1,59 @@
|
||||
// Copyright 2019 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_UTIL_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_UTIL_H_
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
// Return true iff |n| is a power of 2.
|
||||
bool
|
||||
is_pow2_u32(uint32_t n);
|
||||
|
||||
// Return |n| rounded-up to the nearest power of 2.
|
||||
// If |n| is zero then return 0.
|
||||
// REQUIRES: |n <= 0x80000000|.
|
||||
uint32_t
|
||||
pow2_ru_u32(uint32_t n);
|
||||
|
||||
// Return |n| rounded-down to the nearest power of 2.
|
||||
// REQUIRES: |n > 0|.
|
||||
uint32_t
|
||||
pow2_rd_u32(uint32_t n);
|
||||
|
||||
// Return the most-significant bit position for |n|.
|
||||
// REQUIRES: |n > 0|.
|
||||
uint32_t
|
||||
msb_idx_u32(uint32_t n); // 0-based bit position
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_COMMON_UTIL_H_
|
108
src/amd/vulkan/radix_sort/common/vk/assert.c
Normal file
108
src/amd/vulkan/radix_sort/common/vk/assert.c
Normal file
@ -0,0 +1,108 @@
|
||||
// Copyright 2019 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "assert.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define VK_RESULT_TO_STRING(result) \
|
||||
case result: \
|
||||
return #result
|
||||
|
||||
//
|
||||
// FIXME -- results and errors
|
||||
//
|
||||
|
||||
char const *
|
||||
vk_get_result_string(VkResult const result)
|
||||
{
|
||||
switch (result)
|
||||
{
|
||||
//
|
||||
// Results
|
||||
//
|
||||
VK_RESULT_TO_STRING(VK_SUCCESS);
|
||||
VK_RESULT_TO_STRING(VK_NOT_READY);
|
||||
VK_RESULT_TO_STRING(VK_TIMEOUT);
|
||||
VK_RESULT_TO_STRING(VK_EVENT_SET);
|
||||
VK_RESULT_TO_STRING(VK_EVENT_RESET);
|
||||
VK_RESULT_TO_STRING(VK_INCOMPLETE);
|
||||
//
|
||||
// Errors
|
||||
//
|
||||
VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_INITIALIZATION_FAILED);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_DEVICE_LOST);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_MEMORY_MAP_FAILED);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_LAYER_NOT_PRESENT);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_EXTENSION_NOT_PRESENT);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_FEATURE_NOT_PRESENT);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_INCOMPATIBLE_DRIVER);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_TOO_MANY_OBJECTS);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_FORMAT_NOT_SUPPORTED);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_FRAGMENTED_POOL);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_POOL_MEMORY);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_INVALID_EXTERNAL_HANDLE);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_SURFACE_LOST_KHR);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_NATIVE_WINDOW_IN_USE_KHR);
|
||||
VK_RESULT_TO_STRING(VK_SUBOPTIMAL_KHR);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_DATE_KHR);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_INCOMPATIBLE_DISPLAY_KHR);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_VALIDATION_FAILED_EXT);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_INVALID_SHADER_NV);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_FRAGMENTATION_EXT);
|
||||
VK_RESULT_TO_STRING(VK_ERROR_NOT_PERMITTED_EXT);
|
||||
|
||||
//
|
||||
// Extensions: vk_xyz
|
||||
//
|
||||
default:
|
||||
return "UNKNOWN VULKAN RESULT";
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
VkResult
|
||||
vk_assert(VkResult const result, char const * const file, int const line, bool const is_abort)
|
||||
{
|
||||
if (result != VK_SUCCESS)
|
||||
{
|
||||
char const * const vk_result_str = vk_get_result_string(result);
|
||||
|
||||
fprintf(stderr,
|
||||
"\"%s\", line %d: vk_assert( %d ) = \"%s\"\n",
|
||||
file,
|
||||
line,
|
||||
result,
|
||||
vk_result_str);
|
||||
|
||||
if (is_abort)
|
||||
{
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
52
src/amd/vulkan/radix_sort/common/vk/assert.h
Normal file
52
src/amd/vulkan/radix_sort/common/vk/assert.h
Normal file
@ -0,0 +1,52 @@
|
||||
// Copyright 2019 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_ASSERT_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_ASSERT_H_
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <vulkan/vulkan.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
char const *
|
||||
vk_get_result_string(VkResult const result);
|
||||
|
||||
VkResult
|
||||
vk_assert(VkResult const result, char const * const file, int const line, bool const is_abort);
|
||||
|
||||
//
|
||||
// clang-format off
|
||||
//
|
||||
|
||||
#define vk(...) vk_assert((vk##__VA_ARGS__), __FILE__, __LINE__, true);
|
||||
#define vk_ok(err) vk_assert(err, __FILE__, __LINE__, true);
|
||||
|
||||
//
|
||||
// clang-format on
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_ASSERT_H_
|
305
src/amd/vulkan/radix_sort/common/vk/barrier.c
Normal file
305
src/amd/vulkan/radix_sort/common/vk/barrier.c
Normal file
@ -0,0 +1,305 @@
|
||||
// Copyright 2019 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "barrier.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
|
||||
{
|
||||
static VkMemoryBarrier const mb = {
|
||||
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT
|
||||
};
|
||||
|
||||
vkCmdPipelineBarrier(cb,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
0,
|
||||
1,
|
||||
&mb,
|
||||
0,
|
||||
NULL,
|
||||
0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
|
||||
{
|
||||
static VkMemoryBarrier const mb = {
|
||||
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
|
||||
};
|
||||
|
||||
vkCmdPipelineBarrier(cb,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
0,
|
||||
1,
|
||||
&mb,
|
||||
0,
|
||||
NULL,
|
||||
0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
|
||||
{
|
||||
static VkMemoryBarrier const mb = {
|
||||
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT
|
||||
};
|
||||
|
||||
vkCmdPipelineBarrier(cb,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
0,
|
||||
1,
|
||||
&mb,
|
||||
0,
|
||||
NULL,
|
||||
0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
|
||||
{
|
||||
static VkMemoryBarrier const mb = {
|
||||
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT
|
||||
};
|
||||
|
||||
vkCmdPipelineBarrier(cb,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
0,
|
||||
1,
|
||||
&mb,
|
||||
0,
|
||||
NULL,
|
||||
0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
|
||||
{
|
||||
static VkMemoryBarrier const mb = {
|
||||
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT | //
|
||||
VK_ACCESS_SHADER_READ_BIT
|
||||
};
|
||||
|
||||
vkCmdPipelineBarrier(cb,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
0,
|
||||
1,
|
||||
&mb,
|
||||
0,
|
||||
NULL,
|
||||
0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
|
||||
{
|
||||
static VkMemoryBarrier const mb = {
|
||||
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | //
|
||||
VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
|
||||
};
|
||||
|
||||
vkCmdPipelineBarrier(cb,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
0,
|
||||
1,
|
||||
&mb,
|
||||
0,
|
||||
NULL,
|
||||
0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
|
||||
{
|
||||
static VkMemoryBarrier const mb = {
|
||||
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_HOST_READ_BIT
|
||||
};
|
||||
|
||||
vkCmdPipelineBarrier(cb,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_HOST_BIT,
|
||||
0,
|
||||
1,
|
||||
&mb,
|
||||
0,
|
||||
NULL,
|
||||
0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb)
|
||||
{
|
||||
static VkMemoryBarrier const mb = {
|
||||
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_HOST_READ_BIT
|
||||
};
|
||||
|
||||
vkCmdPipelineBarrier(cb,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
VK_PIPELINE_STAGE_HOST_BIT,
|
||||
0,
|
||||
1,
|
||||
&mb,
|
||||
0,
|
||||
NULL,
|
||||
0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_memory_barrier(VkCommandBuffer cb,
|
||||
VkPipelineStageFlags src_stage,
|
||||
VkAccessFlags src_mask,
|
||||
VkPipelineStageFlags dst_stage,
|
||||
VkAccessFlags dst_mask)
|
||||
{
|
||||
VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = src_mask,
|
||||
.dstAccessMask = dst_mask };
|
||||
|
||||
vkCmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_debug(VkCommandBuffer cb)
|
||||
{
|
||||
static VkMemoryBarrier const mb = {
|
||||
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = NULL,
|
||||
.srcAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT | //
|
||||
VK_ACCESS_INDEX_READ_BIT | //
|
||||
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | //
|
||||
VK_ACCESS_UNIFORM_READ_BIT | //
|
||||
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | //
|
||||
VK_ACCESS_SHADER_READ_BIT | //
|
||||
VK_ACCESS_SHADER_WRITE_BIT | //
|
||||
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | //
|
||||
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | //
|
||||
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | //
|
||||
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | //
|
||||
VK_ACCESS_TRANSFER_READ_BIT | //
|
||||
VK_ACCESS_TRANSFER_WRITE_BIT | //
|
||||
VK_ACCESS_HOST_READ_BIT | //
|
||||
VK_ACCESS_HOST_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT | //
|
||||
VK_ACCESS_INDEX_READ_BIT | //
|
||||
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | //
|
||||
VK_ACCESS_UNIFORM_READ_BIT | //
|
||||
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | //
|
||||
VK_ACCESS_SHADER_READ_BIT | //
|
||||
VK_ACCESS_SHADER_WRITE_BIT | //
|
||||
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | //
|
||||
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | //
|
||||
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | //
|
||||
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | //
|
||||
VK_ACCESS_TRANSFER_READ_BIT | //
|
||||
VK_ACCESS_TRANSFER_WRITE_BIT | //
|
||||
VK_ACCESS_HOST_READ_BIT | //
|
||||
VK_ACCESS_HOST_WRITE_BIT
|
||||
};
|
||||
|
||||
vkCmdPipelineBarrier(cb,
|
||||
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
|
||||
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
|
||||
0,
|
||||
1,
|
||||
&mb,
|
||||
0,
|
||||
NULL,
|
||||
0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
72
src/amd/vulkan/radix_sort/common/vk/barrier.h
Normal file
72
src/amd/vulkan/radix_sort/common/vk/barrier.h
Normal file
@ -0,0 +1,72 @@
|
||||
// Copyright 2019 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_BARRIER_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_BARRIER_H_
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <vulkan/vulkan_core.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb);
|
||||
|
||||
void
|
||||
vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb);
|
||||
|
||||
void
|
||||
vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb);
|
||||
|
||||
void
|
||||
vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb);
|
||||
|
||||
void
|
||||
vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb);
|
||||
|
||||
void
|
||||
vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb);
|
||||
|
||||
void
|
||||
vk_barrier_compute_w_to_host_r(VkCommandBuffer cb);
|
||||
|
||||
void
|
||||
vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb);
|
||||
|
||||
void
|
||||
vk_memory_barrier(VkCommandBuffer cb,
|
||||
VkPipelineStageFlags src_stage,
|
||||
VkAccessFlags src_mask,
|
||||
VkPipelineStageFlags dst_stage,
|
||||
VkAccessFlags dst_mask);
|
||||
|
||||
void
|
||||
vk_barrier_debug(VkCommandBuffer cb);
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_BARRIER_H_
|
40
src/amd/vulkan/radix_sort/meson.build
Normal file
40
src/amd/vulkan/radix_sort/meson.build
Normal file
@ -0,0 +1,40 @@
|
||||
# Copyright © 2022 Konstantin Seurer
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
subdir('shaders')
|
||||
|
||||
radix_sort_files = files(
|
||||
'common/vk/assert.c',
|
||||
'common/vk/assert.h',
|
||||
'common/vk/barrier.c',
|
||||
'common/vk/barrier.h',
|
||||
'common/macros.h',
|
||||
'common/util.c',
|
||||
'common/util.h',
|
||||
'shaders/push.h',
|
||||
'targets/u64/config.h',
|
||||
'radix_sort_vk_devaddr.h',
|
||||
'radix_sort_vk_ext.h',
|
||||
'radix_sort_vk.c',
|
||||
'radix_sort_vk.h',
|
||||
'radv_radix_sort.c',
|
||||
'radv_radix_sort.h',
|
||||
'target.h'
|
||||
)
|
1240
src/amd/vulkan/radix_sort/radix_sort_vk.c
Normal file
1240
src/amd/vulkan/radix_sort/radix_sort_vk.c
Normal file
File diff suppressed because it is too large
Load Diff
384
src/amd/vulkan/radix_sort/radix_sort_vk.h
Normal file
384
src/amd/vulkan/radix_sort/radix_sort_vk.h
Normal file
@ -0,0 +1,384 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_H_
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <vulkan/vulkan_core.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "target.h"
|
||||
|
||||
//
|
||||
// Radix Sort Vk is a high-performance sorting library for Vulkan 1.2.
|
||||
//
|
||||
// The sorting function is both directly and indirectly dispatchable.
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// Get a Radix Sort target's Vulkan requirements.
|
||||
//
|
||||
// A Radix Sort target is a binary image containing configuration parameters and
|
||||
// a bundle of SPIR-V modules.
|
||||
//
|
||||
// Targets are prebuilt and specific to a particular device vendor, architecture
|
||||
// and key-val configuration.
|
||||
//
|
||||
// A Radix Sort instance can only be created with a VkDevice that is initialized
|
||||
// with all of the target's required extensions and features.
|
||||
//
|
||||
// The `radix_sort_vk_target_get_requirements()` function yields the extensions
|
||||
// and initialized feature flags required by a Radix Sort target.
|
||||
//
|
||||
// These requirements can be merged with other Vulkan library requirements
|
||||
// before VkDevice creation.
|
||||
//
|
||||
// If the `.ext_names` member is NULL, the `.ext_name_count` member will be
|
||||
// initialized.
|
||||
//
|
||||
// Returns `false` if:
|
||||
//
|
||||
// * The .ext_names field is NULL and the number of required extensions is
|
||||
// greater than zero.
|
||||
// * The .ext_name_count is less than the number of required extensions is
|
||||
// greater than zero.
|
||||
// * Any of the .pdf, .pdf11 or .pdf12 members are NULL.
|
||||
//
|
||||
// Otherwise, returns true.
|
||||
//
|
||||
typedef struct radix_sort_vk_target radix_sort_vk_target_t;
|
||||
|
||||
//
|
||||
// NOTE: The library currently supports uint32_t and uint64_t keyvals.
|
||||
//
|
||||
|
||||
#define RS_KV_DWORDS_MAX 2
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
struct rs_pipeline_layout_scatter
|
||||
{
|
||||
VkPipelineLayout even;
|
||||
VkPipelineLayout odd;
|
||||
};
|
||||
|
||||
struct rs_pipeline_scatter
|
||||
{
|
||||
VkPipeline even;
|
||||
VkPipeline odd;
|
||||
};
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
struct rs_pipeline_layouts_named
|
||||
{
|
||||
VkPipelineLayout init;
|
||||
VkPipelineLayout fill;
|
||||
VkPipelineLayout histogram;
|
||||
VkPipelineLayout prefix;
|
||||
struct rs_pipeline_layout_scatter scatter[RS_KV_DWORDS_MAX];
|
||||
};
|
||||
|
||||
struct rs_pipelines_named
|
||||
{
|
||||
VkPipeline init;
|
||||
VkPipeline fill;
|
||||
VkPipeline histogram;
|
||||
VkPipeline prefix;
|
||||
struct rs_pipeline_scatter scatter[RS_KV_DWORDS_MAX];
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
#define RS_PIPELINE_LAYOUTS_HANDLES (sizeof(struct rs_pipeline_layouts_named) / sizeof(VkPipelineLayout))
|
||||
#define RS_PIPELINES_HANDLES (sizeof(struct rs_pipelines_named) / sizeof(VkPipeline))
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
struct radix_sort_vk
|
||||
{
|
||||
struct radix_sort_vk_target_config config;
|
||||
|
||||
union
|
||||
{
|
||||
struct rs_pipeline_layouts_named named;
|
||||
VkPipelineLayout handles[RS_PIPELINE_LAYOUTS_HANDLES];
|
||||
} pipeline_layouts;
|
||||
|
||||
union
|
||||
{
|
||||
struct rs_pipelines_named named;
|
||||
VkPipeline handles[RS_PIPELINES_HANDLES];
|
||||
} pipelines;
|
||||
|
||||
struct
|
||||
{
|
||||
struct
|
||||
{
|
||||
VkDeviceSize offset;
|
||||
VkDeviceSize range;
|
||||
} histograms;
|
||||
|
||||
struct
|
||||
{
|
||||
VkDeviceSize offset;
|
||||
} partitions;
|
||||
|
||||
} internal;
|
||||
};
|
||||
|
||||
//
|
||||
// Create a Radix Sort instance for a target.(VkCommandBuffer cb,
|
||||
//
|
||||
// Keyval size is implicitly determined by the target.
|
||||
//
|
||||
// Returns NULL on failure.
|
||||
//
|
||||
typedef struct radix_sort_vk radix_sort_vk_t;
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
radix_sort_vk_t *
|
||||
radix_sort_vk_create(VkDevice device,
|
||||
VkAllocationCallbacks const * ac,
|
||||
VkPipelineCache pc,
|
||||
const uint32_t* const* spv,
|
||||
const uint32_t* spv_sizes,
|
||||
struct radix_sort_vk_target_config config);
|
||||
|
||||
//
|
||||
// Destroy the Radix Sort instance using the same device and allocator used at
|
||||
// creation.
|
||||
//
|
||||
void
|
||||
radix_sort_vk_destroy(radix_sort_vk_t * rs, //
|
||||
VkDevice d,
|
||||
VkAllocationCallbacks const * ac);
|
||||
|
||||
//
|
||||
// Returns the buffer size and alignment requirements for a maximum number of
|
||||
// keyvals.
|
||||
//
|
||||
// The radix sort implementation is not an in-place sorting algorithm so two
|
||||
// non-overlapping keyval buffers are required that are at least
|
||||
// `.keyvals_size`.
|
||||
//
|
||||
// The radix sort instance also requires an `internal` buffer during sorting.
|
||||
//
|
||||
// If the indirect dispatch sorting function is used, then an `indirect` buffer
|
||||
// is also required.
|
||||
//
|
||||
// The alignment requirements for the keyval, internal, and indirect buffers
|
||||
// must be honored. All alignments are power of 2.
|
||||
//
|
||||
// Input:
|
||||
// count : Maximum number of keyvals
|
||||
//
|
||||
// Outputs:
|
||||
// keyval_size : Size of a single keyval
|
||||
//
|
||||
// keyvals_size : Minimum size of the even and odd keyval buffers
|
||||
// keyvals_alignment : Alignment of each keyval buffer
|
||||
//
|
||||
// internal_size : Minimum size of internal buffer
|
||||
// internal_aligment : Alignment of the internal buffer
|
||||
//
|
||||
// indirect_size : Minimum size of indirect buffer
|
||||
// indirect_aligment : Alignment of the indirect buffer
|
||||
//
|
||||
// .keyvals_even/odd
|
||||
// -----------------
|
||||
// VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
|
||||
// VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
|
||||
//
|
||||
// .internal
|
||||
// ---------
|
||||
// VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
|
||||
// VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
|
||||
// VK_BUFFER_USAGE_TRANSFER_DST_BIT ("direct" mode only)
|
||||
//
|
||||
// .indirect
|
||||
// ---------
|
||||
// VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
|
||||
// VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
|
||||
// VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT
|
||||
//
|
||||
typedef struct radix_sort_vk_memory_requirements
|
||||
{
|
||||
VkDeviceSize keyval_size;
|
||||
|
||||
VkDeviceSize keyvals_size;
|
||||
VkDeviceSize keyvals_alignment;
|
||||
|
||||
VkDeviceSize internal_size;
|
||||
VkDeviceSize internal_alignment;
|
||||
|
||||
VkDeviceSize indirect_size;
|
||||
VkDeviceSize indirect_alignment;
|
||||
} radix_sort_vk_memory_requirements_t;
|
||||
|
||||
void
|
||||
radix_sort_vk_get_memory_requirements(radix_sort_vk_t const * rs,
|
||||
uint32_t count,
|
||||
radix_sort_vk_memory_requirements_t * mr);
|
||||
|
||||
//
|
||||
// Direct dispatch sorting
|
||||
// -----------------------
|
||||
//
|
||||
// Using a key size of `key_bits`, sort `count` keyvals found in the
|
||||
// `.devaddr_keyvals_even` buffer.
|
||||
//
|
||||
// Each internal sorting pass copies the keyvals from one keyvals buffer to the
|
||||
// other.
|
||||
//
|
||||
// The number of internal sorting passes is determined by `.key_bits`.
|
||||
//
|
||||
// If an even number of internal sorting passes is required, the sorted keyvals
|
||||
// will be found in the "even" keyvals buffer. Otherwise, the sorted keyvals
|
||||
// will be found in the "odd" keyvals buffer.
|
||||
//
|
||||
// Which buffer has the sorted keyvals is returned in `keyvals_sorted`.
|
||||
//
|
||||
// A keyval's `key_bits` are the most significant bits of a keyval.
|
||||
//
|
||||
// The maximum number of key bits is determined by the keyval size.
|
||||
//
|
||||
// The keyval count must be less than (1 << 30) as well as be less than or equal
|
||||
// to the count used to obtain the the memory requirements.
|
||||
//
|
||||
// The info struct's `ext` member must be NULL.
|
||||
//
|
||||
// This function appends push constants, dispatch commands, and barriers.
|
||||
//
|
||||
// Pipeline barriers should be applied as necessary, both before and after
|
||||
// invoking this function.
|
||||
//
|
||||
// The sort begins with either a TRANSFER/WRITE or a COMPUTE/READ to the
|
||||
// `internal` and `keyvals_even` buffers.
|
||||
//
|
||||
// The sort ends with a COMPUTE/WRITE to the `internal` and `keyvals_sorted`
|
||||
// buffers.
|
||||
//
|
||||
|
||||
//
|
||||
// Direct dispatch sorting using VkDescriptorBufferInfo structures
|
||||
// ---------------------------------------------------------------
|
||||
//
|
||||
typedef struct radix_sort_vk_sort_info
|
||||
{
|
||||
void * ext;
|
||||
uint32_t key_bits;
|
||||
uint32_t count;
|
||||
VkDescriptorBufferInfo keyvals_even;
|
||||
VkDescriptorBufferInfo keyvals_odd;
|
||||
VkDescriptorBufferInfo internal;
|
||||
} radix_sort_vk_sort_info_t;
|
||||
|
||||
void
|
||||
radix_sort_vk_sort(radix_sort_vk_t const * rs,
|
||||
radix_sort_vk_sort_info_t const * info,
|
||||
VkDevice device,
|
||||
VkCommandBuffer cb,
|
||||
VkDescriptorBufferInfo * keyvals_sorted);
|
||||
|
||||
//
|
||||
// Indirect dispatch sorting
|
||||
// -------------------------
|
||||
//
|
||||
// Using a key size of `key_bits`, at pipeline execution time, load keyvals
|
||||
// count from `devaddr_count` and sorts the keyvals in `.devaddr_keyvals_even`.
|
||||
//
|
||||
// Each internal sorting pass copies the keyvals from one keyvals buffer to the
|
||||
// other.
|
||||
//
|
||||
// The number of internal sorting passes is determined by `.key_bits`.
|
||||
//
|
||||
// If an even number of internal sorting passes is required, the sorted keyvals
|
||||
// will be found in the "even" keyvals buffer. Otherwise, the sorted keyvals
|
||||
// will be found in the "odd" keyvals buffer.
|
||||
//
|
||||
// Which buffer has the sorted keyvals is returned in `keyvals_sorted`.
|
||||
//
|
||||
// A keyval's `key_bits` are the most significant bits of a keyval.
|
||||
//
|
||||
// The keyval count must be less than (1 << 30) as well as be less than or equal
|
||||
// to the count used to obtain the the memory requirements.
|
||||
//
|
||||
// The info struct's `ext` member must be NULL.
|
||||
//
|
||||
// This function appends push constants, dispatch commands, and barriers.
|
||||
//
|
||||
// Pipeline barriers should be applied as necessary, both before and after
|
||||
// invoking this function.
|
||||
//
|
||||
// The indirect radix sort begins with a COMPUTE/READ from the `count` buffer
|
||||
// and ends with a COMPUTE/WRITE to the `internal` and the `keyvals_sorted`
|
||||
// buffers.
|
||||
//
|
||||
// The `indirect` buffer must support USAGE_INDIRECT.
|
||||
//
|
||||
// The `count` buffer must be at least 4 bytes and 4-byte aligned.
|
||||
//
|
||||
|
||||
//
|
||||
// Indirect dispatch sorting using VkDescriptorBufferInfo structures
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
typedef struct radix_sort_vk_sort_indirect_info
|
||||
{
|
||||
void * ext;
|
||||
uint32_t key_bits;
|
||||
VkDescriptorBufferInfo count;
|
||||
VkDescriptorBufferInfo keyvals_even;
|
||||
VkDescriptorBufferInfo keyvals_odd;
|
||||
VkDescriptorBufferInfo internal;
|
||||
VkDescriptorBufferInfo indirect;
|
||||
} radix_sort_vk_sort_indirect_info_t;
|
||||
|
||||
void
|
||||
radix_sort_vk_sort_indirect(radix_sort_vk_t const * rs,
|
||||
radix_sort_vk_sort_indirect_info_t const * info,
|
||||
VkDevice device,
|
||||
VkCommandBuffer cb,
|
||||
VkDescriptorBufferInfo * keyvals_sorted);
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_H_
|
104
src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h
Normal file
104
src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h
Normal file
@ -0,0 +1,104 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_DEVADDR_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_DEVADDR_H_
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "radix_sort_vk.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// Structure that enables integration with Vulkan drivers.
|
||||
//
|
||||
typedef struct radix_sort_vk_buffer_info
|
||||
{
|
||||
VkBuffer buffer;
|
||||
VkDeviceSize offset;
|
||||
VkDeviceAddress devaddr;
|
||||
} radix_sort_vk_buffer_info_t;
|
||||
|
||||
//
|
||||
// Function prototypes
|
||||
//
|
||||
typedef void (*radix_sort_vk_fill_buffer_pfn)(VkCommandBuffer cb,
|
||||
radix_sort_vk_buffer_info_t const * buffer_info,
|
||||
VkDeviceSize offset,
|
||||
VkDeviceSize size,
|
||||
uint32_t data);
|
||||
|
||||
typedef void (*radix_sort_vk_dispatch_indirect_pfn)(VkCommandBuffer cb,
|
||||
radix_sort_vk_buffer_info_t const * buffer_info,
|
||||
VkDeviceSize offset);
|
||||
|
||||
//
|
||||
// Direct dispatch sorting using buffer device addresses
|
||||
// -----------------------------------------------------
|
||||
//
|
||||
typedef struct radix_sort_vk_sort_devaddr_info
|
||||
{
|
||||
void * ext;
|
||||
uint32_t key_bits;
|
||||
uint32_t count;
|
||||
radix_sort_vk_buffer_info_t keyvals_even;
|
||||
VkDeviceAddress keyvals_odd;
|
||||
radix_sort_vk_buffer_info_t internal;
|
||||
radix_sort_vk_fill_buffer_pfn fill_buffer;
|
||||
} radix_sort_vk_sort_devaddr_info_t;
|
||||
|
||||
void
|
||||
radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs,
|
||||
radix_sort_vk_sort_devaddr_info_t const * info,
|
||||
VkDevice device,
|
||||
VkCommandBuffer cb,
|
||||
VkDeviceAddress * keyvals_sorted);
|
||||
|
||||
//
|
||||
// Indirect dispatch sorting using buffer device addresses
|
||||
// -------------------------------------------------------
|
||||
//
|
||||
// clang-format off
|
||||
//
|
||||
typedef struct radix_sort_vk_sort_indirect_devaddr_info
|
||||
{
|
||||
void * ext;
|
||||
uint32_t key_bits;
|
||||
VkDeviceAddress count;
|
||||
VkDeviceAddress keyvals_even;
|
||||
VkDeviceAddress keyvals_odd;
|
||||
VkDeviceAddress internal;
|
||||
radix_sort_vk_buffer_info_t indirect;
|
||||
radix_sort_vk_dispatch_indirect_pfn dispatch_indirect;
|
||||
} radix_sort_vk_sort_indirect_devaddr_info_t;
|
||||
|
||||
void
|
||||
radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * rs,
|
||||
radix_sort_vk_sort_indirect_devaddr_info_t const * info,
|
||||
VkDevice device,
|
||||
VkCommandBuffer cb,
|
||||
VkDeviceAddress * keyvals_sorted);
|
||||
|
||||
//
|
||||
// clang-format on
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_DEVADDR_H_
|
77
src/amd/vulkan/radix_sort/radix_sort_vk_ext.h
Normal file
77
src/amd/vulkan/radix_sort/radix_sort_vk_ext.h
Normal file
@ -0,0 +1,77 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_EXT_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_EXT_H_
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <vulkan/vulkan_core.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// Radix sort extensions
|
||||
// ---------------------
|
||||
//
|
||||
#ifndef RADIX_SORT_VK_DISABLE_EXTENSIONS
|
||||
|
||||
//
|
||||
// Extension types
|
||||
//
|
||||
enum radix_sort_vk_ext_type
|
||||
{
|
||||
RADIX_SORT_VK_EXT_TIMESTAMPS
|
||||
};
|
||||
|
||||
//
|
||||
// Timestamp each logical step of the algorithm
|
||||
//
|
||||
// Number of timestamps is: 5 + (number of subpasses)
|
||||
//
|
||||
// * direct dispatch: 4 + subpass count
|
||||
// * indirect dispatch: 5 + subpass count
|
||||
//
|
||||
// Indirect / 32-bit keyvals: 9
|
||||
// Indirect / 64-bit keyvals: 13
|
||||
//
|
||||
struct radix_sort_vk_ext_timestamps
|
||||
{
|
||||
void * ext;
|
||||
enum radix_sort_vk_ext_type type;
|
||||
uint32_t timestamp_count;
|
||||
VkQueryPool timestamps;
|
||||
uint32_t timestamps_set;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//dsc
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_EXT_H_
|
193
src/amd/vulkan/radix_sort/radv_radix_sort.c
Normal file
193
src/amd/vulkan/radix_sort/radv_radix_sort.c
Normal file
@ -0,0 +1,193 @@
|
||||
/*
|
||||
* Copyright © 2022 Konstantin Seurer
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "radv_radix_sort.h"
|
||||
#include "targets/u64/config.h"
|
||||
#include "radv_private.h"
|
||||
#include "target.h"
|
||||
|
||||
static const uint32_t init_spv[] = {
|
||||
#include "radix_sort/shaders/init.comp.spv.h"
|
||||
};
|
||||
|
||||
static const uint32_t fill_spv[] = {
|
||||
#include "radix_sort/shaders/fill.comp.spv.h"
|
||||
};
|
||||
|
||||
static const uint32_t histogram_spv[] = {
|
||||
#include "radix_sort/shaders/histogram.comp.spv.h"
|
||||
};
|
||||
|
||||
static const uint32_t prefix_spv[] = {
|
||||
#include "radix_sort/shaders/prefix.comp.spv.h"
|
||||
};
|
||||
|
||||
static const uint32_t scatter_0_even_spv[] = {
|
||||
#include "radix_sort/shaders/scatter_0_even.comp.spv.h"
|
||||
};
|
||||
|
||||
static const uint32_t scatter_0_odd_spv[] = {
|
||||
#include "radix_sort/shaders/scatter_0_odd.comp.spv.h"
|
||||
};
|
||||
|
||||
static const uint32_t scatter_1_even_spv[] = {
|
||||
#include "radix_sort/shaders/scatter_1_even.comp.spv.h"
|
||||
};
|
||||
|
||||
static const uint32_t scatter_1_odd_spv[] = {
|
||||
#include "radix_sort/shaders/scatter_1_odd.comp.spv.h"
|
||||
};
|
||||
|
||||
static const struct radix_sort_vk_target_config target_config = {
|
||||
.keyval_dwords = RS_KEYVAL_DWORDS,
|
||||
|
||||
.histogram =
|
||||
{
|
||||
.workgroup_size_log2 = RS_HISTOGRAM_WORKGROUP_SIZE_LOG2,
|
||||
.subgroup_size_log2 = RS_HISTOGRAM_SUBGROUP_SIZE_LOG2,
|
||||
.block_rows = RS_HISTOGRAM_BLOCK_ROWS,
|
||||
},
|
||||
|
||||
.prefix =
|
||||
{
|
||||
.workgroup_size_log2 = RS_PREFIX_WORKGROUP_SIZE_LOG2,
|
||||
.subgroup_size_log2 = RS_PREFIX_SUBGROUP_SIZE_LOG2,
|
||||
},
|
||||
|
||||
.scatter =
|
||||
{
|
||||
.workgroup_size_log2 = RS_SCATTER_WORKGROUP_SIZE_LOG2,
|
||||
.subgroup_size_log2 = RS_SCATTER_SUBGROUP_SIZE_LOG2,
|
||||
.block_rows = RS_SCATTER_BLOCK_ROWS,
|
||||
},
|
||||
};
|
||||
|
||||
radix_sort_vk_t *
|
||||
radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc)
|
||||
{
|
||||
const uint32_t *spv[8] = {
|
||||
init_spv, fill_spv, histogram_spv, prefix_spv,
|
||||
scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv,
|
||||
};
|
||||
const uint32_t spv_sizes[8] = {
|
||||
sizeof(init_spv), sizeof(fill_spv), sizeof(histogram_spv),
|
||||
sizeof(prefix_spv), sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv),
|
||||
sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv),
|
||||
};
|
||||
return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, target_config);
|
||||
}
|
||||
|
||||
VKAPI_ATTR VkResult VKAPI_CALL
|
||||
vkCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo,
|
||||
const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_device, pdevice, device);
|
||||
return pdevice->vk.dispatch_table.CreateShaderModule(device, pCreateInfo, pAllocator,
|
||||
pShaderModule);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
vkDestroyShaderModule(VkDevice device, VkShaderModule shaderModule,
|
||||
const VkAllocationCallbacks *pAllocator)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_device, pdevice, device);
|
||||
pdevice->vk.dispatch_table.DestroyShaderModule(device, shaderModule, pAllocator);
|
||||
}
|
||||
|
||||
VKAPI_ATTR VkResult VKAPI_CALL
|
||||
vkCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo,
|
||||
const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_device, pdevice, device);
|
||||
return pdevice->vk.dispatch_table.CreatePipelineLayout(device, pCreateInfo, pAllocator,
|
||||
pPipelineLayout);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
vkDestroyPipelineLayout(VkDevice device, VkPipelineLayout pipelineLayout,
|
||||
const VkAllocationCallbacks *pAllocator)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_device, pdevice, device);
|
||||
pdevice->vk.dispatch_table.DestroyPipelineLayout(device, pipelineLayout, pAllocator);
|
||||
}
|
||||
|
||||
VKAPI_ATTR VkResult VKAPI_CALL
|
||||
vkCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount,
|
||||
const VkComputePipelineCreateInfo *pCreateInfos,
|
||||
const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_device, pdevice, device);
|
||||
return pdevice->vk.dispatch_table.CreateComputePipelines(device, pipelineCache, createInfoCount,
|
||||
pCreateInfos, pAllocator, pPipelines);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
vkDestroyPipeline(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_device, pdevice, device);
|
||||
return pdevice->vk.dispatch_table.DestroyPipeline(device, pipeline, pAllocator);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
vkCmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
|
||||
VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags,
|
||||
uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
|
||||
uint32_t bufferMemoryBarrierCount,
|
||||
const VkBufferMemoryBarrier *pBufferMemoryBarriers,
|
||||
uint32_t imageMemoryBarrierCount,
|
||||
const VkImageMemoryBarrier *pImageMemoryBarriers)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
cmd_buffer->device->vk.dispatch_table.CmdPipelineBarrier(
|
||||
commandBuffer, srcStageMask, dstStageMask, dependencyFlags, memoryBarrierCount,
|
||||
pMemoryBarriers, bufferMemoryBarrierCount, pBufferMemoryBarriers, imageMemoryBarrierCount,
|
||||
pImageMemoryBarriers);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
vkCmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
|
||||
VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
|
||||
const void *pValues)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
cmd_buffer->device->vk.dispatch_table.CmdPushConstants(commandBuffer, layout, stageFlags, offset,
|
||||
size, pValues);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
vkCmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
|
||||
VkPipeline pipeline)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
cmd_buffer->device->vk.dispatch_table.CmdBindPipeline(commandBuffer, pipelineBindPoint,
|
||||
pipeline);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
vkCmdDispatch(VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY,
|
||||
uint32_t groupCountZ)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
cmd_buffer->device->vk.dispatch_table.CmdDispatch(commandBuffer, groupCountX, groupCountY,
|
||||
groupCountZ);
|
||||
}
|
32
src/amd/vulkan/radix_sort/radv_radix_sort.h
Normal file
32
src/amd/vulkan/radix_sort/radv_radix_sort.h
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright © 2022 Konstantin Seurer
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef RADV_RADIX_SORT_H
|
||||
#define RADV_RADIX_SORT_H
|
||||
|
||||
#include "radix_sort_vk_devaddr.h"
|
||||
|
||||
radix_sort_vk_t *radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac,
|
||||
VkPipelineCache pc);
|
||||
|
||||
#endif
|
151
src/amd/vulkan/radix_sort/shaders/bufref.h
Normal file
151
src/amd/vulkan/radix_sort/shaders/bufref.h
Normal file
@ -0,0 +1,151 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_BUFREF_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_BUFREF_H_
|
||||
|
||||
//
|
||||
// GLSL
|
||||
//
|
||||
|
||||
#ifdef VULKAN // defined by GLSL/VK compiler
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
//
|
||||
// If the target does not support VkPhysicalDeviceFeatures.shaderInt64
|
||||
// then:
|
||||
//
|
||||
// #define RS_DISABLE_SHADER_INT64
|
||||
//
|
||||
// clang-format off
|
||||
#ifdef RS_DISABLE_SHADER_INT64
|
||||
#extension GL_EXT_buffer_reference_uvec2 : require
|
||||
#else
|
||||
#extension GL_EXT_buffer_reference2 : require
|
||||
#endif
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
// Restrict shouldn't have any noticeable impact on these kernels and
|
||||
// benchmarks appear to prove that true but it's correct to include
|
||||
// the qualifier.
|
||||
//
|
||||
#define RS_RESTRICT restrict
|
||||
|
||||
//
|
||||
// If the device doesn't support .shaderInt64 then the buffer reference address
|
||||
// is a uvec2.
|
||||
//
|
||||
#ifdef RS_DISABLE_SHADER_INT64
|
||||
#define RS_DEVADDR u32vec2
|
||||
#else
|
||||
#define RS_DEVADDR uint64_t
|
||||
#endif
|
||||
|
||||
//
|
||||
// Define a buffer reference.
|
||||
//
|
||||
#define RS_BUFREF_DEFINE(_layout, _name, _devaddr) RS_RESTRICT _layout _name = _layout(_devaddr)
|
||||
|
||||
//
|
||||
// Define a buffer reference at a UINT32 offset.
|
||||
//
|
||||
#ifdef RS_DISABLE_SHADER_INT64
|
||||
#define RS_BUFREF_DEFINE_AT_OFFSET_UINT32(_layout, _name, _devaddr_u32vec2, _offset) \
|
||||
RS_RESTRICT _layout _name; \
|
||||
{ \
|
||||
u32vec2 devaddr; \
|
||||
uint32_t carry; \
|
||||
\
|
||||
devaddr.x = uaddCarry(_devaddr_u32vec2.x, _offset, carry); \
|
||||
devaddr.y = _devaddr_u32vec2.y + carry; \
|
||||
\
|
||||
_name = _layout(devaddr); \
|
||||
}
|
||||
#else
|
||||
#define RS_BUFREF_DEFINE_AT_OFFSET_UINT32(_layout, _name, _devaddr, _offset) \
|
||||
RS_RESTRICT _layout _name = _layout(_devaddr + _offset)
|
||||
#endif
|
||||
|
||||
//
|
||||
// Define a buffer reference at a packed UINT64 offset.
|
||||
//
|
||||
#ifdef RS_DISABLE_SHADER_INT64
|
||||
#define RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(_layout, _name, _devaddr_u32vec2, _offset_u32vec2) \
|
||||
RS_RESTRICT _layout _name; \
|
||||
{ \
|
||||
u32vec2 devaddr; \
|
||||
uint32_t carry; \
|
||||
\
|
||||
devaddr.x = uaddCarry(_devaddr_u32vec2.x, _offset_u32vec2.x, carry); \
|
||||
devaddr.y = _devaddr_u32vec2.y + _offset_u32vec2.y + carry; \
|
||||
\
|
||||
_name = _layout(devaddr); \
|
||||
}
|
||||
#else
|
||||
#define RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(_layout, _name, _devaddr, _offset_u32vec2) \
|
||||
RS_RESTRICT _layout _name = _layout(_devaddr + pack64(_offset_u32vec2))
|
||||
#endif
|
||||
|
||||
//
|
||||
// Increment the buffer reference by a UINT32 offset.
|
||||
//
|
||||
#ifdef RS_DISABLE_SHADER_INT64
|
||||
#define RS_BUFREF_INC_UINT32(_layout, _name, _inc) \
|
||||
{ \
|
||||
u32vec2 devaddr = u32vec2(_name); \
|
||||
uint32_t carry; \
|
||||
\
|
||||
devaddr.x = uaddCarry(devaddr.x, _inc, carry); \
|
||||
devaddr.y = devaddr.y + carry; \
|
||||
\
|
||||
_name = _layout(devaddr); \
|
||||
}
|
||||
#else
|
||||
#define RS_BUFREF_INC_UINT32(_layout, _name, _inc) _name = _layout(uint64_t(_name) + _inc)
|
||||
#endif
|
||||
|
||||
//
|
||||
// Increment the buffer reference by a packed UINT64 offset.
|
||||
//
|
||||
#ifdef RS_DISABLE_SHADER_INT64
|
||||
#define RS_BUFREF_INC_U32VEC2(_layout, _name, _inc_u32vec2) \
|
||||
{ \
|
||||
u32vec2 devaddr = u32vec2(_name); \
|
||||
uint32_t carry; \
|
||||
\
|
||||
devaddr.x = uaddCarry(devaddr.x, _inc_u32vec2.x, carry); \
|
||||
devaddr.y = devaddr.y + _inc_u32vec2.y + carry; \
|
||||
\
|
||||
_name = _layout(devaddr); \
|
||||
}
|
||||
#else
|
||||
#define RS_BUFREF_INC_U32VEC2(_layout, _name, _inc_u32vec2) \
|
||||
_name = _layout(uint64_t(_name) + pack64(_inc_u32vec2))
|
||||
#endif
|
||||
|
||||
//
|
||||
// Increment the buffer reference by the product of two UINT32 factors.
|
||||
//
|
||||
#define RS_BUFREF_INC_UINT32_UINT32(_layout, _name, _inc_a, _inc_b) \
|
||||
{ \
|
||||
u32vec2 inc; \
|
||||
\
|
||||
umulExtended(_inc_a, _inc_b, inc.y, inc.x); \
|
||||
\
|
||||
RS_BUFREF_INC_U32VEC2(_layout, _name, inc); \
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_BUFREF_H_
|
143
src/amd/vulkan/radix_sort/shaders/fill.comp
Normal file
143
src/amd/vulkan/radix_sort/shaders/fill.comp
Normal file
@ -0,0 +1,143 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#version 460
|
||||
|
||||
//
|
||||
// Each workgroup fills up to RS_BLOCK_KEYVALS
|
||||
//
|
||||
|
||||
// clang-format off
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
// Load arch/keyval configuration
|
||||
//
|
||||
#include "config.h"
|
||||
|
||||
//
|
||||
// Buffer reference macros and push constants
|
||||
//
|
||||
#include "bufref.h"
|
||||
#include "push.h"
|
||||
|
||||
//
|
||||
// Subgroup uniform support
|
||||
//
|
||||
#if defined(RS_SCATTER_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
|
||||
#extension GL_EXT_subgroupuniform_qualifier : required
|
||||
#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
|
||||
#else
|
||||
#define RS_SUBGROUP_UNIFORM
|
||||
#endif
|
||||
|
||||
//
|
||||
// Declare the push constants
|
||||
//
|
||||
RS_STRUCT_PUSH_FILL();
|
||||
|
||||
layout(push_constant) uniform block_push
|
||||
{
|
||||
rs_push_fill push;
|
||||
};
|
||||
|
||||
//
|
||||
// The "init" shader configures the fill info structure.
|
||||
//
|
||||
RS_STRUCT_INDIRECT_INFO_FILL();
|
||||
|
||||
//
|
||||
// Check all switches are defined
|
||||
//
|
||||
#ifndef RS_FILL_WORKGROUP_SIZE_LOG2
|
||||
#error "Undefined: RS_FILL_WORKGROUP_SIZE_LOG2"
|
||||
#endif
|
||||
|
||||
//
|
||||
#ifndef RS_FILL_BLOCK_ROWS
|
||||
#error "Undefined: RS_FILL_BLOCK_ROWS"
|
||||
#endif
|
||||
|
||||
//
|
||||
// Local macros
|
||||
//
|
||||
// clang-format off
|
||||
#define RS_WORKGROUP_SIZE (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
|
||||
#define RS_BLOCK_DWORDS (RS_FILL_BLOCK_ROWS * RS_WORKGROUP_SIZE)
|
||||
#define RS_RADIX_MASK ((1 << RS_RADIX_LOG2) - 1)
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
layout(local_size_x = RS_WORKGROUP_SIZE) in;
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
layout(buffer_reference, std430) buffer buffer_rs_indirect_info_fill
|
||||
{
|
||||
rs_indirect_info_fill info;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430) buffer buffer_rs_dwords
|
||||
{
|
||||
uint32_t extent[];
|
||||
};
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
void
|
||||
main()
|
||||
{
|
||||
//
|
||||
// Define indirect info bufref for the fill
|
||||
//
|
||||
readonly RS_BUFREF_DEFINE(buffer_rs_indirect_info_fill, rs_info, push.devaddr_info);
|
||||
|
||||
RS_SUBGROUP_UNIFORM const rs_indirect_info_fill info = rs_info.info;
|
||||
|
||||
//
|
||||
// Define dwords bufref
|
||||
//
|
||||
// Assumes less than 2^32-1 keys and then extended multiplies it by
|
||||
// the keyval size.
|
||||
//
|
||||
// Assumes push.devaddr_dwords_base is suitably aligned to
|
||||
// RS_BLOCK_DWORDS -- at a subgroup or transaction size is fine.
|
||||
//
|
||||
const uint32_t dwords_idx =
|
||||
(info.block_offset + gl_WorkGroupID.x) * RS_BLOCK_DWORDS + gl_LocalInvocationID.x;
|
||||
|
||||
u32vec2 dwords_offset;
|
||||
|
||||
umulExtended(dwords_idx, 4, dwords_offset.y, dwords_offset.x);
|
||||
|
||||
writeonly RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_dwords,
|
||||
rs_dwords,
|
||||
push.devaddr_dwords,
|
||||
dwords_offset);
|
||||
|
||||
//
|
||||
// Fills are always aligned to RS_BLOCK_KEYVALS
|
||||
//
|
||||
// ((v >= min) && (v < max)) == ((v - min) < (max - min))
|
||||
//
|
||||
const uint32_t row_idx = dwords_idx - info.dword_offset_min;
|
||||
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_FILL_BLOCK_ROWS; ii++)
|
||||
{
|
||||
if (row_idx + (ii * RS_WORKGROUP_SIZE) < info.dword_offset_max_minus_min)
|
||||
{
|
||||
rs_dwords.extent[ii * RS_WORKGROUP_SIZE] = push.dword;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
449
src/amd/vulkan/radix_sort/shaders/histogram.comp
Normal file
449
src/amd/vulkan/radix_sort/shaders/histogram.comp
Normal file
@ -0,0 +1,449 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#version 460
|
||||
|
||||
//
|
||||
// Produce multiple radix size histograms from the keyvals.
|
||||
//
|
||||
|
||||
// clang-format off
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
#include "config.h"
|
||||
|
||||
//
|
||||
// Optional switches:
|
||||
//
|
||||
// #define RS_HISTOGRAM_ENABLE_BITFIELD_EXTRACT
|
||||
// #define RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
|
||||
//
|
||||
|
||||
//
|
||||
// Buffer reference macros and push constants
|
||||
//
|
||||
#include "bufref.h"
|
||||
#include "push.h"
|
||||
|
||||
//
|
||||
// Push constants for histogram shader
|
||||
//
|
||||
RS_STRUCT_PUSH_HISTOGRAM();
|
||||
|
||||
layout(push_constant) uniform block_push
|
||||
{
|
||||
rs_push_histogram push;
|
||||
};
|
||||
|
||||
//
|
||||
// Subgroup uniform support
|
||||
//
|
||||
#if defined(RS_HISTOGRAM_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
|
||||
#extension GL_EXT_subgroupuniform_qualifier : required
|
||||
#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
|
||||
#else
|
||||
#define RS_SUBGROUP_UNIFORM
|
||||
#endif
|
||||
|
||||
//
|
||||
// Check all switches are defined
|
||||
//
|
||||
|
||||
// What's the size of the keyval?
|
||||
#ifndef RS_KEYVAL_DWORDS
|
||||
#error "Undefined: RS_KEYVAL_DWORDS"
|
||||
#endif
|
||||
|
||||
//
|
||||
#ifndef RS_HISTOGRAM_BLOCK_ROWS
|
||||
#error "Undefined: RS_HISTOGRAM_BLOCK_ROWS"
|
||||
#endif
|
||||
|
||||
//
|
||||
#ifndef RS_HISTOGRAM_WORKGROUP_SIZE_LOG2
|
||||
#error "Undefined: RS_HISTOGRAM_WORKGROUP_SIZE_LOG2"
|
||||
#endif
|
||||
|
||||
//
|
||||
#ifndef RS_HISTOGRAM_SUBGROUP_SIZE_LOG2
|
||||
#error "Undefined: RS_HISTOGRAM_SUBGROUP_SIZE_LOG2"
|
||||
#endif
|
||||
|
||||
//
|
||||
// Local macros
|
||||
//
|
||||
// clang-format off
|
||||
#define RS_WORKGROUP_SIZE (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
|
||||
#define RS_SUBGROUP_SIZE (1 << RS_HISTOGRAM_SUBGROUP_SIZE_LOG2)
|
||||
#define RS_WORKGROUP_SUBGROUPS (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
|
||||
#define RS_BLOCK_KEYVALS (RS_HISTOGRAM_BLOCK_ROWS * RS_WORKGROUP_SIZE)
|
||||
#define RS_KEYVAL_SIZE (RS_KEYVAL_DWORDS * 4)
|
||||
#define RS_RADIX_MASK ((1 << RS_RADIX_LOG2) - 1)
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
// Keyval type
|
||||
//
|
||||
#if (RS_KEYVAL_DWORDS == 1)
|
||||
#define RS_KEYVAL_TYPE uint32_t
|
||||
#elif (RS_KEYVAL_DWORDS == 2)
|
||||
#define RS_KEYVAL_TYPE u32vec2
|
||||
#else
|
||||
#error "Unsupported RS_KEYVAL_DWORDS"
|
||||
#endif
|
||||
|
||||
//
|
||||
// Histogram offset depends on number of workgroups.
|
||||
//
|
||||
#define RS_HISTOGRAM_BASE(pass_) ((RS_RADIX_SIZE * 4) * pass_)
|
||||
|
||||
#if (RS_WORKGROUP_SUBGROUPS == 1)
|
||||
#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_SubgroupInvocationID * 4)
|
||||
#else
|
||||
#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_LocalInvocationID.x * 4)
|
||||
#endif
|
||||
|
||||
//
|
||||
// Assumes (RS_RADIX_LOG2 == 8)
|
||||
//
|
||||
// Error if this ever changes
|
||||
//
|
||||
#if (RS_RADIX_LOG2 != 8)
|
||||
#error "(RS_RADIX_LOG2 != 8)"
|
||||
#endif
|
||||
|
||||
//
|
||||
// Is bitfield extract faster?
|
||||
//
|
||||
#ifdef RS_HISTOGRAM_ENABLE_BITFIELD_EXTRACT
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
//
|
||||
// Extract a keyval digit
|
||||
//
|
||||
#if (RS_KEYVAL_DWORDS == 1)
|
||||
#define RS_KV_EXTRACT_DIGIT(kv_, pass_) bitfieldExtract(kv_, pass_ * RS_RADIX_LOG2, RS_RADIX_LOG2)
|
||||
#else
|
||||
#define RS_KV_EXTRACT_DIGIT(kv_, pass_) \
|
||||
bitfieldExtract(kv_[pass_ / 4], (pass_ & 3) * RS_RADIX_LOG2, RS_RADIX_LOG2)
|
||||
#endif
|
||||
//----------------------------------------------------------------------
|
||||
#else
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
//
|
||||
// Extract a keyval digit
|
||||
//
|
||||
#if (RS_KEYVAL_DWORDS == 1)
|
||||
#define RS_KV_EXTRACT_DIGIT(kv_, pass_) ((kv_ >> (pass_ * RS_RADIX_LOG2)) & RS_RADIX_MASK)
|
||||
#else
|
||||
#define RS_KV_EXTRACT_DIGIT(kv_, pass_) \
|
||||
((kv_[pass_ / 4] >> ((pass_ & 3) * RS_RADIX_LOG2)) & RS_RADIX_MASK)
|
||||
#endif
|
||||
//----------------------------------------------------------------------
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
|
||||
|
||||
struct rs_histogram_smem
|
||||
{
|
||||
uint32_t histogram[RS_RADIX_SIZE];
|
||||
};
|
||||
|
||||
shared rs_histogram_smem smem;
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
layout(local_size_x = RS_WORKGROUP_SIZE) in;
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
layout(buffer_reference, std430) buffer buffer_rs_kv
|
||||
{
|
||||
RS_KEYVAL_TYPE extent[];
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430) buffer buffer_rs_histograms
|
||||
{
|
||||
uint32_t extent[];
|
||||
};
|
||||
|
||||
//
|
||||
// Shared memory functions
|
||||
//
|
||||
#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
|
||||
|
||||
//
|
||||
// NOTE: Must use same access pattern as rs_histogram_zero()
|
||||
//
|
||||
void
|
||||
rs_histogram_zero()
|
||||
{
|
||||
//
|
||||
// Zero SMEM histogram
|
||||
//
|
||||
#if (RS_WORKGROUP_SUBGROUPS == 1)
|
||||
|
||||
const uint32_t smem_offset = gl_SubgroupInvocationID;
|
||||
|
||||
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
|
||||
{
|
||||
smem.histogram[smem_offset + ii] = 0;
|
||||
}
|
||||
|
||||
#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
|
||||
|
||||
const uint32_t smem_offset = gl_LocalInvocationID.x;
|
||||
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
|
||||
{
|
||||
smem.histogram[smem_offset + ii] = 0;
|
||||
}
|
||||
|
||||
const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
|
||||
|
||||
if (smem_idx < RS_RADIX_SIZE)
|
||||
{
|
||||
smem.histogram[smem_idx] = 0;
|
||||
}
|
||||
|
||||
#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
|
||||
|
||||
#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
|
||||
if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
|
||||
#endif
|
||||
{
|
||||
smem.histogram[gl_LocalInvocationID.x] = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// NOTE: Must use same access pattern as rs_histogram_zero()
|
||||
//
|
||||
void
|
||||
rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms)
|
||||
{
|
||||
//
|
||||
// Store to GMEM
|
||||
//
|
||||
#if (RS_WORKGROUP_SUBGROUPS == 1)
|
||||
|
||||
const uint32_t smem_offset = gl_SubgroupInvocationID;
|
||||
|
||||
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
|
||||
{
|
||||
const uint32_t count = smem.histogram[smem_offset + ii];
|
||||
|
||||
atomicAdd(rs_histograms.extent[ii], count);
|
||||
}
|
||||
|
||||
#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
|
||||
|
||||
const uint32_t smem_offset = gl_LocalInvocationID.x;
|
||||
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
|
||||
{
|
||||
const uint32_t count = smem.histogram[smem_offset + ii];
|
||||
|
||||
atomicAdd(rs_histograms.extent[ii], count);
|
||||
}
|
||||
|
||||
const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
|
||||
|
||||
if (smem_idx < RS_RADIX_SIZE)
|
||||
{
|
||||
const uint32_t count = smem.histogram[smem_idx];
|
||||
|
||||
atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)],
|
||||
count);
|
||||
}
|
||||
|
||||
#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
|
||||
|
||||
#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
|
||||
if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
|
||||
#endif
|
||||
{
|
||||
const uint32_t count = smem.histogram[gl_LocalInvocationID.x];
|
||||
|
||||
atomicAdd(rs_histograms.extent[0], count);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
|
||||
|
||||
void
|
||||
rs_histogram_atomic_after_write()
|
||||
{
|
||||
#if (RS_WORKGROUP_SUBGROUPS == 1)
|
||||
subgroupMemoryBarrierShared();
|
||||
#else
|
||||
barrier();
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
rs_histogram_read_after_atomic()
|
||||
{
|
||||
#if (RS_WORKGROUP_SUBGROUPS == 1)
|
||||
subgroupMemoryBarrierShared();
|
||||
#else
|
||||
barrier();
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
void
|
||||
main()
|
||||
{
|
||||
//
|
||||
// Which subgroups have work?
|
||||
//
|
||||
RS_KEYVAL_TYPE kv[RS_HISTOGRAM_BLOCK_ROWS];
|
||||
|
||||
//
|
||||
// Define kv_in bufref
|
||||
//
|
||||
// Assumes less than 2^30-1 keys and then extended multiplies it
|
||||
// by the keyval size.
|
||||
//
|
||||
u32vec2 kv_in_offset;
|
||||
|
||||
umulExtended(gl_WorkGroupID.x * RS_BLOCK_KEYVALS + gl_LocalInvocationID.x,
|
||||
RS_KEYVAL_SIZE,
|
||||
kv_in_offset.y, // msb
|
||||
kv_in_offset.x); // lsb
|
||||
|
||||
readonly RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_kv,
|
||||
rs_kv_in,
|
||||
push.devaddr_keyvals,
|
||||
kv_in_offset);
|
||||
|
||||
//
|
||||
// Load keyvals
|
||||
//
|
||||
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_HISTOGRAM_BLOCK_ROWS; ii++)
|
||||
{
|
||||
kv[ii] = rs_kv_in.extent[ii * RS_WORKGROUP_SIZE];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Accumulate and store histograms for passes
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// MACRO EXPANSION VARIANT
|
||||
//
|
||||
// NOTE: THIS ALSO SERVES AS A MALI R24+ WORKAROUND: EXPLICITLY
|
||||
// EXPAND THE FOR/LOOP PASSES
|
||||
//
|
||||
#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
|
||||
|
||||
#define RS_HISTOGRAM_PASS(pass_) \
|
||||
rs_histogram_zero(); \
|
||||
\
|
||||
rs_histogram_atomic_after_write(); \
|
||||
\
|
||||
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t jj = 0; jj < RS_HISTOGRAM_BLOCK_ROWS; jj++) \
|
||||
{ \
|
||||
const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[jj], pass_); \
|
||||
\
|
||||
atomicAdd(smem.histogram[digit], 1); \
|
||||
} \
|
||||
\
|
||||
rs_histogram_read_after_atomic(); \
|
||||
\
|
||||
{ \
|
||||
const uint32_t rs_histogram_offset = RS_HISTOGRAM_OFFSET(pass_); \
|
||||
\
|
||||
RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms, \
|
||||
rs_histograms, \
|
||||
push.devaddr_histograms, \
|
||||
rs_histogram_offset); \
|
||||
\
|
||||
rs_histogram_global_store(rs_histograms); \
|
||||
} \
|
||||
\
|
||||
if (push.passes == (RS_KEYVAL_SIZE - pass_)) \
|
||||
return;
|
||||
|
||||
#else // NO SHARED MEMORY
|
||||
|
||||
#define RS_HISTOGRAM_PASS(pass_) \
|
||||
{ \
|
||||
const uint32_t rs_histogram_base = RS_HISTOGRAM_BASE(pass_); \
|
||||
\
|
||||
RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms, \
|
||||
rs_histograms, \
|
||||
push.devaddr_histograms, \
|
||||
rs_histogram_base); \
|
||||
\
|
||||
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t jj = 0; jj < RS_HISTOGRAM_BLOCK_ROWS; jj++) \
|
||||
{ \
|
||||
const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[jj], pass_); \
|
||||
\
|
||||
atomicAdd(rs_histograms.extent[digit], 1); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (push.passes == (RS_KEYVAL_SIZE - pass_)) \
|
||||
return;
|
||||
|
||||
#endif
|
||||
|
||||
#if (RS_KEYVAL_DWORDS == 1)
|
||||
|
||||
RS_HISTOGRAM_PASS(3)
|
||||
RS_HISTOGRAM_PASS(2)
|
||||
RS_HISTOGRAM_PASS(1)
|
||||
RS_HISTOGRAM_PASS(0)
|
||||
|
||||
#elif (RS_KEYVAL_DWORDS == 2)
|
||||
|
||||
RS_HISTOGRAM_PASS(7)
|
||||
RS_HISTOGRAM_PASS(6)
|
||||
RS_HISTOGRAM_PASS(5)
|
||||
RS_HISTOGRAM_PASS(4)
|
||||
RS_HISTOGRAM_PASS(3)
|
||||
RS_HISTOGRAM_PASS(2)
|
||||
RS_HISTOGRAM_PASS(1)
|
||||
RS_HISTOGRAM_PASS(0)
|
||||
|
||||
#else
|
||||
#error "Error: (RS_KEYVAL_DWORDS >= 3) not implemented."
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
168
src/amd/vulkan/radix_sort/shaders/init.comp
Normal file
168
src/amd/vulkan/radix_sort/shaders/init.comp
Normal file
@ -0,0 +1,168 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#version 460
|
||||
|
||||
//
|
||||
// Initialize the `rs_indirect_info` struct
|
||||
//
|
||||
|
||||
// clang-format off
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
// Load arch/keyval configuration
|
||||
//
|
||||
#include "config.h"
|
||||
|
||||
//
|
||||
// Buffer reference macros and push constants
|
||||
//
|
||||
#include "bufref.h"
|
||||
#include "push.h"
|
||||
|
||||
//
|
||||
// Subgroup uniform support
|
||||
//
|
||||
#if defined(RS_SCATTER_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
|
||||
#extension GL_EXT_subgroupuniform_qualifier : required
|
||||
#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
|
||||
#else
|
||||
#define RS_SUBGROUP_UNIFORM
|
||||
#endif
|
||||
|
||||
//
|
||||
// Declare the push constants
|
||||
//
|
||||
RS_STRUCT_PUSH_INIT();
|
||||
|
||||
layout(push_constant) uniform block_push
|
||||
{
|
||||
rs_push_init push;
|
||||
};
|
||||
|
||||
//
|
||||
// The "init" shader configures the fill info structure.
|
||||
//
|
||||
RS_STRUCT_INDIRECT_INFO();
|
||||
|
||||
//
|
||||
// Local macros
|
||||
//
|
||||
// clang-format off
|
||||
#define RS_FILL_WORKGROUP_SIZE (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
|
||||
#define RS_SCATTER_WORKGROUP_SIZE (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
|
||||
#define RS_HISTOGRAM_WORKGROUP_SIZE (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
|
||||
|
||||
#define RS_FILL_BLOCK_DWORDS (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE)
|
||||
#define RS_SCATTER_BLOCK_KEYVALS (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE)
|
||||
#define RS_HISTOGRAM_BLOCK_KEYVALS (RS_HISTOGRAM_BLOCK_ROWS * RS_HISTOGRAM_WORKGROUP_SIZE)
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
layout(buffer_reference, std430) buffer buffer_rs_count
|
||||
{
|
||||
uint32_t count;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430) buffer buffer_rs_indirect_info
|
||||
{
|
||||
rs_indirect_info info;
|
||||
};
|
||||
|
||||
//
|
||||
// Helper macros
|
||||
//
|
||||
// RU = Round Up
|
||||
// RD = Round Down
|
||||
//
|
||||
#define RS_COUNT_RU_BLOCKS(count_, block_size_) ((count_ + block_size_ - 1) / (block_size_))
|
||||
#define RS_COUNT_RD_BLOCKS(count_, block_size_) ((count_) / (block_size_))
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
void
|
||||
main()
|
||||
{
|
||||
//
|
||||
// Load the keyval count
|
||||
//
|
||||
readonly RS_BUFREF_DEFINE(buffer_rs_count, rs_count, push.devaddr_count);
|
||||
|
||||
RS_SUBGROUP_UNIFORM const uint32_t count = rs_count.count;
|
||||
|
||||
//
|
||||
// Define the init struct bufref
|
||||
//
|
||||
writeonly RS_BUFREF_DEFINE(buffer_rs_indirect_info, rs_indirect_info, push.devaddr_info);
|
||||
|
||||
//
|
||||
// Size and set scatter dispatch
|
||||
//
|
||||
const uint32_t scatter_ru_blocks = RS_COUNT_RU_BLOCKS(count, RS_SCATTER_BLOCK_KEYVALS);
|
||||
const uint32_t count_ru_scatter = scatter_ru_blocks * RS_SCATTER_BLOCK_KEYVALS;
|
||||
|
||||
rs_indirect_info.info.dispatch.scatter = u32vec4(scatter_ru_blocks, 1, 1, 0);
|
||||
|
||||
//
|
||||
// Size and set histogram dispatch
|
||||
//
|
||||
const uint32_t histo_ru_blocks = RS_COUNT_RU_BLOCKS(count_ru_scatter, RS_HISTOGRAM_BLOCK_KEYVALS);
|
||||
const uint32_t count_ru_histo = histo_ru_blocks * RS_HISTOGRAM_BLOCK_KEYVALS;
|
||||
|
||||
rs_indirect_info.info.dispatch.histogram = u32vec4(histo_ru_blocks, 1, 1, 0);
|
||||
|
||||
//
|
||||
// Size and set pad fill and dispatch
|
||||
//
|
||||
const uint32_t count_dwords = count * RS_KEYVAL_DWORDS;
|
||||
const uint32_t pad_rd_blocks = RS_COUNT_RD_BLOCKS(count_dwords, RS_FILL_BLOCK_DWORDS);
|
||||
const uint32_t count_rd_pad = pad_rd_blocks * RS_FILL_BLOCK_DWORDS;
|
||||
const uint32_t count_ru_histo_dwords = count_ru_histo * RS_KEYVAL_DWORDS;
|
||||
const uint32_t pad_dwords = count_ru_histo_dwords - count_rd_pad;
|
||||
const uint32_t pad_ru_blocks = RS_COUNT_RU_BLOCKS(pad_dwords, RS_FILL_BLOCK_DWORDS);
|
||||
|
||||
rs_indirect_info_fill pad;
|
||||
|
||||
pad.block_offset = pad_rd_blocks;
|
||||
pad.dword_offset_min = count_dwords;
|
||||
pad.dword_offset_max_minus_min = count_ru_histo_dwords - count_dwords;
|
||||
|
||||
rs_indirect_info.info.pad = pad;
|
||||
rs_indirect_info.info.dispatch.pad = u32vec4(pad_ru_blocks, 1, 1, 0);
|
||||
|
||||
//
|
||||
// Size and set zero fill and dispatch
|
||||
//
|
||||
// NOTE(allanmac): We could zero the histogram passes on the host
|
||||
// since the number of passes is known ahead of time but since the
|
||||
// 256-dword partitions directly follow the 256-dword histograms, we
|
||||
// can dispatch just one FILL.
|
||||
//
|
||||
rs_indirect_info_fill zero;
|
||||
|
||||
zero.block_offset = 0;
|
||||
zero.dword_offset_min = 0;
|
||||
zero.dword_offset_max_minus_min = (push.passes + scatter_ru_blocks - 1) * RS_RADIX_SIZE;
|
||||
|
||||
const uint32_t zero_ru_blocks =
|
||||
RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min, RS_FILL_BLOCK_DWORDS);
|
||||
|
||||
rs_indirect_info.info.zero = zero;
|
||||
rs_indirect_info.info.dispatch.zero = u32vec4(zero_ru_blocks, 1, 1, 0);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
51
src/amd/vulkan/radix_sort/shaders/meson.build
Normal file
51
src/amd/vulkan/radix_sort/shaders/meson.build
Normal file
@ -0,0 +1,51 @@
|
||||
# Copyright © 2022 Konstantin Seurer
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
radix_sort_shaders = [
|
||||
'init.comp',
|
||||
'fill.comp',
|
||||
'histogram.comp',
|
||||
'prefix.comp',
|
||||
'scatter_0_even.comp',
|
||||
'scatter_0_odd.comp',
|
||||
'scatter_1_even.comp',
|
||||
'scatter_1_odd.comp'
|
||||
]
|
||||
|
||||
shader_include_dir = meson.source_root() + '/src/amd/vulkan/radix_sort/targets/u64'
|
||||
|
||||
shader_include_files = files(
|
||||
'bufref.h',
|
||||
'prefix_limits.h',
|
||||
'prefix.h',
|
||||
'push.h',
|
||||
'scatter.glsl',
|
||||
meson.source_root() + '/src/amd/vulkan/radix_sort/targets/u64/config.h'
|
||||
)
|
||||
|
||||
radix_sort_spv = []
|
||||
foreach s : radix_sort_shaders
|
||||
radix_sort_spv += custom_target(
|
||||
s + '.spv.h',
|
||||
input : s,
|
||||
output : s + '.spv.h',
|
||||
command : [prog_glslang, '-V', '-I' + shader_include_dir, '--target-env', 'spirv1.3', '-x', '-o', '@OUTPUT@', '@INPUT@'],
|
||||
depend_files: shader_include_files)
|
||||
endforeach
|
194
src/amd/vulkan/radix_sort/shaders/prefix.comp
Normal file
194
src/amd/vulkan/radix_sort/shaders/prefix.comp
Normal file
@ -0,0 +1,194 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#version 460
|
||||
|
||||
//
|
||||
// Prefix sum the coarse histograms.
|
||||
//
|
||||
|
||||
// clang-format off
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_KHR_shader_subgroup_ballot : require
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
#include "config.h"
|
||||
|
||||
//
|
||||
// Buffer reference macros and push constants
|
||||
//
|
||||
#include "bufref.h"
|
||||
#include "push.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
RS_STRUCT_PUSH_PREFIX();
|
||||
|
||||
layout(push_constant) uniform block_push
|
||||
{
|
||||
rs_push_prefix push;
|
||||
};
|
||||
|
||||
//
|
||||
// Subgroup uniform support
|
||||
//
|
||||
#if defined(RS_HISTOGRAM_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
|
||||
#extension GL_EXT_subgroupuniform_qualifier : required
|
||||
#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
|
||||
#else
|
||||
#define RS_SUBGROUP_UNIFORM
|
||||
#endif
|
||||
|
||||
//
|
||||
// Check all switches are defined
|
||||
//
|
||||
//
|
||||
#ifndef RS_PREFIX_SUBGROUP_SIZE_LOG2
|
||||
#error "Undefined: RS_PREFIX_SUBGROUP_SIZE_LOG2"
|
||||
#endif
|
||||
|
||||
//
|
||||
#ifndef RS_PREFIX_WORKGROUP_SIZE_LOG2
|
||||
#error "Undefined: RS_PREFIX_WORKGROUP_SIZE_LOG2"
|
||||
#endif
|
||||
|
||||
//
|
||||
// Local macros
|
||||
//
|
||||
// clang-format off
|
||||
#define RS_KEYVAL_SIZE (RS_KEYVAL_DWORDS * 4)
|
||||
#define RS_WORKGROUP_SIZE (1 << RS_PREFIX_WORKGROUP_SIZE_LOG2)
|
||||
#define RS_SUBGROUP_SIZE (1 << RS_PREFIX_SUBGROUP_SIZE_LOG2)
|
||||
#define RS_WORKGROUP_SUBGROUPS (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
// There is no purpose in having a workgroup size larger than the
|
||||
// radix size.
|
||||
//
|
||||
#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
|
||||
#error "Error: (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)"
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
layout(local_size_x = RS_WORKGROUP_SIZE) in;
|
||||
|
||||
//
|
||||
// Histogram buffer reference
|
||||
//
|
||||
layout(buffer_reference, std430) buffer buffer_rs_histograms
|
||||
{
|
||||
uint32_t extent[];
|
||||
};
|
||||
|
||||
//
|
||||
// Load prefix limits before loading function
|
||||
//
|
||||
#include "prefix_limits.h"
|
||||
|
||||
//
|
||||
// If multi-subgroup then define shared memory
|
||||
//
|
||||
#if (RS_WORKGROUP_SUBGROUPS > 1)
|
||||
|
||||
//----------------------------------------
|
||||
shared uint32_t smem_sweep0[RS_SWEEP_0_SIZE];
|
||||
|
||||
#define RS_PREFIX_SWEEP0(idx_) smem_sweep0[idx_]
|
||||
//----------------------------------------
|
||||
|
||||
#if (RS_SWEEP_1_SIZE > 0)
|
||||
//----------------------------------------
|
||||
shared uint32_t smem_sweep1[RS_SWEEP_1_SIZE];
|
||||
|
||||
#define RS_PREFIX_SWEEP1(idx_) smem_sweep1[idx_]
|
||||
//----------------------------------------
|
||||
#endif
|
||||
|
||||
#if (RS_SWEEP_2_SIZE > 0)
|
||||
//----------------------------------------
|
||||
shared uint32_t smem_sweep2[RS_SWEEP_2_SIZE];
|
||||
|
||||
#define RS_PREFIX_SWEEP2(idx_) smem_sweep2[idx_]
|
||||
//----------------------------------------
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// Define function arguments
|
||||
//
|
||||
#define RS_PREFIX_ARGS buffer_rs_histograms rs_histograms
|
||||
|
||||
//
|
||||
// Define load/store functions
|
||||
//
|
||||
// clang-format off
|
||||
#define RS_PREFIX_LOAD(idx_) rs_histograms.extent[idx_]
|
||||
#define RS_PREFIX_STORE(idx_) rs_histograms.extent[idx_]
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
// Load prefix function
|
||||
//
|
||||
#include "prefix.h"
|
||||
|
||||
//
|
||||
// Exclusive prefix of uint32_t[256]
|
||||
//
|
||||
void
|
||||
main()
|
||||
{
|
||||
//
|
||||
// Define buffer reference to read histograms
|
||||
//
|
||||
#if (RS_WORKGROUP_SUBGROUPS == 1)
|
||||
//
|
||||
// Define histograms bufref for single subgroup
|
||||
//
|
||||
// NOTE(allanmac): The histogram buffer reference could be adjusted
|
||||
// on the host to save a couple instructions at the cost of added
|
||||
// complexity.
|
||||
//
|
||||
RS_SUBGROUP_UNIFORM
|
||||
const uint32_t histograms_base = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
|
||||
const uint32_t histograms_offset = (histograms_base + gl_SubgroupInvocationID) * 4;
|
||||
|
||||
RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
|
||||
rs_histograms,
|
||||
push.devaddr_histograms,
|
||||
histograms_offset);
|
||||
|
||||
#else
|
||||
//
|
||||
// Define histograms bufref for workgroup
|
||||
//
|
||||
RS_SUBGROUP_UNIFORM
|
||||
const uint32_t histograms_base = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
|
||||
const uint32_t histograms_offset = (histograms_base + gl_LocalInvocationID.x) * 4;
|
||||
|
||||
RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
|
||||
rs_histograms,
|
||||
push.devaddr_histograms,
|
||||
histograms_offset);
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// Compute exclusive prefix of uint32_t[256]
|
||||
//
|
||||
rs_prefix(rs_histograms);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
353
src/amd/vulkan/radix_sort/shaders/prefix.h
Normal file
353
src/amd/vulkan/radix_sort/shaders/prefix.h
Normal file
@ -0,0 +1,353 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
|
||||
|
||||
//
|
||||
// Requires several defines
|
||||
//
|
||||
#ifndef RS_PREFIX_LIMITS
|
||||
#error "Error: \"prefix_limits.h\" not loaded"
|
||||
#endif
|
||||
|
||||
#ifndef RS_PREFIX_ARGS
|
||||
#error "Error: RS_PREFIX_ARGS undefined"
|
||||
#endif
|
||||
|
||||
#ifndef RS_PREFIX_LOAD
|
||||
#error "Error: RS_PREFIX_LOAD undefined"
|
||||
#endif
|
||||
|
||||
#ifndef RS_PREFIX_STORE
|
||||
#error "Error: RS_PREFIX_STORE undefined"
|
||||
#endif
|
||||
|
||||
#ifndef RS_SUBGROUP_SIZE
|
||||
#error "Error: RS_SUBGROUP_SIZE undefined"
|
||||
#endif
|
||||
|
||||
#ifndef RS_WORKGROUP_SIZE
|
||||
#error "Error: RS_WORKGROUP_SIZE undefined"
|
||||
#endif
|
||||
|
||||
#ifndef RS_WORKGROUP_SUBGROUPS
|
||||
#error "Error: RS_WORKGROUP_SUBGROUPS undefined"
|
||||
#endif
|
||||
|
||||
//
|
||||
// Optional switches:
|
||||
//
|
||||
// * Disable holding original inclusively scanned histogram values in registers.
|
||||
//
|
||||
// #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
|
||||
//
|
||||
|
||||
//
|
||||
// Compute exclusive prefix of uint32_t[256]
|
||||
//
|
||||
void
|
||||
rs_prefix(RS_PREFIX_ARGS)
|
||||
{
|
||||
#if (RS_WORKGROUP_SUBGROUPS == 1)
|
||||
//
|
||||
// Workgroup is a single subgroup so no shared memory is required.
|
||||
//
|
||||
|
||||
//
|
||||
// Exclusive scan-add the histogram
|
||||
//
|
||||
const uint32_t h0 = RS_PREFIX_LOAD(0);
|
||||
const uint32_t h0_inc = subgroupInclusiveAdd(h0);
|
||||
RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
|
||||
|
||||
RS_PREFIX_STORE(0) = h0_inc - h0; // exclusive
|
||||
|
||||
//
|
||||
// Each iteration is dependent on the previous so no unrolling. The
|
||||
// compiler is free to hoist the loads upward though.
|
||||
//
|
||||
for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE; //
|
||||
ii < RS_RADIX_SIZE;
|
||||
ii += RS_SUBGROUP_SIZE)
|
||||
{
|
||||
const uint32_t h = RS_PREFIX_LOAD(ii);
|
||||
const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last;
|
||||
h_last = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
|
||||
|
||||
RS_PREFIX_STORE(ii) = h_inc - h; // exclusive
|
||||
}
|
||||
|
||||
#else
|
||||
//
|
||||
// Workgroup is multiple subgroups and uses shared memory to store
|
||||
// the scan's intermediate results.
|
||||
//
|
||||
// Assumes a power-of-two subgroup, workgroup and radix size.
|
||||
//
|
||||
// Downsweep: Repeatedly scan reductions until they fit in a single
|
||||
// subgroup.
|
||||
//
|
||||
// Upsweep: Then uniformly apply reductions to each subgroup.
|
||||
//
|
||||
//
|
||||
// Subgroup Size | 4 | 8 | 16 | 32 | 64 |
|
||||
// --------------+----+----+----+----+----+
|
||||
// Sweep 0 | 64 | 32 | 16 | 8 | 4 | sweep_0[]
|
||||
// Sweep 1 | 16 | 4 | - | - | - | sweep_1[]
|
||||
// Sweep 2 | 4 | - | - | - | - | sweep_2[]
|
||||
// --------------+----+----+----+----+----+
|
||||
// Total dwords | 84 | 36 | 16 | 8 | 4 |
|
||||
// --------------+----+----+----+----+----+
|
||||
//
|
||||
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
|
||||
uint32_t h_exc[RS_H_COMPONENTS];
|
||||
#endif
|
||||
|
||||
//
|
||||
// Downsweep 0
|
||||
//
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
|
||||
{
|
||||
const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
|
||||
|
||||
const uint32_t h_inc = subgroupInclusiveAdd(h);
|
||||
|
||||
const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
|
||||
|
||||
RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
|
||||
|
||||
//
|
||||
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
|
||||
h_exc[ii] = h_inc - h;
|
||||
#else
|
||||
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h;
|
||||
#endif
|
||||
}
|
||||
|
||||
barrier();
|
||||
|
||||
//
|
||||
// Skip generalizing these sweeps for all possible subgroups -- just
|
||||
// write them directly.
|
||||
//
|
||||
#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scan 0
|
||||
//
|
||||
#if (RS_SWEEP_0_SIZE != RS_SUBGROUP_SIZE)
|
||||
if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // subgroup has inactive invocations
|
||||
#endif
|
||||
{
|
||||
const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
|
||||
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
|
||||
|
||||
RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
|
||||
}
|
||||
|
||||
#elif (RS_SUBGROUP_SIZE == 8)
|
||||
|
||||
#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scan 0 and Downsweep 1
|
||||
//
|
||||
if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // 32 invocations
|
||||
{
|
||||
const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
|
||||
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
|
||||
|
||||
RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
|
||||
RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scan 0 and Downsweep 1
|
||||
//
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++) // 32 invocations
|
||||
{
|
||||
const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
|
||||
const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
|
||||
|
||||
const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
|
||||
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
|
||||
|
||||
RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
|
||||
RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
barrier();
|
||||
|
||||
//
|
||||
// Scan 1
|
||||
//
|
||||
if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE) // 4 invocations
|
||||
{
|
||||
const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
|
||||
const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
|
||||
|
||||
RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
|
||||
}
|
||||
|
||||
#elif (RS_SUBGROUP_SIZE == 4)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scan 0 and Downsweep 1
|
||||
//
|
||||
#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
|
||||
|
||||
if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // 64 invocations
|
||||
{
|
||||
const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
|
||||
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
|
||||
|
||||
RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
|
||||
RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++) // 64 invocations
|
||||
{
|
||||
const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
|
||||
const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
|
||||
|
||||
const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
|
||||
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
|
||||
|
||||
RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
|
||||
RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
barrier();
|
||||
|
||||
//
|
||||
// Scan 1 and Downsweep 2
|
||||
//
|
||||
#if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE)
|
||||
if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE) // 16 invocations
|
||||
{
|
||||
const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
|
||||
const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
|
||||
|
||||
RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
|
||||
RS_PREFIX_SWEEP2(gl_SubgroupID) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++) // 16 invocations
|
||||
{
|
||||
const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
|
||||
const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
|
||||
|
||||
const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1);
|
||||
const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
|
||||
|
||||
RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red;
|
||||
RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
barrier();
|
||||
|
||||
//
|
||||
// Scan 2
|
||||
//
|
||||
// 4 invocations
|
||||
//
|
||||
if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE)
|
||||
{
|
||||
const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x);
|
||||
const uint32_t h2_inc = subgroupInclusiveAdd(h2_red);
|
||||
|
||||
RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red;
|
||||
}
|
||||
|
||||
#else
|
||||
#error "Error: Unsupported subgroup size"
|
||||
#endif
|
||||
|
||||
barrier();
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Final upsweep 0
|
||||
//
|
||||
#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
|
||||
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
|
||||
{
|
||||
const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
|
||||
|
||||
// clang format issue
|
||||
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
|
||||
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0);
|
||||
#else
|
||||
const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
|
||||
|
||||
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0);
|
||||
#endif
|
||||
}
|
||||
|
||||
#elif (RS_SUBGROUP_SIZE == 8)
|
||||
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
|
||||
{
|
||||
const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
|
||||
const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
|
||||
|
||||
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
|
||||
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
|
||||
h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
|
||||
#else
|
||||
const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
|
||||
|
||||
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
|
||||
h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
|
||||
#endif
|
||||
}
|
||||
|
||||
#elif (RS_SUBGROUP_SIZE == 4)
|
||||
|
||||
[[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
|
||||
{
|
||||
const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
|
||||
const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
|
||||
const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE;
|
||||
|
||||
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
|
||||
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
|
||||
h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
|
||||
#else
|
||||
const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
|
||||
|
||||
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
|
||||
h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
#error "Error: Unsupported subgroup size"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
|
48
src/amd/vulkan/radix_sort/shaders/prefix_limits.h
Normal file
48
src/amd/vulkan/radix_sort/shaders/prefix_limits.h
Normal file
@ -0,0 +1,48 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_LIMITS_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_LIMITS_H_
|
||||
|
||||
//
|
||||
// Define various prefix limits
|
||||
//
|
||||
#define RS_PREFIX_LIMITS
|
||||
|
||||
//
|
||||
// Multi-subgroup prefix requires shared memory.
|
||||
//
|
||||
#if (RS_WORKGROUP_SUBGROUPS > 1)
|
||||
|
||||
// clang-format off
|
||||
#define RS_H_COMPONENTS (RS_RADIX_SIZE / RS_WORKGROUP_SIZE)
|
||||
|
||||
#define RS_SWEEP_0_SIZE (RS_RADIX_SIZE / RS_SUBGROUP_SIZE)
|
||||
#define RS_SWEEP_1_SIZE (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE)
|
||||
#define RS_SWEEP_2_SIZE (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE)
|
||||
|
||||
#define RS_SWEEP_SIZE (RS_SWEEP_0_SIZE + RS_SWEEP_1_SIZE + RS_SWEEP_2_SIZE)
|
||||
|
||||
#define RS_S0_PASSES (RS_SWEEP_0_SIZE / RS_WORKGROUP_SIZE)
|
||||
#define RS_S1_PASSES (RS_SWEEP_1_SIZE / RS_WORKGROUP_SIZE)
|
||||
|
||||
#define RS_SWEEP_0_OFFSET 0
|
||||
#define RS_SWEEP_1_OFFSET (RS_SWEEP_0_OFFSET + RS_SWEEP_0_SIZE)
|
||||
#define RS_SWEEP_2_OFFSET (RS_SWEEP_1_OFFSET + RS_SWEEP_1_SIZE)
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
// Single subgroup prefix doesn't use shared memory.
|
||||
//
|
||||
#else
|
||||
|
||||
#define RS_SWEEP_SIZE 0
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_LIMITS_H_
|
263
src/amd/vulkan/radix_sort/shaders/push.h
Normal file
263
src/amd/vulkan/radix_sort/shaders/push.h
Normal file
@ -0,0 +1,263 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PUSH_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PUSH_H_
|
||||
|
||||
//
|
||||
// There is a limit to the maximum number of keyvals that can be sorted because
|
||||
// the top 2 bits in the atomic lookback counters are used as tag bits.
|
||||
//
|
||||
#define RS_MAX_KEYVALS ((1 << 30) - 1)
|
||||
|
||||
//
|
||||
// Right now, the entire implementation is very much dependent on an 8-bit radix
|
||||
// size. Most of the shaders attempt to honor this defined size but there are
|
||||
// still a number of places where 256 is assumed.
|
||||
//
|
||||
#define RS_RADIX_LOG2 8
|
||||
#define RS_RADIX_SIZE (1 << RS_RADIX_LOG2)
|
||||
|
||||
//
|
||||
// LOOKBACK STATUS FLAGS
|
||||
//
|
||||
// The decoupled lookback status flags are stored in the two
|
||||
// high bits of the count:
|
||||
//
|
||||
// 0 31
|
||||
// | REDUCTION OR PREFIX COUNT | STATUS |
|
||||
// +---------------------------+--------+
|
||||
// | 30 | 2 |
|
||||
//
|
||||
// This limits the keyval extent size to (2^30-1).
|
||||
//
|
||||
// Valid status flags are:
|
||||
//
|
||||
// EVEN PASS ODD PASS
|
||||
// ----------------------- -----------------------
|
||||
// 0 : invalid 0 : prefix available
|
||||
// 1 : reduction available 1 : ---
|
||||
// 2 : prefix available 2 : invalid
|
||||
// 3 : --- 3 : reduction available
|
||||
//
|
||||
// Atomically adding +1 to a "reduction available" status results in a "prefix
|
||||
// available" status.
|
||||
//
|
||||
// clang-format off
|
||||
#define RS_PARTITION_STATUS_EVEN_INVALID (0u)
|
||||
#define RS_PARTITION_STATUS_EVEN_REDUCTION (1u)
|
||||
#define RS_PARTITION_STATUS_EVEN_PREFIX (2u)
|
||||
|
||||
#define RS_PARTITION_STATUS_ODD_INVALID (2u)
|
||||
#define RS_PARTITION_STATUS_ODD_REDUCTION (3u)
|
||||
#define RS_PARTITION_STATUS_ODD_PREFIX (0u)
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
// Arguments to indirectly launched shaders.
|
||||
//
|
||||
// struct rs_indirect_info_dispatch
|
||||
// {
|
||||
// u32vec4 pad;
|
||||
// u32vec4 zero;
|
||||
// u32vec4 histogram;
|
||||
// u32vec4 scatter;
|
||||
// };
|
||||
//
|
||||
// struct rs_indirect_info_fill
|
||||
// {
|
||||
// uint32_t block_offset;
|
||||
// uint32_t dword_offset_min;
|
||||
// uint32_t dword_offset_max_minus_min;
|
||||
// uint32_t reserved; // padding for 16 bytes
|
||||
// };
|
||||
//
|
||||
// struct rs_indirect_info
|
||||
// {
|
||||
// rs_indirect_info_fill pad;
|
||||
// rs_indirect_info_fill zero;
|
||||
// rs_indirect_info_dispatch dispatch;
|
||||
// };
|
||||
//
|
||||
#define RS_STRUCT_INDIRECT_INFO_DISPATCH() \
|
||||
struct rs_indirect_info_dispatch \
|
||||
{ \
|
||||
RS_STRUCT_MEMBER_STRUCT(u32vec4, pad) \
|
||||
RS_STRUCT_MEMBER_STRUCT(u32vec4, zero) \
|
||||
RS_STRUCT_MEMBER_STRUCT(u32vec4, histogram) \
|
||||
RS_STRUCT_MEMBER_STRUCT(u32vec4, scatter) \
|
||||
}
|
||||
|
||||
#define RS_STRUCT_INDIRECT_INFO_FILL() \
|
||||
struct rs_indirect_info_fill \
|
||||
{ \
|
||||
RS_STRUCT_MEMBER(uint32_t, block_offset) \
|
||||
RS_STRUCT_MEMBER(uint32_t, dword_offset_min) \
|
||||
RS_STRUCT_MEMBER(uint32_t, dword_offset_max_minus_min) \
|
||||
RS_STRUCT_MEMBER(uint32_t, reserved) \
|
||||
}
|
||||
|
||||
#define RS_STRUCT_INDIRECT_INFO() \
|
||||
RS_STRUCT_INDIRECT_INFO_DISPATCH(); \
|
||||
RS_STRUCT_INDIRECT_INFO_FILL(); \
|
||||
struct rs_indirect_info \
|
||||
{ \
|
||||
RS_STRUCT_MEMBER_STRUCT(rs_indirect_info_fill, pad) \
|
||||
RS_STRUCT_MEMBER_STRUCT(rs_indirect_info_fill, zero) \
|
||||
RS_STRUCT_MEMBER_STRUCT(rs_indirect_info_dispatch, dispatch) \
|
||||
}
|
||||
|
||||
//
|
||||
// Define the push constant structures shared by the host and device.
|
||||
//
|
||||
// INIT
|
||||
// ----
|
||||
// struct rs_push_init
|
||||
// {
|
||||
// uint64_t devaddr_count; // address of count buffer
|
||||
// uint64_t devaddr_indirect; // address of indirect info buffer
|
||||
// };
|
||||
//
|
||||
// FILL
|
||||
// ----
|
||||
// struct rs_push_fill
|
||||
// {
|
||||
// uint64_t devaddr_info; // address of indirect info for fill shader
|
||||
// uint64_t devaddr_dwords; // address of dwords extent
|
||||
// uint32_t dword; // dword value used to fill the dwords extent
|
||||
// };
|
||||
//
|
||||
// HISTOGRAM
|
||||
// ---------
|
||||
// struct rs_push_histogram
|
||||
// {
|
||||
// uint64_t devaddr_histograms; // address of histograms extent
|
||||
// uint64_t devaddr_keyvals; // address of keyvals extent
|
||||
// uint32_t passes; // number of passes
|
||||
// };
|
||||
//
|
||||
// PREFIX
|
||||
// ------
|
||||
// struct rs_push_prefix
|
||||
// {
|
||||
// uint64_t devaddr_histograms; // address of histograms extent
|
||||
// };
|
||||
//
|
||||
// SCATTER
|
||||
// -------
|
||||
// struct rs_push_scatter
|
||||
// {
|
||||
// uint64_t devaddr_keyvals_in; // address of input keyvals
|
||||
// uint64_t devaddr_keyvals_out; // address of output keyvals
|
||||
// uint64_t devaddr_partitions // address of partitions
|
||||
// uint64_t devaddr_histogram; // address of pass histogram
|
||||
// uint32_t pass_offset; // keyval pass offset
|
||||
// };
|
||||
//
|
||||
#define RS_STRUCT_PUSH_INIT() \
|
||||
struct rs_push_init \
|
||||
{ \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_info) \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_count) \
|
||||
RS_STRUCT_MEMBER(uint32_t, passes) \
|
||||
}
|
||||
|
||||
#define RS_STRUCT_PUSH_FILL() \
|
||||
struct rs_push_fill \
|
||||
{ \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_info) \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_dwords) \
|
||||
RS_STRUCT_MEMBER(uint32_t, dword) \
|
||||
}
|
||||
|
||||
#define RS_STRUCT_PUSH_HISTOGRAM() \
|
||||
struct rs_push_histogram \
|
||||
{ \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_histograms) \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_keyvals) \
|
||||
RS_STRUCT_MEMBER(uint32_t, passes) \
|
||||
}
|
||||
|
||||
#define RS_STRUCT_PUSH_PREFIX() \
|
||||
struct rs_push_prefix \
|
||||
{ \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_histograms) \
|
||||
}
|
||||
|
||||
#define RS_STRUCT_PUSH_SCATTER() \
|
||||
struct rs_push_scatter \
|
||||
{ \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_keyvals_even) \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_keyvals_odd) \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_partitions) \
|
||||
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_histograms) \
|
||||
RS_STRUCT_MEMBER(uint32_t, pass_offset) \
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// GLSL
|
||||
//
|
||||
#ifdef VULKAN // defined by GLSL/VK compiler
|
||||
|
||||
// clang-format off
|
||||
#define RS_STRUCT_MEMBER(type_, name_) type_ name_;
|
||||
#define RS_STRUCT_MEMBER_FARRAY(type_, len_, name_) type_ name_[len_];
|
||||
#define RS_STRUCT_MEMBER_STRUCT(type_, name_) type_ name_;
|
||||
// clang-format on
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// C/C++
|
||||
//
|
||||
#else
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
struct u32vec4
|
||||
{
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
uint32_t z;
|
||||
uint32_t w;
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
#define RS_DEVADDR uint64_t
|
||||
#define RS_STRUCT_MEMBER(type_, name_) type_ name_;
|
||||
#define RS_STRUCT_MEMBER_FARRAY(type_, len_, name_) type_ name_[len_];
|
||||
#define RS_STRUCT_MEMBER_STRUCT(type_, name_) struct type_ name_;
|
||||
// clang-format on
|
||||
|
||||
RS_STRUCT_PUSH_INIT();
|
||||
RS_STRUCT_PUSH_FILL();
|
||||
RS_STRUCT_PUSH_HISTOGRAM();
|
||||
RS_STRUCT_PUSH_PREFIX();
|
||||
RS_STRUCT_PUSH_SCATTER();
|
||||
|
||||
RS_STRUCT_INDIRECT_INFO();
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PUSH_H_
|
1706
src/amd/vulkan/radix_sort/shaders/scatter.glsl
Normal file
1706
src/amd/vulkan/radix_sort/shaders/scatter.glsl
Normal file
File diff suppressed because it is too large
Load Diff
36
src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp
Normal file
36
src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp
Normal file
@ -0,0 +1,36 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#version 460
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
// clang-format off
|
||||
#define RS_SCATTER_KEYVAL_DWORD_BASE 0
|
||||
|
||||
#define RS_PARTITION_STATUS_INVALID RS_PARTITION_STATUS_EVEN_INVALID
|
||||
#define RS_PARTITION_STATUS_REDUCTION RS_PARTITION_STATUS_EVEN_REDUCTION
|
||||
#define RS_PARTITION_STATUS_PREFIX RS_PARTITION_STATUS_EVEN_PREFIX
|
||||
|
||||
#define RS_DEVADDR_KEYVALS_IN(push_) push_.devaddr_keyvals_even
|
||||
#define RS_DEVADDR_KEYVALS_OUT(push_) push_.devaddr_keyvals_odd
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "scatter.glsl"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
36
src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp
Normal file
36
src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp
Normal file
@ -0,0 +1,36 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#version 460
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
// clang-format off
|
||||
#define RS_SCATTER_KEYVAL_DWORD_BASE 0
|
||||
|
||||
#define RS_PARTITION_STATUS_INVALID RS_PARTITION_STATUS_ODD_INVALID
|
||||
#define RS_PARTITION_STATUS_REDUCTION RS_PARTITION_STATUS_ODD_REDUCTION
|
||||
#define RS_PARTITION_STATUS_PREFIX RS_PARTITION_STATUS_ODD_PREFIX
|
||||
|
||||
#define RS_DEVADDR_KEYVALS_IN(push_) push_.devaddr_keyvals_odd
|
||||
#define RS_DEVADDR_KEYVALS_OUT(push_) push_.devaddr_keyvals_even
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "scatter.glsl"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
36
src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp
Normal file
36
src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp
Normal file
@ -0,0 +1,36 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#version 460
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
// clang-format off
|
||||
#define RS_SCATTER_KEYVAL_DWORD_BASE 1
|
||||
|
||||
#define RS_PARTITION_STATUS_INVALID RS_PARTITION_STATUS_EVEN_INVALID
|
||||
#define RS_PARTITION_STATUS_REDUCTION RS_PARTITION_STATUS_EVEN_REDUCTION
|
||||
#define RS_PARTITION_STATUS_PREFIX RS_PARTITION_STATUS_EVEN_PREFIX
|
||||
|
||||
#define RS_DEVADDR_KEYVALS_IN(push_) push_.devaddr_keyvals_even
|
||||
#define RS_DEVADDR_KEYVALS_OUT(push_) push_.devaddr_keyvals_odd
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "scatter.glsl"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
36
src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp
Normal file
36
src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp
Normal file
@ -0,0 +1,36 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#version 460
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
// clang-format off
|
||||
#define RS_SCATTER_KEYVAL_DWORD_BASE 1
|
||||
|
||||
#define RS_PARTITION_STATUS_INVALID RS_PARTITION_STATUS_ODD_INVALID
|
||||
#define RS_PARTITION_STATUS_REDUCTION RS_PARTITION_STATUS_ODD_REDUCTION
|
||||
#define RS_PARTITION_STATUS_PREFIX RS_PARTITION_STATUS_ODD_PREFIX
|
||||
|
||||
#define RS_DEVADDR_KEYVALS_IN(push_) push_.devaddr_keyvals_odd
|
||||
#define RS_DEVADDR_KEYVALS_OUT(push_) push_.devaddr_keyvals_even
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#extension GL_GOOGLE_include_directive : require
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "scatter.glsl"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
57
src/amd/vulkan/radix_sort/target.h
Normal file
57
src/amd/vulkan/radix_sort/target.h
Normal file
@ -0,0 +1,57 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGET_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGET_H_
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
//
|
||||
// This structure packages target-specific configuration parameters.
|
||||
//
|
||||
|
||||
struct radix_sort_vk_target_config
|
||||
{
|
||||
uint32_t keyval_dwords;
|
||||
|
||||
struct
|
||||
{
|
||||
uint32_t workgroup_size_log2;
|
||||
} init;
|
||||
|
||||
struct
|
||||
{
|
||||
uint32_t workgroup_size_log2;
|
||||
} fill;
|
||||
|
||||
struct
|
||||
{
|
||||
uint32_t workgroup_size_log2;
|
||||
uint32_t subgroup_size_log2;
|
||||
uint32_t block_rows;
|
||||
} histogram;
|
||||
|
||||
struct
|
||||
{
|
||||
uint32_t workgroup_size_log2;
|
||||
uint32_t subgroup_size_log2;
|
||||
} prefix;
|
||||
|
||||
struct
|
||||
{
|
||||
uint32_t workgroup_size_log2;
|
||||
uint32_t subgroup_size_log2;
|
||||
uint32_t block_rows;
|
||||
} scatter;
|
||||
};
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGET_H_
|
34
src/amd/vulkan/radix_sort/targets/u64/config.h
Normal file
34
src/amd/vulkan/radix_sort/targets/u64/config.h
Normal file
@ -0,0 +1,34 @@
|
||||
// Copyright 2021 The Fuchsia Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
|
||||
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
// clang-format off
|
||||
#define RS_KEYVAL_DWORDS 2
|
||||
|
||||
#define RS_FILL_WORKGROUP_SIZE_LOG2 7
|
||||
#define RS_FILL_BLOCK_ROWS 8
|
||||
|
||||
#define RS_HISTOGRAM_WORKGROUP_SIZE_LOG2 8
|
||||
#define RS_HISTOGRAM_SUBGROUP_SIZE_LOG2 6
|
||||
#define RS_HISTOGRAM_BLOCK_ROWS 14
|
||||
|
||||
#define RS_PREFIX_WORKGROUP_SIZE_LOG2 8
|
||||
#define RS_PREFIX_SUBGROUP_SIZE_LOG2 6
|
||||
|
||||
#define RS_SCATTER_WORKGROUP_SIZE_LOG2 8
|
||||
#define RS_SCATTER_SUBGROUP_SIZE_LOG2 6
|
||||
#define RS_SCATTER_BLOCK_ROWS 14
|
||||
// clang-format on
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
|
Loading…
Reference in New Issue
Block a user