aco: Remove VS inputs from visit_load_input.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Acked-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16805>
This commit is contained in:
Timur Kristóf 2023-02-02 17:55:06 +01:00 committed by Marge Bot
parent 27c8131978
commit c602092033

View File

@ -313,58 +313,6 @@ as_vgpr(isel_context* ctx, Temp val)
return as_vgpr(bld, val);
}
// assumes a != 0xffffffff
void
emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)
{
assert(b != 0);
Builder bld(ctx->program, ctx->block);
if (util_is_power_of_two_or_zero(b)) {
bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a);
return;
}
util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
assert(info.multiplier <= 0xffffffff);
bool pre_shift = info.pre_shift != 0;
bool increment = info.increment != 0;
bool multiply = true;
bool post_shift = info.post_shift != 0;
if (!pre_shift && !increment && !multiply && !post_shift) {
bld.copy(Definition(dst), a);
return;
}
Temp pre_shift_dst = a;
if (pre_shift) {
pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift),
a);
}
Temp increment_dst = pre_shift_dst;
if (increment) {
increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst);
}
Temp multiply_dst = increment_dst;
if (multiply) {
multiply_dst = post_shift ? bld.tmp(v1) : dst;
bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
bld.copy(bld.def(v1), Operand::c32(info.multiplier)));
}
if (post_shift) {
bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift),
multiply_dst);
}
}
void
emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
{
@ -5632,216 +5580,7 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
nir_src offset = *nir_get_io_offset_src(instr);
if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
isel_err(offset.ssa->parent_instr,
"Unimplemented non-zero nir_intrinsic_load_input offset");
Temp vertex_buffers =
convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
unsigned bitsize = instr->dest.ssa.bit_size;
unsigned component = nir_intrinsic_component(instr) >> (bitsize == 64 ? 1 : 0);
unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
enum pipe_format attrib_format =
(enum pipe_format)ctx->options->key.vs.vertex_attribute_formats[location];
unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
const struct ac_vtx_format_info* vtx_info =
ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, attrib_format);
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
unsigned desc_index =
ctx->program->info.vs.use_per_attribute_vb_descs ? location : attrib_binding;
desc_index = util_bitcount(ctx->program->info.vs.vb_desc_usage_mask &
u_bit_consecutive(0, desc_index));
Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u));
Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
Temp index;
if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
Temp start_instance = get_arg(ctx, ctx->args->start_instance);
if (divisor) {
Temp instance_id = get_arg(ctx, ctx->args->instance_id);
if (divisor != 1) {
Temp divided = bld.tmp(v1);
emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
index = bld.vadd32(bld.def(v1), start_instance, divided);
} else {
index = bld.vadd32(bld.def(v1), start_instance, instance_id);
}
} else {
index = bld.copy(bld.def(v1), start_instance);
}
} else {
index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->base_vertex),
get_arg(ctx, ctx->args->vertex_id));
}
Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp));
unsigned channel_start = 0;
bool direct_fetch = false;
/* skip unused channels at the start */
if (vtx_info->chan_byte_size) {
channel_start = ffs(mask) - 1;
for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++)
channels[i] = Temp(0, s1);
}
/* load channels */
while (channel_start < num_channels) {
unsigned fetch_component = num_channels - channel_start;
unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
/* use MUBUF when possible to avoid possible alignment issues */
/* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
bool use_mubuf = vtx_info->chan_byte_size == 4 && bitsize != 16;
unsigned fetch_fmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
if (!use_mubuf) {
fetch_component = ac_get_safe_fetch_size(ctx->program->gfx_level, vtx_info, fetch_offset,
vtx_info->num_channels - channel_start, binding_align,
fetch_component);
fetch_fmt = vtx_info->hw_format[fetch_component - 1];
} else {
/* GFX6 only supports loading vec3 with MTBUF, split to vec2,scalar. */
if (fetch_component == 3 && ctx->options->gfx_level == GFX6)
fetch_component = 2;
}
unsigned fetch_bytes = fetch_component * bitsize / 8;
Temp fetch_index = index;
if (attrib_stride != 0 && fetch_offset > attrib_stride) {
fetch_index =
bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index);
fetch_offset = fetch_offset % attrib_stride;
}
Operand soffset = Operand::zero();
if (fetch_offset >= 4096) {
soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096));
fetch_offset %= 4096;
}
aco_opcode opcode;
switch (fetch_bytes) {
case 2:
assert(!use_mubuf && bitsize == 16);
opcode = aco_opcode::tbuffer_load_format_d16_x;
break;
case 4:
if (bitsize == 16) {
assert(!use_mubuf);
opcode = aco_opcode::tbuffer_load_format_d16_xy;
} else {
opcode =
use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
}
break;
case 6:
assert(!use_mubuf && bitsize == 16);
opcode = aco_opcode::tbuffer_load_format_d16_xyz;
break;
case 8:
if (bitsize == 16) {
assert(!use_mubuf);
opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
} else {
opcode =
use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
}
break;
case 12:
assert(ctx->options->gfx_level >= GFX7 ||
(!use_mubuf && ctx->options->gfx_level == GFX6));
opcode =
use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
break;
case 16:
opcode =
use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
break;
default: unreachable("Unimplemented load_input vector size");
}
Temp fetch_dst;
if (channel_start == 0 && fetch_bytes == dst.bytes()) {
direct_fetch = true;
fetch_dst = dst;
} else {
fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
}
if (use_mubuf) {
bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index,
soffset, fetch_offset, false, false, true);
} else {
unsigned dfmt = fetch_fmt & 0xf;
unsigned nfmt = fetch_fmt >> 4;
bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index,
soffset, dfmt, nfmt, fetch_offset, false, true);
}
emit_split_vector(ctx, fetch_dst, fetch_dst.bytes() * 8 / bitsize);
if (fetch_component == 1) {
channels[channel_start] = fetch_dst;
} else {
for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
channels[channel_start + i] = emit_extract_vector(
ctx, fetch_dst, i, RegClass::get(RegType::vgpr, bitsize / 8u));
}
channel_start += fetch_component;
}
if (!direct_fetch) {
bool is_float =
nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float;
unsigned num_components = instr->dest.ssa.num_components;
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
unsigned num_temp = 0;
for (unsigned i = 0; i < num_components; i++) {
unsigned idx = i + component;
if (idx < num_channels && channels[idx].id()) {
Temp channel = channels[idx];
vec->operands[i] = Operand(channel);
num_temp++;
elems[i] = channel;
} else if (bitsize == 64) {
/* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
* For 64-bit data types, no default attribute values are provided. Input variables
* must not use more components than provided by the attribute.
*/
vec->operands[i] = Operand(v2);
} else if (is_float && idx == 3) {
vec->operands[i] = bitsize == 16 ? Operand::c16(0x3c00u) : Operand::c32(0x3f800000u);
} else if (!is_float && idx == 3) {
vec->operands[i] = Operand::get_const(ctx->options->gfx_level, 1u, bitsize / 8u);
} else {
vec->operands[i] = Operand::zero(bitsize / 8u);
}
}
vec->definitions[0] = Definition(dst);
ctx->block->instructions.emplace_back(std::move(vec));
emit_split_vector(ctx, dst, num_components);
if (num_temp == num_components)
ctx->allocated_vec.emplace(dst.id(), elems);
}
} else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
isel_err(offset.ssa->parent_instr,
"Unimplemented non-zero nir_intrinsic_load_input offset");