diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index a608274e1c3..9aa6fea8044 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -313,58 +313,6 @@ as_vgpr(isel_context* ctx, Temp val) return as_vgpr(bld, val); } -// assumes a != 0xffffffff -void -emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b) -{ - assert(b != 0); - Builder bld(ctx->program, ctx->block); - - if (util_is_power_of_two_or_zero(b)) { - bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a); - return; - } - - util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32); - - assert(info.multiplier <= 0xffffffff); - - bool pre_shift = info.pre_shift != 0; - bool increment = info.increment != 0; - bool multiply = true; - bool post_shift = info.post_shift != 0; - - if (!pre_shift && !increment && !multiply && !post_shift) { - bld.copy(Definition(dst), a); - return; - } - - Temp pre_shift_dst = a; - if (pre_shift) { - pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst; - bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift), - a); - } - - Temp increment_dst = pre_shift_dst; - if (increment) { - increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst; - bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst); - } - - Temp multiply_dst = increment_dst; - if (multiply) { - multiply_dst = post_shift ? bld.tmp(v1) : dst; - bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst, - bld.copy(bld.def(v1), Operand::c32(info.multiplier))); - } - - if (post_shift) { - bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift), - multiply_dst); - } -} - void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst) { @@ -5632,216 +5580,7 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); nir_src offset = *nir_get_io_offset_src(instr); - if (ctx->shader->info.stage == MESA_SHADER_VERTEX) { - if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) - isel_err(offset.ssa->parent_instr, - "Unimplemented non-zero nir_intrinsic_load_input offset"); - - Temp vertex_buffers = - convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers)); - - unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0; - unsigned bitsize = instr->dest.ssa.bit_size; - unsigned component = nir_intrinsic_component(instr) >> (bitsize == 64 ? 1 : 0); - unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location]; - uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location]; - uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location]; - enum pipe_format attrib_format = - (enum pipe_format)ctx->options->key.vs.vertex_attribute_formats[location]; - unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding]; - - const struct ac_vtx_format_info* vtx_info = - ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, attrib_format); - - unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component; - unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels); - - unsigned desc_index = - ctx->program->info.vs.use_per_attribute_vb_descs ? location : attrib_binding; - desc_index = util_bitcount(ctx->program->info.vs.vb_desc_usage_mask & - u_bit_consecutive(0, desc_index)); - Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u)); - Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off); - - Temp index; - if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) { - uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location]; - Temp start_instance = get_arg(ctx, ctx->args->start_instance); - if (divisor) { - Temp instance_id = get_arg(ctx, ctx->args->instance_id); - if (divisor != 1) { - Temp divided = bld.tmp(v1); - emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor); - index = bld.vadd32(bld.def(v1), start_instance, divided); - } else { - index = bld.vadd32(bld.def(v1), start_instance, instance_id); - } - } else { - index = bld.copy(bld.def(v1), start_instance); - } - } else { - index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->base_vertex), - get_arg(ctx, ctx->args->vertex_id)); - } - - Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp)); - unsigned channel_start = 0; - bool direct_fetch = false; - - /* skip unused channels at the start */ - if (vtx_info->chan_byte_size) { - channel_start = ffs(mask) - 1; - for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++) - channels[i] = Temp(0, s1); - } - - /* load channels */ - while (channel_start < num_channels) { - unsigned fetch_component = num_channels - channel_start; - unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size; - - /* use MUBUF when possible to avoid possible alignment issues */ - /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */ - bool use_mubuf = vtx_info->chan_byte_size == 4 && bitsize != 16; - unsigned fetch_fmt = V_008F0C_BUF_DATA_FORMAT_INVALID; - if (!use_mubuf) { - fetch_component = ac_get_safe_fetch_size(ctx->program->gfx_level, vtx_info, fetch_offset, - vtx_info->num_channels - channel_start, binding_align, - fetch_component); - fetch_fmt = vtx_info->hw_format[fetch_component - 1]; - } else { - /* GFX6 only supports loading vec3 with MTBUF, split to vec2,scalar. */ - if (fetch_component == 3 && ctx->options->gfx_level == GFX6) - fetch_component = 2; - } - - unsigned fetch_bytes = fetch_component * bitsize / 8; - - Temp fetch_index = index; - if (attrib_stride != 0 && fetch_offset > attrib_stride) { - fetch_index = - bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index); - fetch_offset = fetch_offset % attrib_stride; - } - - Operand soffset = Operand::zero(); - if (fetch_offset >= 4096) { - soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096)); - fetch_offset %= 4096; - } - - aco_opcode opcode; - switch (fetch_bytes) { - case 2: - assert(!use_mubuf && bitsize == 16); - opcode = aco_opcode::tbuffer_load_format_d16_x; - break; - case 4: - if (bitsize == 16) { - assert(!use_mubuf); - opcode = aco_opcode::tbuffer_load_format_d16_xy; - } else { - opcode = - use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; - } - break; - case 6: - assert(!use_mubuf && bitsize == 16); - opcode = aco_opcode::tbuffer_load_format_d16_xyz; - break; - case 8: - if (bitsize == 16) { - assert(!use_mubuf); - opcode = aco_opcode::tbuffer_load_format_d16_xyzw; - } else { - opcode = - use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; - } - break; - case 12: - assert(ctx->options->gfx_level >= GFX7 || - (!use_mubuf && ctx->options->gfx_level == GFX6)); - opcode = - use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; - break; - case 16: - opcode = - use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; - break; - default: unreachable("Unimplemented load_input vector size"); - } - - Temp fetch_dst; - if (channel_start == 0 && fetch_bytes == dst.bytes()) { - direct_fetch = true; - fetch_dst = dst; - } else { - fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes)); - } - - if (use_mubuf) { - bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index, - soffset, fetch_offset, false, false, true); - } else { - unsigned dfmt = fetch_fmt & 0xf; - unsigned nfmt = fetch_fmt >> 4; - bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index, - soffset, dfmt, nfmt, fetch_offset, false, true); - } - - emit_split_vector(ctx, fetch_dst, fetch_dst.bytes() * 8 / bitsize); - - if (fetch_component == 1) { - channels[channel_start] = fetch_dst; - } else { - for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++) - channels[channel_start + i] = emit_extract_vector( - ctx, fetch_dst, i, RegClass::get(RegType::vgpr, bitsize / 8u)); - } - - channel_start += fetch_component; - } - - if (!direct_fetch) { - bool is_float = - nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float; - - unsigned num_components = instr->dest.ssa.num_components; - - aco_ptr vec{create_instruction( - aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; - std::array elems; - unsigned num_temp = 0; - for (unsigned i = 0; i < num_components; i++) { - unsigned idx = i + component; - if (idx < num_channels && channels[idx].id()) { - Temp channel = channels[idx]; - vec->operands[i] = Operand(channel); - - num_temp++; - elems[i] = channel; - } else if (bitsize == 64) { - /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification: - * For 64-bit data types, no default attribute values are provided. Input variables - * must not use more components than provided by the attribute. - */ - vec->operands[i] = Operand(v2); - } else if (is_float && idx == 3) { - vec->operands[i] = bitsize == 16 ? Operand::c16(0x3c00u) : Operand::c32(0x3f800000u); - } else if (!is_float && idx == 3) { - vec->operands[i] = Operand::get_const(ctx->options->gfx_level, 1u, bitsize / 8u); - } else { - vec->operands[i] = Operand::zero(bitsize / 8u); - } - } - vec->definitions[0] = Definition(dst); - ctx->block->instructions.emplace_back(std::move(vec)); - emit_split_vector(ctx, dst, num_components); - - if (num_temp == num_components) - ctx->allocated_vec.emplace(dst.id(), elems); - } - } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) { + if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) { if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");