mirror of
https://github.com/qemu/qemu.git
synced 2024-12-05 09:43:44 +08:00
Backend vector enhancements
Dynamic tlb resizing -----BEGIN PGP SIGNATURE----- iQEcBAABAgAGBQJcTyZfAAoJEGTfOOivfiFf/XIH/2uG8YTamq97ZMALuzSMUD1O RApi8FRghk4M1SMrZv9KAnR3IcYl8Y8Qjlj7LDytD4axVG+1PdPsOwIiVThd3a0d yYB510vCr1nBi0d7an70Ks2n5v0pCm/Q5/WK00F03Swg/eeUGVjiyVhXUQDAdJ8M wI8Qi2eIF7Y2Pin+kXvEvHQwkrCYoRV5V8c+gW7DfuPM9rfjZ2ieAWisUeRkWJuT QVwVEjbAts1RH1JLe7M4DZYaaHoHjjhssG4WUWVt5CVtZBnb10raoRZYR69bNT+w f3LTvpY2Ga0K+rQJa90hWig5dbpgUQ2nOBCU0B6/Ee/SRxo74HQEIzhKM8TjMYw= =NT5a -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20190128' into staging Backend vector enhancements Dynamic tlb resizing # gpg: Signature made Mon 28 Jan 2019 15:57:19 GMT # gpg: using RSA key 64DF38E8AF7E215F # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * remotes/rth/tags/pull-tcg-20190128: (23 commits) cputlb: Remove static tlb sizing tcg/tci: enable dynamic TLB sizing tcg/mips: enable dynamic TLB sizing tcg/mips: Fix tcg_out_qemu_ld_slow_path tcg/arm: enable dynamic TLB sizing tcg/riscv: enable dynamic TLB sizing tcg/s390: enable dynamic TLB sizing tcg/sparc: enable dynamic TLB sizing tcg/ppc: enable dynamic TLB sizing tcg/aarch64: enable dynamic TLB sizing tcg/i386: enable dynamic TLB sizing tcg: introduce dynamic TLB sizing cputlb: do not evict empty entries to the vtlb tcg/aarch64: Implement vector minmax arithmetic tcg/aarch64: Implement vector saturating arithmetic tcg/i386: Implement vector minmax arithmetic tcg/i386: Implement vector saturating arithmetic tcg/i386: Split subroutines out of tcg_expand_vec_op tcg: Add opcodes for vector minmax arithmetic tcg: Add opcodes for vector saturated arithmetic ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
commit
3a183e330d
@ -74,6 +74,166 @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
|
||||
QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
|
||||
#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
|
||||
|
||||
static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
|
||||
{
|
||||
return env->tlb_mask[mmu_idx] + (1 << CPU_TLB_ENTRY_BITS);
|
||||
}
|
||||
|
||||
static void tlb_window_reset(CPUTLBWindow *window, int64_t ns,
|
||||
size_t max_entries)
|
||||
{
|
||||
window->begin_ns = ns;
|
||||
window->max_entries = max_entries;
|
||||
}
|
||||
|
||||
static void tlb_dyn_init(CPUArchState *env)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < NB_MMU_MODES; i++) {
|
||||
CPUTLBDesc *desc = &env->tlb_d[i];
|
||||
size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;
|
||||
|
||||
tlb_window_reset(&desc->window, get_clock_realtime(), 0);
|
||||
desc->n_used_entries = 0;
|
||||
env->tlb_mask[i] = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
|
||||
env->tlb_table[i] = g_new(CPUTLBEntry, n_entries);
|
||||
env->iotlb[i] = g_new(CPUIOTLBEntry, n_entries);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
|
||||
* @env: CPU that owns the TLB
|
||||
* @mmu_idx: MMU index of the TLB
|
||||
*
|
||||
* Called with tlb_lock_held.
|
||||
*
|
||||
* We have two main constraints when resizing a TLB: (1) we only resize it
|
||||
* on a TLB flush (otherwise we'd have to take a perf hit by either rehashing
|
||||
* the array or unnecessarily flushing it), which means we do not control how
|
||||
* frequently the resizing can occur; (2) we don't have access to the guest's
|
||||
* future scheduling decisions, and therefore have to decide the magnitude of
|
||||
* the resize based on past observations.
|
||||
*
|
||||
* In general, a memory-hungry process can benefit greatly from an appropriately
|
||||
* sized TLB, since a guest TLB miss is very expensive. This doesn't mean that
|
||||
* we just have to make the TLB as large as possible; while an oversized TLB
|
||||
* results in minimal TLB miss rates, it also takes longer to be flushed
|
||||
* (flushes can be _very_ frequent), and the reduced locality can also hurt
|
||||
* performance.
|
||||
*
|
||||
* To achieve near-optimal performance for all kinds of workloads, we:
|
||||
*
|
||||
* 1. Aggressively increase the size of the TLB when the use rate of the
|
||||
* TLB being flushed is high, since it is likely that in the near future this
|
||||
* memory-hungry process will execute again, and its memory hungriness will
|
||||
* probably be similar.
|
||||
*
|
||||
* 2. Slowly reduce the size of the TLB as the use rate declines over a
|
||||
* reasonably large time window. The rationale is that if in such a time window
|
||||
* we have not observed a high TLB use rate, it is likely that we won't observe
|
||||
* it in the near future. In that case, once a time window expires we downsize
|
||||
* the TLB to match the maximum use rate observed in the window.
|
||||
*
|
||||
* 3. Try to keep the maximum use rate in a time window in the 30-70% range,
|
||||
* since in that range performance is likely near-optimal. Recall that the TLB
|
||||
* is direct mapped, so we want the use rate to be low (or at least not too
|
||||
* high), since otherwise we are likely to have a significant amount of
|
||||
* conflict misses.
|
||||
*/
|
||||
static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
|
||||
{
|
||||
CPUTLBDesc *desc = &env->tlb_d[mmu_idx];
|
||||
size_t old_size = tlb_n_entries(env, mmu_idx);
|
||||
size_t rate;
|
||||
size_t new_size = old_size;
|
||||
int64_t now = get_clock_realtime();
|
||||
int64_t window_len_ms = 100;
|
||||
int64_t window_len_ns = window_len_ms * 1000 * 1000;
|
||||
bool window_expired = now > desc->window.begin_ns + window_len_ns;
|
||||
|
||||
if (desc->n_used_entries > desc->window.max_entries) {
|
||||
desc->window.max_entries = desc->n_used_entries;
|
||||
}
|
||||
rate = desc->window.max_entries * 100 / old_size;
|
||||
|
||||
if (rate > 70) {
|
||||
new_size = MIN(old_size << 1, 1 << CPU_TLB_DYN_MAX_BITS);
|
||||
} else if (rate < 30 && window_expired) {
|
||||
size_t ceil = pow2ceil(desc->window.max_entries);
|
||||
size_t expected_rate = desc->window.max_entries * 100 / ceil;
|
||||
|
||||
/*
|
||||
* Avoid undersizing when the max number of entries seen is just below
|
||||
* a pow2. For instance, if max_entries == 1025, the expected use rate
|
||||
* would be 1025/2048==50%. However, if max_entries == 1023, we'd get
|
||||
* 1023/1024==99.9% use rate, so we'd likely end up doubling the size
|
||||
* later. Thus, make sure that the expected use rate remains below 70%.
|
||||
* (and since we double the size, that means the lowest rate we'd
|
||||
* expect to get is 35%, which is still in the 30-70% range where
|
||||
* we consider that the size is appropriate.)
|
||||
*/
|
||||
if (expected_rate > 70) {
|
||||
ceil *= 2;
|
||||
}
|
||||
new_size = MAX(ceil, 1 << CPU_TLB_DYN_MIN_BITS);
|
||||
}
|
||||
|
||||
if (new_size == old_size) {
|
||||
if (window_expired) {
|
||||
tlb_window_reset(&desc->window, now, desc->n_used_entries);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
g_free(env->tlb_table[mmu_idx]);
|
||||
g_free(env->iotlb[mmu_idx]);
|
||||
|
||||
tlb_window_reset(&desc->window, now, 0);
|
||||
/* desc->n_used_entries is cleared by the caller */
|
||||
env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
|
||||
env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
|
||||
env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
|
||||
/*
|
||||
* If the allocations fail, try smaller sizes. We just freed some
|
||||
* memory, so going back to half of new_size has a good chance of working.
|
||||
* Increased memory pressure elsewhere in the system might cause the
|
||||
* allocations to fail though, so we progressively reduce the allocation
|
||||
* size, aborting if we cannot even allocate the smallest TLB we support.
|
||||
*/
|
||||
while (env->tlb_table[mmu_idx] == NULL || env->iotlb[mmu_idx] == NULL) {
|
||||
if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
|
||||
error_report("%s: %s", __func__, strerror(errno));
|
||||
abort();
|
||||
}
|
||||
new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS);
|
||||
env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
|
||||
|
||||
g_free(env->tlb_table[mmu_idx]);
|
||||
g_free(env->iotlb[mmu_idx]);
|
||||
env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
|
||||
env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx)
|
||||
{
|
||||
tlb_mmu_resize_locked(env, mmu_idx);
|
||||
memset(env->tlb_table[mmu_idx], -1, sizeof_tlb(env, mmu_idx));
|
||||
env->tlb_d[mmu_idx].n_used_entries = 0;
|
||||
}
|
||||
|
||||
static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
|
||||
{
|
||||
env->tlb_d[mmu_idx].n_used_entries++;
|
||||
}
|
||||
|
||||
static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx)
|
||||
{
|
||||
env->tlb_d[mmu_idx].n_used_entries--;
|
||||
}
|
||||
|
||||
void tlb_init(CPUState *cpu)
|
||||
{
|
||||
CPUArchState *env = cpu->env_ptr;
|
||||
@ -82,6 +242,8 @@ void tlb_init(CPUState *cpu)
|
||||
|
||||
/* Ensure that cpu_reset performs a full flush. */
|
||||
env->tlb_c.dirty = ALL_MMUIDX_BITS;
|
||||
|
||||
tlb_dyn_init(env);
|
||||
}
|
||||
|
||||
/* flush_all_helper: run fn across all cpus
|
||||
@ -122,7 +284,7 @@ void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)
|
||||
|
||||
static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
|
||||
{
|
||||
memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
|
||||
tlb_table_flush_by_mmuidx(env, mmu_idx);
|
||||
memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
|
||||
env->tlb_d[mmu_idx].large_page_addr = -1;
|
||||
env->tlb_d[mmu_idx].large_page_mask = -1;
|
||||
@ -224,13 +386,24 @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
|
||||
tlb_hit_page(tlb_entry->addr_code, page);
|
||||
}
|
||||
|
||||
/**
|
||||
* tlb_entry_is_empty - return true if the entry is not in use
|
||||
* @te: pointer to CPUTLBEntry
|
||||
*/
|
||||
static inline bool tlb_entry_is_empty(const CPUTLBEntry *te)
|
||||
{
|
||||
return te->addr_read == -1 && te->addr_write == -1 && te->addr_code == -1;
|
||||
}
|
||||
|
||||
/* Called with tlb_c.lock held */
|
||||
static inline void tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
|
||||
static inline bool tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
|
||||
target_ulong page)
|
||||
{
|
||||
if (tlb_hit_page_anyprot(tlb_entry, page)) {
|
||||
memset(tlb_entry, -1, sizeof(*tlb_entry));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Called with tlb_c.lock held */
|
||||
@ -241,7 +414,9 @@ static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,
|
||||
|
||||
assert_cpu_is_self(ENV_GET_CPU(env));
|
||||
for (k = 0; k < CPU_VTLB_SIZE; k++) {
|
||||
tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page);
|
||||
if (tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page)) {
|
||||
tlb_n_used_entries_dec(env, mmu_idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -258,7 +433,9 @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
|
||||
midx, lp_addr, lp_mask);
|
||||
tlb_flush_one_mmuidx_locked(env, midx);
|
||||
} else {
|
||||
tlb_flush_entry_locked(tlb_entry(env, midx, page), page);
|
||||
if (tlb_flush_entry_locked(tlb_entry(env, midx, page), page)) {
|
||||
tlb_n_used_entries_dec(env, midx);
|
||||
}
|
||||
tlb_flush_vtlb_page_locked(env, midx, page);
|
||||
}
|
||||
}
|
||||
@ -435,8 +612,9 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
|
||||
qemu_spin_lock(&env->tlb_c.lock);
|
||||
for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
|
||||
unsigned int i;
|
||||
unsigned int n = tlb_n_entries(env, mmu_idx);
|
||||
|
||||
for (i = 0; i < CPU_TLB_SIZE; i++) {
|
||||
for (i = 0; i < n; i++) {
|
||||
tlb_reset_dirty_range_locked(&env->tlb_table[mmu_idx][i], start1,
|
||||
length);
|
||||
}
|
||||
@ -591,13 +769,14 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
|
||||
* Only evict the old entry to the victim tlb if it's for a
|
||||
* different page; otherwise just overwrite the stale data.
|
||||
*/
|
||||
if (!tlb_hit_page_anyprot(te, vaddr_page)) {
|
||||
if (!tlb_hit_page_anyprot(te, vaddr_page) && !tlb_entry_is_empty(te)) {
|
||||
unsigned vidx = env->tlb_d[mmu_idx].vindex++ % CPU_VTLB_SIZE;
|
||||
CPUTLBEntry *tv = &env->tlb_v_table[mmu_idx][vidx];
|
||||
|
||||
/* Evict the old entry into the victim tlb. */
|
||||
copy_tlb_helper_locked(tv, te);
|
||||
env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
|
||||
tlb_n_used_entries_dec(env, mmu_idx);
|
||||
}
|
||||
|
||||
/* refill the tlb */
|
||||
@ -649,6 +828,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
|
||||
}
|
||||
|
||||
copy_tlb_helper_locked(te, &tn);
|
||||
tlb_n_used_entries_inc(env, mmu_idx);
|
||||
qemu_spin_unlock(&env->tlb_c.lock);
|
||||
}
|
||||
|
||||
|
@ -512,6 +512,39 @@ void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_nand)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(vec64)) {
|
||||
*(vec64 *)(d + i) = ~(*(vec64 *)(a + i) & *(vec64 *)(b + i));
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_nor)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(vec64)) {
|
||||
*(vec64 *)(d + i) = ~(*(vec64 *)(a + i) | *(vec64 *)(b + i));
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(vec64)) {
|
||||
*(vec64 *)(d + i) = ~(*(vec64 *)(a + i) ^ *(vec64 *)(b + i));
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
@ -995,3 +1028,227 @@ void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc)
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_smin8)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(int8_t)) {
|
||||
int8_t aa = *(int8_t *)(a + i);
|
||||
int8_t bb = *(int8_t *)(b + i);
|
||||
int8_t dd = aa < bb ? aa : bb;
|
||||
*(int8_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_smin16)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(int16_t)) {
|
||||
int16_t aa = *(int16_t *)(a + i);
|
||||
int16_t bb = *(int16_t *)(b + i);
|
||||
int16_t dd = aa < bb ? aa : bb;
|
||||
*(int16_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_smin32)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(int32_t)) {
|
||||
int32_t aa = *(int32_t *)(a + i);
|
||||
int32_t bb = *(int32_t *)(b + i);
|
||||
int32_t dd = aa < bb ? aa : bb;
|
||||
*(int32_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_smin64)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(int64_t)) {
|
||||
int64_t aa = *(int64_t *)(a + i);
|
||||
int64_t bb = *(int64_t *)(b + i);
|
||||
int64_t dd = aa < bb ? aa : bb;
|
||||
*(int64_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_smax8)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(int8_t)) {
|
||||
int8_t aa = *(int8_t *)(a + i);
|
||||
int8_t bb = *(int8_t *)(b + i);
|
||||
int8_t dd = aa > bb ? aa : bb;
|
||||
*(int8_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_smax16)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(int16_t)) {
|
||||
int16_t aa = *(int16_t *)(a + i);
|
||||
int16_t bb = *(int16_t *)(b + i);
|
||||
int16_t dd = aa > bb ? aa : bb;
|
||||
*(int16_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_smax32)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(int32_t)) {
|
||||
int32_t aa = *(int32_t *)(a + i);
|
||||
int32_t bb = *(int32_t *)(b + i);
|
||||
int32_t dd = aa > bb ? aa : bb;
|
||||
*(int32_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_smax64)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(int64_t)) {
|
||||
int64_t aa = *(int64_t *)(a + i);
|
||||
int64_t bb = *(int64_t *)(b + i);
|
||||
int64_t dd = aa > bb ? aa : bb;
|
||||
*(int64_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_umin8)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
|
||||
uint8_t aa = *(uint8_t *)(a + i);
|
||||
uint8_t bb = *(uint8_t *)(b + i);
|
||||
uint8_t dd = aa < bb ? aa : bb;
|
||||
*(uint8_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_umin16)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
|
||||
uint16_t aa = *(uint16_t *)(a + i);
|
||||
uint16_t bb = *(uint16_t *)(b + i);
|
||||
uint16_t dd = aa < bb ? aa : bb;
|
||||
*(uint16_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_umin32)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
|
||||
uint32_t aa = *(uint32_t *)(a + i);
|
||||
uint32_t bb = *(uint32_t *)(b + i);
|
||||
uint32_t dd = aa < bb ? aa : bb;
|
||||
*(uint32_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_umin64)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
|
||||
uint64_t aa = *(uint64_t *)(a + i);
|
||||
uint64_t bb = *(uint64_t *)(b + i);
|
||||
uint64_t dd = aa < bb ? aa : bb;
|
||||
*(uint64_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_umax8)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
|
||||
uint8_t aa = *(uint8_t *)(a + i);
|
||||
uint8_t bb = *(uint8_t *)(b + i);
|
||||
uint8_t dd = aa > bb ? aa : bb;
|
||||
*(uint8_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_umax16)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
|
||||
uint16_t aa = *(uint16_t *)(a + i);
|
||||
uint16_t bb = *(uint16_t *)(b + i);
|
||||
uint16_t dd = aa > bb ? aa : bb;
|
||||
*(uint16_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_umax32)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
|
||||
uint32_t aa = *(uint32_t *)(a + i);
|
||||
uint32_t bb = *(uint32_t *)(b + i);
|
||||
uint32_t dd = aa > bb ? aa : bb;
|
||||
*(uint32_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
||||
void HELPER(gvec_umax64)(void *d, void *a, void *b, uint32_t desc)
|
||||
{
|
||||
intptr_t oprsz = simd_oprsz(desc);
|
||||
intptr_t i;
|
||||
|
||||
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
|
||||
uint64_t aa = *(uint64_t *)(a + i);
|
||||
uint64_t bb = *(uint64_t *)(b + i);
|
||||
uint64_t dd = aa > bb ? aa : bb;
|
||||
*(uint64_t *)(d + i) = dd;
|
||||
}
|
||||
clear_high(d, oprsz, desc);
|
||||
}
|
||||
|
@ -200,6 +200,26 @@ DEF_HELPER_FLAGS_4(gvec_ussub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_ussub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_ussub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(gvec_smin8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_smin16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_smin32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_smin64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(gvec_smax8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_smax16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_smax32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_smax64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(gvec_umin8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_umin16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_umin32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_umin64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(gvec_umax8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_umax16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_umax32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_umax64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
|
||||
@ -211,6 +231,9 @@ DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_nand, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
|
||||
DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
|
||||
|
@ -67,37 +67,23 @@ typedef uint64_t target_ulong;
|
||||
#define CPU_TLB_ENTRY_BITS 5
|
||||
#endif
|
||||
|
||||
/* TCG_TARGET_TLB_DISPLACEMENT_BITS is used in CPU_TLB_BITS to ensure that
|
||||
* the TLB is not unnecessarily small, but still small enough for the
|
||||
* TLB lookup instruction sequence used by the TCG target.
|
||||
*
|
||||
* TCG will have to generate an operand as large as the distance between
|
||||
* env and the tlb_table[NB_MMU_MODES - 1][0].addend. For simplicity,
|
||||
* the TCG targets just round everything up to the next power of two, and
|
||||
* count bits. This works because: 1) the size of each TLB is a largish
|
||||
* power of two, 2) and because the limit of the displacement is really close
|
||||
* to a power of two, 3) the offset of tlb_table[0][0] inside env is smaller
|
||||
* than the size of a TLB.
|
||||
*
|
||||
* For example, the maximum displacement 0xFFF0 on PPC and MIPS, but TCG
|
||||
* just says "the displacement is 16 bits". TCG_TARGET_TLB_DISPLACEMENT_BITS
|
||||
* then ensures that tlb_table at least 0x8000 bytes large ("not unnecessarily
|
||||
* small": 2^15). The operand then will come up smaller than 0xFFF0 without
|
||||
* any particular care, because the TLB for a single MMU mode is larger than
|
||||
* 0x10000-0xFFF0=16 bytes. In the end, the maximum value of the operand
|
||||
* could be something like 0xC000 (the offset of the last TLB table) plus
|
||||
* 0x18 (the offset of the addend field in each TLB entry) plus the offset
|
||||
* of tlb_table inside env (which is non-trivial but not huge).
|
||||
*/
|
||||
#define CPU_TLB_BITS \
|
||||
MIN(8, \
|
||||
TCG_TARGET_TLB_DISPLACEMENT_BITS - CPU_TLB_ENTRY_BITS - \
|
||||
(NB_MMU_MODES <= 1 ? 0 : \
|
||||
NB_MMU_MODES <= 2 ? 1 : \
|
||||
NB_MMU_MODES <= 4 ? 2 : \
|
||||
NB_MMU_MODES <= 8 ? 3 : 4))
|
||||
#define CPU_TLB_DYN_MIN_BITS 6
|
||||
#define CPU_TLB_DYN_DEFAULT_BITS 8
|
||||
|
||||
#define CPU_TLB_SIZE (1 << CPU_TLB_BITS)
|
||||
# if HOST_LONG_BITS == 32
|
||||
/* Make sure we do not require a double-word shift for the TLB load */
|
||||
# define CPU_TLB_DYN_MAX_BITS (32 - TARGET_PAGE_BITS)
|
||||
# else /* HOST_LONG_BITS == 64 */
|
||||
/*
|
||||
* Assuming TARGET_PAGE_BITS==12, with 2**22 entries we can cover 2**(22+12) ==
|
||||
* 2**34 == 16G of address space. This is roughly what one would expect a
|
||||
* TLB to cover in a modern (as of 2018) x86_64 CPU. For instance, Intel
|
||||
* Skylake's Level-2 STLB has 16 1G entries.
|
||||
* Also, make sure we do not size the TLB past the guest's address space.
|
||||
*/
|
||||
# define CPU_TLB_DYN_MAX_BITS \
|
||||
MIN(22, TARGET_VIRT_ADDR_SPACE_BITS - TARGET_PAGE_BITS)
|
||||
# endif
|
||||
|
||||
typedef struct CPUTLBEntry {
|
||||
/* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
|
||||
@ -141,6 +127,18 @@ typedef struct CPUIOTLBEntry {
|
||||
MemTxAttrs attrs;
|
||||
} CPUIOTLBEntry;
|
||||
|
||||
/**
|
||||
* struct CPUTLBWindow
|
||||
* @begin_ns: host time (in ns) at the beginning of the time window
|
||||
* @max_entries: maximum number of entries observed in the window
|
||||
*
|
||||
* See also: tlb_mmu_resize_locked()
|
||||
*/
|
||||
typedef struct CPUTLBWindow {
|
||||
int64_t begin_ns;
|
||||
size_t max_entries;
|
||||
} CPUTLBWindow;
|
||||
|
||||
typedef struct CPUTLBDesc {
|
||||
/*
|
||||
* Describe a region covering all of the large pages allocated
|
||||
@ -152,6 +150,8 @@ typedef struct CPUTLBDesc {
|
||||
target_ulong large_page_mask;
|
||||
/* The next index to use in the tlb victim table. */
|
||||
size_t vindex;
|
||||
CPUTLBWindow window;
|
||||
size_t n_used_entries;
|
||||
} CPUTLBDesc;
|
||||
|
||||
/*
|
||||
@ -176,6 +176,13 @@ typedef struct CPUTLBCommon {
|
||||
size_t elide_flush_count;
|
||||
} CPUTLBCommon;
|
||||
|
||||
# define CPU_TLB \
|
||||
/* tlb_mask[i] contains (n_entries - 1) << CPU_TLB_ENTRY_BITS */ \
|
||||
uintptr_t tlb_mask[NB_MMU_MODES]; \
|
||||
CPUTLBEntry *tlb_table[NB_MMU_MODES];
|
||||
# define CPU_IOTLB \
|
||||
CPUIOTLBEntry *iotlb[NB_MMU_MODES];
|
||||
|
||||
/*
|
||||
* The meaning of each of the MMU modes is defined in the target code.
|
||||
* Note that NB_MMU_MODES is not yet defined; we can only reference it
|
||||
@ -184,9 +191,9 @@ typedef struct CPUTLBCommon {
|
||||
#define CPU_COMMON_TLB \
|
||||
CPUTLBCommon tlb_c; \
|
||||
CPUTLBDesc tlb_d[NB_MMU_MODES]; \
|
||||
CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; \
|
||||
CPU_TLB \
|
||||
CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE]; \
|
||||
CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \
|
||||
CPU_IOTLB \
|
||||
CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];
|
||||
|
||||
#else
|
||||
|
@ -139,7 +139,14 @@ static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
|
||||
static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
|
||||
target_ulong addr)
|
||||
{
|
||||
return (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
|
||||
uintptr_t size_mask = env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS;
|
||||
|
||||
return (addr >> TARGET_PAGE_BITS) & size_mask;
|
||||
}
|
||||
|
||||
static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
|
||||
{
|
||||
return (env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS) + 1;
|
||||
}
|
||||
|
||||
/* Find the TLB entry corresponding to the mmu_idx + address pair. */
|
||||
|
19
tcg/README
19
tcg/README
@ -554,6 +554,25 @@ E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 -> i32.
|
||||
|
||||
Similarly, v0 = -v1.
|
||||
|
||||
* smin_vec:
|
||||
* umin_vec:
|
||||
|
||||
Similarly, v0 = MIN(v1, v2), for signed and unsigned element types.
|
||||
|
||||
* smax_vec:
|
||||
* umax_vec:
|
||||
|
||||
Similarly, v0 = MAX(v1, v2), for signed and unsigned element types.
|
||||
|
||||
* ssadd_vec:
|
||||
* sssub_vec:
|
||||
* usadd_vec:
|
||||
* ussub_vec:
|
||||
|
||||
Signed and unsigned saturating addition and subtraction. If the true
|
||||
result is not representable within the element type, the element is
|
||||
set to the minimum or maximum value for the type.
|
||||
|
||||
* and_vec v0, v1, v2
|
||||
* or_vec v0, v1, v2
|
||||
* xor_vec v0, v1, v2
|
||||
|
@ -135,6 +135,8 @@ typedef enum {
|
||||
#define TCG_TARGET_HAS_shv_vec 0
|
||||
#define TCG_TARGET_HAS_cmp_vec 1
|
||||
#define TCG_TARGET_HAS_mul_vec 1
|
||||
#define TCG_TARGET_HAS_sat_vec 1
|
||||
#define TCG_TARGET_HAS_minmax_vec 1
|
||||
|
||||
#define TCG_TARGET_DEFAULT_MO (0)
|
||||
#define TCG_TARGET_HAS_MEMORY_BSWAP 1
|
||||
|
@ -498,6 +498,9 @@ typedef enum {
|
||||
I3510_EON = 0x4a200000,
|
||||
I3510_ANDS = 0x6a000000,
|
||||
|
||||
/* Logical shifted register instructions (with a shift). */
|
||||
I3502S_AND_LSR = I3510_AND | (1 << 22),
|
||||
|
||||
/* AdvSIMD copy */
|
||||
I3605_DUP = 0x0e000400,
|
||||
I3605_INS = 0x4e001c00,
|
||||
@ -528,6 +531,14 @@ typedef enum {
|
||||
I3616_CMHI = 0x2e203400,
|
||||
I3616_CMHS = 0x2e203c00,
|
||||
I3616_CMEQ = 0x2e208c00,
|
||||
I3616_SMAX = 0x0e206400,
|
||||
I3616_SMIN = 0x0e206c00,
|
||||
I3616_SQADD = 0x0e200c00,
|
||||
I3616_SQSUB = 0x0e202c00,
|
||||
I3616_UMAX = 0x2e206400,
|
||||
I3616_UMIN = 0x2e206c00,
|
||||
I3616_UQADD = 0x2e200c00,
|
||||
I3616_UQSUB = 0x2e202c00,
|
||||
|
||||
/* AdvSIMD two-reg misc. */
|
||||
I3617_CMGT0 = 0x0e208800,
|
||||
@ -1440,6 +1451,14 @@ static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
|
||||
label->label_ptr[0] = label_ptr;
|
||||
}
|
||||
|
||||
/* We expect tlb_mask to be before tlb_table. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) <
|
||||
offsetof(CPUArchState, tlb_mask));
|
||||
|
||||
/* We expect to use a 24-bit unsigned offset from ENV. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1])
|
||||
> 0xffffff);
|
||||
|
||||
/* Load and compare a TLB entry, emitting the conditional jump to the
|
||||
slow path for the failure case, which will be patched later when finalizing
|
||||
the slow path. Generated code returns the host addend in X1,
|
||||
@ -1448,15 +1467,55 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
|
||||
tcg_insn_unit **label_ptr, int mem_index,
|
||||
bool is_read)
|
||||
{
|
||||
int tlb_offset = is_read ?
|
||||
offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
|
||||
: offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
|
||||
int mask_ofs = offsetof(CPUArchState, tlb_mask[mem_index]);
|
||||
int table_ofs = offsetof(CPUArchState, tlb_table[mem_index]);
|
||||
unsigned a_bits = get_alignment_bits(opc);
|
||||
unsigned s_bits = opc & MO_SIZE;
|
||||
unsigned a_mask = (1u << a_bits) - 1;
|
||||
unsigned s_mask = (1u << s_bits) - 1;
|
||||
TCGReg base = TCG_AREG0, x3;
|
||||
uint64_t tlb_mask;
|
||||
TCGReg mask_base = TCG_AREG0, table_base = TCG_AREG0, x3;
|
||||
TCGType mask_type;
|
||||
uint64_t compare_mask;
|
||||
|
||||
if (table_ofs > 0xfff) {
|
||||
int table_hi = table_ofs & ~0xfff;
|
||||
int mask_hi = mask_ofs & ~0xfff;
|
||||
|
||||
table_base = TCG_REG_X1;
|
||||
if (mask_hi == table_hi) {
|
||||
mask_base = table_base;
|
||||
} else if (mask_hi) {
|
||||
mask_base = TCG_REG_X0;
|
||||
tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64,
|
||||
mask_base, TCG_AREG0, mask_hi);
|
||||
}
|
||||
tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64,
|
||||
table_base, TCG_AREG0, table_hi);
|
||||
mask_ofs -= mask_hi;
|
||||
table_ofs -= table_hi;
|
||||
}
|
||||
|
||||
mask_type = (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32
|
||||
? TCG_TYPE_I64 : TCG_TYPE_I32);
|
||||
|
||||
/* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */
|
||||
tcg_out_ld(s, mask_type, TCG_REG_X0, mask_base, mask_ofs);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_X1, table_base, table_ofs);
|
||||
|
||||
/* Extract the TLB index from the address into X0. */
|
||||
tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
|
||||
TCG_REG_X0, TCG_REG_X0, addr_reg,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
|
||||
/* Add the tlb_table pointer, creating the CPUTLBEntry address into X1. */
|
||||
tcg_out_insn(s, 3502, ADD, 1, TCG_REG_X1, TCG_REG_X1, TCG_REG_X0);
|
||||
|
||||
/* Load the tlb comparator into X0, and the fast path addend into X1. */
|
||||
tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_X0, TCG_REG_X1, is_read
|
||||
? offsetof(CPUTLBEntry, addr_read)
|
||||
: offsetof(CPUTLBEntry, addr_write));
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_X1, TCG_REG_X1,
|
||||
offsetof(CPUTLBEntry, addend));
|
||||
|
||||
/* For aligned accesses, we check the first byte and include the alignment
|
||||
bits within the address. For unaligned access, we check that we don't
|
||||
@ -1468,47 +1527,14 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
|
||||
TCG_REG_X3, addr_reg, s_mask - a_mask);
|
||||
x3 = TCG_REG_X3;
|
||||
}
|
||||
tlb_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
|
||||
|
||||
/* Extract the TLB index from the address into X0.
|
||||
X0<CPU_TLB_BITS:0> =
|
||||
addr_reg<TARGET_PAGE_BITS+CPU_TLB_BITS:TARGET_PAGE_BITS> */
|
||||
tcg_out_ubfm(s, TARGET_LONG_BITS == 64, TCG_REG_X0, addr_reg,
|
||||
TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
|
||||
compare_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
|
||||
|
||||
/* Store the page mask part of the address into X3. */
|
||||
tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64,
|
||||
TCG_REG_X3, x3, tlb_mask);
|
||||
|
||||
/* Add any "high bits" from the tlb offset to the env address into X2,
|
||||
to take advantage of the LSL12 form of the ADDI instruction.
|
||||
X2 = env + (tlb_offset & 0xfff000) */
|
||||
if (tlb_offset & 0xfff000) {
|
||||
tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_X2, base,
|
||||
tlb_offset & 0xfff000);
|
||||
base = TCG_REG_X2;
|
||||
}
|
||||
|
||||
/* Merge the tlb index contribution into X2.
|
||||
X2 = X2 + (X0 << CPU_TLB_ENTRY_BITS) */
|
||||
tcg_out_insn(s, 3502S, ADD_LSL, TCG_TYPE_I64, TCG_REG_X2, base,
|
||||
TCG_REG_X0, CPU_TLB_ENTRY_BITS);
|
||||
|
||||
/* Merge "low bits" from tlb offset, load the tlb comparator into X0.
|
||||
X0 = load [X2 + (tlb_offset & 0x000fff)] */
|
||||
tcg_out_ldst(s, TARGET_LONG_BITS == 32 ? I3312_LDRW : I3312_LDRX,
|
||||
TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff,
|
||||
TARGET_LONG_BITS == 32 ? 2 : 3);
|
||||
|
||||
/* Load the tlb addend. Do that early to avoid stalling.
|
||||
X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */
|
||||
tcg_out_ldst(s, I3312_LDRX, TCG_REG_X1, TCG_REG_X2,
|
||||
(tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) -
|
||||
(is_read ? offsetof(CPUTLBEntry, addr_read)
|
||||
: offsetof(CPUTLBEntry, addr_write)), 3);
|
||||
TCG_REG_X3, x3, compare_mask);
|
||||
|
||||
/* Perform the address comparison. */
|
||||
tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0);
|
||||
tcg_out_cmp(s, TARGET_LONG_BITS == 64, TCG_REG_X0, TCG_REG_X3, 0);
|
||||
|
||||
/* If not equal, we jump to the slow path. */
|
||||
*label_ptr = s->code_ptr;
|
||||
@ -2137,6 +2163,30 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
|
||||
case INDEX_op_orc_vec:
|
||||
tcg_out_insn(s, 3616, ORN, is_q, 0, a0, a1, a2);
|
||||
break;
|
||||
case INDEX_op_ssadd_vec:
|
||||
tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
|
||||
break;
|
||||
case INDEX_op_sssub_vec:
|
||||
tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
|
||||
break;
|
||||
case INDEX_op_usadd_vec:
|
||||
tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
|
||||
break;
|
||||
case INDEX_op_ussub_vec:
|
||||
tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
|
||||
break;
|
||||
case INDEX_op_smax_vec:
|
||||
tcg_out_insn(s, 3616, SMAX, is_q, vece, a0, a1, a2);
|
||||
break;
|
||||
case INDEX_op_smin_vec:
|
||||
tcg_out_insn(s, 3616, SMIN, is_q, vece, a0, a1, a2);
|
||||
break;
|
||||
case INDEX_op_umax_vec:
|
||||
tcg_out_insn(s, 3616, UMAX, is_q, vece, a0, a1, a2);
|
||||
break;
|
||||
case INDEX_op_umin_vec:
|
||||
tcg_out_insn(s, 3616, UMIN, is_q, vece, a0, a1, a2);
|
||||
break;
|
||||
case INDEX_op_not_vec:
|
||||
tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
|
||||
break;
|
||||
@ -2207,6 +2257,14 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
|
||||
case INDEX_op_shli_vec:
|
||||
case INDEX_op_shri_vec:
|
||||
case INDEX_op_sari_vec:
|
||||
case INDEX_op_ssadd_vec:
|
||||
case INDEX_op_sssub_vec:
|
||||
case INDEX_op_usadd_vec:
|
||||
case INDEX_op_ussub_vec:
|
||||
case INDEX_op_smax_vec:
|
||||
case INDEX_op_smin_vec:
|
||||
case INDEX_op_umax_vec:
|
||||
case INDEX_op_umin_vec:
|
||||
return 1;
|
||||
case INDEX_op_mul_vec:
|
||||
return vece < MO_64;
|
||||
@ -2386,6 +2444,14 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
|
||||
case INDEX_op_xor_vec:
|
||||
case INDEX_op_andc_vec:
|
||||
case INDEX_op_orc_vec:
|
||||
case INDEX_op_ssadd_vec:
|
||||
case INDEX_op_sssub_vec:
|
||||
case INDEX_op_usadd_vec:
|
||||
case INDEX_op_ussub_vec:
|
||||
case INDEX_op_smax_vec:
|
||||
case INDEX_op_smin_vec:
|
||||
case INDEX_op_umax_vec:
|
||||
case INDEX_op_umin_vec:
|
||||
return &w_w_w;
|
||||
case INDEX_op_not_vec:
|
||||
case INDEX_op_neg_vec:
|
||||
|
@ -500,6 +500,12 @@ static inline void tcg_out_ldrd_r(TCGContext *s, int cond, TCGReg rt,
|
||||
tcg_out_memop_r(s, cond, INSN_LDRD_REG, rt, rn, rm, 1, 1, 0);
|
||||
}
|
||||
|
||||
static inline void tcg_out_ldrd_rwb(TCGContext *s, int cond, TCGReg rt,
|
||||
TCGReg rn, TCGReg rm)
|
||||
{
|
||||
tcg_out_memop_r(s, cond, INSN_LDRD_REG, rt, rn, rm, 1, 1, 1);
|
||||
}
|
||||
|
||||
static inline void tcg_out_strd_8(TCGContext *s, int cond, TCGReg rt,
|
||||
TCGReg rn, int imm8)
|
||||
{
|
||||
@ -1229,8 +1235,13 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
|
||||
|
||||
#define TLB_SHIFT (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
|
||||
|
||||
/* We're expecting to use an 8-bit immediate and to mask. */
|
||||
QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
|
||||
/* We expect tlb_mask to be before tlb_table. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) <
|
||||
offsetof(CPUArchState, tlb_mask));
|
||||
|
||||
/* We expect to use a 20-bit unsigned offset from ENV. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1])
|
||||
> 0xfffff);
|
||||
|
||||
/* Load and compare a TLB entry, leaving the flags set. Returns the register
|
||||
containing the addend of the tlb entry. Clobbers R0, R1, R2, TMP. */
|
||||
@ -1238,84 +1249,72 @@ QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
|
||||
static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
|
||||
TCGMemOp opc, int mem_index, bool is_load)
|
||||
{
|
||||
TCGReg base = TCG_AREG0;
|
||||
int cmp_off =
|
||||
(is_load
|
||||
? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
|
||||
: offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
|
||||
int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
|
||||
int mask_off;
|
||||
int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
|
||||
: offsetof(CPUTLBEntry, addr_write));
|
||||
int mask_off = offsetof(CPUArchState, tlb_mask[mem_index]);
|
||||
int table_off = offsetof(CPUArchState, tlb_table[mem_index]);
|
||||
TCGReg mask_base = TCG_AREG0, table_base = TCG_AREG0;
|
||||
unsigned s_bits = opc & MO_SIZE;
|
||||
unsigned a_bits = get_alignment_bits(opc);
|
||||
|
||||
/* V7 generates the following:
|
||||
* ubfx r0, addrlo, #TARGET_PAGE_BITS, #CPU_TLB_BITS
|
||||
* add r2, env, #high
|
||||
* add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS
|
||||
* ldr r0, [r2, #cmp]
|
||||
* ldr r2, [r2, #add]
|
||||
* movw tmp, #page_align_mask
|
||||
* bic tmp, addrlo, tmp
|
||||
* cmp r0, tmp
|
||||
*
|
||||
* Otherwise we generate:
|
||||
* shr tmp, addrlo, #TARGET_PAGE_BITS
|
||||
* add r2, env, #high
|
||||
* and r0, tmp, #(CPU_TLB_SIZE - 1)
|
||||
* add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS
|
||||
* ldr r0, [r2, #cmp]
|
||||
* ldr r2, [r2, #add]
|
||||
* tst addrlo, #s_mask
|
||||
* cmpeq r0, tmp, lsl #TARGET_PAGE_BITS
|
||||
*/
|
||||
if (use_armv7_instructions) {
|
||||
tcg_out_extract(s, COND_AL, TCG_REG_R0, addrlo,
|
||||
TARGET_PAGE_BITS, CPU_TLB_BITS);
|
||||
} else {
|
||||
tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
|
||||
0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
|
||||
}
|
||||
if (table_off > 0xfff) {
|
||||
int mask_hi = mask_off & ~0xfff;
|
||||
int table_hi = table_off & ~0xfff;
|
||||
int rot;
|
||||
|
||||
/* Add portions of the offset until the memory access is in range.
|
||||
* If we plan on using ldrd, reduce to an 8-bit offset; otherwise
|
||||
* we can use a 12-bit offset. */
|
||||
if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
|
||||
mask_off = 0xff;
|
||||
} else {
|
||||
mask_off = 0xfff;
|
||||
}
|
||||
while (cmp_off > mask_off) {
|
||||
int shift = ctz32(cmp_off & ~mask_off) & ~1;
|
||||
int rot = ((32 - shift) << 7) & 0xf00;
|
||||
int addend = cmp_off & (0xff << shift);
|
||||
tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
|
||||
rot | ((cmp_off >> shift) & 0xff));
|
||||
base = TCG_REG_R2;
|
||||
add_off -= addend;
|
||||
cmp_off -= addend;
|
||||
}
|
||||
|
||||
if (!use_armv7_instructions) {
|
||||
tcg_out_dat_imm(s, COND_AL, ARITH_AND,
|
||||
TCG_REG_R0, TCG_REG_TMP, CPU_TLB_SIZE - 1);
|
||||
}
|
||||
tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
|
||||
TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
|
||||
|
||||
/* Load the tlb comparator. Use ldrd if needed and available,
|
||||
but due to how the pointer needs setting up, ldm isn't useful.
|
||||
Base arm5 doesn't have ldrd, but armv5te does. */
|
||||
if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
|
||||
tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
|
||||
} else {
|
||||
tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
|
||||
if (TARGET_LONG_BITS == 64) {
|
||||
tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4);
|
||||
table_base = TCG_REG_R2;
|
||||
if (mask_hi == table_hi) {
|
||||
mask_base = table_base;
|
||||
} else if (mask_hi) {
|
||||
mask_base = TCG_REG_TMP;
|
||||
rot = encode_imm(mask_hi);
|
||||
assert(rot >= 0);
|
||||
tcg_out_dat_imm(s, COND_AL, ARITH_ADD, mask_base, TCG_AREG0,
|
||||
rotl(mask_hi, rot) | (rot << 7));
|
||||
}
|
||||
rot = encode_imm(table_hi);
|
||||
assert(rot >= 0);
|
||||
tcg_out_dat_imm(s, COND_AL, ARITH_ADD, table_base, TCG_AREG0,
|
||||
rotl(table_hi, rot) | (rot << 7));
|
||||
|
||||
mask_off -= mask_hi;
|
||||
table_off -= table_hi;
|
||||
}
|
||||
|
||||
/* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP, mask_base, mask_off);
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R2, table_base, table_off);
|
||||
|
||||
/* Extract the tlb index from the address into TMP. */
|
||||
tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_TMP, TCG_REG_TMP, addrlo,
|
||||
SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
|
||||
|
||||
/*
|
||||
* Add the tlb_table pointer, creating the CPUTLBEntry address in R2.
|
||||
* Load the tlb comparator into R0/R1 and the fast path addend into R2.
|
||||
*/
|
||||
if (cmp_off == 0) {
|
||||
if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
|
||||
tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP);
|
||||
} else {
|
||||
tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP);
|
||||
}
|
||||
} else {
|
||||
tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
|
||||
TCG_REG_R2, TCG_REG_R2, TCG_REG_TMP, 0);
|
||||
if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
|
||||
tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
|
||||
} else {
|
||||
tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
|
||||
}
|
||||
}
|
||||
if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {
|
||||
tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4);
|
||||
}
|
||||
|
||||
/* Load the tlb addend. */
|
||||
tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, add_off);
|
||||
tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2,
|
||||
offsetof(CPUTLBEntry, addend));
|
||||
|
||||
/* Check alignment. We don't support inline unaligned acceses,
|
||||
but we can easily support overalignment checks. */
|
||||
|
@ -185,6 +185,8 @@ extern bool have_avx2;
|
||||
#define TCG_TARGET_HAS_shv_vec 0
|
||||
#define TCG_TARGET_HAS_cmp_vec 1
|
||||
#define TCG_TARGET_HAS_mul_vec 1
|
||||
#define TCG_TARGET_HAS_sat_vec 1
|
||||
#define TCG_TARGET_HAS_minmax_vec 1
|
||||
|
||||
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
|
||||
(((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
|
||||
|
@ -329,6 +329,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
|
||||
#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
|
||||
#define OPC_ANDN (0xf2 | P_EXT38)
|
||||
#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
|
||||
#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3))
|
||||
#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
|
||||
#define OPC_BSF (0xbc | P_EXT)
|
||||
#define OPC_BSR (0xbd | P_EXT)
|
||||
@ -377,6 +378,10 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
|
||||
#define OPC_PADDW (0xfd | P_EXT | P_DATA16)
|
||||
#define OPC_PADDD (0xfe | P_EXT | P_DATA16)
|
||||
#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
|
||||
#define OPC_PADDSB (0xec | P_EXT | P_DATA16)
|
||||
#define OPC_PADDSW (0xed | P_EXT | P_DATA16)
|
||||
#define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
|
||||
#define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
|
||||
#define OPC_PAND (0xdb | P_EXT | P_DATA16)
|
||||
#define OPC_PANDN (0xdf | P_EXT | P_DATA16)
|
||||
#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
|
||||
@ -388,6 +393,18 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
|
||||
#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
|
||||
#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
|
||||
#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
|
||||
#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
|
||||
#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMINSW (0xea | P_EXT | P_DATA16)
|
||||
#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMINUB (0xda | P_EXT | P_DATA16)
|
||||
#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
|
||||
#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
|
||||
@ -408,6 +425,10 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
|
||||
#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
|
||||
#define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
|
||||
#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
|
||||
#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
|
||||
#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
|
||||
#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
|
||||
#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
|
||||
#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
|
||||
#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
|
||||
#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
|
||||
@ -1621,7 +1642,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
|
||||
}
|
||||
if (TCG_TYPE_PTR == TCG_TYPE_I64) {
|
||||
hrexw = P_REXW;
|
||||
if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
|
||||
if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
|
||||
tlbtype = TCG_TYPE_I64;
|
||||
tlbrexw = P_REXW;
|
||||
}
|
||||
@ -1629,6 +1650,15 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
|
||||
}
|
||||
|
||||
tcg_out_mov(s, tlbtype, r0, addrlo);
|
||||
tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
|
||||
tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
|
||||
offsetof(CPUArchState, tlb_mask[mem_index]));
|
||||
|
||||
tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
|
||||
offsetof(CPUArchState, tlb_table[mem_index]));
|
||||
|
||||
/* If the required alignment is at least as large as the access, simply
|
||||
copy the address and mask. For lesser alignments, check that we don't
|
||||
cross pages for the complete access. */
|
||||
@ -1638,20 +1668,10 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
|
||||
tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
|
||||
}
|
||||
tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
|
||||
|
||||
tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
|
||||
tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
|
||||
tgen_arithi(s, ARITH_AND + tlbrexw, r0,
|
||||
(CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
|
||||
|
||||
tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
|
||||
offsetof(CPUArchState, tlb_table[mem_index][0])
|
||||
+ which);
|
||||
|
||||
/* cmp 0(r0), r1 */
|
||||
tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
|
||||
tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
|
||||
|
||||
/* Prepare for both the fast path add of the tlb addend, and the slow
|
||||
path function argument setup. */
|
||||
@ -1664,7 +1684,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
|
||||
|
||||
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
|
||||
/* cmp 4(r0), addrhi */
|
||||
tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
|
||||
tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
|
||||
|
||||
/* jne slow_path */
|
||||
tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
|
||||
@ -1676,7 +1696,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
|
||||
|
||||
/* add addend(r0), r1 */
|
||||
tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
|
||||
offsetof(CPUTLBEntry, addend) - which);
|
||||
offsetof(CPUTLBEntry, addend));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2591,9 +2611,21 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
|
||||
static int const add_insn[4] = {
|
||||
OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
|
||||
};
|
||||
static int const ssadd_insn[4] = {
|
||||
OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
|
||||
};
|
||||
static int const usadd_insn[4] = {
|
||||
OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
|
||||
};
|
||||
static int const sub_insn[4] = {
|
||||
OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
|
||||
};
|
||||
static int const sssub_insn[4] = {
|
||||
OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
|
||||
};
|
||||
static int const ussub_insn[4] = {
|
||||
OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
|
||||
};
|
||||
static int const mul_insn[4] = {
|
||||
OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
|
||||
};
|
||||
@ -2618,6 +2650,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
|
||||
static int const packus_insn[4] = {
|
||||
OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
|
||||
};
|
||||
static int const smin_insn[4] = {
|
||||
OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
|
||||
};
|
||||
static int const smax_insn[4] = {
|
||||
OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
|
||||
};
|
||||
static int const umin_insn[4] = {
|
||||
OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
|
||||
};
|
||||
static int const umax_insn[4] = {
|
||||
OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
|
||||
};
|
||||
|
||||
TCGType type = vecl + TCG_TYPE_V64;
|
||||
int insn, sub;
|
||||
@ -2631,9 +2675,21 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
|
||||
case INDEX_op_add_vec:
|
||||
insn = add_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_ssadd_vec:
|
||||
insn = ssadd_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_usadd_vec:
|
||||
insn = usadd_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_sub_vec:
|
||||
insn = sub_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_sssub_vec:
|
||||
insn = sssub_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_ussub_vec:
|
||||
insn = ussub_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_mul_vec:
|
||||
insn = mul_insn[vece];
|
||||
goto gen_simd;
|
||||
@ -2646,6 +2702,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
|
||||
case INDEX_op_xor_vec:
|
||||
insn = OPC_PXOR;
|
||||
goto gen_simd;
|
||||
case INDEX_op_smin_vec:
|
||||
insn = smin_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_umin_vec:
|
||||
insn = umin_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_smax_vec:
|
||||
insn = smax_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_umax_vec:
|
||||
insn = umax_insn[vece];
|
||||
goto gen_simd;
|
||||
case INDEX_op_x86_punpckl_vec:
|
||||
insn = punpckl_insn[vece];
|
||||
goto gen_simd;
|
||||
@ -3007,6 +3075,14 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
|
||||
case INDEX_op_or_vec:
|
||||
case INDEX_op_xor_vec:
|
||||
case INDEX_op_andc_vec:
|
||||
case INDEX_op_ssadd_vec:
|
||||
case INDEX_op_usadd_vec:
|
||||
case INDEX_op_sssub_vec:
|
||||
case INDEX_op_ussub_vec:
|
||||
case INDEX_op_smin_vec:
|
||||
case INDEX_op_umin_vec:
|
||||
case INDEX_op_smax_vec:
|
||||
case INDEX_op_umax_vec:
|
||||
case INDEX_op_cmp_vec:
|
||||
case INDEX_op_x86_shufps_vec:
|
||||
case INDEX_op_x86_blend_vec:
|
||||
@ -3074,258 +3150,310 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
|
||||
}
|
||||
return 1;
|
||||
|
||||
case INDEX_op_ssadd_vec:
|
||||
case INDEX_op_usadd_vec:
|
||||
case INDEX_op_sssub_vec:
|
||||
case INDEX_op_ussub_vec:
|
||||
return vece <= MO_16;
|
||||
case INDEX_op_smin_vec:
|
||||
case INDEX_op_smax_vec:
|
||||
case INDEX_op_umin_vec:
|
||||
case INDEX_op_umax_vec:
|
||||
return vece <= MO_32 ? 1 : -1;
|
||||
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
|
||||
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
|
||||
{
|
||||
TCGv_vec t1, t2;
|
||||
|
||||
tcg_debug_assert(vece == MO_8);
|
||||
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
t2 = tcg_temp_new_vec(type);
|
||||
|
||||
/* Unpack to W, shift, and repack. Tricky bits:
|
||||
(1) Use punpck*bw x,x to produce DDCCBBAA,
|
||||
i.e. duplicate in other half of the 16-bit lane.
|
||||
(2) For right-shift, add 8 so that the high half of
|
||||
the lane becomes zero. For left-shift, we must
|
||||
shift up and down again.
|
||||
(3) Step 2 leaves high half zero such that PACKUSWB
|
||||
(pack with unsigned saturation) does not modify
|
||||
the quantity. */
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
|
||||
|
||||
if (shr) {
|
||||
tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
|
||||
tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
|
||||
} else {
|
||||
tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
|
||||
tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
|
||||
tcg_gen_shri_vec(MO_16, t1, t1, 8);
|
||||
tcg_gen_shri_vec(MO_16, t2, t2, 8);
|
||||
}
|
||||
|
||||
vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
|
||||
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
}
|
||||
|
||||
static void expand_vec_sari(TCGType type, unsigned vece,
|
||||
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
|
||||
{
|
||||
TCGv_vec t1, t2;
|
||||
|
||||
switch (vece) {
|
||||
case MO_8:
|
||||
/* Unpack to W, shift, and repack, as in expand_vec_shi. */
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
t2 = tcg_temp_new_vec(type);
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
|
||||
tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
|
||||
tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
|
||||
vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
|
||||
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
break;
|
||||
|
||||
case MO_64:
|
||||
if (imm <= 32) {
|
||||
/* We can emulate a small sign extend by performing an arithmetic
|
||||
* 32-bit shift and overwriting the high half of a 64-bit logical
|
||||
* shift (note that the ISA says shift of 32 is valid).
|
||||
*/
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
tcg_gen_sari_vec(MO_32, t1, v1, imm);
|
||||
tcg_gen_shri_vec(MO_64, v0, v1, imm);
|
||||
vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
|
||||
tcgv_vec_arg(v0), tcgv_vec_arg(v0),
|
||||
tcgv_vec_arg(t1), 0xaa);
|
||||
tcg_temp_free_vec(t1);
|
||||
} else {
|
||||
/* Otherwise we will need to use a compare vs 0 to produce
|
||||
* the sign-extend, shift and merge.
|
||||
*/
|
||||
t1 = tcg_const_zeros_vec(type);
|
||||
tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
|
||||
tcg_gen_shri_vec(MO_64, v0, v1, imm);
|
||||
tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
|
||||
tcg_gen_or_vec(MO_64, v0, v0, t1);
|
||||
tcg_temp_free_vec(t1);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
g_assert_not_reached();
|
||||
}
|
||||
}
|
||||
|
||||
static void expand_vec_mul(TCGType type, unsigned vece,
|
||||
TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
|
||||
{
|
||||
TCGv_vec t1, t2, t3, t4;
|
||||
|
||||
tcg_debug_assert(vece == MO_8);
|
||||
|
||||
/*
|
||||
* Unpack v1 bytes to words, 0 | x.
|
||||
* Unpack v2 bytes to words, y | 0.
|
||||
* This leaves the 8-bit result, x * y, with 8 bits of right padding.
|
||||
* Shift logical right by 8 bits to clear the high 8 bytes before
|
||||
* using an unsigned saturated pack.
|
||||
*
|
||||
* The difference between the V64, V128 and V256 cases is merely how
|
||||
* we distribute the expansion between temporaries.
|
||||
*/
|
||||
switch (type) {
|
||||
case TCG_TYPE_V64:
|
||||
t1 = tcg_temp_new_vec(TCG_TYPE_V128);
|
||||
t2 = tcg_temp_new_vec(TCG_TYPE_V128);
|
||||
tcg_gen_dup16i_vec(t2, 0);
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
|
||||
tcg_gen_mul_vec(MO_16, t1, t1, t2);
|
||||
tcg_gen_shri_vec(MO_16, t1, t1, 8);
|
||||
vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
|
||||
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
break;
|
||||
|
||||
case TCG_TYPE_V128:
|
||||
case TCG_TYPE_V256:
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
t2 = tcg_temp_new_vec(type);
|
||||
t3 = tcg_temp_new_vec(type);
|
||||
t4 = tcg_temp_new_vec(type);
|
||||
tcg_gen_dup16i_vec(t4, 0);
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
|
||||
tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
|
||||
tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
|
||||
tcg_gen_mul_vec(MO_16, t1, t1, t2);
|
||||
tcg_gen_mul_vec(MO_16, t3, t3, t4);
|
||||
tcg_gen_shri_vec(MO_16, t1, t1, 8);
|
||||
tcg_gen_shri_vec(MO_16, t3, t3, 8);
|
||||
vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
|
||||
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
tcg_temp_free_vec(t3);
|
||||
tcg_temp_free_vec(t4);
|
||||
break;
|
||||
|
||||
default:
|
||||
g_assert_not_reached();
|
||||
}
|
||||
}
|
||||
|
||||
static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
|
||||
TCGv_vec v1, TCGv_vec v2, TCGCond cond)
|
||||
{
|
||||
enum {
|
||||
NEED_SWAP = 1,
|
||||
NEED_INV = 2,
|
||||
NEED_BIAS = 4
|
||||
};
|
||||
static const uint8_t fixups[16] = {
|
||||
[0 ... 15] = -1,
|
||||
[TCG_COND_EQ] = 0,
|
||||
[TCG_COND_NE] = NEED_INV,
|
||||
[TCG_COND_GT] = 0,
|
||||
[TCG_COND_LT] = NEED_SWAP,
|
||||
[TCG_COND_LE] = NEED_INV,
|
||||
[TCG_COND_GE] = NEED_SWAP | NEED_INV,
|
||||
[TCG_COND_GTU] = NEED_BIAS,
|
||||
[TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
|
||||
[TCG_COND_LEU] = NEED_BIAS | NEED_INV,
|
||||
[TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
|
||||
};
|
||||
TCGv_vec t1, t2;
|
||||
uint8_t fixup;
|
||||
|
||||
fixup = fixups[cond & 15];
|
||||
tcg_debug_assert(fixup != 0xff);
|
||||
|
||||
if (fixup & NEED_INV) {
|
||||
cond = tcg_invert_cond(cond);
|
||||
}
|
||||
if (fixup & NEED_SWAP) {
|
||||
t1 = v1, v1 = v2, v2 = t1;
|
||||
cond = tcg_swap_cond(cond);
|
||||
}
|
||||
|
||||
t1 = t2 = NULL;
|
||||
if (fixup & NEED_BIAS) {
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
t2 = tcg_temp_new_vec(type);
|
||||
tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
|
||||
tcg_gen_sub_vec(vece, t1, v1, t2);
|
||||
tcg_gen_sub_vec(vece, t2, v2, t2);
|
||||
v1 = t1;
|
||||
v2 = t2;
|
||||
cond = tcg_signed_cond(cond);
|
||||
}
|
||||
|
||||
tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
|
||||
/* Expand directly; do not recurse. */
|
||||
vec_gen_4(INDEX_op_cmp_vec, type, vece,
|
||||
tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
|
||||
|
||||
if (t1) {
|
||||
tcg_temp_free_vec(t1);
|
||||
if (t2) {
|
||||
tcg_temp_free_vec(t2);
|
||||
}
|
||||
}
|
||||
if (fixup & NEED_INV) {
|
||||
tcg_gen_not_vec(vece, v0, v0);
|
||||
}
|
||||
}
|
||||
|
||||
static void expand_vec_minmax(TCGType type, unsigned vece,
|
||||
TCGCond cond, bool min,
|
||||
TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
|
||||
{
|
||||
TCGv_vec t1 = tcg_temp_new_vec(type);
|
||||
|
||||
tcg_debug_assert(vece == MO_64);
|
||||
|
||||
tcg_gen_cmp_vec(cond, vece, t1, v1, v2);
|
||||
if (min) {
|
||||
TCGv_vec t2;
|
||||
t2 = v1, v1 = v2, v2 = t2;
|
||||
}
|
||||
vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
|
||||
tcgv_vec_arg(v0), tcgv_vec_arg(v1),
|
||||
tcgv_vec_arg(v2), tcgv_vec_arg(t1));
|
||||
tcg_temp_free_vec(t1);
|
||||
}
|
||||
|
||||
void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
|
||||
TCGArg a0, ...)
|
||||
{
|
||||
va_list va;
|
||||
TCGArg a1, a2;
|
||||
TCGv_vec v0, t1, t2, t3, t4;
|
||||
TCGArg a2;
|
||||
TCGv_vec v0, v1, v2;
|
||||
|
||||
va_start(va, a0);
|
||||
v0 = temp_tcgv_vec(arg_temp(a0));
|
||||
v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
|
||||
a2 = va_arg(va, TCGArg);
|
||||
|
||||
switch (opc) {
|
||||
case INDEX_op_shli_vec:
|
||||
case INDEX_op_shri_vec:
|
||||
tcg_debug_assert(vece == MO_8);
|
||||
a1 = va_arg(va, TCGArg);
|
||||
a2 = va_arg(va, TCGArg);
|
||||
/* Unpack to W, shift, and repack. Tricky bits:
|
||||
(1) Use punpck*bw x,x to produce DDCCBBAA,
|
||||
i.e. duplicate in other half of the 16-bit lane.
|
||||
(2) For right-shift, add 8 so that the high half of
|
||||
the lane becomes zero. For left-shift, we must
|
||||
shift up and down again.
|
||||
(3) Step 2 leaves high half zero such that PACKUSWB
|
||||
(pack with unsigned saturation) does not modify
|
||||
the quantity. */
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
t2 = tcg_temp_new_vec(type);
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
|
||||
tcgv_vec_arg(t1), a1, a1);
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
|
||||
tcgv_vec_arg(t2), a1, a1);
|
||||
if (opc == INDEX_op_shri_vec) {
|
||||
vec_gen_3(INDEX_op_shri_vec, type, MO_16,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
|
||||
vec_gen_3(INDEX_op_shri_vec, type, MO_16,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
|
||||
} else {
|
||||
vec_gen_3(INDEX_op_shli_vec, type, MO_16,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
|
||||
vec_gen_3(INDEX_op_shli_vec, type, MO_16,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
|
||||
vec_gen_3(INDEX_op_shri_vec, type, MO_16,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
|
||||
vec_gen_3(INDEX_op_shri_vec, type, MO_16,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
|
||||
}
|
||||
vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
|
||||
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
|
||||
break;
|
||||
|
||||
case INDEX_op_sari_vec:
|
||||
a1 = va_arg(va, TCGArg);
|
||||
a2 = va_arg(va, TCGArg);
|
||||
if (vece == MO_8) {
|
||||
/* Unpack to W, shift, and repack, as above. */
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
t2 = tcg_temp_new_vec(type);
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
|
||||
tcgv_vec_arg(t1), a1, a1);
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
|
||||
tcgv_vec_arg(t2), a1, a1);
|
||||
vec_gen_3(INDEX_op_sari_vec, type, MO_16,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
|
||||
vec_gen_3(INDEX_op_sari_vec, type, MO_16,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
|
||||
vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
|
||||
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
break;
|
||||
}
|
||||
tcg_debug_assert(vece == MO_64);
|
||||
/* MO_64: If the shift is <= 32, we can emulate the sign extend by
|
||||
performing an arithmetic 32-bit shift and overwriting the high
|
||||
half of the result (note that the ISA says shift of 32 is valid). */
|
||||
if (a2 <= 32) {
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
|
||||
vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
|
||||
vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
|
||||
a0, a0, tcgv_vec_arg(t1), 0xaa);
|
||||
tcg_temp_free_vec(t1);
|
||||
break;
|
||||
}
|
||||
/* Otherwise we will need to use a compare vs 0 to produce the
|
||||
sign-extend, shift and merge. */
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
t2 = tcg_const_zeros_vec(type);
|
||||
vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
|
||||
tcg_temp_free_vec(t2);
|
||||
vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
|
||||
vec_gen_3(INDEX_op_shli_vec, type, MO_64,
|
||||
tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
|
||||
vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
|
||||
tcg_temp_free_vec(t1);
|
||||
expand_vec_sari(type, vece, v0, v1, a2);
|
||||
break;
|
||||
|
||||
case INDEX_op_mul_vec:
|
||||
tcg_debug_assert(vece == MO_8);
|
||||
a1 = va_arg(va, TCGArg);
|
||||
a2 = va_arg(va, TCGArg);
|
||||
switch (type) {
|
||||
case TCG_TYPE_V64:
|
||||
t1 = tcg_temp_new_vec(TCG_TYPE_V128);
|
||||
t2 = tcg_temp_new_vec(TCG_TYPE_V128);
|
||||
tcg_gen_dup16i_vec(t2, 0);
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
|
||||
tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
|
||||
tcg_gen_mul_vec(MO_16, t1, t1, t2);
|
||||
tcg_gen_shri_vec(MO_16, t1, t1, 8);
|
||||
vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
|
||||
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
break;
|
||||
|
||||
case TCG_TYPE_V128:
|
||||
t1 = tcg_temp_new_vec(TCG_TYPE_V128);
|
||||
t2 = tcg_temp_new_vec(TCG_TYPE_V128);
|
||||
t3 = tcg_temp_new_vec(TCG_TYPE_V128);
|
||||
t4 = tcg_temp_new_vec(TCG_TYPE_V128);
|
||||
tcg_gen_dup16i_vec(t4, 0);
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
|
||||
tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
|
||||
tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
|
||||
tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
|
||||
tcg_gen_mul_vec(MO_16, t1, t1, t2);
|
||||
tcg_gen_mul_vec(MO_16, t3, t3, t4);
|
||||
tcg_gen_shri_vec(MO_16, t1, t1, 8);
|
||||
tcg_gen_shri_vec(MO_16, t3, t3, 8);
|
||||
vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
|
||||
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
tcg_temp_free_vec(t3);
|
||||
tcg_temp_free_vec(t4);
|
||||
break;
|
||||
|
||||
case TCG_TYPE_V256:
|
||||
t1 = tcg_temp_new_vec(TCG_TYPE_V256);
|
||||
t2 = tcg_temp_new_vec(TCG_TYPE_V256);
|
||||
t3 = tcg_temp_new_vec(TCG_TYPE_V256);
|
||||
t4 = tcg_temp_new_vec(TCG_TYPE_V256);
|
||||
tcg_gen_dup16i_vec(t4, 0);
|
||||
/* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
|
||||
t1: extends of B[0-7], D[0-7]
|
||||
t2: extends of X[0-7], Z[0-7]
|
||||
t3: extends of A[0-7], C[0-7]
|
||||
t4: extends of W[0-7], Y[0-7]. */
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
|
||||
tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
|
||||
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
|
||||
tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
|
||||
tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
|
||||
vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
|
||||
tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
|
||||
/* t1: BX DZ; t2: AW CY. */
|
||||
tcg_gen_mul_vec(MO_16, t1, t1, t2);
|
||||
tcg_gen_mul_vec(MO_16, t3, t3, t4);
|
||||
tcg_gen_shri_vec(MO_16, t1, t1, 8);
|
||||
tcg_gen_shri_vec(MO_16, t3, t3, 8);
|
||||
/* a0: AW BX CY DZ. */
|
||||
vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
|
||||
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
tcg_temp_free_vec(t3);
|
||||
tcg_temp_free_vec(t4);
|
||||
break;
|
||||
|
||||
default:
|
||||
g_assert_not_reached();
|
||||
}
|
||||
v2 = temp_tcgv_vec(arg_temp(a2));
|
||||
expand_vec_mul(type, vece, v0, v1, v2);
|
||||
break;
|
||||
|
||||
case INDEX_op_cmp_vec:
|
||||
{
|
||||
enum {
|
||||
NEED_SWAP = 1,
|
||||
NEED_INV = 2,
|
||||
NEED_BIAS = 4
|
||||
};
|
||||
static const uint8_t fixups[16] = {
|
||||
[0 ... 15] = -1,
|
||||
[TCG_COND_EQ] = 0,
|
||||
[TCG_COND_NE] = NEED_INV,
|
||||
[TCG_COND_GT] = 0,
|
||||
[TCG_COND_LT] = NEED_SWAP,
|
||||
[TCG_COND_LE] = NEED_INV,
|
||||
[TCG_COND_GE] = NEED_SWAP | NEED_INV,
|
||||
[TCG_COND_GTU] = NEED_BIAS,
|
||||
[TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
|
||||
[TCG_COND_LEU] = NEED_BIAS | NEED_INV,
|
||||
[TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
|
||||
};
|
||||
v2 = temp_tcgv_vec(arg_temp(a2));
|
||||
expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
|
||||
break;
|
||||
|
||||
TCGCond cond;
|
||||
uint8_t fixup;
|
||||
|
||||
a1 = va_arg(va, TCGArg);
|
||||
a2 = va_arg(va, TCGArg);
|
||||
cond = va_arg(va, TCGArg);
|
||||
fixup = fixups[cond & 15];
|
||||
tcg_debug_assert(fixup != 0xff);
|
||||
|
||||
if (fixup & NEED_INV) {
|
||||
cond = tcg_invert_cond(cond);
|
||||
}
|
||||
if (fixup & NEED_SWAP) {
|
||||
TCGArg t;
|
||||
t = a1, a1 = a2, a2 = t;
|
||||
cond = tcg_swap_cond(cond);
|
||||
}
|
||||
|
||||
t1 = t2 = NULL;
|
||||
if (fixup & NEED_BIAS) {
|
||||
t1 = tcg_temp_new_vec(type);
|
||||
t2 = tcg_temp_new_vec(type);
|
||||
tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
|
||||
tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
|
||||
tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
|
||||
a1 = tcgv_vec_arg(t1);
|
||||
a2 = tcgv_vec_arg(t2);
|
||||
cond = tcg_signed_cond(cond);
|
||||
}
|
||||
|
||||
tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
|
||||
vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
|
||||
|
||||
if (fixup & NEED_BIAS) {
|
||||
tcg_temp_free_vec(t1);
|
||||
tcg_temp_free_vec(t2);
|
||||
}
|
||||
if (fixup & NEED_INV) {
|
||||
tcg_gen_not_vec(vece, v0, v0);
|
||||
}
|
||||
}
|
||||
case INDEX_op_smin_vec:
|
||||
v2 = temp_tcgv_vec(arg_temp(a2));
|
||||
expand_vec_minmax(type, vece, TCG_COND_GT, true, v0, v1, v2);
|
||||
break;
|
||||
case INDEX_op_smax_vec:
|
||||
v2 = temp_tcgv_vec(arg_temp(a2));
|
||||
expand_vec_minmax(type, vece, TCG_COND_GT, false, v0, v1, v2);
|
||||
break;
|
||||
case INDEX_op_umin_vec:
|
||||
v2 = temp_tcgv_vec(arg_temp(a2));
|
||||
expand_vec_minmax(type, vece, TCG_COND_GTU, true, v0, v1, v2);
|
||||
break;
|
||||
case INDEX_op_umax_vec:
|
||||
v2 = temp_tcgv_vec(arg_temp(a2));
|
||||
expand_vec_minmax(type, vece, TCG_COND_GTU, false, v0, v1, v2);
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@ -1201,8 +1201,19 @@ static int tcg_out_call_iarg_reg2(TCGContext *s, int i, TCGReg al, TCGReg ah)
|
||||
return i;
|
||||
}
|
||||
|
||||
/* Perform the tlb comparison operation. The complete host address is
|
||||
placed in BASE. Clobbers TMP0, TMP1, TMP2, A0. */
|
||||
/* We expect tlb_mask to be before tlb_table. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) <
|
||||
offsetof(CPUArchState, tlb_mask));
|
||||
|
||||
/* We expect tlb_mask to be "near" tlb_table. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) -
|
||||
offsetof(CPUArchState, tlb_mask) >= 0x8000);
|
||||
|
||||
/*
|
||||
* Perform the tlb comparison operation.
|
||||
* The complete host address is placed in BASE.
|
||||
* Clobbers TMP0, TMP1, TMP2, TMP3.
|
||||
*/
|
||||
static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
|
||||
TCGReg addrh, TCGMemOpIdx oi,
|
||||
tcg_insn_unit *label_ptr[2], bool is_load)
|
||||
@ -1210,52 +1221,73 @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
|
||||
TCGMemOp opc = get_memop(oi);
|
||||
unsigned s_bits = opc & MO_SIZE;
|
||||
unsigned a_bits = get_alignment_bits(opc);
|
||||
target_ulong mask;
|
||||
int mem_index = get_mmuidx(oi);
|
||||
int cmp_off
|
||||
= (is_load
|
||||
? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
|
||||
: offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
|
||||
int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
|
||||
int mask_off = offsetof(CPUArchState, tlb_mask[mem_index]);
|
||||
int table_off = offsetof(CPUArchState, tlb_table[mem_index]);
|
||||
int add_off = offsetof(CPUTLBEntry, addend);
|
||||
int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
|
||||
: offsetof(CPUTLBEntry, addr_write));
|
||||
TCGReg mask_base = TCG_AREG0, table_base = TCG_AREG0;
|
||||
target_ulong mask;
|
||||
|
||||
tcg_out_opc_sa(s, ALIAS_TSRL, TCG_REG_A0, addrl,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0,
|
||||
(CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
|
||||
tcg_out_opc_reg(s, ALIAS_PADD, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
|
||||
if (table_off > 0x7fff) {
|
||||
int mask_hi = mask_off - (int16_t)mask_off;
|
||||
int table_hi = table_off - (int16_t)table_off;
|
||||
|
||||
/* Compensate for very large offsets. */
|
||||
while (add_off >= 0x8000) {
|
||||
/* Most target env are smaller than 32k, but a few are larger than 64k,
|
||||
* so handle an arbitrarily large offset.
|
||||
*/
|
||||
tcg_out_opc_imm(s, ALIAS_PADDI, TCG_REG_A0, TCG_REG_A0, 0x7ff0);
|
||||
cmp_off -= 0x7ff0;
|
||||
add_off -= 0x7ff0;
|
||||
table_base = TCG_TMP1;
|
||||
if (likely(mask_hi == table_hi)) {
|
||||
mask_base = table_base;
|
||||
tcg_out_opc_imm(s, OPC_LUI, mask_base, TCG_REG_ZERO, mask_hi >> 16);
|
||||
tcg_out_opc_reg(s, ALIAS_PADD, mask_base, mask_base, TCG_AREG0);
|
||||
mask_off -= mask_hi;
|
||||
table_off -= mask_hi;
|
||||
} else {
|
||||
if (mask_hi != 0) {
|
||||
mask_base = TCG_TMP0;
|
||||
tcg_out_opc_imm(s, OPC_LUI,
|
||||
mask_base, TCG_REG_ZERO, mask_hi >> 16);
|
||||
tcg_out_opc_reg(s, ALIAS_PADD,
|
||||
mask_base, mask_base, TCG_AREG0);
|
||||
}
|
||||
table_off -= mask_off;
|
||||
mask_off -= mask_hi;
|
||||
tcg_out_opc_imm(s, ALIAS_PADDI, table_base, mask_base, mask_off);
|
||||
}
|
||||
}
|
||||
|
||||
/* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, mask_base, mask_off);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP1, table_base, table_off);
|
||||
|
||||
/* Extract the TLB index from the address into TMP3. */
|
||||
tcg_out_opc_sa(s, ALIAS_TSRL, TCG_TMP3, addrl,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
tcg_out_opc_reg(s, OPC_AND, TCG_TMP3, TCG_TMP3, TCG_TMP0);
|
||||
|
||||
/* Add the tlb_table pointer, creating the CPUTLBEntry address in TMP3. */
|
||||
tcg_out_opc_reg(s, ALIAS_PADD, TCG_TMP3, TCG_TMP3, TCG_TMP1);
|
||||
|
||||
/* We don't currently support unaligned accesses.
|
||||
We could do so with mips32r6. */
|
||||
if (a_bits < s_bits) {
|
||||
a_bits = s_bits;
|
||||
}
|
||||
|
||||
/* Mask the page bits, keeping the alignment bits to compare against. */
|
||||
mask = (target_ulong)TARGET_PAGE_MASK | ((1 << a_bits) - 1);
|
||||
|
||||
/* Load the (low half) tlb comparator. Mask the page bits, keeping the
|
||||
alignment bits to compare against. */
|
||||
/* Load the (low-half) tlb comparator. */
|
||||
if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_REG_A0, cmp_off + LO_OFF);
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF);
|
||||
tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1, mask);
|
||||
} else {
|
||||
tcg_out_ldst(s,
|
||||
(TARGET_LONG_BITS == 64 ? OPC_LD
|
||||
: TCG_TARGET_REG_BITS == 64 ? OPC_LWU : OPC_LW),
|
||||
TCG_TMP0, TCG_REG_A0, cmp_off);
|
||||
tcg_out_ldst(s, (TARGET_LONG_BITS == 64 ? OPC_LD
|
||||
: TCG_TARGET_REG_BITS == 64 ? OPC_LWU : OPC_LW),
|
||||
TCG_TMP0, TCG_TMP3, cmp_off);
|
||||
tcg_out_movi(s, TCG_TYPE_TL, TCG_TMP1, mask);
|
||||
/* No second compare is required here;
|
||||
load the tlb addend for the fast path. */
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_REG_A0, add_off);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
|
||||
}
|
||||
tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl);
|
||||
|
||||
@ -1271,10 +1303,10 @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
|
||||
/* Load and test the high half tlb comparator. */
|
||||
if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
|
||||
/* delay slot */
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_REG_A0, cmp_off + HI_OFF);
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF);
|
||||
|
||||
/* Load the tlb addend for the fast path. */
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_REG_A0, add_off);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
|
||||
|
||||
label_ptr[1] = s->code_ptr;
|
||||
tcg_out_opc_br(s, OPC_BNE, addrh, TCG_TMP0);
|
||||
@ -1343,8 +1375,9 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
|
||||
}
|
||||
}
|
||||
|
||||
reloc_pc16(s->code_ptr, l->raddr);
|
||||
tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
|
||||
reloc_pc16(s->code_ptr - 1, l->raddr);
|
||||
|
||||
/* delay slot */
|
||||
if (TCG_TARGET_REG_BITS == 64 && l->type == TCG_TYPE_I32) {
|
||||
/* we always sign-extend 32-bit loads */
|
||||
|
@ -327,6 +327,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
|
||||
#define LHZ OPCD( 40)
|
||||
#define LHA OPCD( 42)
|
||||
#define LWZ OPCD( 32)
|
||||
#define LWZUX XO31( 55)
|
||||
#define STB OPCD( 38)
|
||||
#define STH OPCD( 44)
|
||||
#define STW OPCD( 36)
|
||||
@ -338,6 +339,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
|
||||
#define LD XO58( 0)
|
||||
#define LDX XO31( 21)
|
||||
#define LDU XO58( 1)
|
||||
#define LDUX XO31( 53)
|
||||
#define LWA XO58( 2)
|
||||
#define LWAX XO31(341)
|
||||
|
||||
@ -1503,6 +1505,10 @@ static void * const qemu_st_helpers[16] = {
|
||||
[MO_BEQ] = helper_be_stq_mmu,
|
||||
};
|
||||
|
||||
/* We expect tlb_mask to be before tlb_table. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) <
|
||||
offsetof(CPUArchState, tlb_mask));
|
||||
|
||||
/* Perform the TLB load and compare. Places the result of the comparison
|
||||
in CR7, loads the addend of the TLB into R3, and returns the register
|
||||
containing the guest address (zero-extended into R4). Clobbers R0 and R2. */
|
||||
@ -1513,61 +1519,63 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp opc,
|
||||
{
|
||||
int cmp_off
|
||||
= (is_read
|
||||
? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
|
||||
: offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
|
||||
int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
|
||||
TCGReg base = TCG_AREG0;
|
||||
? offsetof(CPUTLBEntry, addr_read)
|
||||
: offsetof(CPUTLBEntry, addr_write));
|
||||
int mask_off = offsetof(CPUArchState, tlb_mask[mem_index]);
|
||||
int table_off = offsetof(CPUArchState, tlb_table[mem_index]);
|
||||
TCGReg mask_base = TCG_AREG0, table_base = TCG_AREG0;
|
||||
unsigned s_bits = opc & MO_SIZE;
|
||||
unsigned a_bits = get_alignment_bits(opc);
|
||||
|
||||
/* Extract the page index, shifted into place for tlb index. */
|
||||
if (TCG_TARGET_REG_BITS == 64) {
|
||||
if (TARGET_LONG_BITS == 32) {
|
||||
/* Zero-extend the address into a place helpful for further use. */
|
||||
tcg_out_ext32u(s, TCG_REG_R4, addrlo);
|
||||
addrlo = TCG_REG_R4;
|
||||
} else {
|
||||
tcg_out_rld(s, RLDICL, TCG_REG_R3, addrlo,
|
||||
64 - TARGET_PAGE_BITS, 64 - CPU_TLB_BITS);
|
||||
if (table_off > 0x7fff) {
|
||||
int mask_hi = mask_off - (int16_t)mask_off;
|
||||
int table_hi = table_off - (int16_t)table_off;
|
||||
|
||||
table_base = TCG_REG_R4;
|
||||
if (mask_hi == table_hi) {
|
||||
mask_base = table_base;
|
||||
} else if (mask_hi) {
|
||||
mask_base = TCG_REG_R3;
|
||||
tcg_out32(s, ADDIS | TAI(mask_base, TCG_AREG0, mask_hi >> 16));
|
||||
}
|
||||
tcg_out32(s, ADDIS | TAI(table_base, TCG_AREG0, table_hi >> 16));
|
||||
mask_off -= mask_hi;
|
||||
table_off -= table_hi;
|
||||
}
|
||||
|
||||
/* Compensate for very large offsets. */
|
||||
if (add_off >= 0x8000) {
|
||||
int low = (int16_t)cmp_off;
|
||||
int high = cmp_off - low;
|
||||
assert((high & 0xffff) == 0);
|
||||
assert(cmp_off - high == (int16_t)(cmp_off - high));
|
||||
assert(add_off - high == (int16_t)(add_off - high));
|
||||
tcg_out32(s, ADDIS | TAI(TCG_REG_TMP1, base, high >> 16));
|
||||
base = TCG_REG_TMP1;
|
||||
cmp_off -= high;
|
||||
add_off -= high;
|
||||
}
|
||||
/* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, mask_base, mask_off);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R4, table_base, table_off);
|
||||
|
||||
/* Extraction and shifting, part 2. */
|
||||
if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) {
|
||||
tcg_out_rlw(s, RLWINM, TCG_REG_R3, addrlo,
|
||||
32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS),
|
||||
32 - (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS),
|
||||
31 - CPU_TLB_ENTRY_BITS);
|
||||
/* Extract the page index, shifted into place for tlb index. */
|
||||
if (TCG_TARGET_REG_BITS == 32) {
|
||||
tcg_out_shri32(s, TCG_REG_TMP1, addrlo,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
} else {
|
||||
tcg_out_shli64(s, TCG_REG_R3, TCG_REG_R3, CPU_TLB_ENTRY_BITS);
|
||||
tcg_out_shri64(s, TCG_REG_TMP1, addrlo,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
}
|
||||
tcg_out32(s, AND | SAB(TCG_REG_R3, TCG_REG_R3, TCG_REG_TMP1));
|
||||
|
||||
tcg_out32(s, ADD | TAB(TCG_REG_R3, TCG_REG_R3, base));
|
||||
|
||||
/* Load the tlb comparator. */
|
||||
if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R4, TCG_REG_R3, cmp_off);
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP1, TCG_REG_R3, cmp_off + 4);
|
||||
/* Load the TLB comparator. */
|
||||
if (cmp_off == 0 && TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
|
||||
uint32_t lxu = (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32
|
||||
? LWZUX : LDUX);
|
||||
tcg_out32(s, lxu | TAB(TCG_REG_TMP1, TCG_REG_R3, TCG_REG_R4));
|
||||
} else {
|
||||
tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP1, TCG_REG_R3, cmp_off);
|
||||
tcg_out32(s, ADD | TAB(TCG_REG_R3, TCG_REG_R3, TCG_REG_R4));
|
||||
if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP1, TCG_REG_R3, cmp_off + 4);
|
||||
tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R4, TCG_REG_R3, cmp_off);
|
||||
} else {
|
||||
tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP1, TCG_REG_R3, cmp_off);
|
||||
}
|
||||
}
|
||||
|
||||
/* Load the TLB addend for use on the fast path. Do this asap
|
||||
to minimize any load use delay. */
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_REG_R3, add_off);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_REG_R3,
|
||||
offsetof(CPUTLBEntry, addend));
|
||||
|
||||
/* Clear the non-page, non-alignment bits from the address */
|
||||
if (TCG_TARGET_REG_BITS == 32) {
|
||||
@ -1600,6 +1608,9 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp opc,
|
||||
if (TARGET_LONG_BITS == 32) {
|
||||
tcg_out_rlw(s, RLWINM, TCG_REG_R0, t, 0,
|
||||
(32 - a_bits) & 31, 31 - TARGET_PAGE_BITS);
|
||||
/* Zero-extend the address for use in the final address. */
|
||||
tcg_out_ext32u(s, TCG_REG_R4, addrlo);
|
||||
addrlo = TCG_REG_R4;
|
||||
} else if (a_bits == 0) {
|
||||
tcg_out_rld(s, RLDICR, TCG_REG_R0, t, 0, 63 - TARGET_PAGE_BITS);
|
||||
} else {
|
||||
|
@ -958,6 +958,17 @@ static void * const qemu_st_helpers[16] = {
|
||||
[MO_BEQ] = helper_be_stq_mmu,
|
||||
};
|
||||
|
||||
/* We don't support oversize guests */
|
||||
QEMU_BUILD_BUG_ON(TCG_TARGET_REG_BITS < TARGET_LONG_BITS);
|
||||
|
||||
/* We expect tlb_mask to be before tlb_table. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) <
|
||||
offsetof(CPUArchState, tlb_mask));
|
||||
|
||||
/* We expect tlb_mask to be "near" tlb_table. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) -
|
||||
offsetof(CPUArchState, tlb_mask) >= 0x800);
|
||||
|
||||
static void tcg_out_tlb_load(TCGContext *s, TCGReg addrl,
|
||||
TCGReg addrh, TCGMemOpIdx oi,
|
||||
tcg_insn_unit **label_ptr, bool is_load)
|
||||
@ -965,94 +976,67 @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg addrl,
|
||||
TCGMemOp opc = get_memop(oi);
|
||||
unsigned s_bits = opc & MO_SIZE;
|
||||
unsigned a_bits = get_alignment_bits(opc);
|
||||
target_ulong mask;
|
||||
tcg_target_long compare_mask;
|
||||
int mem_index = get_mmuidx(oi);
|
||||
int cmp_off
|
||||
= (is_load
|
||||
? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
|
||||
: offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
|
||||
int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
|
||||
RISCVInsn load_cmp_op = (TARGET_LONG_BITS == 64 ? OPC_LD :
|
||||
TCG_TARGET_REG_BITS == 64 ? OPC_LWU : OPC_LW);
|
||||
RISCVInsn load_add_op = TCG_TARGET_REG_BITS == 64 ? OPC_LD : OPC_LW;
|
||||
TCGReg base = TCG_AREG0;
|
||||
int mask_off, table_off;
|
||||
TCGReg mask_base = TCG_AREG0, table_base = TCG_AREG0;
|
||||
|
||||
/* We don't support oversize guests */
|
||||
if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
|
||||
g_assert_not_reached();
|
||||
mask_off = offsetof(CPUArchState, tlb_mask[mem_index]);
|
||||
table_off = offsetof(CPUArchState, tlb_table[mem_index]);
|
||||
if (table_off > 0x7ff) {
|
||||
int mask_hi = mask_off - sextreg(mask_off, 0, 12);
|
||||
int table_hi = table_off - sextreg(table_off, 0, 12);
|
||||
|
||||
if (likely(mask_hi == table_hi)) {
|
||||
mask_base = table_base = TCG_REG_TMP1;
|
||||
tcg_out_opc_upper(s, OPC_LUI, mask_base, mask_hi);
|
||||
tcg_out_opc_reg(s, OPC_ADD, mask_base, mask_base, TCG_AREG0);
|
||||
mask_off -= mask_hi;
|
||||
table_off -= mask_hi;
|
||||
} else {
|
||||
mask_base = TCG_REG_TMP0;
|
||||
table_base = TCG_REG_TMP1;
|
||||
tcg_out_opc_upper(s, OPC_LUI, mask_base, mask_hi);
|
||||
tcg_out_opc_reg(s, OPC_ADD, mask_base, mask_base, TCG_AREG0);
|
||||
table_off -= mask_off;
|
||||
mask_off -= mask_hi;
|
||||
tcg_out_opc_imm(s, OPC_ADDI, table_base, mask_base, mask_off);
|
||||
}
|
||||
}
|
||||
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, mask_base, mask_off);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, table_base, table_off);
|
||||
|
||||
tcg_out_opc_imm(s, OPC_SRLI, TCG_REG_TMP2, addrl,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP0);
|
||||
tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1);
|
||||
|
||||
/* Load the tlb comparator and the addend. */
|
||||
tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP0, TCG_REG_TMP2,
|
||||
is_load ? offsetof(CPUTLBEntry, addr_read)
|
||||
: offsetof(CPUTLBEntry, addr_write));
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2,
|
||||
offsetof(CPUTLBEntry, addend));
|
||||
|
||||
/* We don't support unaligned accesses. */
|
||||
if (a_bits < s_bits) {
|
||||
a_bits = s_bits;
|
||||
}
|
||||
mask = (target_ulong)TARGET_PAGE_MASK | ((1 << a_bits) - 1);
|
||||
|
||||
|
||||
/* Compensate for very large offsets. */
|
||||
if (add_off >= 0x1000) {
|
||||
int adj;
|
||||
base = TCG_REG_TMP2;
|
||||
if (cmp_off <= 2 * 0xfff) {
|
||||
adj = 0xfff;
|
||||
tcg_out_opc_imm(s, OPC_ADDI, base, TCG_AREG0, adj);
|
||||
} else {
|
||||
adj = cmp_off - sextreg(cmp_off, 0, 12);
|
||||
tcg_debug_assert(add_off - adj >= -0x1000
|
||||
&& add_off - adj < 0x1000);
|
||||
|
||||
tcg_out_opc_upper(s, OPC_LUI, base, adj);
|
||||
tcg_out_opc_reg(s, OPC_ADD, base, base, TCG_AREG0);
|
||||
}
|
||||
add_off -= adj;
|
||||
cmp_off -= adj;
|
||||
}
|
||||
|
||||
/* Extract the page index. */
|
||||
if (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS < 12) {
|
||||
tcg_out_opc_imm(s, OPC_SRLI, TCG_REG_TMP0, addrl,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP0, TCG_REG_TMP0,
|
||||
MAKE_64BIT_MASK(CPU_TLB_ENTRY_BITS, CPU_TLB_BITS));
|
||||
} else if (TARGET_PAGE_BITS >= 12) {
|
||||
tcg_out_opc_upper(s, OPC_LUI, TCG_REG_TMP0,
|
||||
MAKE_64BIT_MASK(TARGET_PAGE_BITS, CPU_TLB_BITS));
|
||||
tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP0, TCG_REG_TMP0, addrl);
|
||||
tcg_out_opc_imm(s, OPC_SRLI, TCG_REG_TMP0, TCG_REG_TMP0,
|
||||
CPU_TLB_BITS - CPU_TLB_ENTRY_BITS);
|
||||
} else {
|
||||
tcg_out_opc_imm(s, OPC_SRLI, TCG_REG_TMP0, addrl, TARGET_PAGE_BITS);
|
||||
tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP0, TCG_REG_TMP0,
|
||||
MAKE_64BIT_MASK(0, CPU_TLB_BITS));
|
||||
tcg_out_opc_imm(s, OPC_SLLI, TCG_REG_TMP0, TCG_REG_TMP0,
|
||||
CPU_TLB_ENTRY_BITS);
|
||||
}
|
||||
|
||||
/* Add that to the base address to index the tlb. */
|
||||
tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, base, TCG_REG_TMP0);
|
||||
base = TCG_REG_TMP2;
|
||||
|
||||
/* Load the tlb comparator and the addend. */
|
||||
tcg_out_ldst(s, load_cmp_op, TCG_REG_TMP0, base, cmp_off);
|
||||
tcg_out_ldst(s, load_add_op, TCG_REG_TMP2, base, add_off);
|
||||
|
||||
/* Clear the non-page, non-alignment bits from the address. */
|
||||
if (mask == sextreg(mask, 0, 12)) {
|
||||
tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addrl, mask);
|
||||
compare_mask = (tcg_target_long)TARGET_PAGE_MASK | ((1 << a_bits) - 1);
|
||||
if (compare_mask == sextreg(compare_mask, 0, 12)) {
|
||||
tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addrl, compare_mask);
|
||||
} else {
|
||||
tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_TMP1, mask);
|
||||
tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_TMP1, compare_mask);
|
||||
tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addrl);
|
||||
}
|
||||
}
|
||||
|
||||
/* Compare masked address with the TLB entry. */
|
||||
label_ptr[0] = s->code_ptr;
|
||||
tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP0, TCG_REG_TMP1, 0);
|
||||
/* NOP to allow patching later */
|
||||
tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_ZERO, TCG_REG_ZERO, 0);
|
||||
/* TODO: Move this out of line
|
||||
* see:
|
||||
* https://lists.nongnu.org/archive/html/qemu-devel/2018-11/msg02234.html
|
||||
*/
|
||||
|
||||
/* TLB Hit - translate address using addend. */
|
||||
if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
|
||||
|
@ -1537,10 +1537,10 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGMemOp opc, TCGReg data,
|
||||
#if defined(CONFIG_SOFTMMU)
|
||||
#include "tcg-ldst.inc.c"
|
||||
|
||||
/* We're expecting to use a 20-bit signed offset on the tlb memory ops.
|
||||
Using the offset of the second entry in the last tlb table ensures
|
||||
that we can index all of the elements of the first entry. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
|
||||
/* We're expecting to use a 20-bit signed offset on the tlb memory ops. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_mask[NB_MMU_MODES - 1])
|
||||
> 0x7ffff);
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1])
|
||||
> 0x7ffff);
|
||||
|
||||
/* Load and compare a TLB entry, leaving the flags set. Loads the TLB
|
||||
@ -1552,48 +1552,41 @@ static TCGReg tcg_out_tlb_read(TCGContext* s, TCGReg addr_reg, TCGMemOp opc,
|
||||
unsigned a_bits = get_alignment_bits(opc);
|
||||
unsigned s_mask = (1 << s_bits) - 1;
|
||||
unsigned a_mask = (1 << a_bits) - 1;
|
||||
int mask_off = offsetof(CPUArchState, tlb_mask[mem_index]);
|
||||
int table_off = offsetof(CPUArchState, tlb_table[mem_index]);
|
||||
int ofs, a_off;
|
||||
uint64_t tlb_mask;
|
||||
|
||||
tcg_out_sh64(s, RSY_SRLG, TCG_REG_R2, addr_reg, TCG_REG_NONE,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
tcg_out_insn(s, RXY, NG, TCG_REG_R2, TCG_AREG0, TCG_REG_NONE, mask_off);
|
||||
tcg_out_insn(s, RXY, AG, TCG_REG_R2, TCG_AREG0, TCG_REG_NONE, table_off);
|
||||
|
||||
/* For aligned accesses, we check the first byte and include the alignment
|
||||
bits within the address. For unaligned access, we check that we don't
|
||||
cross pages using the address of the last byte of the access. */
|
||||
a_off = (a_bits >= s_bits ? 0 : s_mask - a_mask);
|
||||
tlb_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
|
||||
|
||||
if (s390_facilities & FACILITY_GEN_INST_EXT) {
|
||||
tcg_out_risbg(s, TCG_REG_R2, addr_reg,
|
||||
64 - CPU_TLB_BITS - CPU_TLB_ENTRY_BITS,
|
||||
63 - CPU_TLB_ENTRY_BITS,
|
||||
64 + CPU_TLB_ENTRY_BITS - TARGET_PAGE_BITS, 1);
|
||||
if (a_off) {
|
||||
tcg_out_insn(s, RX, LA, TCG_REG_R3, addr_reg, TCG_REG_NONE, a_off);
|
||||
tgen_andi(s, TCG_TYPE_TL, TCG_REG_R3, tlb_mask);
|
||||
} else {
|
||||
tgen_andi_risbg(s, TCG_REG_R3, addr_reg, tlb_mask);
|
||||
}
|
||||
if ((s390_facilities & FACILITY_GEN_INST_EXT) && a_off == 0) {
|
||||
tgen_andi_risbg(s, TCG_REG_R3, addr_reg, tlb_mask);
|
||||
} else {
|
||||
tcg_out_sh64(s, RSY_SRLG, TCG_REG_R2, addr_reg, TCG_REG_NONE,
|
||||
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
|
||||
tcg_out_insn(s, RX, LA, TCG_REG_R3, addr_reg, TCG_REG_NONE, a_off);
|
||||
tgen_andi(s, TCG_TYPE_I64, TCG_REG_R2,
|
||||
(CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
|
||||
tgen_andi(s, TCG_TYPE_TL, TCG_REG_R3, tlb_mask);
|
||||
}
|
||||
|
||||
if (is_ld) {
|
||||
ofs = offsetof(CPUArchState, tlb_table[mem_index][0].addr_read);
|
||||
ofs = offsetof(CPUTLBEntry, addr_read);
|
||||
} else {
|
||||
ofs = offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
|
||||
ofs = offsetof(CPUTLBEntry, addr_write);
|
||||
}
|
||||
if (TARGET_LONG_BITS == 32) {
|
||||
tcg_out_mem(s, RX_C, RXY_CY, TCG_REG_R3, TCG_REG_R2, TCG_AREG0, ofs);
|
||||
tcg_out_insn(s, RX, C, TCG_REG_R3, TCG_REG_R2, TCG_REG_NONE, ofs);
|
||||
} else {
|
||||
tcg_out_mem(s, 0, RXY_CG, TCG_REG_R3, TCG_REG_R2, TCG_AREG0, ofs);
|
||||
tcg_out_insn(s, RXY, CG, TCG_REG_R3, TCG_REG_R2, TCG_REG_NONE, ofs);
|
||||
}
|
||||
|
||||
ofs = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
|
||||
tcg_out_mem(s, 0, RXY_LG, TCG_REG_R2, TCG_REG_R2, TCG_AREG0, ofs);
|
||||
tcg_out_insn(s, RXY, LG, TCG_REG_R2, TCG_REG_R2, TCG_REG_NONE,
|
||||
offsetof(CPUTLBEntry, addend));
|
||||
|
||||
if (TARGET_LONG_BITS == 32) {
|
||||
tgen_ext32u(s, TCG_REG_R3, addr_reg);
|
||||
|
@ -1074,54 +1074,72 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
|
||||
The result of the TLB comparison is in %[ix]cc. The sanitized address
|
||||
is in the returned register, maybe %o0. The TLB addend is in %o1. */
|
||||
|
||||
/* We expect tlb_mask to be before tlb_table. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) <
|
||||
offsetof(CPUArchState, tlb_mask));
|
||||
|
||||
/* We expect tlb_mask to be "near" tlb_table. */
|
||||
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) -
|
||||
offsetof(CPUArchState, tlb_mask) >= (1 << 13));
|
||||
|
||||
static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addr, int mem_index,
|
||||
TCGMemOp opc, int which)
|
||||
{
|
||||
int mask_off = offsetof(CPUArchState, tlb_mask[mem_index]);
|
||||
int table_off = offsetof(CPUArchState, tlb_table[mem_index]);
|
||||
TCGReg base = TCG_AREG0;
|
||||
const TCGReg r0 = TCG_REG_O0;
|
||||
const TCGReg r1 = TCG_REG_O1;
|
||||
const TCGReg r2 = TCG_REG_O2;
|
||||
unsigned s_bits = opc & MO_SIZE;
|
||||
unsigned a_bits = get_alignment_bits(opc);
|
||||
int tlb_ofs;
|
||||
tcg_target_long compare_mask;
|
||||
|
||||
/* Shift the page number down. */
|
||||
tcg_out_arithi(s, r1, addr, TARGET_PAGE_BITS, SHIFT_SRL);
|
||||
if (!check_fit_i32(table_off, 13)) {
|
||||
int table_hi;
|
||||
|
||||
base = r1;
|
||||
if (table_off <= 2 * 0xfff) {
|
||||
table_hi = 0xfff;
|
||||
tcg_out_arithi(s, base, TCG_AREG0, table_hi, ARITH_ADD);
|
||||
} else {
|
||||
table_hi = table_off & ~0x3ff;
|
||||
tcg_out_sethi(s, base, table_hi);
|
||||
tcg_out_arith(s, base, TCG_AREG0, base, ARITH_ADD);
|
||||
}
|
||||
mask_off -= table_hi;
|
||||
table_off -= table_hi;
|
||||
tcg_debug_assert(check_fit_i32(mask_off, 13));
|
||||
}
|
||||
|
||||
/* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, r0, base, mask_off);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, r1, base, table_off);
|
||||
|
||||
/* Extract the page index, shifted into place for tlb index. */
|
||||
tcg_out_arithi(s, r2, addr, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS,
|
||||
SHIFT_SRL);
|
||||
tcg_out_arith(s, r2, r2, r0, ARITH_AND);
|
||||
|
||||
/* Add the tlb_table pointer, creating the CPUTLBEntry address into R2. */
|
||||
tcg_out_arith(s, r2, r2, r1, ARITH_ADD);
|
||||
|
||||
/* Load the tlb comparator and the addend. */
|
||||
tcg_out_ld(s, TCG_TYPE_TL, r0, r2, which);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, r1, r2, offsetof(CPUTLBEntry, addend));
|
||||
|
||||
/* Mask out the page offset, except for the required alignment.
|
||||
We don't support unaligned accesses. */
|
||||
if (a_bits < s_bits) {
|
||||
a_bits = s_bits;
|
||||
}
|
||||
tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_T1,
|
||||
TARGET_PAGE_MASK | ((1 << a_bits) - 1));
|
||||
|
||||
/* Mask the tlb index. */
|
||||
tcg_out_arithi(s, r1, r1, CPU_TLB_SIZE - 1, ARITH_AND);
|
||||
|
||||
/* Mask page, part 2. */
|
||||
tcg_out_arith(s, r0, addr, TCG_REG_T1, ARITH_AND);
|
||||
|
||||
/* Shift the tlb index into place. */
|
||||
tcg_out_arithi(s, r1, r1, CPU_TLB_ENTRY_BITS, SHIFT_SLL);
|
||||
|
||||
/* Relative to the current ENV. */
|
||||
tcg_out_arith(s, r1, TCG_AREG0, r1, ARITH_ADD);
|
||||
|
||||
/* Find a base address that can load both tlb comparator and addend. */
|
||||
tlb_ofs = offsetof(CPUArchState, tlb_table[mem_index][0]);
|
||||
if (!check_fit_ptr(tlb_ofs + sizeof(CPUTLBEntry), 13)) {
|
||||
if (tlb_ofs & ~0x3ff) {
|
||||
tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, tlb_ofs & ~0x3ff);
|
||||
tcg_out_arith(s, r1, r1, TCG_REG_T1, ARITH_ADD);
|
||||
}
|
||||
tlb_ofs &= 0x3ff;
|
||||
compare_mask = (tcg_target_ulong)TARGET_PAGE_MASK | ((1 << a_bits) - 1);
|
||||
if (check_fit_tl(compare_mask, 13)) {
|
||||
tcg_out_arithi(s, r2, addr, compare_mask, ARITH_AND);
|
||||
} else {
|
||||
tcg_out_movi(s, TCG_TYPE_TL, r2, compare_mask);
|
||||
tcg_out_arith(s, r2, addr, r2, ARITH_AND);
|
||||
}
|
||||
|
||||
/* Load the tlb comparator and the addend. */
|
||||
tcg_out_ld(s, TCG_TYPE_TL, r2, r1, tlb_ofs + which);
|
||||
tcg_out_ld(s, TCG_TYPE_PTR, r1, r1, tlb_ofs+offsetof(CPUTLBEntry, addend));
|
||||
|
||||
/* subcc arg0, arg2, %g0 */
|
||||
tcg_out_cmp(s, r0, r2, 0);
|
||||
|
||||
/* If the guest address must be zero-extended, do so now. */
|
||||
|
@ -665,7 +665,7 @@ static void expand_3_i32(uint32_t dofs, uint32_t aofs,
|
||||
|
||||
/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
|
||||
static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
|
||||
uint32_t cofs, uint32_t oprsz,
|
||||
uint32_t cofs, uint32_t oprsz, bool write_aofs,
|
||||
void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
|
||||
{
|
||||
TCGv_i32 t0 = tcg_temp_new_i32();
|
||||
@ -680,6 +680,9 @@ static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
|
||||
tcg_gen_ld_i32(t3, cpu_env, cofs + i);
|
||||
fni(t0, t1, t2, t3);
|
||||
tcg_gen_st_i32(t0, cpu_env, dofs + i);
|
||||
if (write_aofs) {
|
||||
tcg_gen_st_i32(t1, cpu_env, aofs + i);
|
||||
}
|
||||
}
|
||||
tcg_temp_free_i32(t3);
|
||||
tcg_temp_free_i32(t2);
|
||||
@ -769,7 +772,7 @@ static void expand_3_i64(uint32_t dofs, uint32_t aofs,
|
||||
|
||||
/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
|
||||
static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
|
||||
uint32_t cofs, uint32_t oprsz,
|
||||
uint32_t cofs, uint32_t oprsz, bool write_aofs,
|
||||
void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
|
||||
{
|
||||
TCGv_i64 t0 = tcg_temp_new_i64();
|
||||
@ -784,6 +787,9 @@ static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
|
||||
tcg_gen_ld_i64(t3, cpu_env, cofs + i);
|
||||
fni(t0, t1, t2, t3);
|
||||
tcg_gen_st_i64(t0, cpu_env, dofs + i);
|
||||
if (write_aofs) {
|
||||
tcg_gen_st_i64(t1, cpu_env, aofs + i);
|
||||
}
|
||||
}
|
||||
tcg_temp_free_i64(t3);
|
||||
tcg_temp_free_i64(t2);
|
||||
@ -880,7 +886,7 @@ static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
/* Expand OPSZ bytes worth of four-operand operations using host vectors. */
|
||||
static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t cofs, uint32_t oprsz,
|
||||
uint32_t tysz, TCGType type,
|
||||
uint32_t tysz, TCGType type, bool write_aofs,
|
||||
void (*fni)(unsigned, TCGv_vec, TCGv_vec,
|
||||
TCGv_vec, TCGv_vec))
|
||||
{
|
||||
@ -896,6 +902,9 @@ static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
tcg_gen_ld_vec(t3, cpu_env, cofs + i);
|
||||
fni(vece, t0, t1, t2, t3);
|
||||
tcg_gen_st_vec(t0, cpu_env, dofs + i);
|
||||
if (write_aofs) {
|
||||
tcg_gen_st_vec(t1, cpu_env, aofs + i);
|
||||
}
|
||||
}
|
||||
tcg_temp_free_vec(t3);
|
||||
tcg_temp_free_vec(t2);
|
||||
@ -1187,7 +1196,7 @@ void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
|
||||
*/
|
||||
some = QEMU_ALIGN_DOWN(oprsz, 32);
|
||||
expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
|
||||
32, TCG_TYPE_V256, g->fniv);
|
||||
32, TCG_TYPE_V256, g->write_aofs, g->fniv);
|
||||
if (some == oprsz) {
|
||||
break;
|
||||
}
|
||||
@ -1200,18 +1209,20 @@ void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
|
||||
/* fallthru */
|
||||
case TCG_TYPE_V128:
|
||||
expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
|
||||
16, TCG_TYPE_V128, g->fniv);
|
||||
16, TCG_TYPE_V128, g->write_aofs, g->fniv);
|
||||
break;
|
||||
case TCG_TYPE_V64:
|
||||
expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
|
||||
8, TCG_TYPE_V64, g->fniv);
|
||||
8, TCG_TYPE_V64, g->write_aofs, g->fniv);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
if (g->fni8 && check_size_impl(oprsz, 8)) {
|
||||
expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
|
||||
expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
|
||||
g->write_aofs, g->fni8);
|
||||
} else if (g->fni4 && check_size_impl(oprsz, 4)) {
|
||||
expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
|
||||
expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
|
||||
g->write_aofs, g->fni4);
|
||||
} else {
|
||||
assert(g->fno != NULL);
|
||||
tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
|
||||
@ -1667,10 +1678,22 @@ void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g[4] = {
|
||||
{ .fno = gen_helper_gvec_ssadd8, .vece = MO_8 },
|
||||
{ .fno = gen_helper_gvec_ssadd16, .vece = MO_16 },
|
||||
{ .fno = gen_helper_gvec_ssadd32, .vece = MO_32 },
|
||||
{ .fno = gen_helper_gvec_ssadd64, .vece = MO_64 }
|
||||
{ .fniv = tcg_gen_ssadd_vec,
|
||||
.fno = gen_helper_gvec_ssadd8,
|
||||
.opc = INDEX_op_ssadd_vec,
|
||||
.vece = MO_8 },
|
||||
{ .fniv = tcg_gen_ssadd_vec,
|
||||
.fno = gen_helper_gvec_ssadd16,
|
||||
.opc = INDEX_op_ssadd_vec,
|
||||
.vece = MO_16 },
|
||||
{ .fniv = tcg_gen_ssadd_vec,
|
||||
.fno = gen_helper_gvec_ssadd32,
|
||||
.opc = INDEX_op_ssadd_vec,
|
||||
.vece = MO_32 },
|
||||
{ .fniv = tcg_gen_ssadd_vec,
|
||||
.fno = gen_helper_gvec_ssadd64,
|
||||
.opc = INDEX_op_ssadd_vec,
|
||||
.vece = MO_64 },
|
||||
};
|
||||
tcg_debug_assert(vece <= MO_64);
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
|
||||
@ -1680,16 +1703,28 @@ void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g[4] = {
|
||||
{ .fno = gen_helper_gvec_sssub8, .vece = MO_8 },
|
||||
{ .fno = gen_helper_gvec_sssub16, .vece = MO_16 },
|
||||
{ .fno = gen_helper_gvec_sssub32, .vece = MO_32 },
|
||||
{ .fno = gen_helper_gvec_sssub64, .vece = MO_64 }
|
||||
{ .fniv = tcg_gen_sssub_vec,
|
||||
.fno = gen_helper_gvec_sssub8,
|
||||
.opc = INDEX_op_sssub_vec,
|
||||
.vece = MO_8 },
|
||||
{ .fniv = tcg_gen_sssub_vec,
|
||||
.fno = gen_helper_gvec_sssub16,
|
||||
.opc = INDEX_op_sssub_vec,
|
||||
.vece = MO_16 },
|
||||
{ .fniv = tcg_gen_sssub_vec,
|
||||
.fno = gen_helper_gvec_sssub32,
|
||||
.opc = INDEX_op_sssub_vec,
|
||||
.vece = MO_32 },
|
||||
{ .fniv = tcg_gen_sssub_vec,
|
||||
.fno = gen_helper_gvec_sssub64,
|
||||
.opc = INDEX_op_sssub_vec,
|
||||
.vece = MO_64 },
|
||||
};
|
||||
tcg_debug_assert(vece <= MO_64);
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
|
||||
}
|
||||
|
||||
static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
|
||||
static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
|
||||
{
|
||||
TCGv_i32 max = tcg_const_i32(-1);
|
||||
tcg_gen_add_i32(d, a, b);
|
||||
@ -1697,7 +1732,7 @@ static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
|
||||
tcg_temp_free_i32(max);
|
||||
}
|
||||
|
||||
static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
|
||||
static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
|
||||
{
|
||||
TCGv_i64 max = tcg_const_i64(-1);
|
||||
tcg_gen_add_i64(d, a, b);
|
||||
@ -1709,20 +1744,30 @@ void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g[4] = {
|
||||
{ .fno = gen_helper_gvec_usadd8, .vece = MO_8 },
|
||||
{ .fno = gen_helper_gvec_usadd16, .vece = MO_16 },
|
||||
{ .fni4 = tcg_gen_vec_usadd32_i32,
|
||||
{ .fniv = tcg_gen_usadd_vec,
|
||||
.fno = gen_helper_gvec_usadd8,
|
||||
.opc = INDEX_op_usadd_vec,
|
||||
.vece = MO_8 },
|
||||
{ .fniv = tcg_gen_usadd_vec,
|
||||
.fno = gen_helper_gvec_usadd16,
|
||||
.opc = INDEX_op_usadd_vec,
|
||||
.vece = MO_16 },
|
||||
{ .fni4 = tcg_gen_usadd_i32,
|
||||
.fniv = tcg_gen_usadd_vec,
|
||||
.fno = gen_helper_gvec_usadd32,
|
||||
.opc = INDEX_op_usadd_vec,
|
||||
.vece = MO_32 },
|
||||
{ .fni8 = tcg_gen_vec_usadd32_i64,
|
||||
{ .fni8 = tcg_gen_usadd_i64,
|
||||
.fniv = tcg_gen_usadd_vec,
|
||||
.fno = gen_helper_gvec_usadd64,
|
||||
.opc = INDEX_op_usadd_vec,
|
||||
.vece = MO_64 }
|
||||
};
|
||||
tcg_debug_assert(vece <= MO_64);
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
|
||||
}
|
||||
|
||||
static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
|
||||
static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
|
||||
{
|
||||
TCGv_i32 min = tcg_const_i32(0);
|
||||
tcg_gen_sub_i32(d, a, b);
|
||||
@ -1730,7 +1775,7 @@ static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
|
||||
tcg_temp_free_i32(min);
|
||||
}
|
||||
|
||||
static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
|
||||
static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
|
||||
{
|
||||
TCGv_i64 min = tcg_const_i64(0);
|
||||
tcg_gen_sub_i64(d, a, b);
|
||||
@ -1742,13 +1787,131 @@ void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g[4] = {
|
||||
{ .fno = gen_helper_gvec_ussub8, .vece = MO_8 },
|
||||
{ .fno = gen_helper_gvec_ussub16, .vece = MO_16 },
|
||||
{ .fni4 = tcg_gen_vec_ussub32_i32,
|
||||
{ .fniv = tcg_gen_ussub_vec,
|
||||
.fno = gen_helper_gvec_ussub8,
|
||||
.opc = INDEX_op_ussub_vec,
|
||||
.vece = MO_8 },
|
||||
{ .fniv = tcg_gen_ussub_vec,
|
||||
.fno = gen_helper_gvec_ussub16,
|
||||
.opc = INDEX_op_ussub_vec,
|
||||
.vece = MO_16 },
|
||||
{ .fni4 = tcg_gen_ussub_i32,
|
||||
.fniv = tcg_gen_ussub_vec,
|
||||
.fno = gen_helper_gvec_ussub32,
|
||||
.opc = INDEX_op_ussub_vec,
|
||||
.vece = MO_32 },
|
||||
{ .fni8 = tcg_gen_vec_ussub32_i64,
|
||||
{ .fni8 = tcg_gen_ussub_i64,
|
||||
.fniv = tcg_gen_ussub_vec,
|
||||
.fno = gen_helper_gvec_ussub64,
|
||||
.opc = INDEX_op_ussub_vec,
|
||||
.vece = MO_64 }
|
||||
};
|
||||
tcg_debug_assert(vece <= MO_64);
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g[4] = {
|
||||
{ .fniv = tcg_gen_smin_vec,
|
||||
.fno = gen_helper_gvec_smin8,
|
||||
.opc = INDEX_op_smin_vec,
|
||||
.vece = MO_8 },
|
||||
{ .fniv = tcg_gen_smin_vec,
|
||||
.fno = gen_helper_gvec_smin16,
|
||||
.opc = INDEX_op_smin_vec,
|
||||
.vece = MO_16 },
|
||||
{ .fni4 = tcg_gen_smin_i32,
|
||||
.fniv = tcg_gen_smin_vec,
|
||||
.fno = gen_helper_gvec_smin32,
|
||||
.opc = INDEX_op_smin_vec,
|
||||
.vece = MO_32 },
|
||||
{ .fni8 = tcg_gen_smin_i64,
|
||||
.fniv = tcg_gen_smin_vec,
|
||||
.fno = gen_helper_gvec_smin64,
|
||||
.opc = INDEX_op_smin_vec,
|
||||
.vece = MO_64 }
|
||||
};
|
||||
tcg_debug_assert(vece <= MO_64);
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g[4] = {
|
||||
{ .fniv = tcg_gen_umin_vec,
|
||||
.fno = gen_helper_gvec_umin8,
|
||||
.opc = INDEX_op_umin_vec,
|
||||
.vece = MO_8 },
|
||||
{ .fniv = tcg_gen_umin_vec,
|
||||
.fno = gen_helper_gvec_umin16,
|
||||
.opc = INDEX_op_umin_vec,
|
||||
.vece = MO_16 },
|
||||
{ .fni4 = tcg_gen_umin_i32,
|
||||
.fniv = tcg_gen_umin_vec,
|
||||
.fno = gen_helper_gvec_umin32,
|
||||
.opc = INDEX_op_umin_vec,
|
||||
.vece = MO_32 },
|
||||
{ .fni8 = tcg_gen_umin_i64,
|
||||
.fniv = tcg_gen_umin_vec,
|
||||
.fno = gen_helper_gvec_umin64,
|
||||
.opc = INDEX_op_umin_vec,
|
||||
.vece = MO_64 }
|
||||
};
|
||||
tcg_debug_assert(vece <= MO_64);
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g[4] = {
|
||||
{ .fniv = tcg_gen_smax_vec,
|
||||
.fno = gen_helper_gvec_smax8,
|
||||
.opc = INDEX_op_smax_vec,
|
||||
.vece = MO_8 },
|
||||
{ .fniv = tcg_gen_smax_vec,
|
||||
.fno = gen_helper_gvec_smax16,
|
||||
.opc = INDEX_op_smax_vec,
|
||||
.vece = MO_16 },
|
||||
{ .fni4 = tcg_gen_smax_i32,
|
||||
.fniv = tcg_gen_smax_vec,
|
||||
.fno = gen_helper_gvec_smax32,
|
||||
.opc = INDEX_op_smax_vec,
|
||||
.vece = MO_32 },
|
||||
{ .fni8 = tcg_gen_smax_i64,
|
||||
.fniv = tcg_gen_smax_vec,
|
||||
.fno = gen_helper_gvec_smax64,
|
||||
.opc = INDEX_op_smax_vec,
|
||||
.vece = MO_64 }
|
||||
};
|
||||
tcg_debug_assert(vece <= MO_64);
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g[4] = {
|
||||
{ .fniv = tcg_gen_umax_vec,
|
||||
.fno = gen_helper_gvec_umax8,
|
||||
.opc = INDEX_op_umax_vec,
|
||||
.vece = MO_8 },
|
||||
{ .fniv = tcg_gen_umax_vec,
|
||||
.fno = gen_helper_gvec_umax16,
|
||||
.opc = INDEX_op_umax_vec,
|
||||
.vece = MO_16 },
|
||||
{ .fni4 = tcg_gen_umax_i32,
|
||||
.fniv = tcg_gen_umax_vec,
|
||||
.fno = gen_helper_gvec_umax32,
|
||||
.opc = INDEX_op_umax_vec,
|
||||
.vece = MO_32 },
|
||||
{ .fni8 = tcg_gen_umax_i64,
|
||||
.fniv = tcg_gen_umax_vec,
|
||||
.fno = gen_helper_gvec_umax64,
|
||||
.opc = INDEX_op_umax_vec,
|
||||
.vece = MO_64 }
|
||||
};
|
||||
tcg_debug_assert(vece <= MO_64);
|
||||
@ -1840,7 +2003,12 @@ void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
.opc = INDEX_op_and_vec,
|
||||
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
|
||||
};
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
|
||||
if (aofs == bofs) {
|
||||
tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
|
||||
} else {
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
@ -1853,7 +2021,12 @@ void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
.opc = INDEX_op_or_vec,
|
||||
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
|
||||
};
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
|
||||
if (aofs == bofs) {
|
||||
tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
|
||||
} else {
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
@ -1866,7 +2039,12 @@ void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
.opc = INDEX_op_xor_vec,
|
||||
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
|
||||
};
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
|
||||
if (aofs == bofs) {
|
||||
tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
|
||||
} else {
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
@ -1879,7 +2057,12 @@ void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
.opc = INDEX_op_andc_vec,
|
||||
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
|
||||
};
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
|
||||
if (aofs == bofs) {
|
||||
tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
|
||||
} else {
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
@ -1892,7 +2075,63 @@ void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
.opc = INDEX_op_orc_vec,
|
||||
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
|
||||
};
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
|
||||
if (aofs == bofs) {
|
||||
tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
|
||||
} else {
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g = {
|
||||
.fni8 = tcg_gen_nand_i64,
|
||||
.fniv = tcg_gen_nand_vec,
|
||||
.fno = gen_helper_gvec_nand,
|
||||
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
|
||||
};
|
||||
|
||||
if (aofs == bofs) {
|
||||
tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
|
||||
} else {
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g = {
|
||||
.fni8 = tcg_gen_nor_i64,
|
||||
.fniv = tcg_gen_nor_vec,
|
||||
.fno = gen_helper_gvec_nor,
|
||||
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
|
||||
};
|
||||
|
||||
if (aofs == bofs) {
|
||||
tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
|
||||
} else {
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
|
||||
{
|
||||
static const GVecGen3 g = {
|
||||
.fni8 = tcg_gen_eqv_i64,
|
||||
.fniv = tcg_gen_eqv_vec,
|
||||
.fno = gen_helper_gvec_eqv,
|
||||
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
|
||||
};
|
||||
|
||||
if (aofs == bofs) {
|
||||
tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
|
||||
} else {
|
||||
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
|
||||
}
|
||||
}
|
||||
|
||||
static const GVecGen2s gop_ands = {
|
||||
|
@ -181,6 +181,8 @@ typedef struct {
|
||||
uint8_t vece;
|
||||
/* Prefer i64 to v64. */
|
||||
bool prefer_i64;
|
||||
/* Write aofs as a 2nd dest operand. */
|
||||
bool write_aofs;
|
||||
} GVecGen4;
|
||||
|
||||
void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
|
||||
@ -232,6 +234,16 @@ void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
|
||||
/* Min/max. */
|
||||
void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
|
||||
void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
@ -242,6 +254,12 @@ void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
|
||||
|
||||
void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
|
||||
int64_t c, uint32_t oprsz, uint32_t maxsz);
|
||||
|
@ -275,6 +275,27 @@ void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
/* TODO: Add TCG_TARGET_HAS_nand_vec when adding a backend supports it. */
|
||||
tcg_gen_and_vec(0, r, a, b);
|
||||
tcg_gen_not_vec(0, r, r);
|
||||
}
|
||||
|
||||
void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
/* TODO: Add TCG_TARGET_HAS_nor_vec when adding a backend supports it. */
|
||||
tcg_gen_or_vec(0, r, a, b);
|
||||
tcg_gen_not_vec(0, r, r);
|
||||
}
|
||||
|
||||
void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
/* TODO: Add TCG_TARGET_HAS_eqv_vec when adding a backend supports it. */
|
||||
tcg_gen_xor_vec(0, r, a, b);
|
||||
tcg_gen_not_vec(0, r, r);
|
||||
}
|
||||
|
||||
void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
|
||||
{
|
||||
if (TCG_TARGET_HAS_not_vec) {
|
||||
@ -365,7 +386,8 @@ void tcg_gen_cmp_vec(TCGCond cond, unsigned vece,
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
static void do_op3(unsigned vece, TCGv_vec r, TCGv_vec a,
|
||||
TCGv_vec b, TCGOpcode opc)
|
||||
{
|
||||
TCGTemp *rt = tcgv_vec_temp(r);
|
||||
TCGTemp *at = tcgv_vec_temp(a);
|
||||
@ -378,11 +400,56 @@ void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
|
||||
tcg_debug_assert(at->base_type >= type);
|
||||
tcg_debug_assert(bt->base_type >= type);
|
||||
can = tcg_can_emit_vec_op(INDEX_op_mul_vec, type, vece);
|
||||
can = tcg_can_emit_vec_op(opc, type, vece);
|
||||
if (can > 0) {
|
||||
vec_gen_3(INDEX_op_mul_vec, type, vece, ri, ai, bi);
|
||||
vec_gen_3(opc, type, vece, ri, ai, bi);
|
||||
} else {
|
||||
tcg_debug_assert(can < 0);
|
||||
tcg_expand_vec_op(INDEX_op_mul_vec, type, vece, ri, ai, bi);
|
||||
tcg_expand_vec_op(opc, type, vece, ri, ai, bi);
|
||||
}
|
||||
}
|
||||
|
||||
void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
do_op3(vece, r, a, b, INDEX_op_mul_vec);
|
||||
}
|
||||
|
||||
void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
do_op3(vece, r, a, b, INDEX_op_ssadd_vec);
|
||||
}
|
||||
|
||||
void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
do_op3(vece, r, a, b, INDEX_op_usadd_vec);
|
||||
}
|
||||
|
||||
void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
do_op3(vece, r, a, b, INDEX_op_sssub_vec);
|
||||
}
|
||||
|
||||
void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
do_op3(vece, r, a, b, INDEX_op_ussub_vec);
|
||||
}
|
||||
|
||||
void tcg_gen_smin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
do_op3(vece, r, a, b, INDEX_op_smin_vec);
|
||||
}
|
||||
|
||||
void tcg_gen_umin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
do_op3(vece, r, a, b, INDEX_op_umin_vec);
|
||||
}
|
||||
|
||||
void tcg_gen_smax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
do_op3(vece, r, a, b, INDEX_op_smax_vec);
|
||||
}
|
||||
|
||||
void tcg_gen_umax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
|
||||
{
|
||||
do_op3(vece, r, a, b, INDEX_op_umax_vec);
|
||||
}
|
||||
|
11
tcg/tcg-op.h
11
tcg/tcg-op.h
@ -962,8 +962,19 @@ void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
|
||||
void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
|
||||
void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_smin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_umin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_smax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
void tcg_gen_umax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
|
||||
|
||||
void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
|
||||
void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
|
||||
|
@ -222,6 +222,14 @@ DEF(add_vec, 1, 2, 0, IMPLVEC)
|
||||
DEF(sub_vec, 1, 2, 0, IMPLVEC)
|
||||
DEF(mul_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_mul_vec))
|
||||
DEF(neg_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
|
||||
DEF(ssadd_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
|
||||
DEF(usadd_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
|
||||
DEF(sssub_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
|
||||
DEF(ussub_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
|
||||
DEF(smin_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_minmax_vec))
|
||||
DEF(umin_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_minmax_vec))
|
||||
DEF(smax_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_minmax_vec))
|
||||
DEF(umax_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_minmax_vec))
|
||||
|
||||
DEF(and_vec, 1, 2, 0, IMPLVEC)
|
||||
DEF(or_vec, 1, 2, 0, IMPLVEC)
|
||||
|
10
tcg/tcg.c
10
tcg/tcg.c
@ -1607,6 +1607,16 @@ bool tcg_op_supported(TCGOpcode op)
|
||||
case INDEX_op_shrv_vec:
|
||||
case INDEX_op_sarv_vec:
|
||||
return have_vec && TCG_TARGET_HAS_shv_vec;
|
||||
case INDEX_op_ssadd_vec:
|
||||
case INDEX_op_usadd_vec:
|
||||
case INDEX_op_sssub_vec:
|
||||
case INDEX_op_ussub_vec:
|
||||
return have_vec && TCG_TARGET_HAS_sat_vec;
|
||||
case INDEX_op_smin_vec:
|
||||
case INDEX_op_umin_vec:
|
||||
case INDEX_op_smax_vec:
|
||||
case INDEX_op_umax_vec:
|
||||
return have_vec && TCG_TARGET_HAS_minmax_vec;
|
||||
|
||||
default:
|
||||
tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
|
||||
|
Loading…
Reference in New Issue
Block a user