linux/arch/riscv/kvm/vcpu_pmu.c
Anup Patel 47d40d9329 RISC-V: KVM: Don't zero-out PMU snapshot area before freeing data
With the latest Linux-6.11-rc3, the below NULL pointer crash is observed
when SBI PMU snapshot is enabled for the guest and the guest is forcefully
powered-off.

  Unable to handle kernel NULL pointer dereference at virtual address 0000000000000508
  Oops [#1]
  Modules linked in: kvm
  CPU: 0 UID: 0 PID: 61 Comm: term-poll Not tainted 6.11.0-rc3-00018-g44d7178dd77a #3
  Hardware name: riscv-virtio,qemu (DT)
  epc : __kvm_write_guest_page+0x94/0xa6 [kvm]
   ra : __kvm_write_guest_page+0x54/0xa6 [kvm]
  epc : ffffffff01590e98 ra : ffffffff01590e58 sp : ffff8f80001f39b0
   gp : ffffffff81512a60 tp : ffffaf80024872c0 t0 : ffffaf800247e000
   t1 : 00000000000007e0 t2 : 0000000000000000 s0 : ffff8f80001f39f0
   s1 : 00007fff89ac4000 a0 : ffffffff015dd7e8 a1 : 0000000000000086
   a2 : 0000000000000000 a3 : ffffaf8000000000 a4 : ffffaf80024882c0
   a5 : 0000000000000000 a6 : ffffaf800328d780 a7 : 00000000000001cc
   s2 : ffffaf800197bd00 s3 : 00000000000828c4 s4 : ffffaf800248c000
   s5 : ffffaf800247d000 s6 : 0000000000001000 s7 : 0000000000001000
   s8 : 0000000000000000 s9 : 00007fff861fd500 s10: 0000000000000001
   s11: 0000000000800000 t3 : 00000000000004d3 t4 : 00000000000004d3
   t5 : ffffffff814126e0 t6 : ffffffff81412700
  status: 0000000200000120 badaddr: 0000000000000508 cause: 000000000000000d
  [<ffffffff01590e98>] __kvm_write_guest_page+0x94/0xa6 [kvm]
  [<ffffffff015943a6>] kvm_vcpu_write_guest+0x56/0x90 [kvm]
  [<ffffffff015a175c>] kvm_pmu_clear_snapshot_area+0x42/0x7e [kvm]
  [<ffffffff015a1972>] kvm_riscv_vcpu_pmu_deinit.part.0+0xe0/0x14e [kvm]
  [<ffffffff015a2ad0>] kvm_riscv_vcpu_pmu_deinit+0x1a/0x24 [kvm]
  [<ffffffff0159b344>] kvm_arch_vcpu_destroy+0x28/0x4c [kvm]
  [<ffffffff0158e420>] kvm_destroy_vcpus+0x5a/0xda [kvm]
  [<ffffffff0159930c>] kvm_arch_destroy_vm+0x14/0x28 [kvm]
  [<ffffffff01593260>] kvm_destroy_vm+0x168/0x2a0 [kvm]
  [<ffffffff015933d4>] kvm_put_kvm+0x3c/0x58 [kvm]
  [<ffffffff01593412>] kvm_vm_release+0x22/0x2e [kvm]

Clearly, the kvm_vcpu_write_guest() function is crashing because it is
being called from kvm_pmu_clear_snapshot_area() upon guest tear down.

To address the above issue, simplify the kvm_pmu_clear_snapshot_area() to
not zero-out PMU snapshot area from kvm_pmu_clear_snapshot_area() because
the guest is anyway being tore down.

The kvm_pmu_clear_snapshot_area() is also called when guest changes
PMU snapshot area of a VCPU but even in this case the previous PMU
snaphsot area must not be zeroed-out because the guest might have
reclaimed the pervious PMU snapshot area for some other purpose.

Fixes: c2f41ddbcd ("RISC-V: KVM: Implement SBI PMU Snapshot feature")
Signed-off-by: Anup Patel <apatel@ventanamicro.com>
Link: https://lore.kernel.org/r/20240815170907.2792229-1-apatel@ventanamicro.com
Signed-off-by: Anup Patel <anup@brainfault.org>
2024-08-19 08:58:17 +05:30

850 lines
23 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023 Rivos Inc
*
* Authors:
* Atish Patra <atishp@rivosinc.com>
*/
#define pr_fmt(fmt) "riscv-kvm-pmu: " fmt
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kvm_host.h>
#include <linux/perf/riscv_pmu.h>
#include <asm/csr.h>
#include <asm/kvm_vcpu_sbi.h>
#include <asm/kvm_vcpu_pmu.h>
#include <asm/sbi.h>
#include <linux/bitops.h>
#define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs)
#define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
#define get_event_code(x) ((x) & SBI_PMU_EVENT_IDX_CODE_MASK)
static enum perf_hw_id hw_event_perf_map[SBI_PMU_HW_GENERAL_MAX] = {
[SBI_PMU_HW_CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES,
[SBI_PMU_HW_INSTRUCTIONS] = PERF_COUNT_HW_INSTRUCTIONS,
[SBI_PMU_HW_CACHE_REFERENCES] = PERF_COUNT_HW_CACHE_REFERENCES,
[SBI_PMU_HW_CACHE_MISSES] = PERF_COUNT_HW_CACHE_MISSES,
[SBI_PMU_HW_BRANCH_INSTRUCTIONS] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
[SBI_PMU_HW_BRANCH_MISSES] = PERF_COUNT_HW_BRANCH_MISSES,
[SBI_PMU_HW_BUS_CYCLES] = PERF_COUNT_HW_BUS_CYCLES,
[SBI_PMU_HW_STALLED_CYCLES_FRONTEND] = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND,
[SBI_PMU_HW_STALLED_CYCLES_BACKEND] = PERF_COUNT_HW_STALLED_CYCLES_BACKEND,
[SBI_PMU_HW_REF_CPU_CYCLES] = PERF_COUNT_HW_REF_CPU_CYCLES,
};
static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc)
{
u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
u64 sample_period;
if (!pmc->counter_val)
sample_period = counter_val_mask;
else
sample_period = (-pmc->counter_val) & counter_val_mask;
return sample_period;
}
static u32 kvm_pmu_get_perf_event_type(unsigned long eidx)
{
enum sbi_pmu_event_type etype = get_event_type(eidx);
u32 type = PERF_TYPE_MAX;
switch (etype) {
case SBI_PMU_EVENT_TYPE_HW:
type = PERF_TYPE_HARDWARE;
break;
case SBI_PMU_EVENT_TYPE_CACHE:
type = PERF_TYPE_HW_CACHE;
break;
case SBI_PMU_EVENT_TYPE_RAW:
case SBI_PMU_EVENT_TYPE_FW:
type = PERF_TYPE_RAW;
break;
default:
break;
}
return type;
}
static bool kvm_pmu_is_fw_event(unsigned long eidx)
{
return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW;
}
static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
{
if (pmc->perf_event) {
perf_event_disable(pmc->perf_event);
perf_event_release_kernel(pmc->perf_event);
pmc->perf_event = NULL;
}
}
static u64 kvm_pmu_get_perf_event_hw_config(u32 sbi_event_code)
{
return hw_event_perf_map[sbi_event_code];
}
static u64 kvm_pmu_get_perf_event_cache_config(u32 sbi_event_code)
{
u64 config = U64_MAX;
unsigned int cache_type, cache_op, cache_result;
/* All the cache event masks lie within 0xFF. No separate masking is necessary */
cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >>
SBI_PMU_EVENT_CACHE_ID_SHIFT;
cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >>
SBI_PMU_EVENT_CACHE_OP_SHIFT;
cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
return config;
config = cache_type | (cache_op << 8) | (cache_result << 16);
return config;
}
static u64 kvm_pmu_get_perf_event_config(unsigned long eidx, uint64_t evt_data)
{
enum sbi_pmu_event_type etype = get_event_type(eidx);
u32 ecode = get_event_code(eidx);
u64 config = U64_MAX;
switch (etype) {
case SBI_PMU_EVENT_TYPE_HW:
if (ecode < SBI_PMU_HW_GENERAL_MAX)
config = kvm_pmu_get_perf_event_hw_config(ecode);
break;
case SBI_PMU_EVENT_TYPE_CACHE:
config = kvm_pmu_get_perf_event_cache_config(ecode);
break;
case SBI_PMU_EVENT_TYPE_RAW:
config = evt_data & RISCV_PMU_RAW_EVENT_MASK;
break;
case SBI_PMU_EVENT_TYPE_FW:
if (ecode < SBI_PMU_FW_MAX)
config = (1ULL << 63) | ecode;
break;
default:
break;
}
return config;
}
static int kvm_pmu_get_fixed_pmc_index(unsigned long eidx)
{
u32 etype = kvm_pmu_get_perf_event_type(eidx);
u32 ecode = get_event_code(eidx);
if (etype != SBI_PMU_EVENT_TYPE_HW)
return -EINVAL;
if (ecode == SBI_PMU_HW_CPU_CYCLES)
return 0;
else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
return 2;
else
return -EINVAL;
}
static int kvm_pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
unsigned long cbase, unsigned long cmask)
{
int ctr_idx = -1;
int i, pmc_idx;
int min, max;
if (kvm_pmu_is_fw_event(eidx)) {
/* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
min = kvpmu->num_hw_ctrs;
max = min + kvpmu->num_fw_ctrs;
} else {
/* First 3 counters are reserved for fixed counters */
min = 3;
max = kvpmu->num_hw_ctrs;
}
for_each_set_bit(i, &cmask, BITS_PER_LONG) {
pmc_idx = i + cbase;
if ((pmc_idx >= min && pmc_idx < max) &&
!test_bit(pmc_idx, kvpmu->pmc_in_use)) {
ctr_idx = pmc_idx;
break;
}
}
return ctr_idx;
}
static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
unsigned long cbase, unsigned long cmask)
{
int ret;
/* Fixed counters need to be have fixed mapping as they have different width */
ret = kvm_pmu_get_fixed_pmc_index(eidx);
if (ret >= 0)
return ret;
return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
}
static int pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx,
unsigned long *out_val)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
int fevent_code;
if (!IS_ENABLED(CONFIG_32BIT)) {
pr_warn("%s: should be invoked for only RV32\n", __func__);
return -EINVAL;
}
if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) {
pr_warn("Invalid counter id [%ld]during read\n", cidx);
return -EINVAL;
}
pmc = &kvpmu->pmc[cidx];
if (pmc->cinfo.type != SBI_PMU_CTR_TYPE_FW)
return -EINVAL;
fevent_code = get_event_code(pmc->event_idx);
pmc->counter_val = kvpmu->fw_event[fevent_code].value;
*out_val = pmc->counter_val >> 32;
return 0;
}
static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
unsigned long *out_val)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
u64 enabled, running;
int fevent_code;
if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) {
pr_warn("Invalid counter id [%ld] during read\n", cidx);
return -EINVAL;
}
pmc = &kvpmu->pmc[cidx];
if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
fevent_code = get_event_code(pmc->event_idx);
pmc->counter_val = kvpmu->fw_event[fevent_code].value;
} else if (pmc->perf_event) {
pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running);
} else {
return -EINVAL;
}
*out_val = pmc->counter_val;
return 0;
}
static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base,
unsigned long ctr_mask)
{
/* Make sure the we have a valid counter mask requested from the caller */
if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu)))
return -EINVAL;
return 0;
}
static void kvm_riscv_pmu_overflow(struct perf_event *perf_event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_vcpu *vcpu = pmc->vcpu;
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
struct riscv_pmu *rpmu = to_riscv_pmu(perf_event->pmu);
u64 period;
/*
* Stop the event counting by directly accessing the perf_event.
* Otherwise, this needs to deferred via a workqueue.
* That will introduce skew in the counter value because the actual
* physical counter would start after returning from this function.
* It will be stopped again once the workqueue is scheduled
*/
rpmu->pmu.stop(perf_event, PERF_EF_UPDATE);
/*
* The hw counter would start automatically when this function returns.
* Thus, the host may continue to interrupt and inject it to the guest
* even without the guest configuring the next event. Depending on the hardware
* the host may have some sluggishness only if privilege mode filtering is not
* available. In an ideal world, where qemu is not the only capable hardware,
* this can be removed.
* FYI: ARM64 does this way while x86 doesn't do anything as such.
* TODO: Should we keep it for RISC-V ?
*/
period = -(local64_read(&perf_event->count));
local64_set(&perf_event->hw.period_left, 0);
perf_event->attr.sample_period = period;
perf_event->hw.sample_period = period;
set_bit(pmc->idx, kvpmu->pmc_overflown);
kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_PMU_OVF);
rpmu->pmu.start(perf_event, PERF_EF_RELOAD);
}
static long kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr,
unsigned long flags, unsigned long eidx,
unsigned long evtdata)
{
struct perf_event *event;
kvm_pmu_release_perf_event(pmc);
attr->config = kvm_pmu_get_perf_event_config(eidx, evtdata);
if (flags & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
//TODO: Do we really want to clear the value in hardware counter
pmc->counter_val = 0;
}
/*
* Set the default sample_period for now. The guest specified value
* will be updated in the start call.
*/
attr->sample_period = kvm_pmu_get_sample_period(pmc);
event = perf_event_create_kernel_counter(attr, -1, current, kvm_riscv_pmu_overflow, pmc);
if (IS_ERR(event)) {
pr_debug("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event));
return PTR_ERR(event);
}
pmc->perf_event = event;
if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
perf_event_enable(pmc->perf_event);
return 0;
}
int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
struct kvm_fw_event *fevent;
if (!kvpmu || fid >= SBI_PMU_FW_MAX)
return -EINVAL;
fevent = &kvpmu->fw_event[fid];
if (fevent->started)
fevent->value++;
return 0;
}
int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
unsigned long *val, unsigned long new_val,
unsigned long wr_mask)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
int cidx, ret = KVM_INSN_CONTINUE_NEXT_SEPC;
if (!kvpmu || !kvpmu->init_done) {
/*
* In absence of sscofpmf in the platform, the guest OS may use
* the legacy PMU driver to read cycle/instret. In that case,
* just return 0 to avoid any illegal trap. However, any other
* hpmcounter access should result in illegal trap as they must
* be access through SBI PMU only.
*/
if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) {
*val = 0;
return ret;
} else {
return KVM_INSN_ILLEGAL_TRAP;
}
}
/* The counter CSR are read only. Thus, any write should result in illegal traps */
if (wr_mask)
return KVM_INSN_ILLEGAL_TRAP;
cidx = csr_num - CSR_CYCLE;
if (pmu_ctr_read(vcpu, cidx, val) < 0)
return KVM_INSN_ILLEGAL_TRAP;
return ret;
}
static void kvm_pmu_clear_snapshot_area(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
kfree(kvpmu->sdata);
kvpmu->sdata = NULL;
kvpmu->snapshot_addr = INVALID_GPA;
}
int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long saddr_low,
unsigned long saddr_high, unsigned long flags,
struct kvm_vcpu_sbi_return *retdata)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data);
int sbiret = 0;
gpa_t saddr;
unsigned long hva;
bool writable;
if (!kvpmu || flags) {
sbiret = SBI_ERR_INVALID_PARAM;
goto out;
}
if (saddr_low == SBI_SHMEM_DISABLE && saddr_high == SBI_SHMEM_DISABLE) {
kvm_pmu_clear_snapshot_area(vcpu);
return 0;
}
saddr = saddr_low;
if (saddr_high != 0) {
if (IS_ENABLED(CONFIG_32BIT))
saddr |= ((gpa_t)saddr_high << 32);
else
sbiret = SBI_ERR_INVALID_ADDRESS;
goto out;
}
hva = kvm_vcpu_gfn_to_hva_prot(vcpu, saddr >> PAGE_SHIFT, &writable);
if (kvm_is_error_hva(hva) || !writable) {
sbiret = SBI_ERR_INVALID_ADDRESS;
goto out;
}
kvpmu->sdata = kzalloc(snapshot_area_size, GFP_ATOMIC);
if (!kvpmu->sdata)
return -ENOMEM;
if (kvm_vcpu_write_guest(vcpu, saddr, kvpmu->sdata, snapshot_area_size)) {
kfree(kvpmu->sdata);
sbiret = SBI_ERR_FAILURE;
goto out;
}
kvpmu->snapshot_addr = saddr;
out:
retdata->err_val = sbiret;
return 0;
}
int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu,
struct kvm_vcpu_sbi_return *retdata)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
retdata->out_val = kvm_pmu_num_counters(kvpmu);
return 0;
}
int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
struct kvm_vcpu_sbi_return *retdata)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
if (cidx > RISCV_KVM_MAX_COUNTERS || cidx == 1) {
retdata->err_val = SBI_ERR_INVALID_PARAM;
return 0;
}
retdata->out_val = kvpmu->pmc[cidx].cinfo.value;
return 0;
}
int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
unsigned long ctr_mask, unsigned long flags, u64 ival,
struct kvm_vcpu_sbi_return *retdata)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
int i, pmc_index, sbiret = 0;
struct kvm_pmc *pmc;
int fevent_code;
bool snap_flag_set = flags & SBI_PMU_START_FLAG_INIT_SNAPSHOT;
if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
sbiret = SBI_ERR_INVALID_PARAM;
goto out;
}
if (snap_flag_set) {
if (kvpmu->snapshot_addr == INVALID_GPA) {
sbiret = SBI_ERR_NO_SHMEM;
goto out;
}
if (kvm_vcpu_read_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata,
sizeof(struct riscv_pmu_snapshot_data))) {
pr_warn("Unable to read snapshot shared memory while starting counters\n");
sbiret = SBI_ERR_FAILURE;
goto out;
}
}
/* Start the counters that have been configured and requested by the guest */
for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
pmc_index = i + ctr_base;
if (!test_bit(pmc_index, kvpmu->pmc_in_use))
continue;
/* The guest started the counter again. Reset the overflow status */
clear_bit(pmc_index, kvpmu->pmc_overflown);
pmc = &kvpmu->pmc[pmc_index];
if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE) {
pmc->counter_val = ival;
} else if (snap_flag_set) {
/* The counter index in the snapshot are relative to the counter base */
pmc->counter_val = kvpmu->sdata->ctr_values[i];
}
if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
fevent_code = get_event_code(pmc->event_idx);
if (fevent_code >= SBI_PMU_FW_MAX) {
sbiret = SBI_ERR_INVALID_PARAM;
goto out;
}
/* Check if the counter was already started for some reason */
if (kvpmu->fw_event[fevent_code].started) {
sbiret = SBI_ERR_ALREADY_STARTED;
continue;
}
kvpmu->fw_event[fevent_code].started = true;
kvpmu->fw_event[fevent_code].value = pmc->counter_val;
} else if (pmc->perf_event) {
if (unlikely(pmc->started)) {
sbiret = SBI_ERR_ALREADY_STARTED;
continue;
}
perf_event_period(pmc->perf_event, kvm_pmu_get_sample_period(pmc));
perf_event_enable(pmc->perf_event);
pmc->started = true;
} else {
sbiret = SBI_ERR_INVALID_PARAM;
}
}
out:
retdata->err_val = sbiret;
return 0;
}
int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
unsigned long ctr_mask, unsigned long flags,
struct kvm_vcpu_sbi_return *retdata)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
int i, pmc_index, sbiret = 0;
u64 enabled, running;
struct kvm_pmc *pmc;
int fevent_code;
bool snap_flag_set = flags & SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
bool shmem_needs_update = false;
if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
sbiret = SBI_ERR_INVALID_PARAM;
goto out;
}
if (snap_flag_set && kvpmu->snapshot_addr == INVALID_GPA) {
sbiret = SBI_ERR_NO_SHMEM;
goto out;
}
/* Stop the counters that have been configured and requested by the guest */
for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
pmc_index = i + ctr_base;
if (!test_bit(pmc_index, kvpmu->pmc_in_use))
continue;
pmc = &kvpmu->pmc[pmc_index];
if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
fevent_code = get_event_code(pmc->event_idx);
if (fevent_code >= SBI_PMU_FW_MAX) {
sbiret = SBI_ERR_INVALID_PARAM;
goto out;
}
if (!kvpmu->fw_event[fevent_code].started)
sbiret = SBI_ERR_ALREADY_STOPPED;
kvpmu->fw_event[fevent_code].started = false;
} else if (pmc->perf_event) {
if (pmc->started) {
/* Stop counting the counter */
perf_event_disable(pmc->perf_event);
pmc->started = false;
} else {
sbiret = SBI_ERR_ALREADY_STOPPED;
}
if (flags & SBI_PMU_STOP_FLAG_RESET)
/* Release the counter if this is a reset request */
kvm_pmu_release_perf_event(pmc);
} else {
sbiret = SBI_ERR_INVALID_PARAM;
}
if (snap_flag_set && !sbiret) {
if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW)
pmc->counter_val = kvpmu->fw_event[fevent_code].value;
else if (pmc->perf_event)
pmc->counter_val += perf_event_read_value(pmc->perf_event,
&enabled, &running);
/*
* The counter and overflow indicies in the snapshot region are w.r.to
* cbase. Modify the set bit in the counter mask instead of the pmc_index
* which indicates the absolute counter index.
*/
if (test_bit(pmc_index, kvpmu->pmc_overflown))
kvpmu->sdata->ctr_overflow_mask |= BIT(i);
kvpmu->sdata->ctr_values[i] = pmc->counter_val;
shmem_needs_update = true;
}
if (flags & SBI_PMU_STOP_FLAG_RESET) {
pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
clear_bit(pmc_index, kvpmu->pmc_in_use);
clear_bit(pmc_index, kvpmu->pmc_overflown);
if (snap_flag_set) {
/*
* Only clear the given counter as the caller is responsible to
* validate both the overflow mask and configured counters.
*/
kvpmu->sdata->ctr_overflow_mask &= ~BIT(i);
shmem_needs_update = true;
}
}
}
if (shmem_needs_update)
kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata,
sizeof(struct riscv_pmu_snapshot_data));
out:
retdata->err_val = sbiret;
return 0;
}
int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base,
unsigned long ctr_mask, unsigned long flags,
unsigned long eidx, u64 evtdata,
struct kvm_vcpu_sbi_return *retdata)
{
int ctr_idx, sbiret = 0;
long ret;
bool is_fevent;
unsigned long event_code;
u32 etype = kvm_pmu_get_perf_event_type(eidx);
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc = NULL;
struct perf_event_attr attr = {
.type = etype,
.size = sizeof(struct perf_event_attr),
.pinned = true,
/*
* It should never reach here if the platform doesn't support the sscofpmf
* extension as mode filtering won't work without it.
*/
.exclude_host = true,
.exclude_hv = true,
.exclude_user = !!(flags & SBI_PMU_CFG_FLAG_SET_UINH),
.exclude_kernel = !!(flags & SBI_PMU_CFG_FLAG_SET_SINH),
.config1 = RISCV_PMU_CONFIG1_GUEST_EVENTS,
};
if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
sbiret = SBI_ERR_INVALID_PARAM;
goto out;
}
event_code = get_event_code(eidx);
is_fevent = kvm_pmu_is_fw_event(eidx);
if (is_fevent && event_code >= SBI_PMU_FW_MAX) {
sbiret = SBI_ERR_NOT_SUPPORTED;
goto out;
}
/*
* SKIP_MATCH flag indicates the caller is aware of the assigned counter
* for this event. Just do a sanity check if it already marked used.
*/
if (flags & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
if (!test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use)) {
sbiret = SBI_ERR_FAILURE;
goto out;
}
ctr_idx = ctr_base + __ffs(ctr_mask);
} else {
ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
if (ctr_idx < 0) {
sbiret = SBI_ERR_NOT_SUPPORTED;
goto out;
}
}
pmc = &kvpmu->pmc[ctr_idx];
pmc->idx = ctr_idx;
if (is_fevent) {
if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
kvpmu->fw_event[event_code].started = true;
} else {
ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata);
if (ret) {
sbiret = SBI_ERR_NOT_SUPPORTED;
goto out;
}
}
set_bit(ctr_idx, kvpmu->pmc_in_use);
pmc->event_idx = eidx;
retdata->out_val = ctr_idx;
out:
retdata->err_val = sbiret;
return 0;
}
int kvm_riscv_vcpu_pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx,
struct kvm_vcpu_sbi_return *retdata)
{
int ret;
ret = pmu_fw_ctr_read_hi(vcpu, cidx, &retdata->out_val);
if (ret == -EINVAL)
retdata->err_val = SBI_ERR_INVALID_PARAM;
return 0;
}
int kvm_riscv_vcpu_pmu_fw_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
struct kvm_vcpu_sbi_return *retdata)
{
int ret;
ret = pmu_ctr_read(vcpu, cidx, &retdata->out_val);
if (ret == -EINVAL)
retdata->err_val = SBI_ERR_INVALID_PARAM;
return 0;
}
void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
{
int i = 0, ret, num_hw_ctrs = 0, hpm_width = 0;
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
/*
* PMU functionality should be only available to guests if privilege mode
* filtering is available in the host. Otherwise, guest will always count
* events while the execution is in hypervisor mode.
*/
if (!riscv_isa_extension_available(NULL, SSCOFPMF))
return;
ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs);
if (ret < 0 || !hpm_width || !num_hw_ctrs)
return;
/*
* Increase the number of hardware counters to offset the time counter.
*/
kvpmu->num_hw_ctrs = num_hw_ctrs + 1;
kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX;
memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
kvpmu->snapshot_addr = INVALID_GPA;
if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) {
pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA");
kvpmu->num_hw_ctrs = RISCV_KVM_MAX_HW_CTRS;
}
/*
* There is no correlation between the logical hardware counter and virtual counters.
* However, we need to encode a hpmcounter CSR in the counter info field so that
* KVM can trap n emulate the read. This works well in the migration use case as
* KVM doesn't care if the actual hpmcounter is available in the hardware or not.
*/
for (i = 0; i < kvm_pmu_num_counters(kvpmu); i++) {
/* TIME CSR shouldn't be read from perf interface */
if (i == 1)
continue;
pmc = &kvpmu->pmc[i];
pmc->idx = i;
pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
pmc->vcpu = vcpu;
if (i < kvpmu->num_hw_ctrs) {
pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW;
if (i < 3)
/* CY, IR counters */
pmc->cinfo.width = 63;
else
pmc->cinfo.width = hpm_width;
/*
* The CSR number doesn't have any relation with the logical
* hardware counters. The CSR numbers are encoded sequentially
* to avoid maintaining a map between the virtual counter
* and CSR number.
*/
pmc->cinfo.csr = CSR_CYCLE + i;
} else {
pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
pmc->cinfo.width = 63;
}
}
kvpmu->init_done = true;
}
void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
int i;
if (!kvpmu)
return;
for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS) {
pmc = &kvpmu->pmc[i];
pmc->counter_val = 0;
kvm_pmu_release_perf_event(pmc);
pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
}
bitmap_zero(kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS);
bitmap_zero(kvpmu->pmc_overflown, RISCV_KVM_MAX_COUNTERS);
memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
kvm_pmu_clear_snapshot_area(vcpu);
}
void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
{
kvm_riscv_vcpu_pmu_deinit(vcpu);
}