venus: check and configure new ringMonitoring feature

At ring creation, if supported by renderer, we can request
ringMonitoring. During driver ring waits, the ring's new ALIVE status
bit will be checked periodically at the configured rate. If the bit is
not set, the renderer must have crashed and the driver should do the
same to signal a problem to the app/user.

Signed-off-by: Ryan Neph <ryanneph@google.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22036>
This commit is contained in:
Ryan Neph 2023-03-22 12:47:42 -07:00 committed by Marge Bot
parent c4c09464ce
commit 4a4b05869a
5 changed files with 95 additions and 2 deletions

View File

@ -123,9 +123,52 @@ vn_extension_get_spec_version(const char *name)
return index >= 0 ? vn_info_extension_get(index)->spec_version : 0;
}
static bool
vn_ring_monitor_acquire(struct vn_ring *ring)
{
pid_t tid = gettid();
if (!ring->monitor.threadid && tid != ring->monitor.threadid &&
mtx_trylock(&ring->monitor.mutex) == thrd_success) {
/* register as the only waiting thread that monitors the ring. */
ring->monitor.threadid = tid;
}
return tid == ring->monitor.threadid;
}
void
vn_ring_monitor_release(struct vn_ring *ring)
{
if (gettid() != ring->monitor.threadid)
return;
ring->monitor.threadid = 0;
mtx_unlock(&ring->monitor.mutex);
}
struct vn_relax_state
vn_relax_init(struct vn_ring *ring, const char *reason)
{
if (ring->monitor.report_period_us) {
#ifndef NDEBUG
/* ensure minimum check period is greater than maximum renderer
* reporting period (with margin of safety to ensure no false
* positives).
*
* first_warn_time is pre-calculated based on parameters in vn_relax
* and must update together.
*/
const uint32_t first_warn_time = 3481600;
const uint32_t safety_margin = 250000;
assert(first_warn_time - safety_margin >=
ring->monitor.report_period_us);
#endif
if (vn_ring_monitor_acquire(ring)) {
ring->monitor.alive = true;
vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
}
}
return (struct vn_relax_state){
.ring = ring,
.iter = 0,
@ -143,6 +186,7 @@ vn_relax(struct vn_relax_state *state)
/* Yield for the first 2^busy_wait_order times and then sleep for
* base_sleep_us microseconds for the same number of times. After that,
* keep doubling both sleep length and count.
* Must also update pre-calculated "first_warn_time" in vn_relax_init().
*/
const uint32_t busy_wait_order = 8;
const uint32_t base_sleep_us = vn_env.relax_base_sleep_us;
@ -167,6 +211,19 @@ vn_relax(struct vn_relax_state *state)
abort();
}
if (ring->monitor.report_period_us) {
if (vn_ring_monitor_acquire(ring)) {
ring->monitor.alive = status & VK_RING_STATUS_ALIVE_BIT_MESA;
vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
}
if (!ring->monitor.alive) {
vn_log(NULL, "aborting on expired ring alive status at iter %d",
*iter);
abort();
}
}
if (*iter >= (1 << abort_order) && !VN_DEBUG(NO_ABORT)) {
vn_log(NULL, "aborting");
abort();

View File

@ -229,6 +229,9 @@ vn_refcount_dec(struct vn_refcount *ref)
uint32_t
vn_extension_get_spec_version(const char *name);
void
vn_ring_monitor_release(struct vn_ring *ring);
struct vn_relax_state
vn_relax_init(struct vn_ring *ring, const char *reason);
@ -238,6 +241,7 @@ vn_relax(struct vn_relax_state *state);
static inline void
vn_relax_fini(struct vn_relax_state *state)
{
vn_ring_monitor_release(state->ring);
}
static_assert(sizeof(vn_object_id) >= sizeof(uintptr_t), "");

View File

@ -133,8 +133,19 @@ vn_instance_init_ring(struct vn_instance *instance)
instance->ring.id = (uintptr_t)ring;
struct VkRingMonitorInfoMESA monitor_info;
if (instance->experimental.ringMonitoring) {
ring->monitor.report_period_us = 3000000;
mtx_init(&ring->monitor.mutex, mtx_plain);
monitor_info = (struct VkRingMonitorInfoMESA){
.sType = VK_STRUCTURE_TYPE_RING_MONITOR_INFO_MESA,
.maxReportingPeriodMicroseconds = ring->monitor.report_period_us,
};
}
const struct VkRingCreateInfoMESA info = {
.sType = VK_STRUCTURE_TYPE_RING_CREATE_INFO_MESA,
.pNext = instance->experimental.ringMonitoring ? &monitor_info : NULL,
.resourceId = instance->ring.shmem->res_id,
.size = layout.shmem_size,
.idleTimeout = 50ull * 1000 * 1000,
@ -231,12 +242,14 @@ vn_instance_init_experimental_features(struct vn_instance *instance)
"\n\tglobalFencing = %u"
"\n\tlargeRing = %u"
"\n\tsyncFdFencing = %u"
"\n\tasyncRoundtrip = %u",
"\n\tasyncRoundtrip = %u"
"\n\tringMonitoring = %u",
instance->experimental.memoryResourceAllocationSize,
instance->experimental.globalFencing,
instance->experimental.largeRing,
instance->experimental.syncFdFencing,
instance->experimental.asyncRoundtrip);
instance->experimental.asyncRoundtrip,
instance->experimental.ringMonitoring);
}
return VK_SUCCESS;

View File

@ -208,6 +208,9 @@ vn_ring_fini(struct vn_ring *ring)
list_for_each_entry_safe(struct vn_ring_submit, submit,
&ring->free_submits, head)
free(submit);
if (ring->monitor.report_period_us)
mtx_destroy(&ring->monitor.mutex);
}
struct vn_ring_submit *

View File

@ -70,6 +70,22 @@ struct vn_ring {
struct list_head submits;
struct list_head free_submits;
/* Only one "waiting" thread may fulfill the "monitor" role at a time.
* Every "report_period_us" or longer, the waiting "monitor" thread tests
* the ring's ALIVE status, updates the "alive" atomic, and resets the
* ALIVE status for the next cycle. Waiting non-"monitor" threads, just
* check the "alive" atomic. The "monitor" role may be released and
* acquired by another waiting thread dynamically.
*/
struct {
mtx_t mutex;
atomic_int threadid;
atomic_bool alive;
/* constant and non-zero after ring init, if monitoring is enabled */
uint32_t report_period_us;
} monitor;
};
void