venus: check and configure new ringMonitoring feature

At ring creation, if supported by renderer, we can request ringMonitoring. During driver ring waits, the ring's new ALIVE status bit will be checked periodically at the configured rate. If the bit is not set, the renderer must have crashed and the driver should do the same to signal a problem to the app/user. Signed-off-by: Ryan Neph <ryanneph@google.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22036>
2025-01-24 22:53:51 +08:00 · 2023-03-22 12:47:42 -07:00 · 2023-03-22 12:47:42 -07:00 · 4a4b05869a
commit 4a4b05869a
parent c4c09464ce
5 changed files with 95 additions and 2 deletions
--- a/src/virtio/vulkan/vn_common.c
+++ b/src/virtio/vulkan/vn_common.c
@ -123,9 +123,52 @@ vn_extension_get_spec_version(const char *name)
   return index >= 0 ? vn_info_extension_get(index)->spec_version : 0;
 }

+static bool
+vn_ring_monitor_acquire(struct vn_ring *ring)
+{
+   pid_t tid = gettid();
+   if (!ring->monitor.threadid && tid != ring->monitor.threadid &&
+       mtx_trylock(&ring->monitor.mutex) == thrd_success) {
+      /* register as the only waiting thread that monitors the ring. */
+      ring->monitor.threadid = tid;
+   }
+   return tid == ring->monitor.threadid;
+}
+
+void
+vn_ring_monitor_release(struct vn_ring *ring)
+{
+   if (gettid() != ring->monitor.threadid)
+      return;
+
+   ring->monitor.threadid = 0;
+   mtx_unlock(&ring->monitor.mutex);
+}
+
 struct vn_relax_state
 vn_relax_init(struct vn_ring *ring, const char *reason)
 {
+   if (ring->monitor.report_period_us) {
+#ifndef NDEBUG
+      /* ensure minimum check period is greater than maximum renderer
+       * reporting period (with margin of safety to ensure no false
+       * positives).
+       *
+       * first_warn_time is pre-calculated based on parameters in vn_relax
+       * and must update together.
+       */
+      const uint32_t first_warn_time = 3481600;
+      const uint32_t safety_margin = 250000;
+      assert(first_warn_time - safety_margin >=
+             ring->monitor.report_period_us);
+#endif
+
+      if (vn_ring_monitor_acquire(ring)) {
+         ring->monitor.alive = true;
+         vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
+      }
+   }
+
   return (struct vn_relax_state){
      .ring = ring,
      .iter = 0,
@ -143,6 +186,7 @@ vn_relax(struct vn_relax_state *state)
   /* Yield for the first 2^busy_wait_order times and then sleep for
    * base_sleep_us microseconds for the same number of times.  After that,
    * keep doubling both sleep length and count.
+    * Must also update pre-calculated "first_warn_time" in vn_relax_init().
    */
   const uint32_t busy_wait_order = 8;
   const uint32_t base_sleep_us = vn_env.relax_base_sleep_us;
@ -167,6 +211,19 @@ vn_relax(struct vn_relax_state *state)
         abort();
      }

+      if (ring->monitor.report_period_us) {
+         if (vn_ring_monitor_acquire(ring)) {
+            ring->monitor.alive = status & VK_RING_STATUS_ALIVE_BIT_MESA;
+            vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
+         }
+
+         if (!ring->monitor.alive) {
+            vn_log(NULL, "aborting on expired ring alive status at iter %d",
+                   *iter);
+            abort();
+         }
+      }
+
      if (*iter >= (1 << abort_order) && !VN_DEBUG(NO_ABORT)) {
         vn_log(NULL, "aborting");
         abort();
--- a/src/virtio/vulkan/vn_common.h
+++ b/src/virtio/vulkan/vn_common.h
@ -229,6 +229,9 @@ vn_refcount_dec(struct vn_refcount *ref)
 uint32_t
 vn_extension_get_spec_version(const char *name);

+void
+vn_ring_monitor_release(struct vn_ring *ring);
+
 struct vn_relax_state
 vn_relax_init(struct vn_ring *ring, const char *reason);

@ -238,6 +241,7 @@ vn_relax(struct vn_relax_state *state);
 static inline void
 vn_relax_fini(struct vn_relax_state *state)
 {
+   vn_ring_monitor_release(state->ring);
 }

 static_assert(sizeof(vn_object_id) >= sizeof(uintptr_t), "");
--- a/src/virtio/vulkan/vn_instance.c
+++ b/src/virtio/vulkan/vn_instance.c
@ -133,8 +133,19 @@ vn_instance_init_ring(struct vn_instance *instance)

   instance->ring.id = (uintptr_t)ring;

+   struct VkRingMonitorInfoMESA monitor_info;
+   if (instance->experimental.ringMonitoring) {
+      ring->monitor.report_period_us = 3000000;
+      mtx_init(&ring->monitor.mutex, mtx_plain);
+      monitor_info = (struct VkRingMonitorInfoMESA){
+         .sType = VK_STRUCTURE_TYPE_RING_MONITOR_INFO_MESA,
+         .maxReportingPeriodMicroseconds = ring->monitor.report_period_us,
+      };
+   }
+
   const struct VkRingCreateInfoMESA info = {
      .sType = VK_STRUCTURE_TYPE_RING_CREATE_INFO_MESA,
+      .pNext = instance->experimental.ringMonitoring ? &monitor_info : NULL,
      .resourceId = instance->ring.shmem->res_id,
      .size = layout.shmem_size,
      .idleTimeout = 50ull * 1000 * 1000,
@ -231,12 +242,14 @@ vn_instance_init_experimental_features(struct vn_instance *instance)
             "\n\tglobalFencing = %u"
             "\n\tlargeRing = %u"
             "\n\tsyncFdFencing = %u"
-             "\n\tasyncRoundtrip = %u",
+             "\n\tasyncRoundtrip = %u"
+             "\n\tringMonitoring = %u",
             instance->experimental.memoryResourceAllocationSize,
             instance->experimental.globalFencing,
             instance->experimental.largeRing,
             instance->experimental.syncFdFencing,
-             instance->experimental.asyncRoundtrip);
+             instance->experimental.asyncRoundtrip,
+             instance->experimental.ringMonitoring);
   }

   return VK_SUCCESS;
--- a/src/virtio/vulkan/vn_ring.c
+++ b/src/virtio/vulkan/vn_ring.c
@ -208,6 +208,9 @@ vn_ring_fini(struct vn_ring *ring)
   list_for_each_entry_safe(struct vn_ring_submit, submit,
                            &ring->free_submits, head)
      free(submit);
+
+   if (ring->monitor.report_period_us)
+      mtx_destroy(&ring->monitor.mutex);
 }

 struct vn_ring_submit *
--- a/src/virtio/vulkan/vn_ring.h
+++ b/src/virtio/vulkan/vn_ring.h
@ -70,6 +70,22 @@ struct vn_ring {

   struct list_head submits;
   struct list_head free_submits;
+
+   /* Only one "waiting" thread may fulfill the "monitor" role at a time.
+    * Every "report_period_us" or longer, the waiting "monitor" thread tests
+    * the ring's ALIVE status, updates the "alive" atomic, and resets the
+    * ALIVE status for the next cycle. Waiting non-"monitor" threads, just
+    * check the "alive" atomic. The "monitor" role may be released and
+    * acquired by another waiting thread dynamically.
+    */
+   struct {
+      mtx_t mutex;
+      atomic_int threadid;
+      atomic_bool alive;
+
+      /* constant and non-zero after ring init, if monitoring is enabled */
+      uint32_t report_period_us;
+   } monitor;
 };

 void