rusticl/memory: complete rework on how mapping is implemented

Previously we tried to map GPU resources directly wherever we could, however this was always causing random issues and was overall not very robust. Now we just allocate a staging buffer on the host to copy into, with some short-cut for host_ptr allocations. Fixes the following tests across various drivers: 1Dbuffer tests (radeonsi, zink) buffers map_read_* (zink) multiple_device_context context_* (zink) thread_dimensions quick_* (zink) math_brute_force (zink) Cc: mesa-stable Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30082> (cherry picked from commit 7b22bc617b) [Eric: also drop `can_map_directly` and `has_user_shadow_buffer` as they're now dead code, which was breaking the build]
2024-11-23 18:24:13 +08:00 · 2024-07-09 10:27:07 +02:00 · 2024-07-09 10:27:07 +02:00 · 01366d8729
commit 01366d8729
parent e51ac614c5
4 changed files with 214 additions and 332 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@ -1094,7 +1094,7 @@
        "description": "rusticl/memory: complete rework on how mapping is implemented",
        "nominated": true,
        "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": null,
        "notes": null
--- a/docs/features.txt
+++ b/docs/features.txt
@ -860,7 +860,7 @@ Rusticl OpenCL 1.2 -- all DONE:
  Separate compilation and linking of programs          DONE
  Extend cl_mem_flags                                   DONE
  clEnqueueFillBuffer, clEnqueueFillImage               DONE
-  Add CL_MAP_WRITE_INVALIDATE_REGION to cl_map_flags    in progress (flag is ignored)
+  Add CL_MAP_WRITE_INVALIDATE_REGION to cl_map_flags    DONE
  New image types                                       DONE
  clCreateImage                                         DONE
  clEnqueueMigrateMemObjects                            DONE
--- a/src/gallium/frontends/rusticl/api/memory.rs
+++ b/src/gallium/frontends/rusticl/api/memory.rs
@ -1685,14 +1685,20 @@ fn enqueue_map_buffer(
        return Err(CL_INVALID_CONTEXT);
    }

-    let ptr = b.map(q.device, offset)?;
+    let ptr = b.map(size, offset, map_flags != CL_MAP_READ.into())?;
    create_and_queue(
        q,
        CL_COMMAND_MAP_BUFFER,
        evs,
        event,
        block,
-        Box::new(move |q, ctx| b.sync_shadow(q, ctx, ptr)),
+        Box::new(move |q, ctx| {
+            if map_flags != CL_MAP_WRITE_INVALIDATE_REGION.into() {
+                b.sync_map(q, ctx, ptr)
+            } else {
+                Ok(())
+            }
+        }),
    )?;

    Ok(ptr.as_ptr())
@ -2141,24 +2147,29 @@ fn enqueue_map_image(
    };

    let ptr = i.map(
-        q.device,
-        &origin,
+        origin,
+        region,
        unsafe { image_row_pitch.as_mut().unwrap() },
        image_slice_pitch,
+        map_flags != CL_MAP_READ.into(),
    )?;

-    // SAFETY: it's required that applications do not cause data races
-    let sync_ptr = unsafe { MutMemoryPtr::from_ptr(ptr) };
    create_and_queue(
        q,
        CL_COMMAND_MAP_IMAGE,
        evs,
        event,
        block,
-        Box::new(move |q, ctx| i.sync_shadow(q, ctx, sync_ptr)),
+        Box::new(move |q, ctx| {
+            if map_flags != CL_MAP_WRITE_INVALIDATE_REGION.into() {
+                i.sync_map(q, ctx, ptr)
+            } else {
+                Ok(())
+            }
+        }),
    )?;

-    Ok(ptr)
+    Ok(ptr.as_ptr())

    //• CL_INVALID_IMAGE_SIZE if image dimensions (image width, height, specified or compute row and/or slice pitch) for image are not supported by device associated with queue.
    //• CL_IMAGE_FORMAT_NOT_SUPPORTED if image format (image channel order and data type) for image are not supported by device associated with queue.
--- a/src/gallium/frontends/rusticl/core/memory.rs
+++ b/src/gallium/frontends/rusticl/core/memory.rs
@ -17,10 +17,14 @@ use mesa_rust::pipe::transfer::*;
 use mesa_rust_gen::*;
 use mesa_rust_util::math::*;
 use mesa_rust_util::properties::Properties;
+use mesa_rust_util::ptr::AllocSize;
+use mesa_rust_util::ptr::TrackedPointers;
 use rusticl_opencl_gen::*;

+use std::alloc;
+use std::alloc::Layout;
 use std::cmp;
-use std::collections::hash_map::Entry;
+use std::collections::btree_map::Entry;
 use std::collections::HashMap;
 use std::convert::TryInto;
 use std::mem;
@ -31,84 +35,45 @@ use std::ptr;
 use std::sync::Arc;
 use std::sync::Mutex;

-struct MappingTransfer {
-    tx: PipeTransfer,
-    shadow: Option<PipeResource>,
-    pending: u32,
+struct Mapping<T> {
+    layout: Layout,
+    writes: bool,
+    ptr: Option<MutMemoryPtr>,
+    count: u32,
+    inner: T,
 }

-impl MappingTransfer {
-    fn new(tx: PipeTransfer, shadow: Option<PipeResource>) -> Self {
-        MappingTransfer {
-            tx: tx,
-            shadow: shadow,
-            pending: 1,
+impl<T> Drop for Mapping<T> {
+    fn drop(&mut self) {
+        if let Some(ptr) = &self.ptr {
+            unsafe {
+                alloc::dealloc(ptr.as_ptr().cast(), self.layout);
+            }
        }
    }
 }

-struct Mappings {
-    tx: HashMap<&'static Device, MappingTransfer>,
-    maps: HashMap<usize, u32>,
+impl<T> AllocSize<usize> for Mapping<T> {
+    fn size(&self) -> usize {
+        self.layout.size()
+    }
 }

-impl Mappings {
-    fn new() -> Mutex<Self> {
-        Mutex::new(Mappings {
-            tx: HashMap::new(),
-            maps: HashMap::new(),
-        })
+impl<T> Deref for Mapping<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
    }
+}

-    fn contains_ptr(&self, ptr: *mut c_void) -> bool {
-        let ptr = ptr as usize;
-        self.maps.contains_key(&ptr)
-    }
+struct BufferMapping {
+    offset: usize,
+}

-    fn mark_pending(&mut self, dev: &Device) {
-        self.tx.get_mut(dev).unwrap().pending += 1;
-    }
-
-    fn unmark_pending(&mut self, dev: &Device) {
-        if let Some(tx) = self.tx.get_mut(dev) {
-            tx.pending -= 1;
-        }
-    }
-
-    fn increase_ref(&mut self, dev: &Device, ptr: *mut c_void) -> bool {
-        let ptr = ptr as usize;
-        let res = self.maps.is_empty();
-        *self.maps.entry(ptr).or_default() += 1;
-        self.unmark_pending(dev);
-        res
-    }
-
-    fn decrease_ref(&mut self, ptr: *mut c_void, dev: &Device) -> (bool, Option<&PipeResource>) {
-        let ptr = ptr as usize;
-        if let Some(r) = self.maps.get_mut(&ptr) {
-            *r -= 1;
-
-            if *r == 0 {
-                self.maps.remove(&ptr);
-            }
-
-            if self.maps.is_empty() {
-                let shadow = self.tx.get(dev).and_then(|tx| tx.shadow.as_ref());
-                return (true, shadow);
-            }
-        }
-        (false, None)
-    }
-
-    fn clean_up_tx(&mut self, dev: &Device, ctx: &PipeContext) {
-        if self.maps.is_empty() {
-            if let Some(tx) = self.tx.get(&dev) {
-                if tx.pending == 0 {
-                    self.tx.remove(dev).unwrap().tx.with_ctx(ctx);
-                }
-            }
-        }
-    }
+struct ImageMapping {
+    origin: CLVec<usize>,
+    region: CLVec<usize>,
 }

 #[repr(transparent)]
@ -133,6 +98,14 @@ impl ConstMemoryPtr {
    }
 }

+impl From<MutMemoryPtr> for ConstMemoryPtr {
+    fn from(value: MutMemoryPtr) -> Self {
+        Self {
+            ptr: value.ptr.cast(),
+        }
+    }
+}
+
 #[repr(transparent)]
 #[derive(Clone, Copy)]
 pub struct MutMemoryPtr {
@ -172,6 +145,13 @@ impl Deref for Mem {
 }

 impl Mem {
+    pub fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
+        match self {
+            Self::Buffer(b) => b.is_mapped_ptr(ptr),
+            Self::Image(i) => i.is_mapped_ptr(ptr),
+        }
+    }
+
    pub fn unmap(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
        match self {
            Self::Buffer(b) => b.unmap(q, ctx, ptr),
@ -219,12 +199,12 @@ pub struct MemBase {
    pub cbs: Mutex<Vec<MemCB>>,
    pub gl_obj: Option<GLObject>,
    res: Option<HashMap<&'static Device, Arc<PipeResource>>>,
-    maps: Mutex<Mappings>,
 }

 pub struct Buffer {
    base: MemBase,
    pub offset: usize,
+    maps: Mutex<TrackedPointers<usize, Mapping<BufferMapping>>>,
 }

 pub struct Image {
@ -233,6 +213,7 @@ pub struct Image {
    pub pipe_format: pipe_format,
    pub image_desc: cl_image_desc,
    pub image_elem_size: u8,
+    maps: Mutex<TrackedPointers<usize, Mapping<ImageMapping>>>,
 }

 impl Deref for Buffer {
@ -385,17 +366,6 @@ fn sw_copy(
    }
 }

-/// helper function to determine if we can just map the resource in question or if we have to go
-/// through a shdow buffer to let the CPU access the resources memory
-fn can_map_directly(dev: &Device, res: &PipeResource) -> bool {
-    // there are two aprts to this check:
-    //   1. is the resource located in system RAM
-    //   2. has the resource a linear memory layout
-    // we do not want to map memory over the PCIe bus as this generally leads to bad performance.
-    (dev.unified_memory() || res.is_staging() || res.is_user)
-        && (res.is_buffer() || res.is_linear())
-}
-
 impl MemBase {
    pub fn new_buffer(
        context: Arc<Context>,
@ -436,9 +406,9 @@ impl MemBase {
                gl_obj: None,
                cbs: Mutex::new(Vec::new()),
                res: Some(buffer),
-                maps: Mappings::new(),
            },
            offset: 0,
+            maps: Mutex::new(TrackedPointers::new()),
        }))
    }

@ -467,9 +437,9 @@ impl MemBase {
                gl_obj: None,
                cbs: Mutex::new(Vec::new()),
                res: None,
-                maps: Mappings::new(),
            },
            offset: offset,
+            maps: Mutex::new(TrackedPointers::new()),
        })
    }

@ -550,12 +520,12 @@ impl MemBase {
                gl_obj: None,
                cbs: Mutex::new(Vec::new()),
                res: texture,
-                maps: Mappings::new(),
            },
            image_format: *image_format,
            pipe_format: pipe_format,
            image_desc: api_image_desc,
            image_elem_size: image_elem_size,
+            maps: Mutex::new(TrackedPointers::new()),
        }))
    }

@ -656,13 +626,13 @@ impl MemBase {
            }),
            cbs: Mutex::new(Vec::new()),
            res: Some(texture),
-            maps: Mappings::new(),
        };

        Ok(if rusticl_type == RusticlTypes::Buffer {
            Arc::new(Buffer {
                base: base,
                offset: gl_mem_props.offset as usize,
+                maps: Mutex::new(TrackedPointers::new()),
            })
            .into_cl()
        } else {
@ -683,6 +653,7 @@ impl MemBase {
                    ..Default::default()
                },
                image_elem_size: gl_mem_props.pixel_size,
+                maps: Mutex::new(TrackedPointers::new()),
            })
            .into_cl()
        })
@ -720,17 +691,51 @@ impl MemBase {
        }
    }

-    fn has_user_shadow_buffer(&self, d: &Device) -> CLResult<bool> {
-        let r = self.get_res_of_dev(d)?;
-        Ok(!r.is_user && bit_check(self.flags, CL_MEM_USE_HOST_PTR))
-    }
-
    pub fn host_ptr(&self) -> *mut c_void {
        self.host_ptr as *mut c_void
    }

-    pub fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
-        self.maps.lock().unwrap().contains_ptr(ptr)
+    fn is_pure_user_memory(&self, d: &Device) -> CLResult<bool> {
+        let r = self.get_res_of_dev(d)?;
+        Ok(r.is_user)
+    }
+
+    fn map<T>(
+        &self,
+        offset: usize,
+        layout: Layout,
+        writes: bool,
+        maps: &Mutex<TrackedPointers<usize, Mapping<T>>>,
+        inner: T,
+    ) -> CLResult<MutMemoryPtr> {
+        let host_ptr = self.host_ptr();
+        let ptr = unsafe {
+            let ptr = if !host_ptr.is_null() {
+                host_ptr.add(offset)
+            } else {
+                alloc::alloc(layout).cast()
+            };
+
+            MutMemoryPtr::from_ptr(ptr)
+        };
+
+        match maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
+            Entry::Occupied(mut e) => {
+                debug_assert!(!host_ptr.is_null());
+                e.get_mut().count += 1;
+            }
+            Entry::Vacant(e) => {
+                e.insert(Mapping {
+                    layout: layout,
+                    writes: writes,
+                    ptr: host_ptr.is_null().then_some(ptr),
+                    count: 1,
+                    inner: inner,
+                });
+            }
+        }
+
+        Ok(ptr)
    }
 }

@ -740,10 +745,6 @@ impl Drop for MemBase {
        for cb in cbs.into_iter().rev() {
            cb.call(self);
        }
-
-        for (d, tx) in self.maps.get_mut().unwrap().tx.drain() {
-            d.helper_ctx().unmap(tx.tx);
-        }
    }
 }

@ -895,25 +896,20 @@ impl Buffer {
        Ok(())
    }

-    pub fn map(&self, dev: &'static Device, offset: usize) -> CLResult<MutMemoryPtr> {
-        let ptr = if self.has_user_shadow_buffer(dev)? {
-            self.host_ptr()
-        } else {
-            let mut lock = self.maps.lock().unwrap();
+    fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
+        self.maps.lock().unwrap().contains_key(ptr as usize)
+    }

-            if let Entry::Vacant(e) = lock.tx.entry(dev) {
-                let (tx, res) = self.tx_raw_async(dev, RWFlags::RW)?;
-                e.insert(MappingTransfer::new(tx, res));
-            } else {
-                lock.mark_pending(dev);
-            }
-
-            lock.tx.get(dev).unwrap().tx.ptr()
-        };
-
-        let ptr = unsafe { ptr.add(offset) };
-        // SAFETY: it's required that applications do not cause data races
-        Ok(unsafe { MutMemoryPtr::from_ptr(ptr) })
+    pub fn map(&self, size: usize, offset: usize, writes: bool) -> CLResult<MutMemoryPtr> {
+        let layout =
+            unsafe { Layout::from_size_align_unchecked(size, size_of::<[cl_ulong; 16]>()) };
+        self.base.map(
+            offset,
+            layout,
+            writes,
+            &self.maps,
+            BufferMapping { offset: offset },
+        )
    }

    pub fn read(
@ -968,35 +964,18 @@ impl Buffer {
        Ok(())
    }

-    // TODO: only sync on map when the memory is not mapped with discard
-    pub fn sync_shadow(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
-        let ptr = ptr.as_ptr();
-        let mut lock = self.maps.lock().unwrap();
-        if !lock.increase_ref(q.device, ptr) {
+    pub fn sync_map(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
+        // no need to update
+        if self.is_pure_user_memory(q.device)? {
            return Ok(());
        }

-        if self.has_user_shadow_buffer(q.device)? {
-            self.read(
-                q,
-                ctx,
-                0,
-                // SAFETY: it's required that applications do not cause data races
-                unsafe { MutMemoryPtr::from_ptr(self.host_ptr()) },
-                self.size,
-            )
-        } else {
-            if let Some(shadow) = lock.tx.get(&q.device).and_then(|tx| tx.shadow.as_ref()) {
-                let res = self.get_res_of_dev(q.device)?;
-                let bx = create_pipe_box(
-                    [self.offset, 0, 0].into(),
-                    [self.size, 1, 1].into(),
-                    CL_MEM_OBJECT_BUFFER,
-                )?;
-                ctx.resource_copy_region(res, shadow, &[0; 3], &bx);
-            }
-            Ok(())
-        }
+        let maps = self.maps.lock().unwrap();
+        let Some(mapping) = maps.find_alloc_precise(ptr.as_ptr() as usize) else {
+            return Err(CL_INVALID_VALUE);
+        };
+
+        self.read(q, ctx, mapping.offset, ptr, mapping.size())
    }

    fn tx<'a>(
@ -1022,69 +1001,21 @@ impl Buffer {
            .with_ctx(ctx))
    }

-    fn tx_raw_async(
-        &self,
-        dev: &Device,
-        rw: RWFlags,
-    ) -> CLResult<(PipeTransfer, Option<PipeResource>)> {
-        let r = self.get_res_of_dev(dev)?;
-        let offset = self.offset.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?;
-        let size = self.size.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?;
-        let ctx = dev.helper_ctx();
-
-        let tx = if can_map_directly(dev, r) {
-            ctx.buffer_map_directly(r, offset, size, rw)
-        } else {
-            None
-        };
-
-        if let Some(tx) = tx {
-            Ok((tx, None))
-        } else {
-            let shadow = dev
-                .screen()
-                .resource_create_buffer(size as u32, ResourceType::Staging, 0)
-                .ok_or(CL_OUT_OF_RESOURCES)?;
-            let tx = ctx
-                .buffer_map_coherent(&shadow, 0, size, rw)
-                .ok_or(CL_OUT_OF_RESOURCES)?;
-            Ok((tx, Some(shadow)))
-        }
-    }
-
    // TODO: only sync on unmap when the memory is not mapped for writing
    pub fn unmap(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
-        let ptr = ptr.as_ptr();
-        let mut lock = self.maps.lock().unwrap();
-        if !lock.contains_ptr(ptr) {
-            return Ok(());
-        }
-
-        let (needs_sync, shadow) = lock.decrease_ref(ptr, q.device);
-        if needs_sync {
-            if let Some(shadow) = shadow {
-                let res = self.get_res_of_dev(q.device)?;
-                let offset = self.offset.try_into().map_err(|_| CL_OUT_OF_HOST_MEMORY)?;
-                let bx = create_pipe_box(
-                    CLVec::default(),
-                    [self.size, 1, 1].into(),
-                    CL_MEM_OBJECT_BUFFER,
-                )?;
-
-                ctx.resource_copy_region(shadow, res, &[offset, 0, 0], &bx);
-            } else if self.has_user_shadow_buffer(q.device)? {
-                self.write(
-                    q,
-                    ctx,
-                    0,
-                    // SAFETY: it's required that applications do not cause data races
-                    unsafe { ConstMemoryPtr::from_ptr(self.host_ptr()) },
-                    self.size,
-                )?;
+        let mapping = match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
+            Entry::Vacant(_) => return Err(CL_INVALID_VALUE),
+            Entry::Occupied(mut entry) => {
+                entry.get_mut().count -= 1;
+                (entry.get().count == 0).then(|| entry.remove())
            }
-        }
+        };

-        lock.clean_up_tx(q.device, ctx);
+        if let Some(mapping) = mapping {
+            if mapping.writes && !self.is_pure_user_memory(q.device)? {
+                self.write(q, ctx, mapping.offset, ptr.into(), mapping.size())?;
+            }
+        };

        Ok(())
    }
@ -1345,61 +1276,45 @@ impl Image {
        Ok(())
    }

+    fn is_mapped_ptr(&self, ptr: *mut c_void) -> bool {
+        self.maps.lock().unwrap().contains_key(ptr as usize)
+    }
+
    pub fn is_parent_buffer(&self) -> bool {
        matches!(self.parent, Some(Mem::Buffer(_)))
    }

    pub fn map(
        &self,
-        dev: &'static Device,
-        origin: &CLVec<usize>,
+        origin: CLVec<usize>,
+        region: CLVec<usize>,
        row_pitch: &mut usize,
        slice_pitch: &mut usize,
-    ) -> CLResult<*mut c_void> {
-        // we might have a host_ptr shadow buffer or image created from buffer
-        let ptr = if self.has_user_shadow_buffer(dev)? {
-            *row_pitch = self.image_desc.image_row_pitch;
-            *slice_pitch = self.image_desc.image_slice_pitch;
-            self.host_ptr()
-        } else if let Some(Mem::Buffer(buffer)) = &self.parent {
-            *row_pitch = self.image_desc.image_row_pitch;
-            *slice_pitch = self.image_desc.image_slice_pitch;
-            buffer.map(dev, 0)?.as_ptr()
-        } else {
-            let mut lock = self.maps.lock().unwrap();
+        writes: bool,
+    ) -> CLResult<MutMemoryPtr> {
+        let pixel_size = self.image_format.pixel_size().unwrap() as usize;

-            if let Entry::Vacant(e) = lock.tx.entry(dev) {
-                let bx = self.image_desc.bx()?;
-                let (tx, res) = self.tx_raw_async(dev, &bx, RWFlags::RW)?;
-                e.insert(MappingTransfer::new(tx, res));
-            } else {
-                lock.mark_pending(dev);
-            }
+        *row_pitch = self.image_desc.row_pitch()? as usize;
+        *slice_pitch = self.image_desc.slice_pitch();

-            let tx = &lock.tx.get(dev).unwrap().tx;
+        let (offset, size) =
+            CLVec::calc_offset_size(origin, region, [pixel_size, *row_pitch, *slice_pitch]);

-            if self.image_desc.dims() > 1 {
-                *row_pitch = tx.row_pitch() as usize;
-            }
-            if self.image_desc.dims() > 2 || self.image_desc.is_array() {
-                *slice_pitch = tx.slice_pitch();
-            }
+        let layout;
+        unsafe {
+            layout = Layout::from_size_align_unchecked(size, size_of::<[u32; 4]>());
+        }

-            tx.ptr()
-        };
-
-        let ptr = unsafe {
-            ptr.add(
-                *origin
-                    * [
-                        self.image_format.pixel_size().unwrap().into(),
-                        *row_pitch,
-                        *slice_pitch,
-                    ],
-            )
-        };
-
-        Ok(ptr)
+        self.base.map(
+            offset,
+            layout,
+            writes,
+            &self.maps,
+            ImageMapping {
+                origin: origin,
+                region: region,
+            },
+        )
    }

    pub fn pipe_image_host_access(&self) -> u16 {
@ -1466,32 +1381,29 @@ impl Image {
    }

    // TODO: only sync on map when the memory is not mapped with discard
-    pub fn sync_shadow(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
-        let ptr = ptr.as_ptr();
-        let mut lock = self.maps.lock().unwrap();
-        if !lock.increase_ref(q.device, ptr) {
+    pub fn sync_map(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
+        // no need to update
+        if self.is_pure_user_memory(q.device)? {
            return Ok(());
        }

-        if self.has_user_shadow_buffer(q.device)? {
-            self.read(
-                // SAFETY: it's required that applications do not cause data races
-                unsafe { MutMemoryPtr::from_ptr(self.host_ptr()) },
-                q,
-                ctx,
-                &self.image_desc.size(),
-                &CLVec::default(),
-                self.image_desc.image_row_pitch,
-                self.image_desc.image_slice_pitch,
-            )
-        } else {
-            if let Some(shadow) = lock.tx.get(q.device).and_then(|tx| tx.shadow.as_ref()) {
-                let res = self.get_res_of_dev(q.device)?;
-                let bx = self.image_desc.bx()?;
-                ctx.resource_copy_region(res, shadow, &[0, 0, 0], &bx);
-            }
-            Ok(())
-        }
+        let maps = self.maps.lock().unwrap();
+        let Some(mapping) = maps.find_alloc_precise(ptr.as_ptr() as usize) else {
+            return Err(CL_INVALID_VALUE);
+        };
+
+        let row_pitch = self.image_desc.row_pitch()? as usize;
+        let slice_pitch = self.image_desc.slice_pitch();
+
+        self.read(
+            ptr,
+            q,
+            ctx,
+            &mapping.region,
+            &mapping.origin,
+            row_pitch,
+            slice_pitch,
+        )
    }

    fn tx_image<'a>(
@ -1508,74 +1420,33 @@ impl Image {
            .with_ctx(ctx))
    }

-    fn tx_raw_async(
-        &self,
-        dev: &Device,
-        bx: &pipe_box,
-        rw: RWFlags,
-    ) -> CLResult<(PipeTransfer, Option<PipeResource>)> {
-        let r = self.get_res_of_dev(dev)?;
-        let ctx = dev.helper_ctx();
-
-        let tx = if can_map_directly(dev, r) {
-            ctx.texture_map_directly(r, bx, rw)
-        } else {
-            None
-        };
-
-        if let Some(tx) = tx {
-            Ok((tx, None))
-        } else {
-            let shadow = dev
-                .screen()
-                .resource_create_texture(
-                    r.width(),
-                    r.height(),
-                    r.depth(),
-                    r.array_size(),
-                    cl_mem_type_to_texture_target(self.image_desc.image_type),
-                    self.pipe_format,
-                    ResourceType::Staging,
-                    false,
-                )
-                .ok_or(CL_OUT_OF_RESOURCES)?;
-            let tx = ctx
-                .texture_map_coherent(&shadow, bx, rw)
-                .ok_or(CL_OUT_OF_RESOURCES)?;
-            Ok((tx, Some(shadow)))
-        }
-    }
-
    // TODO: only sync on unmap when the memory is not mapped for writing
    pub fn unmap(&self, q: &Queue, ctx: &PipeContext, ptr: MutMemoryPtr) -> CLResult<()> {
-        let ptr = ptr.as_ptr();
-        let mut lock = self.maps.lock().unwrap();
-        if !lock.contains_ptr(ptr) {
-            return Ok(());
-        }
+        let mapping = match self.maps.lock().unwrap().entry(ptr.as_ptr() as usize) {
+            Entry::Vacant(_) => return Err(CL_INVALID_VALUE),
+            Entry::Occupied(mut entry) => {
+                entry.get_mut().count -= 1;
+                (entry.get().count == 0).then(|| entry.remove())
+            }
+        };

-        let (needs_sync, shadow) = lock.decrease_ref(ptr, q.device);
-        if needs_sync {
-            if let Some(shadow) = shadow {
-                let res = self.get_res_of_dev(q.device)?;
-                let bx = self.image_desc.bx()?;
-                ctx.resource_copy_region(shadow, res, &[0, 0, 0], &bx);
-            } else if self.has_user_shadow_buffer(q.device)? {
+        let row_pitch = self.image_desc.row_pitch()? as usize;
+        let slice_pitch = self.image_desc.slice_pitch();
+
+        if let Some(mapping) = mapping {
+            if mapping.writes && !self.is_pure_user_memory(q.device)? {
                self.write(
-                    // SAFETY: it's required that applications do not cause data races
-                    unsafe { ConstMemoryPtr::from_ptr(self.host_ptr()) },
+                    ptr.into(),
                    q,
                    ctx,
-                    &self.image_desc.size(),
-                    self.image_desc.image_row_pitch,
-                    self.image_desc.image_slice_pitch,
-                    &CLVec::default(),
+                    &mapping.region,
+                    row_pitch,
+                    slice_pitch,
+                    &mapping.origin,
                )?;
            }
        }

-        lock.clean_up_tx(q.device, ctx);
-
        Ok(())
    }