diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index cdb9b9bdea1f..8f714406528d 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -235,7 +235,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, if (bhs[i] == NULL) { bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { - ocfs2_metadata_cache_io_unlock(ci); status = -ENOMEM; mlog_errno(status); /* Don't forget to put previous bh! */ @@ -389,7 +388,8 @@ read_failure: /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ - ocfs2_set_buffer_uptodate(ci, bh); + if (bh) + ocfs2_set_buffer_uptodate(ci, bh); } ocfs2_metadata_cache_io_unlock(ci); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 530fba34f6d3..1bf188b6866a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1055,7 +1055,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (!igrab(inode)) BUG(); - num_running_trans = atomic_read(&(osb->journal->j_num_trans)); + num_running_trans = atomic_read(&(journal->j_num_trans)); trace_ocfs2_journal_shutdown(num_running_trans); /* Do a commit_cache here. It will flush our journal, *and* @@ -1074,9 +1074,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) osb->commit_task = NULL; } - BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); + BUG_ON(atomic_read(&(journal->j_num_trans)) != 0); - if (ocfs2_mount_local(osb)) { + if (ocfs2_mount_local(osb) && + (journal->j_journal->j_flags & JBD2_LOADED)) { jbd2_journal_lock_updates(journal->j_journal); status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 8ce462c64c51..73d3367c533b 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -692,7 +692,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) int status; struct buffer_head *bh = NULL; struct ocfs2_quota_recovery *rec; - int locked = 0; + int locked = 0, global_read = 0; info->dqi_max_spc_limit = 0x7fffffffffffffffLL; info->dqi_max_ino_limit = 0x7fffffffffffffffLL; @@ -700,6 +700,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) if (!oinfo) { mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" " info."); + status = -ENOMEM; goto out_err; } info->dqi_priv = oinfo; @@ -712,6 +713,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) status = ocfs2_global_read_info(sb, type); if (status < 0) goto out_err; + global_read = 1; status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1); if (status < 0) { @@ -782,10 +784,12 @@ out_err: if (locked) ocfs2_inode_unlock(lqinode, 1); ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + if (global_read) + cancel_delayed_work_sync(&oinfo->dqi_sync_work); kfree(oinfo); } brelse(bh); - return -1; + return status; } /* Write local info to quota file */ diff --git a/kernel/resource.c b/kernel/resource.c index a83040fde236..1681ab5012e1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -540,20 +540,62 @@ static int __region_intersects(struct resource *parent, resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - struct resource res; + resource_size_t ostart, oend; int type = 0; int other = 0; - struct resource *p; + struct resource *p, *dp; + bool is_type, covered; + struct resource res; res.start = start; res.end = start + size - 1; for (p = parent->child; p ; p = p->sibling) { - bool is_type = (((p->flags & flags) == flags) && - ((desc == IORES_DESC_NONE) || - (desc == p->desc))); - - if (resource_overlaps(p, &res)) - is_type ? type++ : other++; + if (!resource_overlaps(p, &res)) + continue; + is_type = (p->flags & flags) == flags && + (desc == IORES_DESC_NONE || desc == p->desc); + if (is_type) { + type++; + continue; + } + /* + * Continue to search in descendant resources as if the + * matched descendant resources cover some ranges of 'p'. + * + * |------------- "CXL Window 0" ------------| + * |-- "System RAM" --| + * + * will behave similar as the following fake resource + * tree when searching "System RAM". + * + * |-- "System RAM" --||-- "CXL Window 0a" --| + */ + covered = false; + ostart = max(res.start, p->start); + oend = min(res.end, p->end); + for_each_resource(p, dp, false) { + if (!resource_overlaps(dp, &res)) + continue; + is_type = (dp->flags & flags) == flags && + (desc == IORES_DESC_NONE || desc == dp->desc); + if (is_type) { + type++; + /* + * Range from 'ostart' to 'dp->start' + * isn't covered by matched resource. + */ + if (dp->start > ostart) + break; + if (dp->end >= oend) { + covered = true; + break; + } + /* Remove covered range */ + ostart = max(ostart, dp->end + 1); + } + } + if (!covered) + other++; } if (type == 0) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 58829baf8b5d..a0036dc78a3b 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -126,6 +126,7 @@ static int __damon_va_three_regions(struct mm_struct *mm, * If this is too slow, it can be optimised to examine the maple * tree gaps. */ + rcu_read_lock(); for_each_vma(vmi, vma) { unsigned long gap; @@ -146,6 +147,7 @@ static int __damon_va_three_regions(struct mm_struct *mm, next: prev = vma; } + rcu_read_unlock(); if (!sz_range(&second_gap) || !sz_range(&first_gap)) return -EINVAL; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 67c86a5d64a6..99b146d16a18 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -220,6 +220,8 @@ retry: count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } + /* Ensure zero folio won't have large_rmappable flag set. */ + folio_clear_large_rmappable(zero_folio); preempt_disable(); if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) { preempt_enable(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aaf508be0a2b..9a3a6e2dee97 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6048,7 +6048,7 @@ retry_avoidcopy: * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - ret = vmf_anon_prepare(vmf); + ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out_release_all; @@ -6247,7 +6247,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, } if (!(vma->vm_flags & VM_MAYSHARE)) { - ret = vmf_anon_prepare(vmf); + ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out; } @@ -6378,6 +6378,14 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); + + /* + * We must check to release the per-VMA lock. __vmf_anon_prepare() is + * the only way ret can be set to VM_FAULT_RETRY. + */ + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; @@ -6599,6 +6607,14 @@ out_ptl: } out_mutex: hugetlb_vma_unlock_read(vma); + + /* + * We must check to release the per-VMA lock. __vmf_anon_prepare() in + * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY. + */ + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But diff --git a/mm/internal.h b/mm/internal.h index b4d86436565b..a963f67d3452 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -310,7 +310,16 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) wake_up(wqh); } -vm_fault_t vmf_anon_prepare(struct vm_fault *vmf); +vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf); +static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +{ + vm_fault_t ret = __vmf_anon_prepare(vmf); + + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vmf->vma); + return ret; +} + vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); diff --git a/mm/madvise.c b/mm/madvise.c index 89089d84f8df..6e3a137b8e50 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1527,7 +1527,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, * Require CAP_SYS_NICE for influencing process performance. Note that * only non-destructive hints are currently supported. */ - if (!capable(CAP_SYS_NICE)) { + if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } diff --git a/mm/memory.c b/mm/memory.c index ebfc9768f801..cda2c12c500b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3276,7 +3276,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) } /** - * vmf_anon_prepare - Prepare to handle an anonymous fault. + * __vmf_anon_prepare - Prepare to handle an anonymous fault. * @vmf: The vm_fault descriptor passed from the fault handler. * * When preparing to insert an anonymous page into a VMA from a @@ -3290,7 +3290,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) * Return: 0 if fault handling can proceed. Any other value should be * returned to the caller. */ -vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; @@ -3298,10 +3298,8 @@ vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - if (!mmap_read_trylock(vma->vm_mm)) { - vma_end_read(vma); + if (!mmap_read_trylock(vma->vm_mm)) return VM_FAULT_RETRY; - } } if (__anon_vma_prepare(vma)) ret = VM_FAULT_OOM; diff --git a/mm/vmscan.c b/mm/vmscan.c index bd489c1af228..a8d61a8b6894 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4300,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (zone > sc->reclaim_idx) { + if (!folio_test_lru(folio) || zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2d3163e4da96..b572aa84823c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -293,17 +294,27 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} static int create_cache(struct zs_pool *pool) { - pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, - 0, 0, NULL); - if (!pool->handle_cachep) - return 1; + char *name; - pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), - 0, 0, NULL); + name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name); + if (!name) + return -ENOMEM; + pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE, + 0, 0, NULL); + kfree(name); + if (!pool->handle_cachep) + return -EINVAL; + + name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name); + if (!name) + return -ENOMEM; + pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage), + 0, 0, NULL); + kfree(name); if (!pool->zspage_cachep) { kmem_cache_destroy(pool->handle_cachep); pool->handle_cachep = NULL; - return 1; + return -EINVAL; } return 0;