linux/mm/page_table_check.c
Linus Torvalds 6e17c6de3d - Yosry Ahmed brought back some cgroup v1 stats in OOM logs.
- Yosry has also eliminated cgroup's atomic rstat flushing.
 
 - Nhat Pham adds the new cachestat() syscall.  It provides userspace
   with the ability to query pagecache status - a similar concept to
   mincore() but more powerful and with improved usability.
 
 - Mel Gorman provides more optimizations for compaction, reducing the
   prevalence of page rescanning.
 
 - Lorenzo Stoakes has done some maintanance work on the get_user_pages()
   interface.
 
 - Liam Howlett continues with cleanups and maintenance work to the maple
   tree code.  Peng Zhang also does some work on maple tree.
 
 - Johannes Weiner has done some cleanup work on the compaction code.
 
 - David Hildenbrand has contributed additional selftests for
   get_user_pages().
 
 - Thomas Gleixner has contributed some maintenance and optimization work
   for the vmalloc code.
 
 - Baolin Wang has provided some compaction cleanups,
 
 - SeongJae Park continues maintenance work on the DAMON code.
 
 - Huang Ying has done some maintenance on the swap code's usage of
   device refcounting.
 
 - Christoph Hellwig has some cleanups for the filemap/directio code.
 
 - Ryan Roberts provides two patch series which yield some
   rationalization of the kernel's access to pte entries - use the provided
   APIs rather than open-coding accesses.
 
 - Lorenzo Stoakes has some fixes to the interaction between pagecache
   and directio access to file mappings.
 
 - John Hubbard has a series of fixes to the MM selftesting code.
 
 - ZhangPeng continues the folio conversion campaign.
 
 - Hugh Dickins has been working on the pagetable handling code, mainly
   with a view to reducing the load on the mmap_lock.
 
 - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment from
   128 to 8.
 
 - Domenico Cerasuolo has improved the zswap reclaim mechanism by
   reorganizing the LRU management.
 
 - Matthew Wilcox provides some fixups to make gfs2 work better with the
   buffer_head code.
 
 - Vishal Moola also has done some folio conversion work.
 
 - Matthew Wilcox has removed the remnants of the pagevec code - their
   functionality is migrated over to struct folio_batch.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZJejewAKCRDdBJ7gKXxA
 joggAPwKMfT9lvDBEUnJagY7dbDPky1cSYZdJKxxM2cApGa42gEA6Cl8HRAWqSOh
 J0qXCzqaaN8+BuEyLGDVPaXur9KirwY=
 =B7yQ
 -----END PGP SIGNATURE-----

Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull mm updates from Andrew Morton:

 - Yosry Ahmed brought back some cgroup v1 stats in OOM logs

 - Yosry has also eliminated cgroup's atomic rstat flushing

 - Nhat Pham adds the new cachestat() syscall. It provides userspace
   with the ability to query pagecache status - a similar concept to
   mincore() but more powerful and with improved usability

 - Mel Gorman provides more optimizations for compaction, reducing the
   prevalence of page rescanning

 - Lorenzo Stoakes has done some maintanance work on the
   get_user_pages() interface

 - Liam Howlett continues with cleanups and maintenance work to the
   maple tree code. Peng Zhang also does some work on maple tree

 - Johannes Weiner has done some cleanup work on the compaction code

 - David Hildenbrand has contributed additional selftests for
   get_user_pages()

 - Thomas Gleixner has contributed some maintenance and optimization
   work for the vmalloc code

 - Baolin Wang has provided some compaction cleanups,

 - SeongJae Park continues maintenance work on the DAMON code

 - Huang Ying has done some maintenance on the swap code's usage of
   device refcounting

 - Christoph Hellwig has some cleanups for the filemap/directio code

 - Ryan Roberts provides two patch series which yield some
   rationalization of the kernel's access to pte entries - use the
   provided APIs rather than open-coding accesses

 - Lorenzo Stoakes has some fixes to the interaction between pagecache
   and directio access to file mappings

 - John Hubbard has a series of fixes to the MM selftesting code

 - ZhangPeng continues the folio conversion campaign

 - Hugh Dickins has been working on the pagetable handling code, mainly
   with a view to reducing the load on the mmap_lock

 - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment
   from 128 to 8

 - Domenico Cerasuolo has improved the zswap reclaim mechanism by
   reorganizing the LRU management

 - Matthew Wilcox provides some fixups to make gfs2 work better with the
   buffer_head code

 - Vishal Moola also has done some folio conversion work

 - Matthew Wilcox has removed the remnants of the pagevec code - their
   functionality is migrated over to struct folio_batch

* tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits)
  mm/hugetlb: remove hugetlb_set_page_subpool()
  mm: nommu: correct the range of mmap_sem_read_lock in task_mem()
  hugetlb: revert use of page_cache_next_miss()
  Revert "page cache: fix page_cache_next/prev_miss off by one"
  mm/vmscan: fix root proactive reclaim unthrottling unbalanced node
  mm: memcg: rename and document global_reclaim()
  mm: kill [add|del]_page_to_lru_list()
  mm: compaction: convert to use a folio in isolate_migratepages_block()
  mm: zswap: fix double invalidate with exclusive loads
  mm: remove unnecessary pagevec includes
  mm: remove references to pagevec
  mm: rename invalidate_mapping_pagevec to mapping_try_invalidate
  mm: remove struct pagevec
  net: convert sunrpc from pagevec to folio_batch
  i915: convert i915_gpu_error to use a folio_batch
  pagevec: rename fbatch_count()
  mm: remove check_move_unevictable_pages()
  drm: convert drm_gem_put_pages() to use a folio_batch
  i915: convert shmem_sg_free_table() to use a folio_batch
  scatterlist: add sg_set_folio()
  ...
2023-06-28 10:28:11 -07:00

259 lines
6.1 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2021, Google LLC.
* Pasha Tatashin <pasha.tatashin@soleen.com>
*/
#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/page_table_check.h>
#undef pr_fmt
#define pr_fmt(fmt) "page_table_check: " fmt
struct page_table_check {
atomic_t anon_map_count;
atomic_t file_map_count;
};
static bool __page_table_check_enabled __initdata =
IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED);
DEFINE_STATIC_KEY_TRUE(page_table_check_disabled);
EXPORT_SYMBOL(page_table_check_disabled);
static int __init early_page_table_check_param(char *buf)
{
return kstrtobool(buf, &__page_table_check_enabled);
}
early_param("page_table_check", early_page_table_check_param);
static bool __init need_page_table_check(void)
{
return __page_table_check_enabled;
}
static void __init init_page_table_check(void)
{
if (!__page_table_check_enabled)
return;
static_branch_disable(&page_table_check_disabled);
}
struct page_ext_operations page_table_check_ops = {
.size = sizeof(struct page_table_check),
.need = need_page_table_check,
.init = init_page_table_check,
.need_shared_flags = false,
};
static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
{
BUG_ON(!page_ext);
return (void *)(page_ext) + page_table_check_ops.offset;
}
/*
* An entry is removed from the page table, decrement the counters for that page
* verify that it is of correct type and counters do not become negative.
*/
static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
unsigned long pfn, unsigned long pgcnt)
{
struct page_ext *page_ext;
struct page *page;
unsigned long i;
bool anon;
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
BUG_ON(PageSlab(page));
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
if (anon) {
BUG_ON(atomic_read(&ptc->file_map_count));
BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
} else {
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
}
page_ext = page_ext_next(page_ext);
}
page_ext_put(page_ext);
}
/*
* A new entry is added to the page table, increment the counters for that page
* verify that it is of correct type and is not being mapped with a different
* type to a different process.
*/
static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
unsigned long pfn, unsigned long pgcnt,
bool rw)
{
struct page_ext *page_ext;
struct page *page;
unsigned long i;
bool anon;
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
BUG_ON(PageSlab(page));
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
if (anon) {
BUG_ON(atomic_read(&ptc->file_map_count));
BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw);
} else {
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
}
page_ext = page_ext_next(page_ext);
}
page_ext_put(page_ext);
}
/*
* page is on free list, or is being allocated, verify that counters are zeroes
* crash if they are not.
*/
void __page_table_check_zero(struct page *page, unsigned int order)
{
struct page_ext *page_ext;
unsigned long i;
BUG_ON(PageSlab(page));
page_ext = page_ext_get(page);
BUG_ON(!page_ext);
for (i = 0; i < (1ul << order); i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_read(&ptc->file_map_count));
page_ext = page_ext_next(page_ext);
}
page_ext_put(page_ext);
}
void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t pte)
{
if (&init_mm == mm)
return;
if (pte_user_accessible_page(pte)) {
page_table_check_clear(mm, addr, pte_pfn(pte),
PAGE_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pte_clear);
void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
pmd_t pmd)
{
if (&init_mm == mm)
return;
if (pmd_user_accessible_page(pmd)) {
page_table_check_clear(mm, addr, pmd_pfn(pmd),
PMD_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pmd_clear);
void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
pud_t pud)
{
if (&init_mm == mm)
return;
if (pud_user_accessible_page(pud)) {
page_table_check_clear(mm, addr, pud_pfn(pud),
PUD_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pud_clear);
void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
if (&init_mm == mm)
return;
__page_table_check_pte_clear(mm, addr, ptep_get(ptep));
if (pte_user_accessible_page(pte)) {
page_table_check_set(mm, addr, pte_pfn(pte),
PAGE_SIZE >> PAGE_SHIFT,
pte_write(pte));
}
}
EXPORT_SYMBOL(__page_table_check_pte_set);
void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
if (&init_mm == mm)
return;
__page_table_check_pmd_clear(mm, addr, *pmdp);
if (pmd_user_accessible_page(pmd)) {
page_table_check_set(mm, addr, pmd_pfn(pmd),
PMD_SIZE >> PAGE_SHIFT,
pmd_write(pmd));
}
}
EXPORT_SYMBOL(__page_table_check_pmd_set);
void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
if (&init_mm == mm)
return;
__page_table_check_pud_clear(mm, addr, *pudp);
if (pud_user_accessible_page(pud)) {
page_table_check_set(mm, addr, pud_pfn(pud),
PUD_SIZE >> PAGE_SHIFT,
pud_write(pud));
}
}
EXPORT_SYMBOL(__page_table_check_pud_set);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
pmd_t pmd)
{
if (&init_mm == mm)
return;
if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
pte_t *ptep = pte_offset_map(&pmd, addr);
unsigned long i;
if (WARN_ON(!ptep))
return;
for (i = 0; i < PTRS_PER_PTE; i++) {
__page_table_check_pte_clear(mm, addr, ptep_get(ptep));
addr += PAGE_SIZE;
ptep++;
}
pte_unmap(ptep - PTRS_PER_PTE);
}
}