mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-12-03 17:14:14 +08:00
f1a7941243
Currently mm_struct maintains rss_stats which are updated on page fault and the unmapping codepaths. For page fault codepath the updates are cached per thread with the batch of TASK_RSS_EVENTS_THRESH which is 64. The reason for caching is performance for multithreaded applications otherwise the rss_stats updates may become hotspot for such applications. However this optimization comes with the cost of error margin in the rss stats. The rss_stats for applications with large number of threads can be very skewed. At worst the error margin is (nr_threads * 64) and we have a lot of applications with 100s of threads, so the error margin can be very high. Internally we had to reduce TASK_RSS_EVENTS_THRESH to 32. Recently we started seeing the unbounded errors for rss_stats for specific applications which use TCP rx0cp. It seems like vm_insert_pages() codepath does not sync rss_stats at all. This patch converts the rss_stats into percpu_counter to convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However this conversion enable us to get the accurate stats for situations where accuracy is more important than the cpu cost. This patch does not make such tradeoffs - we can just use percpu_counter_add_local() for the updates and percpu_counter_sum() (or percpu_counter_sync() + percpu_counter_read) for the readers. At the moment the readers are either procfs interface, oom_killer and memory reclaim which I think are not performance critical and should be ok with slow read. However I think we can make that change in a separate patch. Link: https://lkml.kernel.org/r/20221024052841.3291983-1-shakeelb@google.com Signed-off-by: Shakeel Butt <shakeelb@google.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
378 lines
8.6 KiB
C
378 lines
8.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#undef TRACE_SYSTEM
|
|
#define TRACE_SYSTEM kmem
|
|
|
|
#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
|
|
#define _TRACE_KMEM_H
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/tracepoint.h>
|
|
#include <trace/events/mmflags.h>
|
|
|
|
TRACE_EVENT(kmem_cache_alloc,
|
|
|
|
TP_PROTO(unsigned long call_site,
|
|
const void *ptr,
|
|
struct kmem_cache *s,
|
|
gfp_t gfp_flags,
|
|
int node),
|
|
|
|
TP_ARGS(call_site, ptr, s, gfp_flags, node),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, call_site )
|
|
__field( const void *, ptr )
|
|
__field( size_t, bytes_req )
|
|
__field( size_t, bytes_alloc )
|
|
__field( unsigned long, gfp_flags )
|
|
__field( int, node )
|
|
__field( bool, accounted )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->call_site = call_site;
|
|
__entry->ptr = ptr;
|
|
__entry->bytes_req = s->object_size;
|
|
__entry->bytes_alloc = s->size;
|
|
__entry->gfp_flags = (__force unsigned long)gfp_flags;
|
|
__entry->node = node;
|
|
__entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ?
|
|
((gfp_flags & __GFP_ACCOUNT) ||
|
|
(s->flags & SLAB_ACCOUNT)) : false;
|
|
),
|
|
|
|
TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
|
|
(void *)__entry->call_site,
|
|
__entry->ptr,
|
|
__entry->bytes_req,
|
|
__entry->bytes_alloc,
|
|
show_gfp_flags(__entry->gfp_flags),
|
|
__entry->node,
|
|
__entry->accounted ? "true" : "false")
|
|
);
|
|
|
|
TRACE_EVENT(kmalloc,
|
|
|
|
TP_PROTO(unsigned long call_site,
|
|
const void *ptr,
|
|
size_t bytes_req,
|
|
size_t bytes_alloc,
|
|
gfp_t gfp_flags,
|
|
int node),
|
|
|
|
TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, call_site )
|
|
__field( const void *, ptr )
|
|
__field( size_t, bytes_req )
|
|
__field( size_t, bytes_alloc )
|
|
__field( unsigned long, gfp_flags )
|
|
__field( int, node )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->call_site = call_site;
|
|
__entry->ptr = ptr;
|
|
__entry->bytes_req = bytes_req;
|
|
__entry->bytes_alloc = bytes_alloc;
|
|
__entry->gfp_flags = (__force unsigned long)gfp_flags;
|
|
__entry->node = node;
|
|
),
|
|
|
|
TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
|
|
(void *)__entry->call_site,
|
|
__entry->ptr,
|
|
__entry->bytes_req,
|
|
__entry->bytes_alloc,
|
|
show_gfp_flags(__entry->gfp_flags),
|
|
__entry->node,
|
|
(IS_ENABLED(CONFIG_MEMCG_KMEM) &&
|
|
(__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false")
|
|
);
|
|
|
|
TRACE_EVENT(kfree,
|
|
|
|
TP_PROTO(unsigned long call_site, const void *ptr),
|
|
|
|
TP_ARGS(call_site, ptr),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, call_site )
|
|
__field( const void *, ptr )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->call_site = call_site;
|
|
__entry->ptr = ptr;
|
|
),
|
|
|
|
TP_printk("call_site=%pS ptr=%p",
|
|
(void *)__entry->call_site, __entry->ptr)
|
|
);
|
|
|
|
TRACE_EVENT(kmem_cache_free,
|
|
|
|
TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s),
|
|
|
|
TP_ARGS(call_site, ptr, s),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, call_site )
|
|
__field( const void *, ptr )
|
|
__string( name, s->name )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->call_site = call_site;
|
|
__entry->ptr = ptr;
|
|
__assign_str(name, s->name);
|
|
),
|
|
|
|
TP_printk("call_site=%pS ptr=%p name=%s",
|
|
(void *)__entry->call_site, __entry->ptr, __get_str(name))
|
|
);
|
|
|
|
TRACE_EVENT(mm_page_free,
|
|
|
|
TP_PROTO(struct page *page, unsigned int order),
|
|
|
|
TP_ARGS(page, order),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, pfn )
|
|
__field( unsigned int, order )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->pfn = page_to_pfn(page);
|
|
__entry->order = order;
|
|
),
|
|
|
|
TP_printk("page=%p pfn=0x%lx order=%d",
|
|
pfn_to_page(__entry->pfn),
|
|
__entry->pfn,
|
|
__entry->order)
|
|
);
|
|
|
|
TRACE_EVENT(mm_page_free_batched,
|
|
|
|
TP_PROTO(struct page *page),
|
|
|
|
TP_ARGS(page),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, pfn )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->pfn = page_to_pfn(page);
|
|
),
|
|
|
|
TP_printk("page=%p pfn=0x%lx order=0",
|
|
pfn_to_page(__entry->pfn),
|
|
__entry->pfn)
|
|
);
|
|
|
|
TRACE_EVENT(mm_page_alloc,
|
|
|
|
TP_PROTO(struct page *page, unsigned int order,
|
|
gfp_t gfp_flags, int migratetype),
|
|
|
|
TP_ARGS(page, order, gfp_flags, migratetype),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, pfn )
|
|
__field( unsigned int, order )
|
|
__field( unsigned long, gfp_flags )
|
|
__field( int, migratetype )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->pfn = page ? page_to_pfn(page) : -1UL;
|
|
__entry->order = order;
|
|
__entry->gfp_flags = (__force unsigned long)gfp_flags;
|
|
__entry->migratetype = migratetype;
|
|
),
|
|
|
|
TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s",
|
|
__entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
|
|
__entry->pfn != -1UL ? __entry->pfn : 0,
|
|
__entry->order,
|
|
__entry->migratetype,
|
|
show_gfp_flags(__entry->gfp_flags))
|
|
);
|
|
|
|
DECLARE_EVENT_CLASS(mm_page,
|
|
|
|
TP_PROTO(struct page *page, unsigned int order, int migratetype,
|
|
int percpu_refill),
|
|
|
|
TP_ARGS(page, order, migratetype, percpu_refill),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, pfn )
|
|
__field( unsigned int, order )
|
|
__field( int, migratetype )
|
|
__field( int, percpu_refill )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->pfn = page ? page_to_pfn(page) : -1UL;
|
|
__entry->order = order;
|
|
__entry->migratetype = migratetype;
|
|
__entry->percpu_refill = percpu_refill;
|
|
),
|
|
|
|
TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d",
|
|
__entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
|
|
__entry->pfn != -1UL ? __entry->pfn : 0,
|
|
__entry->order,
|
|
__entry->migratetype,
|
|
__entry->percpu_refill)
|
|
);
|
|
|
|
DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,
|
|
|
|
TP_PROTO(struct page *page, unsigned int order, int migratetype,
|
|
int percpu_refill),
|
|
|
|
TP_ARGS(page, order, migratetype, percpu_refill)
|
|
);
|
|
|
|
TRACE_EVENT(mm_page_pcpu_drain,
|
|
|
|
TP_PROTO(struct page *page, unsigned int order, int migratetype),
|
|
|
|
TP_ARGS(page, order, migratetype),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, pfn )
|
|
__field( unsigned int, order )
|
|
__field( int, migratetype )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->pfn = page ? page_to_pfn(page) : -1UL;
|
|
__entry->order = order;
|
|
__entry->migratetype = migratetype;
|
|
),
|
|
|
|
TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d",
|
|
pfn_to_page(__entry->pfn), __entry->pfn,
|
|
__entry->order, __entry->migratetype)
|
|
);
|
|
|
|
TRACE_EVENT(mm_page_alloc_extfrag,
|
|
|
|
TP_PROTO(struct page *page,
|
|
int alloc_order, int fallback_order,
|
|
int alloc_migratetype, int fallback_migratetype),
|
|
|
|
TP_ARGS(page,
|
|
alloc_order, fallback_order,
|
|
alloc_migratetype, fallback_migratetype),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( unsigned long, pfn )
|
|
__field( int, alloc_order )
|
|
__field( int, fallback_order )
|
|
__field( int, alloc_migratetype )
|
|
__field( int, fallback_migratetype )
|
|
__field( int, change_ownership )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->pfn = page_to_pfn(page);
|
|
__entry->alloc_order = alloc_order;
|
|
__entry->fallback_order = fallback_order;
|
|
__entry->alloc_migratetype = alloc_migratetype;
|
|
__entry->fallback_migratetype = fallback_migratetype;
|
|
__entry->change_ownership = (alloc_migratetype ==
|
|
get_pageblock_migratetype(page));
|
|
),
|
|
|
|
TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
|
|
pfn_to_page(__entry->pfn),
|
|
__entry->pfn,
|
|
__entry->alloc_order,
|
|
__entry->fallback_order,
|
|
pageblock_order,
|
|
__entry->alloc_migratetype,
|
|
__entry->fallback_migratetype,
|
|
__entry->fallback_order < pageblock_order,
|
|
__entry->change_ownership)
|
|
);
|
|
|
|
/*
|
|
* Required for uniquely and securely identifying mm in rss_stat tracepoint.
|
|
*/
|
|
#ifndef __PTR_TO_HASHVAL
|
|
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
|
|
{
|
|
int ret;
|
|
unsigned long hashval;
|
|
|
|
ret = ptr_to_hashval(ptr, &hashval);
|
|
if (ret)
|
|
return 0;
|
|
|
|
/* The hashed value is only 32-bit */
|
|
return (unsigned int)hashval;
|
|
}
|
|
#define __PTR_TO_HASHVAL
|
|
#endif
|
|
|
|
#define TRACE_MM_PAGES \
|
|
EM(MM_FILEPAGES) \
|
|
EM(MM_ANONPAGES) \
|
|
EM(MM_SWAPENTS) \
|
|
EMe(MM_SHMEMPAGES)
|
|
|
|
#undef EM
|
|
#undef EMe
|
|
|
|
#define EM(a) TRACE_DEFINE_ENUM(a);
|
|
#define EMe(a) TRACE_DEFINE_ENUM(a);
|
|
|
|
TRACE_MM_PAGES
|
|
|
|
#undef EM
|
|
#undef EMe
|
|
|
|
#define EM(a) { a, #a },
|
|
#define EMe(a) { a, #a }
|
|
|
|
TRACE_EVENT(rss_stat,
|
|
|
|
TP_PROTO(struct mm_struct *mm,
|
|
int member),
|
|
|
|
TP_ARGS(mm, member),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(unsigned int, mm_id)
|
|
__field(unsigned int, curr)
|
|
__field(int, member)
|
|
__field(long, size)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->mm_id = mm_ptr_to_hash(mm);
|
|
__entry->curr = !!(current->mm == mm);
|
|
__entry->member = member;
|
|
__entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
|
|
<< PAGE_SHIFT);
|
|
),
|
|
|
|
TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
|
|
__entry->mm_id,
|
|
__entry->curr,
|
|
__print_symbolic(__entry->member, TRACE_MM_PAGES),
|
|
__entry->size)
|
|
);
|
|
#endif /* _TRACE_KMEM_H */
|
|
|
|
/* This part must be outside protection */
|
|
#include <trace/define_trace.h>
|