mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-17 17:24:17 +08:00
b5c56e0cdd
CONFIG_RETPOLINE has severely degraded indirect function call performance, so it's worth putting some effort into reducing the number of times cmp() is called. This patch avoids badly unbalanced merges on unlucky input sizes. It slightly increases the code size, but saves an average of 0.2*n calls to cmp(). x86-64 code size 739 -> 803 bytes (+64) Unfortunately, there's not a lot of low-hanging fruit in a merge sort; it already performs only n*log2(n) - K*n + O(1) compares. The leading coefficient is already at the theoretical limit (log2(n!) corresponds to K=1.4427), so we're fighting over the linear term, and the best mergesort can do is K=1.2645, achieved when n is a power of 2. The differences between mergesort variants appear when n is *not* a power of 2; K is a function of the fractional part of log2(n). Top-down mergesort does best of all, achieving a minimum K=1.2408, and an average (over all sizes) K=1.248. However, that requires knowing the number of entries to be sorted ahead of time, and making a full pass over the input to count it conflicts with a second performance goal, which is cache blocking. Obviously, we have to read the entire list into L1 cache at some point, and performance is best if it fits. But if it doesn't fit, each full pass over the input causes a cache miss per element, which is undesirable. While textbooks explain bottom-up mergesort as a succession of merging passes, practical implementations do merging in depth-first order: as soon as two lists of the same size are available, they are merged. This allows as many merge passes as possible to fit into L1; only the final few merges force cache misses. This cache-friendly depth-first merge order depends on us merging the beginning of the input as much as possible before we've even seen the end of the input (and thus know its size). The simple eager merge pattern causes bad performance when n is just over a power of 2. If n=1028, the final merge is between 1024- and 4-element lists, which is wasteful of comparisons. (This is actually worse on average than n=1025, because a 1204:1 merge will, on average, end after 512 compares, while 1024:4 will walk 4/5 of the list.) Because of this, bottom-up mergesort achieves K < 0.5 for such sizes, and has an average (over all sizes) K of around 1. (My experiments show K=1.01, while theory predicts K=0.965.) There are "worst-case optimal" variants of bottom-up mergesort which avoid this bad performance, but the algorithms given in the literature, such as queue-mergesort and boustrodephonic mergesort, depend on the breadth-first multi-pass structure that we are trying to avoid. This implementation is as eager as possible while ensuring that all merge passes are at worst 1:2 unbalanced. This achieves the same average K=1.207 as queue-mergesort, which is 0.2*n better then bottom-up, and only 0.04*n behind top-down mergesort. Specifically, defers merging two lists of size 2^k until it is known that there are 2^k additional inputs following. This ensures that the final uneven merges triggered by reaching the end of the input will be at worst 2:1. This will avoid cache misses as long as 3*2^k elements fit into the cache. (I confess to being more than a little bit proud of how clean this code turned out. It took a lot of thinking, but the resultant inner loop is very simple and efficient.) Refs: Bottom-up Mergesort: A Detailed Analysis Wolfgang Panny, Helmut Prodinger Algorithmica 14(4):340--354, October 1995 https://doi.org/10.1007/BF01294131 https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.5260 The cost distribution of queue-mergesort, optimal mergesorts, and power-of-two rules Wei-Mei Chen, Hsien-Kuei Hwang, Gen-Huey Chen Journal of Algorithms 30(2); Pages 423--448, February 1999 https://doi.org/10.1006/jagm.1998.0986 https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5380 Queue-Mergesort Mordecai J. Golin, Robert Sedgewick Information Processing Letters, 48(5):253--259, 10 December 1993 https://doi.org/10.1016/0020-0190(93)90088-q https://sci-hub.tw/10.1016/0020-0190(93)90088-Q Feedback from Rasmus Villemoes <linux@rasmusvillemoes.dk>. Link: http://lkml.kernel.org/r/fd560853cc4dca0d0f02184ffa888b4c1be89abc.1552704200.git.lkml@sdf.org Signed-off-by: George Spelvin <lkml@sdf.org> Acked-by: Andrey Abramov <st5pub@yandex.ru> Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Cc: Daniel Wagner <daniel.wagner@siemens.com> Cc: Dave Chinner <dchinner@redhat.com> Cc: Don Mullis <don.mullis@gmail.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
256 lines
8.4 KiB
C
256 lines
8.4 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/kernel.h>
|
|
#include <linux/bug.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/export.h>
|
|
#include <linux/string.h>
|
|
#include <linux/list_sort.h>
|
|
#include <linux/list.h>
|
|
|
|
typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *,
|
|
struct list_head const *, struct list_head const *);
|
|
|
|
/*
|
|
* Returns a list organized in an intermediate format suited
|
|
* to chaining of merge() calls: null-terminated, no reserved or
|
|
* sentinel head node, "prev" links not maintained.
|
|
*/
|
|
__attribute__((nonnull(2,3,4)))
|
|
static struct list_head *merge(void *priv, cmp_func cmp,
|
|
struct list_head *a, struct list_head *b)
|
|
{
|
|
struct list_head *head, **tail = &head;
|
|
|
|
for (;;) {
|
|
/* if equal, take 'a' -- important for sort stability */
|
|
if (cmp(priv, a, b) <= 0) {
|
|
*tail = a;
|
|
tail = &a->next;
|
|
a = a->next;
|
|
if (!a) {
|
|
*tail = b;
|
|
break;
|
|
}
|
|
} else {
|
|
*tail = b;
|
|
tail = &b->next;
|
|
b = b->next;
|
|
if (!b) {
|
|
*tail = a;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return head;
|
|
}
|
|
|
|
/*
|
|
* Combine final list merge with restoration of standard doubly-linked
|
|
* list structure. This approach duplicates code from merge(), but
|
|
* runs faster than the tidier alternatives of either a separate final
|
|
* prev-link restoration pass, or maintaining the prev links
|
|
* throughout.
|
|
*/
|
|
__attribute__((nonnull(2,3,4,5)))
|
|
static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
|
|
struct list_head *a, struct list_head *b)
|
|
{
|
|
struct list_head *tail = head;
|
|
u8 count = 0;
|
|
|
|
for (;;) {
|
|
/* if equal, take 'a' -- important for sort stability */
|
|
if (cmp(priv, a, b) <= 0) {
|
|
tail->next = a;
|
|
a->prev = tail;
|
|
tail = a;
|
|
a = a->next;
|
|
if (!a)
|
|
break;
|
|
} else {
|
|
tail->next = b;
|
|
b->prev = tail;
|
|
tail = b;
|
|
b = b->next;
|
|
if (!b) {
|
|
b = a;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Finish linking remainder of list b on to tail */
|
|
tail->next = b;
|
|
do {
|
|
/*
|
|
* If the merge is highly unbalanced (e.g. the input is
|
|
* already sorted), this loop may run many iterations.
|
|
* Continue callbacks to the client even though no
|
|
* element comparison is needed, so the client's cmp()
|
|
* routine can invoke cond_resched() periodically.
|
|
*/
|
|
if (unlikely(!++count))
|
|
cmp(priv, b, b);
|
|
b->prev = tail;
|
|
tail = b;
|
|
b = b->next;
|
|
} while (b);
|
|
|
|
/* And the final links to make a circular doubly-linked list */
|
|
tail->next = head;
|
|
head->prev = tail;
|
|
}
|
|
|
|
/**
|
|
* list_sort - sort a list
|
|
* @priv: private data, opaque to list_sort(), passed to @cmp
|
|
* @head: the list to sort
|
|
* @cmp: the elements comparison function
|
|
*
|
|
* The comparison funtion @cmp must return > 0 if @a should sort after
|
|
* @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
|
|
* sort before @b *or* their original order should be preserved. It is
|
|
* always called with the element that came first in the input in @a,
|
|
* and list_sort is a stable sort, so it is not necessary to distinguish
|
|
* the @a < @b and @a == @b cases.
|
|
*
|
|
* This is compatible with two styles of @cmp function:
|
|
* - The traditional style which returns <0 / =0 / >0, or
|
|
* - Returning a boolean 0/1.
|
|
* The latter offers a chance to save a few cycles in the comparison
|
|
* (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c).
|
|
*
|
|
* A good way to write a multi-word comparison is
|
|
* if (a->high != b->high)
|
|
* return a->high > b->high;
|
|
* if (a->middle != b->middle)
|
|
* return a->middle > b->middle;
|
|
* return a->low > b->low;
|
|
*
|
|
*
|
|
* This mergesort is as eager as possible while always performing at least
|
|
* 2:1 balanced merges. Given two pending sublists of size 2^k, they are
|
|
* merged to a size-2^(k+1) list as soon as we have 2^k following elements.
|
|
*
|
|
* Thus, it will avoid cache thrashing as long as 3*2^k elements can
|
|
* fit into the cache. Not quite as good as a fully-eager bottom-up
|
|
* mergesort, but it does use 0.2*n fewer comparisons, so is faster in
|
|
* the common case that everything fits into L1.
|
|
*
|
|
*
|
|
* The merging is controlled by "count", the number of elements in the
|
|
* pending lists. This is beautiully simple code, but rather subtle.
|
|
*
|
|
* Each time we increment "count", we set one bit (bit k) and clear
|
|
* bits k-1 .. 0. Each time this happens (except the very first time
|
|
* for each bit, when count increments to 2^k), we merge two lists of
|
|
* size 2^k into one list of size 2^(k+1).
|
|
*
|
|
* This merge happens exactly when the count reaches an odd multiple of
|
|
* 2^k, which is when we have 2^k elements pending in smaller lists,
|
|
* so it's safe to merge away two lists of size 2^k.
|
|
*
|
|
* After this happens twice, we have created two lists of size 2^(k+1),
|
|
* which will be merged into a list of size 2^(k+2) before we create
|
|
* a third list of size 2^(k+1), so there are never more than two pending.
|
|
*
|
|
* The number of pending lists of size 2^k is determined by the
|
|
* state of bit k of "count" plus two extra pieces of information:
|
|
* - The state of bit k-1 (when k == 0, consider bit -1 always set), and
|
|
* - Whether the higher-order bits are zero or non-zero (i.e.
|
|
* is count >= 2^(k+1)).
|
|
* There are six states we distinguish. "x" represents some arbitrary
|
|
* bits, and "y" represents some arbitrary non-zero bits:
|
|
* 0: 00x: 0 pending of size 2^k; x pending of sizes < 2^k
|
|
* 1: 01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
|
|
* 2: x10x: 0 pending of size 2^k; 2^k + x pending of sizes < 2^k
|
|
* 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
|
|
* 4: y00x: 1 pending of size 2^k; 2^k + x pending of sizes < 2^k
|
|
* 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
|
|
* (merge and loop back to state 2)
|
|
*
|
|
* We gain lists of size 2^k in the 2->3 and 4->5 transitions (because
|
|
* bit k-1 is set while the more significant bits are non-zero) and
|
|
* merge them away in the 5->2 transition. Note in particular that just
|
|
* before the 5->2 transition, all lower-order bits are 11 (state 3),
|
|
* so there is one list of each smaller size.
|
|
*
|
|
* When we reach the end of the input, we merge all the pending
|
|
* lists, from smallest to largest. If you work through cases 2 to
|
|
* 5 above, you can see that the number of elements we merge with a list
|
|
* of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to
|
|
* 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
|
|
*/
|
|
__attribute__((nonnull(2,3)))
|
|
void list_sort(void *priv, struct list_head *head,
|
|
int (*cmp)(void *priv, struct list_head *a,
|
|
struct list_head *b))
|
|
{
|
|
struct list_head *list = head->next, *pending = NULL;
|
|
size_t count = 0; /* Count of pending */
|
|
|
|
if (list == head->prev) /* Zero or one elements */
|
|
return;
|
|
|
|
/* Convert to a null-terminated singly-linked list. */
|
|
head->prev->next = NULL;
|
|
|
|
/*
|
|
* Data structure invariants:
|
|
* - All lists are singly linked and null-terminated; prev
|
|
* pointers are not maintained.
|
|
* - pending is a prev-linked "list of lists" of sorted
|
|
* sublists awaiting further merging.
|
|
* - Each of the sorted sublists is power-of-two in size.
|
|
* - Sublists are sorted by size and age, smallest & newest at front.
|
|
* - There are zero to two sublists of each size.
|
|
* - A pair of pending sublists are merged as soon as the number
|
|
* of following pending elements equals their size (i.e.
|
|
* each time count reaches an odd multiple of that size).
|
|
* That ensures each later final merge will be at worst 2:1.
|
|
* - Each round consists of:
|
|
* - Merging the two sublists selected by the highest bit
|
|
* which flips when count is incremented, and
|
|
* - Adding an element from the input as a size-1 sublist.
|
|
*/
|
|
do {
|
|
size_t bits;
|
|
struct list_head **tail = &pending;
|
|
|
|
/* Find the least-significant clear bit in count */
|
|
for (bits = count; bits & 1; bits >>= 1)
|
|
tail = &(*tail)->prev;
|
|
/* Do the indicated merge */
|
|
if (likely(bits)) {
|
|
struct list_head *a = *tail, *b = a->prev;
|
|
|
|
a = merge(priv, (cmp_func)cmp, b, a);
|
|
/* Install the merged result in place of the inputs */
|
|
a->prev = b->prev;
|
|
*tail = a;
|
|
}
|
|
|
|
/* Move one element from input list to pending */
|
|
list->prev = pending;
|
|
pending = list;
|
|
list = list->next;
|
|
pending->next = NULL;
|
|
count++;
|
|
} while (list);
|
|
|
|
/* End of input; merge together all the pending lists. */
|
|
list = pending;
|
|
pending = pending->prev;
|
|
for (;;) {
|
|
struct list_head *next = pending->prev;
|
|
|
|
if (!next)
|
|
break;
|
|
list = merge(priv, (cmp_func)cmp, pending, list);
|
|
pending = next;
|
|
}
|
|
/* The final merge, rebuilding prev links */
|
|
merge_final(priv, (cmp_func)cmp, head, pending, list);
|
|
}
|
|
EXPORT_SYMBOL(list_sort);
|