linux/lib/zstd/common/compiler.h
Nick Terrell 1974990cca lib: zstd: Don't inline functions in zstd_opt.c
`zstd_opt.c` contains the match finder for the highest compression
levels. These levels are already very slow, and are unlikely to be used
in the kernel. If they are used, they shouldn't be used in latency
sensitive workloads, so slowing them down shouldn't be a big deal.

This saves 188 KB of the 288 KB regression reported by Geert Uytterhoeven [0].
I've also opened an issue upstream [1] so that we can properly tackle
the code size issue in `zstd_opt.c` for all users, and can hopefully
remove this hack in the next zstd version we import.

Bloat-o-meter output on x86-64:

```
> ../scripts/bloat-o-meter vmlinux.old vmlinux
add/remove: 6/5 grow/shrink: 1/9 up/down: 16673/-209939 (-193266)
Function                                     old     new   delta
ZSTD_compressBlock_opt_generic.constprop       -    7559   +7559
ZSTD_insertBtAndGetAllMatches                  -    6304   +6304
ZSTD_insertBt1                                 -    1731   +1731
ZSTD_storeSeq                                  -     693    +693
ZSTD_BtGetAllMatches                           -     255    +255
ZSTD_updateRep                                 -     128    +128
ZSTD_updateTree                               96      99      +3
ZSTD_insertAndFindFirstIndexHash3             81       -     -81
ZSTD_setBasePrices.constprop                  98       -     -98
ZSTD_litLengthPrice.constprop                138       -    -138
ZSTD_count                                   362     181    -181
ZSTD_count_2segments                        1407     938    -469
ZSTD_insertBt1.constprop                    2689       -   -2689
ZSTD_compressBlock_btultra2                19990     423  -19567
ZSTD_compressBlock_btultra                 19633      15  -19618
ZSTD_initStats_ultra                       19825       -  -19825
ZSTD_compressBlock_btopt                   20374      12  -20362
ZSTD_compressBlock_btopt_extDict           29984      12  -29972
ZSTD_compressBlock_btultra_extDict         30718      15  -30703
ZSTD_compressBlock_btopt_dictMatchState    32689      12  -32677
ZSTD_compressBlock_btultra_dictMatchState   33574      15  -33559
Total: Before=6611828, After=6418562, chg -2.92%
```

[0] https://lkml.org/lkml/2021/11/14/189
[1] https://github.com/facebook/zstd/issues/2862

Link: https://lore.kernel.org/r/20211117014949.1169186-3-nickrterrell@gmail.com/
Link: https://lore.kernel.org/r/20211117201459.1194876-3-nickrterrell@gmail.com/

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Nick Terrell <terrelln@fb.com>
2021-11-18 13:15:33 -08:00

178 lines
5.7 KiB
C

/*
* Copyright (c) Yann Collet, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
#ifndef ZSTD_COMPILER_H
#define ZSTD_COMPILER_H
/*-*******************************************************
* Compiler specifics
*********************************************************/
/* force inlining */
#if !defined(ZSTD_NO_INLINE)
#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
# define INLINE_KEYWORD inline
#else
# define INLINE_KEYWORD
#endif
#define FORCE_INLINE_ATTR __attribute__((always_inline))
#else
#define INLINE_KEYWORD
#define FORCE_INLINE_ATTR
#endif
/*
On MSVC qsort requires that functions passed into it use the __cdecl calling conversion(CC).
This explictly marks such functions as __cdecl so that the code will still compile
if a CC other than __cdecl has been made the default.
*/
#define WIN_CDECL
/*
* FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
* parameters. They must be inlined for the compiler to eliminate the constant
* branches.
*/
#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
/*
* HINT_INLINE is used to help the compiler generate better code. It is *not*
* used for "templates", so it can be tweaked based on the compilers
* performance.
*
* gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
* always_inline attribute.
*
* clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
* attribute.
*/
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
# define HINT_INLINE static INLINE_KEYWORD
#else
# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
#endif
/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
#define UNUSED_ATTR __attribute__((unused))
/* force no inlining */
#define FORCE_NOINLINE static __attribute__((__noinline__))
/* target attribute */
#ifndef __has_attribute
#define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */
#endif
#define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
/* Enable runtime BMI2 dispatch based on the CPU.
* Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
*/
#ifndef DYNAMIC_BMI2
#if ((defined(__clang__) && __has_attribute(__target__)) \
|| (defined(__GNUC__) \
&& (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
&& (defined(__x86_64__) || defined(_M_X86)) \
&& !defined(__BMI2__)
# define DYNAMIC_BMI2 1
#else
# define DYNAMIC_BMI2 0
#endif
#endif
/* prefetch
* can be disabled, by declaring NO_PREFETCH build macro */
#if ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
# define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
# define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
#elif defined(__aarch64__)
# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
#else
# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */
# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */
#endif /* NO_PREFETCH */
#define CACHELINE_SIZE 64
#define PREFETCH_AREA(p, s) { \
const char* const _ptr = (const char*)(p); \
size_t const _size = (size_t)(s); \
size_t _pos; \
for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
PREFETCH_L2(_ptr + _pos); \
} \
}
/* vectorization
* older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */
#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__)
# if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5)
# define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
# else
# define DONT_VECTORIZE _Pragma("GCC optimize(\"no-tree-vectorize\")")
# endif
#else
# define DONT_VECTORIZE
#endif
/* Tell the compiler that a branch is likely or unlikely.
* Only use these macros if it causes the compiler to generate better code.
* If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc
* and clang, please do.
*/
#define LIKELY(x) (__builtin_expect((x), 1))
#define UNLIKELY(x) (__builtin_expect((x), 0))
/* disable warnings */
/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
/* compat. with non-clang compilers */
#ifndef __has_builtin
# define __has_builtin(x) 0
#endif
/* compat. with non-clang compilers */
#ifndef __has_feature
# define __has_feature(x) 0
#endif
/* C-language Attributes are added in C23. */
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
# define ZSTD_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
#else
# define ZSTD_HAS_C_ATTRIBUTE(x) 0
#endif
/* Only use C++ attributes in C++. Some compilers report support for C++
* attributes when compiling with C.
*/
#define ZSTD_HAS_CPP_ATTRIBUTE(x) 0
/* Define ZSTD_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute.
* - C23: https://en.cppreference.com/w/c/language/attributes/fallthrough
* - CPP17: https://en.cppreference.com/w/cpp/language/attributes/fallthrough
* - Else: __attribute__((__fallthrough__))
*/
#define ZSTD_FALLTHROUGH fallthrough
/* detects whether we are being compiled under msan */
/* detects whether we are being compiled under asan */
#endif /* ZSTD_COMPILER_H */