mirror of
https://github.com/coreutils/coreutils.git
synced 2024-12-12 03:15:15 +08:00
wc: use avx2 optimization when counting only lines
Use cpuid to detect CPU support for avx2 instructions. Performance was seen to improve by 5x for a file with only newlines, while the performance for a file with no such characters is unchanged. * configure.ac [USE_AVX2_WC_LINECOUNT]: A new conditional, set when __get_cpuid_count() and avx2 compiler intrinsics are supported. * src/wc.c (avx2_supported): A new function using __get_cpuid_count() to determine if avx2 instructions are supported. (wc_lines): A new function refactored from wc(), which implements the standard line counting logic, and provides the fallback implementation for when avx2 is not supported. * src/wc_avx2.c: A new module to implement using avx2 intrinsics. * src/local.mk: Reference the new module. Note we build as a separate lib so that it can be portably built with separate -mavx2 etc. flags.
This commit is contained in:
parent
d435cfc0bc
commit
4064c57380
49
configure.ac
49
configure.ac
@ -575,6 +575,55 @@ AM_CONDITIONAL([USE_PCLMUL_CRC32],
|
|||||||
test "x$pclmul_intrinsic_exists" = "xyes"])
|
test "x$pclmul_intrinsic_exists" = "xyes"])
|
||||||
CFLAGS=$ac_save_CFLAGS
|
CFLAGS=$ac_save_CFLAGS
|
||||||
|
|
||||||
|
AC_MSG_CHECKING([if __get_cpuid_count exists])
|
||||||
|
AC_COMPILE_IFELSE(
|
||||||
|
[AC_LANG_SOURCE([[
|
||||||
|
#include <cpuid.h>
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
|
||||||
|
__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
]])
|
||||||
|
],[
|
||||||
|
AC_MSG_RESULT([yes])
|
||||||
|
get_cpuid_count_exists=yes
|
||||||
|
],[
|
||||||
|
AC_MSG_RESULT([no])
|
||||||
|
])
|
||||||
|
|
||||||
|
CFLAGS="-mavx2 $CFLAGS"
|
||||||
|
AC_MSG_CHECKING([if avx2 intrinstics exists])
|
||||||
|
AC_COMPILE_IFELSE(
|
||||||
|
[AC_LANG_SOURCE([[
|
||||||
|
#include <x86intrin.h>
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
__m256i a, b;
|
||||||
|
a = _mm256_sad_epu8(a, b);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
]])
|
||||||
|
],[
|
||||||
|
AC_MSG_RESULT([yes])
|
||||||
|
AC_DEFINE([HAVE_AVX2_INTRINSIC], [1], [avx2 intrinsics exists])
|
||||||
|
avx2_intrinsic_exists=yes
|
||||||
|
],[
|
||||||
|
AC_MSG_RESULT([no])
|
||||||
|
])
|
||||||
|
if test "x$get_cpuid_count_exists" = "xyes" &&
|
||||||
|
test "x$avx2_intrinsic_exists" = "xyes"; then
|
||||||
|
AC_DEFINE([USE_AVX2_WC_LINECOUNT], [1], [Counting lines with AVX2 enabled])
|
||||||
|
fi
|
||||||
|
AM_CONDITIONAL([USE_AVX2_WC_LINECOUNT],
|
||||||
|
[test "x$get_cpuid_count_exists" = "xyes" &&
|
||||||
|
test "x$avx2_intrinsic_exists" = "xyes"])
|
||||||
|
|
||||||
|
CFLAGS=$ac_save_CFLAGS
|
||||||
|
|
||||||
############################################################################
|
############################################################################
|
||||||
|
|
||||||
dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script.
|
dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script.
|
||||||
|
@ -427,6 +427,15 @@ src_basenc_CPPFLAGS = -DBASE_TYPE=42 $(AM_CPPFLAGS)
|
|||||||
src_expand_SOURCES = src/expand.c src/expand-common.c
|
src_expand_SOURCES = src/expand.c src/expand-common.c
|
||||||
src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
|
src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
|
||||||
|
|
||||||
|
src_wc_SOURCES = src/wc.c
|
||||||
|
if USE_AVX2_WC_LINECOUNT
|
||||||
|
noinst_LIBRARIES += src/libwc_avx2.a
|
||||||
|
src_libwc_avx2_a_SOURCES = src/wc_avx2.c
|
||||||
|
wc_avx2_ldadd = src/libwc_avx2.a
|
||||||
|
src_wc_LDADD += $(wc_avx2_ldadd)
|
||||||
|
src_libwc_avx2_a_CFLAGS = -mavx2 $(AM_CFLAGS)
|
||||||
|
endif
|
||||||
|
|
||||||
# Ensure we don't link against libcoreutils.a as that lib is
|
# Ensure we don't link against libcoreutils.a as that lib is
|
||||||
# not compiled with -fPIC which causes issues on 64 bit at least
|
# not compiled with -fPIC which causes issues on 64 bit at least
|
||||||
src_libstdbuf_so_LDADD = $(LIBINTL)
|
src_libstdbuf_so_LDADD = $(LIBINTL)
|
||||||
|
157
src/wc.c
157
src/wc.c
@ -37,6 +37,9 @@
|
|||||||
#include "safe-read.h"
|
#include "safe-read.h"
|
||||||
#include "stat-size.h"
|
#include "stat-size.h"
|
||||||
#include "xbinary-io.h"
|
#include "xbinary-io.h"
|
||||||
|
#ifdef USE_AVX2_WC_LINECOUNT
|
||||||
|
# include <cpuid.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !defined iswspace && !HAVE_ISWSPACE
|
#if !defined iswspace && !HAVE_ISWSPACE
|
||||||
# define iswspace(wc) \
|
# define iswspace(wc) \
|
||||||
@ -53,6 +56,20 @@
|
|||||||
/* Size of atomic reads. */
|
/* Size of atomic reads. */
|
||||||
#define BUFFER_SIZE (16 * 1024)
|
#define BUFFER_SIZE (16 * 1024)
|
||||||
|
|
||||||
|
static bool
|
||||||
|
wc_lines (char const *file, int fd, uintmax_t *lines_out,
|
||||||
|
uintmax_t *bytes_out);
|
||||||
|
#ifdef USE_AVX2_WC_LINECOUNT
|
||||||
|
/* From wc_avx2.c */
|
||||||
|
extern bool
|
||||||
|
wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
|
||||||
|
uintmax_t *bytes_out);
|
||||||
|
#endif
|
||||||
|
static bool
|
||||||
|
(*wc_lines_p) (char const *file, int fd, uintmax_t *lines_out,
|
||||||
|
uintmax_t *bytes_out) = wc_lines;
|
||||||
|
|
||||||
|
|
||||||
/* Cumulative number of lines, words, chars and bytes in all files so far.
|
/* Cumulative number of lines, words, chars and bytes in all files so far.
|
||||||
max_line_length is the maximum over all files processed so far. */
|
max_line_length is the maximum over all files processed so far. */
|
||||||
static uintmax_t total_lines;
|
static uintmax_t total_lines;
|
||||||
@ -108,6 +125,33 @@ static struct option const longopts[] =
|
|||||||
{NULL, 0, NULL, 0}
|
{NULL, 0, NULL, 0}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef USE_AVX2_WC_LINECOUNT
|
||||||
|
static bool
|
||||||
|
avx2_supported (void)
|
||||||
|
{
|
||||||
|
unsigned int eax = 0;
|
||||||
|
unsigned int ebx = 0;
|
||||||
|
unsigned int ecx = 0;
|
||||||
|
unsigned int edx = 0;
|
||||||
|
|
||||||
|
if (! __get_cpuid (1, &eax, &ebx, &ecx, &edx))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (! (ecx & bit_OSXSAVE))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
eax = ebx = ecx = edx = 0;
|
||||||
|
|
||||||
|
if (! __get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (! (ebx & bit_AVX2))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void
|
void
|
||||||
usage (int status)
|
usage (int status)
|
||||||
{
|
{
|
||||||
@ -208,6 +252,70 @@ write_counts (uintmax_t lines,
|
|||||||
putchar ('\n');
|
putchar ('\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
|
||||||
|
{
|
||||||
|
size_t bytes_read;
|
||||||
|
uintmax_t lines, bytes;
|
||||||
|
char buf[BUFFER_SIZE + 1];
|
||||||
|
bool long_lines = false;
|
||||||
|
|
||||||
|
if (!lines_out || !bytes_out)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
lines = bytes = 0;
|
||||||
|
|
||||||
|
while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (bytes_read == SAFE_READ_ERROR)
|
||||||
|
{
|
||||||
|
error (0, errno, "%s", quotef (file));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bytes += bytes_read;
|
||||||
|
|
||||||
|
char *p = buf;
|
||||||
|
char *end = buf + bytes_read;
|
||||||
|
uintmax_t plines = lines;
|
||||||
|
|
||||||
|
if (! long_lines)
|
||||||
|
{
|
||||||
|
/* Avoid function call overhead for shorter lines. */
|
||||||
|
while (p != end)
|
||||||
|
lines += *p++ == '\n';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* memchr is more efficient with longer lines. */
|
||||||
|
while ((p = memchr (p, '\n', end - p)))
|
||||||
|
{
|
||||||
|
++p;
|
||||||
|
++lines;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If the average line length in the block is >= 15, then use
|
||||||
|
memchr for the next block, where system specific optimizations
|
||||||
|
may outweigh function call overhead.
|
||||||
|
FIXME: This line length was determined in 2015, on both
|
||||||
|
x86_64 and ppc64, but it's worth re-evaluating in future with
|
||||||
|
newer compilers, CPUs, or memchr() implementations etc. */
|
||||||
|
if (lines - plines <= bytes_read / 15)
|
||||||
|
long_lines = true;
|
||||||
|
else
|
||||||
|
long_lines = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
*bytes_out = bytes;
|
||||||
|
*lines_out = lines;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/* Count words. FILE_X is the name of the file (or NULL for standard
|
/* Count words. FILE_X is the name of the file (or NULL for standard
|
||||||
input) that is open on descriptor FD. *FSTATUS is its status.
|
input) that is open on descriptor FD. *FSTATUS is its status.
|
||||||
CURRENT_POS is the current file offset if known, negative if unknown.
|
CURRENT_POS is the current file offset if known, negative if unknown.
|
||||||
@ -312,49 +420,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
|
|||||||
{
|
{
|
||||||
/* Use a separate loop when counting only lines or lines and bytes --
|
/* Use a separate loop when counting only lines or lines and bytes --
|
||||||
but not chars or words. */
|
but not chars or words. */
|
||||||
bool long_lines = false;
|
ok = wc_lines_p (file, fd, &lines, &bytes);
|
||||||
while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
|
|
||||||
{
|
|
||||||
if (bytes_read == SAFE_READ_ERROR)
|
|
||||||
{
|
|
||||||
error (0, errno, "%s", quotef (file));
|
|
||||||
ok = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
bytes += bytes_read;
|
|
||||||
|
|
||||||
char *p = buf;
|
|
||||||
char *end = p + bytes_read;
|
|
||||||
uintmax_t plines = lines;
|
|
||||||
|
|
||||||
if (! long_lines)
|
|
||||||
{
|
|
||||||
/* Avoid function call overhead for shorter lines. */
|
|
||||||
while (p != end)
|
|
||||||
lines += *p++ == '\n';
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* memchr is more efficient with longer lines. */
|
|
||||||
while ((p = memchr (p, '\n', end - p)))
|
|
||||||
{
|
|
||||||
++p;
|
|
||||||
++lines;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If the average line length in the block is >= 15, then use
|
|
||||||
memchr for the next block, where system specific optimizations
|
|
||||||
may outweigh function call overhead.
|
|
||||||
FIXME: This line length was determined in 2015, on both
|
|
||||||
x86_64 and ppc64, but it's worth re-evaluating in future with
|
|
||||||
newer compilers, CPUs, or memchr() implementations etc. */
|
|
||||||
if (lines - plines <= bytes_read / 15)
|
|
||||||
long_lines = true;
|
|
||||||
else
|
|
||||||
long_lines = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#if MB_LEN_MAX > 1
|
#if MB_LEN_MAX > 1
|
||||||
# define SUPPORT_OLD_MBRTOWC 1
|
# define SUPPORT_OLD_MBRTOWC 1
|
||||||
@ -706,6 +772,11 @@ main (int argc, char **argv)
|
|||||||
print_linelength = false;
|
print_linelength = false;
|
||||||
total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
|
total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
|
||||||
|
|
||||||
|
#ifdef USE_AVX2_WC_LINECOUNT
|
||||||
|
if (avx2_supported ())
|
||||||
|
wc_lines_p = wc_lines_avx2;
|
||||||
|
#endif
|
||||||
|
|
||||||
while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
|
while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
|
||||||
switch (optc)
|
switch (optc)
|
||||||
{
|
{
|
||||||
|
122
src/wc_avx2.c
Normal file
122
src/wc_avx2.c
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
/* wc_avx - Count the number of newlines with avx2 instructions.
|
||||||
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <config.h>
|
||||||
|
|
||||||
|
#include "system.h"
|
||||||
|
#include "error.h"
|
||||||
|
#include "safe-read.h"
|
||||||
|
|
||||||
|
#include <x86intrin.h>
|
||||||
|
|
||||||
|
/* This must be below 16 KB (16384) or else the accumulators can
|
||||||
|
theoretically overflow, producing wrong result. This is 2*32 bytes below,
|
||||||
|
so there is no single bytes in the optimal case. */
|
||||||
|
#define BUFSIZE (16320)
|
||||||
|
|
||||||
|
extern bool
|
||||||
|
wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
|
||||||
|
uintmax_t *bytes_out);
|
||||||
|
|
||||||
|
extern bool
|
||||||
|
wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
|
||||||
|
uintmax_t *bytes_out)
|
||||||
|
{
|
||||||
|
__m256i accumulator;
|
||||||
|
__m256i accumulator2;
|
||||||
|
__m256i zeroes;
|
||||||
|
__m256i endlines;
|
||||||
|
__m256i avx_buf[BUFSIZE / sizeof (__m256i)];
|
||||||
|
__m256i *datap;
|
||||||
|
uintmax_t lines = 0;
|
||||||
|
uintmax_t bytes = 0;
|
||||||
|
size_t bytes_read = 0;
|
||||||
|
|
||||||
|
|
||||||
|
if (!lines_out || !bytes_out)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* Using two parallel accumulators gave a good performance increase.
|
||||||
|
Adding a third gave no additional benefit, at least on an
|
||||||
|
Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector
|
||||||
|
execution engines it would be a win. */
|
||||||
|
accumulator = _mm256_setzero_si256 ();
|
||||||
|
accumulator2 = _mm256_setzero_si256 ();
|
||||||
|
zeroes = _mm256_setzero_si256 ();
|
||||||
|
endlines = _mm256_set1_epi8 ('\n');
|
||||||
|
|
||||||
|
while ((bytes_read = safe_read (fd, avx_buf, sizeof (avx_buf))) > 0)
|
||||||
|
{
|
||||||
|
__m256i to_match;
|
||||||
|
__m256i to_match2;
|
||||||
|
__m256i matches;
|
||||||
|
__m256i matches2;
|
||||||
|
|
||||||
|
if (bytes_read == SAFE_READ_ERROR)
|
||||||
|
{
|
||||||
|
error (0, errno, "%s", quotef (file));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bytes += bytes_read;
|
||||||
|
|
||||||
|
datap = avx_buf;
|
||||||
|
char *end = ((char *)avx_buf) + bytes_read;
|
||||||
|
|
||||||
|
while (bytes_read >= 64)
|
||||||
|
{
|
||||||
|
to_match = _mm256_load_si256 (datap);
|
||||||
|
to_match2 = _mm256_load_si256 (datap + 1);
|
||||||
|
|
||||||
|
matches = _mm256_cmpeq_epi8 (to_match, endlines);
|
||||||
|
matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
|
||||||
|
/* Compare will set each 8 bit integer in the register to 0xFF
|
||||||
|
on match. When we subtract it the 8 bit accumulators
|
||||||
|
will underflow, so this is equal to adding 1. */
|
||||||
|
accumulator = _mm256_sub_epi8 (accumulator, matches);
|
||||||
|
accumulator2 = _mm256_sub_epi8 (accumulator2, matches2);
|
||||||
|
|
||||||
|
datap += 2;
|
||||||
|
bytes_read -= 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Horizontally add all 8 bit integers in the register,
|
||||||
|
and then reset it */
|
||||||
|
accumulator = _mm256_sad_epu8 (accumulator, zeroes);
|
||||||
|
lines += _mm256_extract_epi16 (accumulator, 0)
|
||||||
|
+ _mm256_extract_epi16 (accumulator, 4)
|
||||||
|
+ _mm256_extract_epi16 (accumulator, 8)
|
||||||
|
+ _mm256_extract_epi16 (accumulator, 12);
|
||||||
|
accumulator = _mm256_setzero_si256 ();
|
||||||
|
|
||||||
|
accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes);
|
||||||
|
lines += _mm256_extract_epi16 (accumulator2, 0)
|
||||||
|
+ _mm256_extract_epi16 (accumulator2, 4)
|
||||||
|
+ _mm256_extract_epi16 (accumulator2, 8)
|
||||||
|
+ _mm256_extract_epi16 (accumulator2, 12);
|
||||||
|
accumulator2 = _mm256_setzero_si256 ();
|
||||||
|
|
||||||
|
/* Finish up any left over bytes */
|
||||||
|
char *p = (char *)datap;
|
||||||
|
while (p != end)
|
||||||
|
lines += *p++ == '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
*lines_out = lines;
|
||||||
|
*bytes_out = bytes;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user