Remove MMX code path in lexer

Host systems with only MMX and no SSE2 should be really rare now. Let's remove the MMX code path to keep the number of custom implementations the same. The SSE2 code path is also somewhat dubious now (nearly everything should have SSE4 4.2 which is >15 years old now), but the SSE2 code path is used as fallback for others and also apparently Solaris uses it due to tool chain deficiencies. libcpp/ChangeLog: * lex.cc (search_line_mmx): Remove function. (init_vectorized_lexer): Remove search_line_mmx.
2024-11-23 19:03:59 +08:00 · 2024-07-29 18:48:08 -07:00 · 2024-07-29 18:48:08 -07:00 · eac63be1d4
commit eac63be1d4
parent dc01f249db
1 changed files with 0 additions and 75 deletions
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@ -290,71 +290,6 @@ static const char repl_chars[4][16] __attribute__((aligned(16))) = {
    '?', '?', '?', '?', '?', '?', '?', '?' },
 };

-/* A version of the fast scanner using MMX vectorized byte compare insns.
-
-   This uses the PMOVMSKB instruction which was introduced with "MMX2",
-   which was packaged into SSE1; it is also present in the AMD MMX
-   extension.  Mark the function as using "sse" so that we emit a real
-   "emms" instruction, rather than the 3dNOW "femms" instruction.  */
-
-static const uchar *
-#ifndef __SSE__
-__attribute__((__target__("sse")))
-#endif
-search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
-{
-  typedef char v8qi __attribute__ ((__vector_size__ (8)));
-  typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
-
-  const v8qi repl_nl = *(const v8qi *)repl_chars[0];
-  const v8qi repl_cr = *(const v8qi *)repl_chars[1];
-  const v8qi repl_bs = *(const v8qi *)repl_chars[2];
-  const v8qi repl_qm = *(const v8qi *)repl_chars[3];
-
-  unsigned int misalign, found, mask;
-  const v8qi *p;
-  v8qi data, t, c;
-
-  /* Align the source pointer.  While MMX doesn't generate unaligned data
-     faults, this allows us to safely scan to the end of the buffer without
-     reading beyond the end of the last page.  */
-  misalign = (uintptr_t)s & 7;
-  p = (const v8qi *)((uintptr_t)s & -8);
-  data = *p;
-
-  /* Create a mask for the bytes that are valid within the first
-     16-byte block.  The Idea here is that the AND with the mask
-     within the loop is "free", since we need some AND or TEST
-     insn in order to set the flags for the branch anyway.  */
-  mask = -1u << misalign;
-
-  /* Main loop processing 8 bytes at a time.  */
-  goto start;
-  do
-    {
-      data = *++p;
-      mask = -1;
-
-    start:
-      t = __builtin_ia32_pcmpeqb(data, repl_nl);
-      c = __builtin_ia32_pcmpeqb(data, repl_cr);
-      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
-      c = __builtin_ia32_pcmpeqb(data, repl_bs);
-      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
-      c = __builtin_ia32_pcmpeqb(data, repl_qm);
-      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
-      found = __builtin_ia32_pmovmskb (t);
-      found &= mask;
-    }
-  while (!found);
-
-  __builtin_ia32_emms ();
-
-  /* FOUND contains 1 in bits for which we matched a relevant
-     character.  Conversion to the byte index is trivial.  */
-  found = __builtin_ctz(found);
-  return (const uchar *)p + found;
-}

 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */

@ -509,8 +444,6 @@ init_vectorized_lexer (void)
  minimum = 3;
 #elif defined(__SSE2__)
  minimum = 2;
-#elif defined(__SSE__)
-  minimum = 1;
 #endif

  if (minimum == 3)
@ -521,14 +454,6 @@ init_vectorized_lexer (void)
        impl = search_line_sse42;
      else if (minimum == 2 || (edx & bit_SSE2))
 	impl = search_line_sse2;
-      else if (minimum == 1 || (edx & bit_SSE))
-	impl = search_line_mmx;
-    }
-  else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
-    {
-      if (minimum == 1
-	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
-	impl = search_line_mmx;
    }

  search_line_fast = impl;