gcc/libcody/buffer.cc

// CODYlib		-*- mode:c++ -*-
// Copyright (C) 2020 Nathan Sidwell, nathan@acm.org
// License: Apache v2.0

// Cody
#include "internal.hh"
// C++
#include <algorithm>
// C
#include <cstring>
// OS
#include <unistd.h>
#include <cerrno>

// MessageBuffer code

// Lines consist of words and end with a NEWLINE (0xa) char
// Whitespace characters are TAB (0x9) and SPACE (0x20)
// Words consist of non-whitespace chars separated by whitespace.
// Multiple lines in one transaction are indicated by ending non-final
// lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE
// Continuations with ; preceding it
// Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting.
// Quoting with '...'
// Anything outside of [-+_/%.a-zA-Z0-9] needs quoting
// Anything outside of <= <space> or DEL or \' or \\ needs escaping.
// Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>?
// Spaces separate words, UTF8 encoding for non-ascii chars

namespace Cody {
namespace Detail {

static const char CONTINUE = S2C(u8";");

void MessageBuffer::BeginLine ()
{
  if (!buffer.empty ())
    {
      // Terminate the previous line with a continuation
      buffer.reserve (buffer.size () + 3);
      buffer.push_back (S2C(u8" "));
      buffer.push_back (CONTINUE);
      buffer.push_back (S2C(u8"\n"));
    }
  lastBol = buffer.size ();
}

// QUOTE means 'maybe quote', we search it for quote-needing chars

void MessageBuffer::Append (char const *str, bool quote, size_t len)
{
  if (len == ~size_t (0))
    len = strlen (str);

  if (!len && !quote)
    return;

  // We want to quote characters outside of [-+_A-Za-z0-9/%.], anything
  // that could remotely be shell-active.  UTF8 encoding for non-ascii.
  if (quote && len)
    {
      quote = false;
      // Scan looking for quote-needing characters.  We could just
      // append until we find one, but that's probably confusing
      for (size_t ix = len; ix--;)
	{
	  unsigned char c = (unsigned char)str[ix];
	  if (!((c >= S2C(u8"a") && c <= S2C(u8"z"))
		|| (c >= S2C(u8"A") && c <= S2C(u8"Z"))
		|| (c >= S2C(u8"0") && c <= S2C(u8"9"))
		|| c == S2C(u8"-") || c == S2C(u8"+") || c == S2C(u8"_")
		|| c == S2C(u8"/") || c == S2C(u8"%") || c == S2C(u8".")))
	    {
	      quote = true;
	      break;
	    }
	}
    }

  // Maximal length of appended string
  buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2);

  if (quote)
    buffer.push_back (S2C(u8"'"));

  for (auto *end = str + len; str != end;)
    {
      auto *e = end;

      if (quote)
	// Look for next escape-needing char.  More relaxed than
	// the earlier needs-quoting check.
	for (e = str; e != end; ++e)
	  {
	    unsigned char c = (unsigned char)*e;
	    if (c < S2C(u8" ") || c == 0x7f
		|| c == S2C(u8"\\") || c == S2C(u8"'"))
	      break;
	  }
      buffer.insert (buffer.end (), str, e);
      str = e;

      if (str == end)
	break;

      buffer.push_back (S2C(u8"\\"));
      switch (unsigned char c = (unsigned char)*str++)
	{
	case S2C(u8"\t"):
	  c = S2C(u8"t");
	  goto append;

	case S2C(u8"\n"):
	  c = S2C(u8"n");
	  goto append;

	case S2C(u8"'"):
	case S2C(u8"\\"):
	append:
	  buffer.push_back (c);
	  break;

	default:
	  // Full-on escape.  Use 2 lower-case hex chars
	  for (unsigned shift = 8; shift;)
	    {
	      shift -= 4;

	      char nibble = (c >> shift) & 0xf;
	      nibble += S2C(u8"0");
	      if (nibble > S2C(u8"9"))
		nibble += S2C(u8"a") - (S2C(u8"9") + 1);
	      buffer.push_back (nibble);
	    }
	}
    }

  if (quote)
    buffer.push_back (S2C(u8"'"));
}

void MessageBuffer::Append (char c)
{
  buffer.push_back (c);
}

void MessageBuffer::AppendInteger (unsigned u)
{
  // Sigh, even though std::to_string is C++11, we support building on
  // gcc 4.8, which is a C++11 compiler lacking std::to_string.  so
  // have something horrible.
  std::string v (20, 0);
  size_t len = snprintf (const_cast<char *> (v.data ()), v.size (), "%u", u);
  v.erase (len);

  AppendWord (v);
}

int MessageBuffer::Write (int fd) noexcept
{
  size_t limit = buffer.size () - lastBol;
  ssize_t count = write (fd, &buffer.data ()[lastBol], limit);

  int err = 0;
  if (count < 0)
    err = errno;
  else
    {
      lastBol += count;
      if (size_t (count) != limit)
	err = EAGAIN;
    }

  if (err != EAGAIN && err != EINTR)
    {
      // Reset for next message
      buffer.clear ();
      lastBol = 0;
    }

  return err;
}

int MessageBuffer::Read (int fd) noexcept
{
  constexpr size_t blockSize = 200;

  size_t lwm = buffer.size ();
  size_t hwm = buffer.capacity ();
  if (hwm - lwm < blockSize / 2)
    hwm += blockSize;
  buffer.resize (hwm);

  auto iter = buffer.begin () + lwm;
  ssize_t count = read (fd, &*iter, hwm - lwm);
  buffer.resize (lwm + (count >= 0 ? count : 0));

  if (count < 0)
    return errno;

  if (!count)
    // End of file
    return -1;

  bool more = true;
  for (;;)
    {
      auto newline = std::find (iter, buffer.end (), S2C(u8"\n"));
      if (newline == buffer.end ())
	break;
      more = newline != buffer.begin () && newline[-1] == CONTINUE;
      iter = newline + 1;
	
      if (iter == buffer.end ())
	break;

      if (!more)
	{
	  // There is no continuation, but there are chars after the
	  // newline.  Truncate the buffer and return an error
	  buffer.resize (iter - buffer.begin ());
	  return EINVAL;
	}
    }

  return more ? EAGAIN : 0;
}

int MessageBuffer::Lex (std::vector<std::string> &result)
{
  result.clear ();

  if (IsAtEnd ())
    return ENOENT;

  Assert (buffer.back () == S2C(u8"\n"));

  auto iter = buffer.begin () + lastBol;

  for (std::string *word = nullptr;;)
    {
      char c = *iter;

      ++iter;
      if (c == S2C(u8" ") || c == S2C(u8"\t"))
	{
	  word = nullptr;
	  continue;
	}

      if (c == S2C(u8"\n"))
	break;

      if (c == CONTINUE)
	{
	  // Line continuation
	  if (word || *iter != S2C(u8"\n"))
	    goto malformed;
	  ++iter;
	  break;
	}

      if (c <= S2C(u8" ") || c >= 0x7f)
	goto malformed;

      if (!word)
	{
	  result.emplace_back ();
	  word = &result.back ();
	}

      if (c == S2C(u8"'"))
	{
	  // Quoted word
	  for (;;)
	    {
	      c = *iter;

	      if (c == S2C(u8"\n"))
		{
		malformed:;
		  result.clear ();
		  iter = std::find (iter, buffer.end (), S2C(u8"\n"));
		  auto back = iter;
		  if (back[-1] == CONTINUE  && back[-2] == S2C(u8" "))
		    // Smells like a line continuation
		    back -= 2;
		  result.emplace_back (&buffer[lastBol],
				       back - buffer.begin () - lastBol);
		  ++iter;
		  lastBol = iter - buffer.begin ();
		  return EINVAL;
		}

	      if (c < S2C(u8" ") || c >= 0x7f)
		goto malformed;

	      ++iter;
	      if (c == S2C(u8"'"))
		break;

	      if (c == S2C(u8"\\"))
		// escape
		switch (c = *iter)
		  {
		    case S2C(u8"\\"):
		    case S2C(u8"'"):
		      ++iter;
		      break;

		    case S2C(u8"n"):
		      c = S2C(u8"\n");
		      ++iter;
		      break;

		    case S2C(u8"_"):
		      // We used to escape SPACE as \_, so accept that
		      c = S2C(u8" ");
		      ++iter;
		      break;

		    case S2C(u8"t"):
		      c = S2C(u8"\t");
		      ++iter;
		      break;

		    default:
		      {
			unsigned v = 0;
			for (unsigned nibble = 0; nibble != 2; nibble++)
			  {
			    c = *iter;
			    if (c < S2C(u8"0"))
			      {
				if (!nibble)
				  goto malformed;
				break;
			      }
			    else if (c <= S2C(u8"9"))
			      c -= S2C(u8"0");
			    else if (c < S2C(u8"a"))
			      {
				if (!nibble)
				  goto malformed;
				break;
			      }
			    else if (c <= S2C(u8"f"))
			      c -= S2C(u8"a") - 10;
			    else
			      {
				if (!nibble)
				  goto malformed;
				break;
			      }
			    ++iter;
			    v = (v << 4) | c;
			  }
			c = v;
		      }
		  }
	      word->push_back (c);
	    }
	}
      else
	// Unquoted character
	word->push_back (c);
    }
  lastBol = iter - buffer.begin ();
  if (result.empty ())
    return ENOENT;

  return 0;
}

void MessageBuffer::LexedLine (std::string &str)
{
  if (lastBol)
    {
      size_t pos = lastBol - 1;
      for (; pos; pos--)
	if (buffer[pos-1] == S2C(u8"\n"))
	  break;

      size_t end = lastBol - 1;
      if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" "))
	// Strip line continuation
	end -= 2;
      str.append (&buffer[pos], end - pos);
    }
}
} // Detail
} // Cody
Add libcody In order to separate compiler from build system, C++ Modules, as implemented in GCC introduces a communication channel between those two entities. This is implemented by libcody. It is anticipated that other implementations will also implement this protocol, or use libcody to provide it. * Makefile.def: Add libcody. * configure.ac: Add libcody. * Makefile.in: Regenerated. * configure: Regenerated. gcc/ * Makefile.in (CODYINC, CODYLIB, CODYLIB_H): New. Use them. libcody/ * configure.ac: New. * CMakeLists.txt: New. * CODING.md: New. * CONTRIB.md: New. * LICENSE: New. * LICENSE.gcc: New. * Makefile.in: New. * Makesub.in: New. * README.md: New. * buffer.cc: New. * build-aux/config.guess: New. * build-aux/config.sub: New. * build-aux/install-sh: New. * client.cc: New. * cmake/libcody-config-ix.cmake * cody.hh: New. * config.h.in: New. * config.m4: New. * configure: New. * configure.ac: New. * dox.cfg.in: New. * fatal.cc: New. * gdbinit.in: New. * internal.hh: New. * netclient.cc: New. * netserver.cc: New. * packet.cc: New. * resolver.cc: New. * server.cc: New. * tests/01-serialize/connect.cc: New. * tests/01-serialize/decoder.cc: New. * tests/01-serialize/encoder.cc: New. * tests/02-comms/client-1.cc: New. * tests/02-comms/pivot-1.cc: New. * tests/02-comms/server-1.cc: New. * tests/Makesub.in: New. * tests/jouster: New. 2020-12-15 00:10:27 +08:00			`// CODYlib -- mode:c++ --`
			`// Copyright (C) 2020 Nathan Sidwell, nathan@acm.org`
			`// License: Apache v2.0`

			`// Cody`
			`#include "internal.hh"`
			`// C++`
			`#include <algorithm>`
			`// C`
			`#include <cstring>`
			`// OS`
			`#include <unistd.h>`
			`#include <cerrno>`

			`// MessageBuffer code`

			`// Lines consist of words and end with a NEWLINE (0xa) char`
			`// Whitespace characters are TAB (0x9) and SPACE (0x20)`
			`// Words consist of non-whitespace chars separated by whitespace.`
			`// Multiple lines in one transaction are indicated by ending non-final`
			`// lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE`
			`// Continuations with ; preceding it`
			`// Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting.`
			`// Quoting with '...'`
			`// Anything outside of [-+_/%.a-zA-Z0-9] needs quoting`
			`// Anything outside of <= <space> or DEL or \' or \\ needs escaping.`
			`// Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>?`
			`// Spaces separate words, UTF8 encoding for non-ascii chars`

			`namespace Cody {`
			`namespace Detail {`

			`static const char CONTINUE = S2C(u8";");`

			`void MessageBuffer::BeginLine ()`
			`{`
			`if (!buffer.empty ())`
			`{`
			`// Terminate the previous line with a continuation`
			`buffer.reserve (buffer.size () + 3);`
			`buffer.push_back (S2C(u8" "));`
			`buffer.push_back (CONTINUE);`
			`buffer.push_back (S2C(u8"\n"));`
			`}`
			`lastBol = buffer.size ();`
			`}`

			`// QUOTE means 'maybe quote', we search it for quote-needing chars`

			`void MessageBuffer::Append (char const *str, bool quote, size_t len)`
			`{`
			`if (len == ~size_t (0))`
			`len = strlen (str);`

			`if (!len && !quote)`
			`return;`

			`// We want to quote characters outside of [-+_A-Za-z0-9/%.], anything`
			`// that could remotely be shell-active. UTF8 encoding for non-ascii.`
			`if (quote && len)`
			`{`
			`quote = false;`
			`// Scan looking for quote-needing characters. We could just`
			`// append until we find one, but that's probably confusing`
			`for (size_t ix = len; ix--;)`
			`{`
			`unsigned char c = (unsigned char)str[ix];`
			`if (!((c >= S2C(u8"a") && c <= S2C(u8"z"))`
			`\|\| (c >= S2C(u8"A") && c <= S2C(u8"Z"))`
			`\|\| (c >= S2C(u8"0") && c <= S2C(u8"9"))`
			`\|\| c == S2C(u8"-") \|\| c == S2C(u8"+") \|\| c == S2C(u8"_")`
			`\|\| c == S2C(u8"/") \|\| c == S2C(u8"%") \|\| c == S2C(u8".")))`
			`{`
			`quote = true;`
			`break;`
			`}`
			`}`
			`}`

			`// Maximal length of appended string`
			`buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2);`

			`if (quote)`
			`buffer.push_back (S2C(u8"'"));`

			`for (auto *end = str + len; str != end;)`
			`{`
			`auto *e = end;`

			`if (quote)`
			`// Look for next escape-needing char. More relaxed than`
			`// the earlier needs-quoting check.`
			`for (e = str; e != end; ++e)`
			`{`
			`unsigned char c = (unsigned char)*e;`
			`if (c < S2C(u8" ") \|\| c == 0x7f`
			`\|\| c == S2C(u8"\\") \|\| c == S2C(u8"'"))`
			`break;`
			`}`
			`buffer.insert (buffer.end (), str, e);`
			`str = e;`

			`if (str == end)`
			`break;`

			`buffer.push_back (S2C(u8"\\"));`
			`switch (unsigned char c = (unsigned char)*str++)`
			`{`
			`case S2C(u8"\t"):`
			`c = S2C(u8"t");`
			`goto append;`

			`case S2C(u8"\n"):`
			`c = S2C(u8"n");`
			`goto append;`

			`case S2C(u8"'"):`
			`case S2C(u8"\\"):`
			`append:`
			`buffer.push_back (c);`
			`break;`

			`default:`
			`// Full-on escape. Use 2 lower-case hex chars`
			`for (unsigned shift = 8; shift;)`
			`{`
			`shift -= 4;`

			`char nibble = (c >> shift) & 0xf;`
			`nibble += S2C(u8"0");`
			`if (nibble > S2C(u8"9"))`
			`nibble += S2C(u8"a") - (S2C(u8"9") + 1);`
			`buffer.push_back (nibble);`
			`}`
			`}`
			`}`

			`if (quote)`
			`buffer.push_back (S2C(u8"'"));`
			`}`

			`void MessageBuffer::Append (char c)`
			`{`
			`buffer.push_back (c);`
			`}`

			`void MessageBuffer::AppendInteger (unsigned u)`
			`{`
libcody: to_string is not always available [PR 98412] to_string is not always available, so don't use it. libcody/ * buffer.cc (MessageBuffer::AppendInteger): Workaround to_string's non-ubiquity. 2020-12-21 21:38:34 +08:00			`// Sigh, even though std::to_string is C++11, we support building on`
			`// gcc 4.8, which is a C++11 compiler lacking std::to_string. so`
			`// have something horrible.`
			`std::string v (20, 0);`
			`size_t len = snprintf (const_cast<char *> (v.data ()), v.size (), "%u", u);`
			`v.erase (len);`

Add libcody In order to separate compiler from build system, C++ Modules, as implemented in GCC introduces a communication channel between those two entities. This is implemented by libcody. It is anticipated that other implementations will also implement this protocol, or use libcody to provide it. * Makefile.def: Add libcody. * configure.ac: Add libcody. * Makefile.in: Regenerated. * configure: Regenerated. gcc/ * Makefile.in (CODYINC, CODYLIB, CODYLIB_H): New. Use them. libcody/ * configure.ac: New. * CMakeLists.txt: New. * CODING.md: New. * CONTRIB.md: New. * LICENSE: New. * LICENSE.gcc: New. * Makefile.in: New. * Makesub.in: New. * README.md: New. * buffer.cc: New. * build-aux/config.guess: New. * build-aux/config.sub: New. * build-aux/install-sh: New. * client.cc: New. * cmake/libcody-config-ix.cmake * cody.hh: New. * config.h.in: New. * config.m4: New. * configure: New. * configure.ac: New. * dox.cfg.in: New. * fatal.cc: New. * gdbinit.in: New. * internal.hh: New. * netclient.cc: New. * netserver.cc: New. * packet.cc: New. * resolver.cc: New. * server.cc: New. * tests/01-serialize/connect.cc: New. * tests/01-serialize/decoder.cc: New. * tests/01-serialize/encoder.cc: New. * tests/02-comms/client-1.cc: New. * tests/02-comms/pivot-1.cc: New. * tests/02-comms/server-1.cc: New. * tests/Makesub.in: New. * tests/jouster: New. 2020-12-15 00:10:27 +08:00			`AppendWord (v);`
			`}`

			`int MessageBuffer::Write (int fd) noexcept`
			`{`
			`size_t limit = buffer.size () - lastBol;`
			`ssize_t count = write (fd, &buffer.data ()[lastBol], limit);`

			`int err = 0;`
			`if (count < 0)`
			`err = errno;`
			`else`
			`{`
			`lastBol += count;`
			`if (size_t (count) != limit)`
			`err = EAGAIN;`
			`}`

			`if (err != EAGAIN && err != EINTR)`
			`{`
			`// Reset for next message`
			`buffer.clear ();`
			`lastBol = 0;`
			`}`

			`return err;`
			`}`

			`int MessageBuffer::Read (int fd) noexcept`
			`{`
			`constexpr size_t blockSize = 200;`

			`size_t lwm = buffer.size ();`
			`size_t hwm = buffer.capacity ();`
			`if (hwm - lwm < blockSize / 2)`
			`hwm += blockSize;`
			`buffer.resize (hwm);`

			`auto iter = buffer.begin () + lwm;`
			`ssize_t count = read (fd, &*iter, hwm - lwm);`
			`buffer.resize (lwm + (count >= 0 ? count : 0));`

			`if (count < 0)`
			`return errno;`

			`if (!count)`
			`// End of file`
			`return -1;`

			`bool more = true;`
			`for (;;)`
			`{`
			`auto newline = std::find (iter, buffer.end (), S2C(u8"\n"));`
			`if (newline == buffer.end ())`
			`break;`
			`more = newline != buffer.begin () && newline[-1] == CONTINUE;`
			`iter = newline + 1;`

			`if (iter == buffer.end ())`
			`break;`

			`if (!more)`
			`{`
			`// There is no continuation, but there are chars after the`
			`// newline. Truncate the buffer and return an error`
			`buffer.resize (iter - buffer.begin ());`
			`return EINVAL;`
			`}`
			`}`

			`return more ? EAGAIN : 0;`
			`}`

			`int MessageBuffer::Lex (std::vector<std::string> &result)`
			`{`
			`result.clear ();`

			`if (IsAtEnd ())`
			`return ENOENT;`

			`Assert (buffer.back () == S2C(u8"\n"));`

			`auto iter = buffer.begin () + lastBol;`

			`for (std::string *word = nullptr;;)`
			`{`
			`char c = *iter;`

			`++iter;`
			`if (c == S2C(u8" ") \|\| c == S2C(u8"\t"))`
			`{`
			`word = nullptr;`
			`continue;`
			`}`

			`if (c == S2C(u8"\n"))`
			`break;`

			`if (c == CONTINUE)`
			`{`
			`// Line continuation`
			`if (word \|\| *iter != S2C(u8"\n"))`
			`goto malformed;`
			`++iter;`
			`break;`
			`}`

			`if (c <= S2C(u8" ") \|\| c >= 0x7f)`
			`goto malformed;`

			`if (!word)`
			`{`
			`result.emplace_back ();`
			`word = &result.back ();`
			`}`

			`if (c == S2C(u8"'"))`
			`{`
			`// Quoted word`
			`for (;;)`
			`{`
			`c = *iter;`

			`if (c == S2C(u8"\n"))`
			`{`
			`malformed:;`
			`result.clear ();`
			`iter = std::find (iter, buffer.end (), S2C(u8"\n"));`
			`auto back = iter;`
			`if (back[-1] == CONTINUE && back[-2] == S2C(u8" "))`
			`// Smells like a line continuation`
			`back -= 2;`
			`result.emplace_back (&buffer[lastBol],`
			`back - buffer.begin () - lastBol);`
			`++iter;`
			`lastBol = iter - buffer.begin ();`
			`return EINVAL;`
			`}`

			`if (c < S2C(u8" ") \|\| c >= 0x7f)`
			`goto malformed;`

			`++iter;`
			`if (c == S2C(u8"'"))`
			`break;`

			`if (c == S2C(u8"\\"))`
			`// escape`
			`switch (c = *iter)`
			`{`
			`case S2C(u8"\\"):`
			`case S2C(u8"'"):`
			`++iter;`
			`break;`

			`case S2C(u8"n"):`
			`c = S2C(u8"\n");`
			`++iter;`
			`break;`

			`case S2C(u8"_"):`
			`// We used to escape SPACE as \_, so accept that`
			`c = S2C(u8" ");`
			`++iter;`
			`break;`

			`case S2C(u8"t"):`
			`c = S2C(u8"\t");`
			`++iter;`
			`break;`

			`default:`
			`{`
			`unsigned v = 0;`
			`for (unsigned nibble = 0; nibble != 2; nibble++)`
			`{`
			`c = *iter;`
			`if (c < S2C(u8"0"))`
			`{`
			`if (!nibble)`
			`goto malformed;`
			`break;`
			`}`
			`else if (c <= S2C(u8"9"))`
			`c -= S2C(u8"0");`
			`else if (c < S2C(u8"a"))`
			`{`
			`if (!nibble)`
			`goto malformed;`
			`break;`
			`}`
			`else if (c <= S2C(u8"f"))`
			`c -= S2C(u8"a") - 10;`
			`else`
			`{`
			`if (!nibble)`
			`goto malformed;`
			`break;`
			`}`
			`++iter;`
			`v = (v << 4) \| c;`
			`}`
			`c = v;`
			`}`
			`}`
			`word->push_back (c);`
			`}`
			`}`
			`else`
			`// Unquoted character`
			`word->push_back (c);`
			`}`
			`lastBol = iter - buffer.begin ();`
			`if (result.empty ())`
			`return ENOENT;`

			`return 0;`
			`}`

			`void MessageBuffer::LexedLine (std::string &str)`
			`{`
			`if (lastBol)`
			`{`
			`size_t pos = lastBol - 1;`
			`for (; pos; pos--)`
			`if (buffer[pos-1] == S2C(u8"\n"))`
			`break;`

			`size_t end = lastBol - 1;`
			`if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" "))`
			`// Strip line continuation`
			`end -= 2;`
			`str.append (&buffer[pos], end - pos);`
			`}`
			`}`
			`} // Detail`
			`} // Cody`