diff --git a/gcc/diagnostic-format-sarif.cc b/gcc/diagnostic-format-sarif.cc index 2c48cbd46e2..fd29ac2ca3b 100644 --- a/gcc/diagnostic-format-sarif.cc +++ b/gcc/diagnostic-format-sarif.cc @@ -1390,76 +1390,25 @@ sarif_builder::make_artifact_object (const char *filename) return artifact_obj; } -/* Read all data from F_IN until EOF. - Return a NULL-terminated buffer containing the data, which must be - freed by the caller. - Return NULL on errors. */ - -static char * -read_until_eof (FILE *f_in) -{ - /* Read content, allocating a buffer for it. */ - char *result = NULL; - size_t total_sz = 0; - size_t alloc_sz = 0; - char buf[4096]; - size_t iter_sz_in; - - while ( (iter_sz_in = fread (buf, 1, sizeof (buf), f_in)) ) - { - gcc_assert (alloc_sz >= total_sz); - size_t old_total_sz = total_sz; - total_sz += iter_sz_in; - /* Allow 1 extra byte for 0-termination. */ - if (alloc_sz < (total_sz + 1)) - { - size_t new_alloc_sz = alloc_sz ? alloc_sz * 2: total_sz + 1; - result = (char *)xrealloc (result, new_alloc_sz); - alloc_sz = new_alloc_sz; - } - memcpy (result + old_total_sz, buf, iter_sz_in); - } - - if (!feof (f_in)) - return NULL; - - /* 0-terminate the buffer. */ - gcc_assert (total_sz < alloc_sz); - result[total_sz] = '\0'; - - return result; -} - -/* Read all data from FILENAME until EOF. - Return a NULL-terminated buffer containing the data, which must be - freed by the caller. - Return NULL on errors. */ - -static char * -maybe_read_file (const char *filename) -{ - FILE *f_in = fopen (filename, "r"); - if (!f_in) - return NULL; - char *result = read_until_eof (f_in); - fclose (f_in); - return result; -} - /* Make an artifactContent object (SARIF v2.1.0 section 3.3) for the full contents of FILENAME. */ json::object * sarif_builder::maybe_make_artifact_content_object (const char *filename) const { - char *text_utf8 = maybe_read_file (filename); - if (!text_utf8) + /* Let input.cc handle any charset conversion. */ + char_span utf8_content = get_source_file_content (filename); + if (!utf8_content) + return NULL; + + /* Don't add it if it's not valid UTF-8. */ + if (!cpp_valid_utf8_p(utf8_content.get_buffer (), utf8_content.length ())) return NULL; json::object *artifact_content_obj = new json::object (); - artifact_content_obj->set ("text", new json::string (text_utf8)); - free (text_utf8); - + artifact_content_obj->set ("text", + new json::string (utf8_content.get_buffer (), + utf8_content.length ())); return artifact_content_obj; } @@ -1501,6 +1450,13 @@ sarif_builder::maybe_make_artifact_content_object (const char *filename, if (!text_utf8) return NULL; + /* Don't add it if it's not valid UTF-8. */ + if (!cpp_valid_utf8_p(text_utf8, strlen(text_utf8))) + { + free (text_utf8); + return NULL; + } + json::object *artifact_content_obj = new json::object (); artifact_content_obj->set ("text", new json::string (text_utf8)); free (text_utf8); diff --git a/gcc/input.cc b/gcc/input.cc index 9702dbd39f0..eaf301ec7c1 100644 --- a/gcc/input.cc +++ b/gcc/input.cc @@ -67,6 +67,7 @@ public: { return m_missing_trailing_newline; } + char_span get_full_file_content (); void inc_use_count () { m_use_count++; } @@ -459,6 +460,20 @@ file_cache::add_file (const char *file_path) return r; } +/* Get a borrowed char_span to the full content of this file + as decoded according to the input charset, encoded as UTF-8. */ + +char_span +file_cache_slot::get_full_file_content () +{ + char *line; + ssize_t line_len; + while (get_next_line (&line, &line_len)) + { + } + return char_span (m_data, m_nb_read); +} + /* Populate this slot for use on FILE_PATH and FP, dropping any existing cached content within it. */ @@ -1047,6 +1062,18 @@ get_source_text_between (location_t start, location_t end) return xstrdup (buf); } +/* Get a borrowed char_span to the full content of FILE_PATH + as decoded according to the input charset, encoded as UTF-8. */ + +char_span +get_source_file_content (const char *file_path) +{ + diagnostic_file_cache_init (); + + file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path); + return c->get_full_file_content (); +} + /* Determine if FILE_PATH missing a trailing newline on its final line. Only valid to call once all of the file has been loaded, by requesting a line number beyond the end of the file. */ @@ -4045,7 +4072,104 @@ void test_cpp_utf8 () ASSERT_EQ (byte_col2, byte_col); } } +} +static bool +check_cpp_valid_utf8_p (const char *str) +{ + return cpp_valid_utf8_p (str, strlen (str)); +} + +/* Check that cpp_valid_utf8_p works as expected. */ + +static void +test_cpp_valid_utf8_p () +{ + ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world")); + + /* 2-byte char (pi). */ + ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80")); + + /* 3-byte chars (the Japanese word "mojibake"). */ + ASSERT_TRUE (check_cpp_valid_utf8_p + ( + /* U+6587 CJK UNIFIED IDEOGRAPH-6587 + UTF-8: 0xE6 0x96 0x87 + C octal escaped UTF-8: \346\226\207. */ + "\346\226\207" + /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57 + UTF-8: 0xE5 0xAD 0x97 + C octal escaped UTF-8: \345\255\227. */ + "\345\255\227" + /* U+5316 CJK UNIFIED IDEOGRAPH-5316 + UTF-8: 0xE5 0x8C 0x96 + C octal escaped UTF-8: \345\214\226. */ + "\345\214\226" + /* U+3051 HIRAGANA LETTER KE + UTF-8: 0xE3 0x81 0x91 + C octal escaped UTF-8: \343\201\221. */ + "\343\201\221")); + + /* 4-byte char: an emoji. */ + ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82")); + + /* Control codes, including the NUL byte. */ + ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5)); + + ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!")); + + /* Unexpected continuation bytes. */ + for (unsigned char continuation_byte = 0x80; + continuation_byte <= 0xbf; + continuation_byte++) + ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1)); + + /* "Lonely start characters" for 2-byte sequences. */ + { + unsigned char buf[2]; + buf[1] = ' '; + for (buf[0] = 0xc0; + buf[0] <= 0xdf; + buf[0]++) + ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2)); + } + + /* "Lonely start characters" for 3-byte sequences. */ + { + unsigned char buf[2]; + buf[1] = ' '; + for (buf[0] = 0xe0; + buf[0] <= 0xef; + buf[0]++) + ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2)); + } + + /* "Lonely start characters" for 4-byte sequences. */ + { + unsigned char buf[2]; + buf[1] = ' '; + for (buf[0] = 0xf0; + buf[0] <= 0xf4; + buf[0]++) + ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2)); + } + + /* Invalid start characters (formerly valid for 5-byte and 6-byte + sequences). */ + { + unsigned char buf[2]; + buf[1] = ' '; + for (buf[0] = 0xf5; + buf[0] <= 0xfd; + buf[0]++) + ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2)); + } + + /* Impossible bytes. */ + ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0")); + ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1")); + ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe")); + ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff")); } /* Run all of the selftests within this file. */ @@ -4091,6 +4215,7 @@ input_cc_tests () test_line_offset_overflow (); test_cpp_utf8 (); + test_cpp_valid_utf8_p (); } } // namespace selftest diff --git a/gcc/input.h b/gcc/input.h index 9d68648bb3c..d1087b7a9e8 100644 --- a/gcc/input.h +++ b/gcc/input.h @@ -115,6 +115,7 @@ class char_span extern char_span location_get_source_line (const char *file_path, int line); extern char *get_source_text_between (location_t, location_t); +extern char_span get_source_file_content (const char *file_path); extern bool location_missing_trailing_newline (const char *file_path); diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c index cad53095478..f0dcaa705ca 100644 --- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c +++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c @@ -4,6 +4,7 @@ #warning message /* Verify that some JSON was written to a file with the expected name. */ +/* { dg-final { verify-sarif-file } } */ /* We expect various properties. The indentation here reflects the expected hierarchy, though these tests diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c index 8f5814d894e..02ee33f5265 100644 --- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c +++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c @@ -10,6 +10,8 @@ int test (void) } /* + { dg-final { verify-sarif-file } } + { dg-final { scan-sarif-file "\"level\": \"warning\"" } } { dg-final { scan-sarif-file "\"ruleId\": \"-Wmisleading-indentation\"" } } { dg-final { scan-sarif-file "\"text\": \" if " } } diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c index 3856782b5ea..80954711db2 100644 --- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c +++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c @@ -10,6 +10,8 @@ int test (struct s *ptr) } /* + { dg-final { verify-sarif-file } } + { dg-final { scan-sarif-file "\"level\": \"error\"" } } We expect a logical location for the error (within fn "test"): diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c index 2d22f54037c..bd13da71425 100644 --- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c +++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c @@ -8,6 +8,8 @@ int test (void) } /* + { dg-final { verify-sarif-file } } + { dg-final { scan-sarif-file "\"level\": \"error\"" } } We expect the region expressed in display columns: diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c new file mode 100644 index 00000000000..283df75670d --- /dev/null +++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c @@ -0,0 +1,23 @@ +/* Adapted from Wbidi-chars-1.c */ + +/* PR preprocessor/103026 */ +/* { dg-do compile } */ +/* { dg-options "-fdiagnostics-format=sarif-file" } */ + +int main() { + int isAdmin = 0; + /*‮ } ⁦if (isAdmin)⁩ ⁦ begin admins only */ + __builtin_printf("You are an admin.\n"); + /* end admins only ‮ { ⁦*/ + return 0; +} + +/* Verify that we generate a valid UTF-8 .sarif file. + + { dg-final { verify-sarif-file } } + + Verify that we captured the expected warnings. + + { dg-final { scan-sarif-file {"text": "unpaired UTF-8 bidirectional control characters detected"} } } + { dg-final { scan-sarif-file {"text": "unpaired UTF-8 bidirectional control characters detected"} } } +*/ diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c new file mode 100644 index 00000000000..47f89232b11 --- /dev/null +++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c @@ -0,0 +1,23 @@ +/* Try to process this explicitly as UTF-8. + + { dg-do preprocess } + { dg-options "-finput-charset=UTF-8 -Winvalid-utf8 -fdiagnostics-format=sarif-file" } */ + +/* This comment intentionally contains non-UTF-8 bytes: + * may be used uninitialized + */ + +/* + { dg-final { verify-sarif-file } } + + Verify that we captured the expected warnings. + + { dg-final { scan-sarif-file "\"results\": \\\[" } } + { dg-final { scan-sarif-file "\"level\": \"warning\"" } } + { dg-final { scan-sarif-file "\"ruleId\": \"-Winvalid-utf8\"" } } + { dg-final { scan-sarif-file "\"message\": " } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <98>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <99>"} } } +*/ diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c new file mode 100644 index 00000000000..8395f1d8610 --- /dev/null +++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c @@ -0,0 +1,16 @@ +/* Try to process this explicitly as UTF-8. */ + +/* { dg-do compile } */ +/* { dg-options "-finput-charset=utf-8 -fdiagnostics-format=sarif-file" } */ +/* { dg-excess-errors "The error is sent to the SARIF file, rather than stderr" } */ + +const char *section = "" + +/* The above in quotes is byte 0xFE which is not valid in UTF-8. + Verify that we can generate a valid UTF-8 .sarif file complaining + about the missing semicolon above. */ + +/* { dg-final { verify-sarif-file } } + + { dg-final { scan-sarif-file {"text": "expected ',' or ';' at end of input"} } } +*/ diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c new file mode 100644 index 00000000000..ead03a52fd6 --- /dev/null +++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c @@ -0,0 +1,95 @@ +/* Adapted from cpp/Winvalid-utf8-1.c + + P2295R6 - Support for UTF-8 as a portable source file encoding + This test intentionally contains various byte sequences which are not valid UTF-8 + { dg-do preprocess } + { dg-options "-finput-charset=UTF-8 -Winvalid-utf8 -fdiagnostics-format=sarif-file" } */ + +// a€߿ࠀ퟿𐀀􏿿a +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// aa +// a +/* a€߿ࠀ퟿𐀀􏿿a */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ +/* aa */ + + + +/* Verify that we generate a valid UTF-8 .sarif file. + + { dg-final { verify-sarif-file } } + + Verify that we captured the expected warnings. + + { dg-final { scan-sarif-file "\"results\": \\\[" } } + { dg-final { scan-sarif-file "\"level\": \"warning\"" } } + { dg-final { scan-sarif-file "\"ruleId\": \"-Winvalid-utf8\"" } } + { dg-final { scan-sarif-file "\"message\": " } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <9f><80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80><80><80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <8f>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <90><80><80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <9f><80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80><80><80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <8f>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <90><80><80>"} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } + { dg-final { scan-sarif-file {"text": "invalid UTF-8 character "} } } +*/ diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c new file mode 100644 index 00000000000..a189274692e --- /dev/null +++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c @@ -0,0 +1,22 @@ +/* Adapted from gcc.dg/diagnostic-input-charset-1.c */ +/* { dg-do compile } */ +/* { dg-require-iconv "CP850" } */ +/* { dg-options "-finput-charset=CP850 -fdiagnostics-format=sarif-file" } */ +/* { dg-excess-errors "The error is sent to the SARIF file, rather than stderr" } */ + +/* Test that diagnostics are converted to UTF-8; this file is encoded in + CP850. + + The non-ASCII byte here is 0xf5, which when decoded as CP850 + is U+00A7 SECTION SIGN */ +const char *section = "" + +/* + { dg-final { verify-sarif-file } } + + Verify that we captured the expected warning, and converted the snippet to + UTF-8 on output. + + { dg-final { scan-sarif-file {"text": "expected ',' or ';' at end of input"} } } + { dg-final { scan-sarif-file {"text": "const char .section = \\"\u00a7\\"} } } +*/ diff --git a/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c b/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c index 9186a3262ca..3b773a9a84c 100644 --- a/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c +++ b/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c @@ -10,6 +10,7 @@ void test_inject_ice (void) } /* Verify that some JSON was written to a file with the expected name. */ +/* { dg-final { verify-sarif-file } } */ /* We expect various properties. The indentation here reflects the expected hierarchy, though these tests diff --git a/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c b/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c index 99de3f888d4..57caa20155f 100644 --- a/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c +++ b/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c @@ -10,6 +10,7 @@ void test_inject_write_through_null (void) } /* Verify that some JSON was written to a file with the expected name. */ +/* { dg-final { verify-sarif-file } } */ /* We expect various properties. The indentation here reflects the expected hierarchy, though these tests diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c index bd09391a8b2..27851d1b9f7 100644 --- a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c +++ b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c @@ -34,6 +34,8 @@ make_a_list_of_random_ints_badly(PyObject *self, } /* + { dg-final { verify-sarif-file } } + { dg-final { scan-sarif-file "\"tool\": " } } We expect info about the plugin: diff --git a/gcc/testsuite/lib/scansarif.exp b/gcc/testsuite/lib/scansarif.exp index 05524aa6e7a..83e76c31de7 100644 --- a/gcc/testsuite/lib/scansarif.exp +++ b/gcc/testsuite/lib/scansarif.exp @@ -51,3 +51,32 @@ proc scan-sarif-file-not { args } { dg-scan "scan-sarif-file-not" 0 $testcase $output_file $args } + +# Perform validity checks on the .sarif file produced by the compiler. +# +# Assuming python3 is available, use verify-sarif-file.py to check +# that the .sarif file is UTF-8 encoded and is parseable as JSON. + +proc verify-sarif-file { args } { + global srcdir subdir + + set testcase [testname-for-summary] + set filename [lindex $testcase 0] + set output_file "[file tail $filename].sarif" + + if { ![check_effective_target_recent_python3] } { + unsupported "$testcase verify-sarif-file: python3 is missing" + return + } + + # Verify that the file is correctly encoded and is parseable as JSON. + set script_name $srcdir/lib/verify-sarif-file.py + set what "$testcase (test .sarif output for UTF-8-encoded parseable JSON)" + if [catch {exec python3 $script_name $output_file} res ] { + verbose "verify-sarif-file: res: $res" 2 + fail "$what" + return + } else { + pass "$what" + } +} diff --git a/gcc/testsuite/lib/verify-sarif-file.py b/gcc/testsuite/lib/verify-sarif-file.py new file mode 100644 index 00000000000..f1833f3016e --- /dev/null +++ b/gcc/testsuite/lib/verify-sarif-file.py @@ -0,0 +1,11 @@ +# Verify that ARGV[1] is UTF-8 encoded and parseable as JSON +# For use by the verify-sarif-file directive + +import json +import sys + +sys.tracebacklimit = 0 + +fname = sys.argv[1] +with open(fname, encoding="utf-8") as f: + json.load(f) diff --git a/libcpp/charset.cc b/libcpp/charset.cc index 3c47d4f868b..d7f323b2cd5 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -1864,6 +1864,33 @@ _cpp_valid_utf8 (cpp_reader *pfile, return true; } +/* Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8. */ + +extern bool +cpp_valid_utf8_p (const char *buffer, size_t num_bytes) +{ + const uchar *iter = (const uchar *)buffer; + size_t bytesleft = num_bytes; + while (bytesleft > 0) + { + /* one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per + RFC 2279, but this has been superceded by RFC 3629, which + restricts UTF-8 to 1-byte through 4-byte sequences, and + states "the octet values C0, C1, F5 to FF never appear". + + Reject such values. */ + if (*iter >= 0xf4) + return false; + + cppchar_t cp; + int err = one_utf8_to_cppchar (&iter, &bytesleft, &cp); + if (err) + return false; + } + /* No problems encountered. */ + return true; +} + /* Subroutine of convert_hex and convert_oct. N is the representation in the execution character set of a numeric escape; write it into the string buffer TBUF and update the end-of-string pointer therein. WIDE diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 8df071e1587..a6f0abd894c 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -1600,5 +1600,6 @@ int cpp_wcwidth (cppchar_t c); bool cpp_input_conversion_is_trivial (const char *input_charset); int cpp_check_utf8_bom (const char *data, size_t data_length); +bool cpp_valid_utf8_p (const char *data, size_t num_bytes); #endif /* ! LIBCPP_CPPLIB_H */