diff --git a/gcc/diagnostic-format-sarif.cc b/gcc/diagnostic-format-sarif.cc
index 2c48cbd46e2..fd29ac2ca3b 100644
--- a/gcc/diagnostic-format-sarif.cc
+++ b/gcc/diagnostic-format-sarif.cc
@@ -1390,76 +1390,25 @@ sarif_builder::make_artifact_object (const char *filename)
   return artifact_obj;
 }
 
-/* Read all data from F_IN until EOF.
-   Return a NULL-terminated buffer containing the data, which must be
-   freed by the caller.
-   Return NULL on errors.  */
-
-static char *
-read_until_eof (FILE *f_in)
-{
-  /* Read content, allocating a buffer for it.  */
-  char *result = NULL;
-  size_t total_sz = 0;
-  size_t alloc_sz = 0;
-  char buf[4096];
-  size_t iter_sz_in;
-
-  while ( (iter_sz_in = fread (buf, 1, sizeof (buf), f_in)) )
-    {
-      gcc_assert (alloc_sz >= total_sz);
-      size_t old_total_sz = total_sz;
-      total_sz += iter_sz_in;
-      /* Allow 1 extra byte for 0-termination.  */
-      if (alloc_sz < (total_sz + 1))
-	{
-	  size_t new_alloc_sz = alloc_sz ? alloc_sz * 2: total_sz + 1;
-	  result = (char *)xrealloc (result, new_alloc_sz);
-	  alloc_sz = new_alloc_sz;
-	}
-      memcpy (result + old_total_sz, buf, iter_sz_in);
-    }
-
-  if (!feof (f_in))
-    return NULL;
-
-  /* 0-terminate the buffer.  */
-  gcc_assert (total_sz < alloc_sz);
-  result[total_sz] = '\0';
-
-  return result;
-}
-
-/* Read all data from FILENAME until EOF.
-   Return a NULL-terminated buffer containing the data, which must be
-   freed by the caller.
-   Return NULL on errors.  */
-
-static char *
-maybe_read_file (const char *filename)
-{
-  FILE *f_in = fopen (filename, "r");
-  if (!f_in)
-    return NULL;
-  char *result = read_until_eof (f_in);
-  fclose (f_in);
-  return result;
-}
-
 /* Make an artifactContent object (SARIF v2.1.0 section 3.3) for the
    full contents of FILENAME.  */
 
 json::object *
 sarif_builder::maybe_make_artifact_content_object (const char *filename) const
 {
-  char *text_utf8 = maybe_read_file (filename);
-  if (!text_utf8)
+  /* Let input.cc handle any charset conversion.  */
+  char_span utf8_content = get_source_file_content (filename);
+  if (!utf8_content)
+    return NULL;
+
+  /* Don't add it if it's not valid UTF-8.  */
+  if (!cpp_valid_utf8_p(utf8_content.get_buffer (), utf8_content.length ()))
     return NULL;
 
   json::object *artifact_content_obj = new json::object ();
-  artifact_content_obj->set ("text", new json::string (text_utf8));
-  free (text_utf8);
-
+  artifact_content_obj->set ("text",
+			     new json::string (utf8_content.get_buffer (),
+					       utf8_content.length ()));
   return artifact_content_obj;
 }
 
@@ -1501,6 +1450,13 @@ sarif_builder::maybe_make_artifact_content_object (const char *filename,
   if (!text_utf8)
     return NULL;
 
+  /* Don't add it if it's not valid UTF-8.  */
+  if (!cpp_valid_utf8_p(text_utf8, strlen(text_utf8)))
+    {
+      free (text_utf8);
+      return NULL;
+    }
+
   json::object *artifact_content_obj = new json::object ();
   artifact_content_obj->set ("text", new json::string (text_utf8));
   free (text_utf8);
diff --git a/gcc/input.cc b/gcc/input.cc
index 9702dbd39f0..eaf301ec7c1 100644
--- a/gcc/input.cc
+++ b/gcc/input.cc
@@ -67,6 +67,7 @@ public:
   {
     return m_missing_trailing_newline;
   }
+  char_span get_full_file_content ();
 
   void inc_use_count () { m_use_count++; }
 
@@ -459,6 +460,20 @@ file_cache::add_file (const char *file_path)
   return r;
 }
 
+/* Get a borrowed char_span to the full content of this file
+   as decoded according to the input charset, encoded as UTF-8.  */
+
+char_span
+file_cache_slot::get_full_file_content ()
+{
+  char *line;
+  ssize_t line_len;
+  while (get_next_line (&line, &line_len))
+    {
+    }
+  return char_span (m_data, m_nb_read);
+}
+
 /* Populate this slot for use on FILE_PATH and FP, dropping any
    existing cached content within it.  */
 
@@ -1047,6 +1062,18 @@ get_source_text_between (location_t start, location_t end)
   return xstrdup (buf);
 }
 
+/* Get a borrowed char_span to the full content of FILE_PATH
+   as decoded according to the input charset, encoded as UTF-8.  */
+
+char_span
+get_source_file_content (const char *file_path)
+{
+  diagnostic_file_cache_init ();
+
+  file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
+  return c->get_full_file_content ();
+}
+
 /* Determine if FILE_PATH missing a trailing newline on its final line.
    Only valid to call once all of the file has been loaded, by
    requesting a line number beyond the end of the file.  */
@@ -4045,7 +4072,104 @@ void test_cpp_utf8 ()
 	  ASSERT_EQ (byte_col2, byte_col);
       }
   }
+}
 
+static bool
+check_cpp_valid_utf8_p (const char *str)
+{
+  return cpp_valid_utf8_p (str, strlen (str));
+}
+
+/* Check that cpp_valid_utf8_p works as expected.  */
+
+static void
+test_cpp_valid_utf8_p ()
+{
+  ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world"));
+
+  /* 2-byte char (pi).  */
+  ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80"));
+
+  /* 3-byte chars (the Japanese word "mojibake").  */
+  ASSERT_TRUE (check_cpp_valid_utf8_p
+	       (
+		/* U+6587 CJK UNIFIED IDEOGRAPH-6587
+		   UTF-8: 0xE6 0x96 0x87
+		   C octal escaped UTF-8: \346\226\207.  */
+		"\346\226\207"
+		/* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
+		   UTF-8: 0xE5 0xAD 0x97
+		   C octal escaped UTF-8: \345\255\227.  */
+		"\345\255\227"
+		/* U+5316 CJK UNIFIED IDEOGRAPH-5316
+		   UTF-8: 0xE5 0x8C 0x96
+		   C octal escaped UTF-8: \345\214\226.  */
+		"\345\214\226"
+		/* U+3051 HIRAGANA LETTER KE
+		   UTF-8: 0xE3 0x81 0x91
+		   C octal escaped UTF-8: \343\201\221.  */
+		"\343\201\221"));
+
+  /* 4-byte char: an emoji.  */
+  ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82"));
+
+  /* Control codes, including the NUL byte.  */
+  ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5));
+
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!"));
+
+  /* Unexpected continuation bytes.  */
+  for (unsigned char continuation_byte = 0x80;
+       continuation_byte <= 0xbf;
+       continuation_byte++)
+    ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1));
+
+  /* "Lonely start characters" for 2-byte sequences.  */
+  {
+    unsigned char buf[2];
+    buf[1] = ' ';
+    for (buf[0] = 0xc0;
+	 buf[0] <= 0xdf;
+	 buf[0]++)
+      ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+  }
+
+  /* "Lonely start characters" for 3-byte sequences.  */
+  {
+    unsigned char buf[2];
+    buf[1] = ' ';
+    for (buf[0] = 0xe0;
+	 buf[0] <= 0xef;
+	 buf[0]++)
+      ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+  }
+
+  /* "Lonely start characters" for 4-byte sequences.  */
+  {
+    unsigned char buf[2];
+    buf[1] = ' ';
+    for (buf[0] = 0xf0;
+	 buf[0] <= 0xf4;
+	 buf[0]++)
+      ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+  }
+
+  /* Invalid start characters (formerly valid for 5-byte and 6-byte
+     sequences).  */
+  {
+    unsigned char buf[2];
+    buf[1] = ' ';
+    for (buf[0] = 0xf5;
+	 buf[0] <= 0xfd;
+	 buf[0]++)
+      ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+  }
+
+  /* Impossible bytes.  */
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0"));
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1"));
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe"));
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff"));
 }
 
 /* Run all of the selftests within this file.  */
@@ -4091,6 +4215,7 @@ input_cc_tests ()
   test_line_offset_overflow ();
 
   test_cpp_utf8 ();
+  test_cpp_valid_utf8_p ();
 }
 
 } // namespace selftest
diff --git a/gcc/input.h b/gcc/input.h
index 9d68648bb3c..d1087b7a9e8 100644
--- a/gcc/input.h
+++ b/gcc/input.h
@@ -115,6 +115,7 @@ class char_span
 
 extern char_span location_get_source_line (const char *file_path, int line);
 extern char *get_source_text_between (location_t, location_t);
+extern char_span get_source_file_content (const char *file_path);
 
 extern bool location_missing_trailing_newline (const char *file_path);
 
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c
index cad53095478..f0dcaa705ca 100644
--- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c
@@ -4,6 +4,7 @@
 #warning message
 
 /* Verify that some JSON was written to a file with the expected name.  */
+/* { dg-final { verify-sarif-file } } */
 
 /* We expect various properties.
    The indentation here reflects the expected hierarchy, though these tests
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c
index 8f5814d894e..02ee33f5265 100644
--- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c
@@ -10,6 +10,8 @@ int test (void)
 }
 
 /* 
+   { dg-final { verify-sarif-file } }
+
        { dg-final { scan-sarif-file "\"level\": \"warning\"" } }
        { dg-final { scan-sarif-file "\"ruleId\": \"-Wmisleading-indentation\"" } }
          { dg-final { scan-sarif-file "\"text\": \"  if " } }
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c
index 3856782b5ea..80954711db2 100644
--- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c
@@ -10,6 +10,8 @@ int test (struct s *ptr)
 }
 
 /* 
+   { dg-final { verify-sarif-file } }
+
        { dg-final { scan-sarif-file "\"level\": \"error\"" } }
 
        We expect a logical location for the error (within fn "test"):
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c
index 2d22f54037c..bd13da71425 100644
--- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c
@@ -8,6 +8,8 @@ int test (void)
 }
 
 /* 
+   { dg-final { verify-sarif-file } }
+
        { dg-final { scan-sarif-file "\"level\": \"error\"" } }
 
        We expect the region expressed in display columns:
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c
new file mode 100644
index 00000000000..283df75670d
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c
@@ -0,0 +1,23 @@
+/* Adapted from Wbidi-chars-1.c */
+
+/* PR preprocessor/103026 */
+/* { dg-do compile } */
+/* { dg-options "-fdiagnostics-format=sarif-file" } */
+
+int main() {
+    int isAdmin = 0;
+    /*â€® } â¦if (isAdmin)â© â¦ begin admins only */
+        __builtin_printf("You are an admin.\n");
+    /* end admins only â€® { â¦*/
+    return 0;
+}
+
+/* Verify that we generate a valid UTF-8 .sarif file.
+
+     { dg-final { verify-sarif-file } }
+
+   Verify that we captured the expected warnings.
+
+     { dg-final { scan-sarif-file {"text": "unpaired UTF-8 bidirectional control characters detected"} } }
+     { dg-final { scan-sarif-file {"text": "unpaired UTF-8 bidirectional control characters detected"} } }
+*/
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c
new file mode 100644
index 00000000000..47f89232b11
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c
@@ -0,0 +1,23 @@
+/* Try to process this explicitly as UTF-8.
+
+   { dg-do preprocess }
+   { dg-options "-finput-charset=UTF-8 -Winvalid-utf8 -fdiagnostics-format=sarif-file" } */
+
+/* This comment intentionally contains non-UTF-8 bytes:
+ *   €˜<unknown>€™ may be used uninitialized
+ */
+
+/* 
+   { dg-final { verify-sarif-file } }
+
+   Verify that we captured the expected warnings.
+
+     { dg-final { scan-sarif-file "\"results\": \\\[" } }
+       { dg-final { scan-sarif-file "\"level\": \"warning\"" } }
+       { dg-final { scan-sarif-file "\"ruleId\": \"-Winvalid-utf8\"" } }
+       { dg-final { scan-sarif-file "\"message\": " } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <98>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <99>"} } }
+*/
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c
new file mode 100644
index 00000000000..8395f1d8610
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c
@@ -0,0 +1,16 @@
+/* Try to process this explicitly as UTF-8.  */
+
+/* { dg-do compile } */
+/* { dg-options "-finput-charset=utf-8 -fdiagnostics-format=sarif-file" } */
+/* { dg-excess-errors "The error is sent to the SARIF file, rather than stderr" } */
+
+const char *section = "þ"
+
+/* The above in quotes is byte 0xFE which is not valid in UTF-8.
+   Verify that we can generate a valid UTF-8 .sarif file complaining
+   about the missing semicolon above.  */
+
+/* { dg-final { verify-sarif-file } }
+
+     { dg-final { scan-sarif-file {"text": "expected ',' or ';' at end of input"} } }
+*/
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c
new file mode 100644
index 00000000000..ead03a52fd6
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c
@@ -0,0 +1,95 @@
+/* Adapted from cpp/Winvalid-utf8-1.c
+
+   P2295R6 - Support for UTF-8 as a portable source file encoding
+   This test intentionally contains various byte sequences which are not valid UTF-8
+   { dg-do preprocess }
+   { dg-options "-finput-charset=UTF-8 -Winvalid-utf8 -fdiagnostics-format=sarif-file" } */
+
+// aÂ€ß¿à €íŸ¿î€€ð€€ô¿¿a
+// a€a
+// a¿a
+// aÀa
+// aÁa
+// aõa
+// aÿa
+// aÂa
+// aàa
+// aà€¿a
+// aàŸ€a
+// aà¿a
+// aì€a
+// aí €a
+// að€€€a
+// að¿¿a
+// aô€€a
+// aý¿¿¿¿¿
+/* aÂ€ß¿à €íŸ¿î€€ð€€ô¿¿a */
+/* a€a */
+/* a¿a */
+/* aÀa */
+/* aÁa */
+/* aõa */
+/* aÿa */
+/* aÂa */
+/* aàa */
+/* aà€¿a */
+/* aàŸ€a */
+/* aà¿a */
+/* aì€a */
+/* aí €a */
+/* að€€€a */
+/* að¿¿a */
+/* aô€€a */
+/* aý¿¿¿¿¿a */
+
+
+
+/* Verify that we generate a valid UTF-8 .sarif file.
+
+     { dg-final { verify-sarif-file } }
+
+   Verify that we captured the expected warnings.
+
+     { dg-final { scan-sarif-file "\"results\": \\\[" } }
+       { dg-final { scan-sarif-file "\"level\": \"warning\"" } }
+       { dg-final { scan-sarif-file "\"ruleId\": \"-Winvalid-utf8\"" } }
+       { dg-final { scan-sarif-file "\"message\": " } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c0>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c1>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f5>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ff>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c2>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><80><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><9f><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ec><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ed><a0><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><80><80><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><8f><bf><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f4><90><80><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <fd><bf><bf><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c0>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c1>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f5>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ff>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c2>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><80><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><9f><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ec><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ed><a0><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><80><80><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><8f><bf><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f4><90><80><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <fd><bf><bf><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+*/
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c
new file mode 100644
index 00000000000..a189274692e
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c
@@ -0,0 +1,22 @@
+/* Adapted from gcc.dg/diagnostic-input-charset-1.c  */
+/* { dg-do compile } */
+/* { dg-require-iconv "CP850" } */
+/* { dg-options "-finput-charset=CP850 -fdiagnostics-format=sarif-file" } */
+/* { dg-excess-errors "The error is sent to the SARIF file, rather than stderr" } */
+
+/* Test that diagnostics are converted to UTF-8; this file is encoded in
+   CP850.
+
+   The non-ASCII byte here is 0xf5, which when decoded as CP850
+   is U+00A7 SECTION SIGN  */
+const char *section = "õ"
+
+/* 
+   { dg-final { verify-sarif-file } }
+
+   Verify that we captured the expected warning, and converted the snippet to
+   UTF-8 on output.
+
+   { dg-final { scan-sarif-file {"text": "expected ',' or ';' at end of input"} } }
+   { dg-final { scan-sarif-file {"text": "const char .section = \\"\u00a7\\"} } }
+*/
diff --git a/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c b/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c
index 9186a3262ca..3b773a9a84c 100644
--- a/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c
+++ b/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c
@@ -10,6 +10,7 @@ void test_inject_ice (void)
 }
 
 /* Verify that some JSON was written to a file with the expected name.  */
+/* { dg-final { verify-sarif-file } } */
 
 /* We expect various properties.
    The indentation here reflects the expected hierarchy, though these tests
diff --git a/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c b/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c
index 99de3f888d4..57caa20155f 100644
--- a/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c
+++ b/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c
@@ -10,6 +10,7 @@ void test_inject_write_through_null (void)
 }
 
 /* Verify that some JSON was written to a file with the expected name.  */
+/* { dg-final { verify-sarif-file } } */
 
 /* We expect various properties.
    The indentation here reflects the expected hierarchy, though these tests
diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c
index bd09391a8b2..27851d1b9f7 100644
--- a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c
+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c
@@ -34,6 +34,8 @@ make_a_list_of_random_ints_badly(PyObject *self,
 }
 
 /* 
+   { dg-final { verify-sarif-file } }
+
    { dg-final { scan-sarif-file "\"tool\": " } }
 
      We expect info about the plugin:
diff --git a/gcc/testsuite/lib/scansarif.exp b/gcc/testsuite/lib/scansarif.exp
index 05524aa6e7a..83e76c31de7 100644
--- a/gcc/testsuite/lib/scansarif.exp
+++ b/gcc/testsuite/lib/scansarif.exp
@@ -51,3 +51,32 @@ proc scan-sarif-file-not { args } {
 
     dg-scan "scan-sarif-file-not" 0 $testcase $output_file $args
 }
+
+# Perform validity checks on the .sarif file produced by the compiler.
+#
+# Assuming python3 is available, use verify-sarif-file.py to check
+# that the .sarif file is UTF-8 encoded and is parseable as JSON.
+
+proc verify-sarif-file { args } {
+    global srcdir subdir
+
+    set testcase [testname-for-summary]
+    set filename [lindex $testcase 0]
+    set output_file "[file tail $filename].sarif"
+
+    if { ![check_effective_target_recent_python3] } {
+	unsupported "$testcase verify-sarif-file: python3 is missing"
+	return
+    }
+
+    # Verify that the file is correctly encoded and is parseable as JSON.
+    set script_name $srcdir/lib/verify-sarif-file.py
+    set what "$testcase (test .sarif output for UTF-8-encoded parseable JSON)"
+    if [catch {exec python3 $script_name $output_file} res ] {
+	verbose "verify-sarif-file: res: $res" 2
+	fail "$what"
+	return
+    } else {
+	pass "$what"
+    }
+}
diff --git a/gcc/testsuite/lib/verify-sarif-file.py b/gcc/testsuite/lib/verify-sarif-file.py
new file mode 100644
index 00000000000..f1833f3016e
--- /dev/null
+++ b/gcc/testsuite/lib/verify-sarif-file.py
@@ -0,0 +1,11 @@
+# Verify that ARGV[1] is UTF-8 encoded and parseable as JSON
+# For use by the verify-sarif-file directive
+
+import json
+import sys
+
+sys.tracebacklimit = 0
+
+fname = sys.argv[1]
+with open(fname, encoding="utf-8") as f:
+    json.load(f)
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index 3c47d4f868b..d7f323b2cd5 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -1864,6 +1864,33 @@ _cpp_valid_utf8 (cpp_reader *pfile,
   return true;
 }
 
+/* Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8.  */
+
+extern bool
+cpp_valid_utf8_p (const char *buffer, size_t num_bytes)
+{
+  const uchar *iter = (const uchar *)buffer;
+  size_t bytesleft = num_bytes;
+  while (bytesleft > 0)
+    {
+      /* one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per
+	 RFC 2279, but this has been superceded by RFC 3629, which
+	 restricts UTF-8 to 1-byte through 4-byte sequences, and
+	 states "the octet values C0, C1, F5 to FF never appear".
+
+	 Reject such values.  */
+      if (*iter >= 0xf4)
+	return false;
+
+      cppchar_t cp;
+      int err = one_utf8_to_cppchar (&iter, &bytesleft, &cp);
+      if (err)
+	return false;
+    }
+  /* No problems encountered.  */
+  return true;
+}
+
 /* Subroutine of convert_hex and convert_oct.  N is the representation
    in the execution character set of a numeric escape; write it into the
    string buffer TBUF and update the end-of-string pointer therein.  WIDE
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index 8df071e1587..a6f0abd894c 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -1600,5 +1600,6 @@ int cpp_wcwidth (cppchar_t c);
 
 bool cpp_input_conversion_is_trivial (const char *input_charset);
 int cpp_check_utf8_bom (const char *data, size_t data_length);
+bool cpp_valid_utf8_p (const char *data, size_t num_bytes);
 
 #endif /* ! LIBCPP_CPPLIB_H */