mirror of
https://gcc.gnu.org/git/gcc.git
synced 2024-11-24 03:14:08 +08:00
diagnostics: Support for -finput-charset [PR93067]
Adds the logic to handle -finput-charset in layout_get_source_line(), so that source lines are converted from their input encodings prior to being output by diagnostics machinery. Also adds the ability to strip a UTF-8 BOM similarly. gcc/c-family/ChangeLog: PR other/93067 * c-opts.c (c_common_input_charset_cb): New function. (c_common_post_options): Call new function diagnostic_initialize_input_context(). gcc/d/ChangeLog: PR other/93067 * d-lang.cc (d_input_charset_callback): New function. (d_init): Call new function diagnostic_initialize_input_context(). gcc/fortran/ChangeLog: PR other/93067 * cpp.c (gfc_cpp_post_options): Call new function diagnostic_initialize_input_context(). gcc/ChangeLog: PR other/93067 * coretypes.h (typedef diagnostic_input_charset_callback): Declare. * diagnostic.c (diagnostic_initialize_input_context): New function. * diagnostic.h (diagnostic_initialize_input_context): Declare. * input.c (default_charset_callback): New function. (file_cache::initialize_input_context): New function. (file_cache_slot::create): Added ability to convert the input according to the input context. (file_cache::file_cache): Initialize the new input context. (class file_cache_slot): Added new m_alloc_offset member. (file_cache_slot::file_cache_slot): Initialize the new member. (file_cache_slot::~file_cache_slot): Handle potentially offset buffer. (file_cache_slot::maybe_grow): Likewise. (file_cache_slot::needs_read_p): Handle NULL fp, which is now possible. (file_cache_slot::get_next_line): Likewise. * input.h (class file_cache): Added input context member. libcpp/ChangeLog: PR other/93067 * charset.c (init_iconv_desc): Adapt to permit PFILE argument to be NULL. (_cpp_convert_input): Likewise. Also move UTF-8 BOM logic to... (cpp_check_utf8_bom): ...here. New function. (cpp_input_conversion_is_trivial): New function. * files.c (read_file_guts): Allow PFILE argument to be NULL. Add INPUT_CHARSET argument as an alternate source of this information. (read_file): Pass the new argument to read_file_guts. (cpp_get_converted_source): New function. * include/cpplib.h (struct cpp_converted_source): Declare. (cpp_get_converted_source): Declare. (cpp_input_conversion_is_trivial): Declare. (cpp_check_utf8_bom): Declare. gcc/testsuite/ChangeLog: PR other/93067 * gcc.dg/diagnostic-input-charset-1.c: New test. * gcc.dg/diagnostic-input-utf8-bom.c: New test.
This commit is contained in:
parent
43a5d46fea
commit
3ac6b5cff1
@ -188,6 +188,14 @@ c_common_diagnostics_set_defaults (diagnostic_context *context)
|
||||
context->opt_permissive = OPT_fpermissive;
|
||||
}
|
||||
|
||||
/* Input charset configuration for diagnostics. */
|
||||
static const char *
|
||||
c_common_input_charset_cb (const char * /*filename*/)
|
||||
{
|
||||
const char *cs = cpp_opts->input_charset;
|
||||
return cpp_input_conversion_is_trivial (cs) ? nullptr : cs;
|
||||
}
|
||||
|
||||
/* Whether options from all C-family languages should be accepted
|
||||
quietly. */
|
||||
static bool accept_all_c_family_options = false;
|
||||
@ -1136,6 +1144,11 @@ c_common_post_options (const char **pfilename)
|
||||
cpp_post_options (parse_in);
|
||||
init_global_opts_from_cpp (&global_options, cpp_get_options (parse_in));
|
||||
|
||||
/* Let diagnostics infrastructure know how to convert input files the same
|
||||
way libcpp will do it, namely using the configured input charset and
|
||||
skipping a UTF-8 BOM if present. */
|
||||
diagnostic_initialize_input_context (global_dc,
|
||||
c_common_input_charset_cb, true);
|
||||
input_location = UNKNOWN_LOCATION;
|
||||
|
||||
*pfilename = this_input_filename
|
||||
|
@ -154,6 +154,7 @@ struct cl_option_handlers;
|
||||
struct diagnostic_context;
|
||||
class pretty_printer;
|
||||
class diagnostic_event_id_t;
|
||||
typedef const char * (*diagnostic_input_charset_callback)(const char *);
|
||||
|
||||
template<typename T> struct array_traits;
|
||||
|
||||
|
@ -50,6 +50,7 @@ along with GCC; see the file COPYING3. If not see
|
||||
#include "output.h"
|
||||
#include "print-tree.h"
|
||||
#include "debug.h"
|
||||
#include "input.h"
|
||||
|
||||
#include "d-tree.h"
|
||||
#include "id.h"
|
||||
@ -362,6 +363,19 @@ d_option_lang_mask (void)
|
||||
return CL_D;
|
||||
}
|
||||
|
||||
/* Implements input charset and BOM skipping configuration for
|
||||
diagnostics. */
|
||||
static const char *d_input_charset_callback (const char * /*filename*/)
|
||||
{
|
||||
/* TODO: The input charset is automatically determined by code in
|
||||
dmd/dmodule.c based on the contents of the file. If this detection
|
||||
logic were factored out and could be reused here, then we would be able
|
||||
to return UTF-16 or UTF-32 as needed here. For now, we return always
|
||||
NULL, which means no conversion is necessary, i.e. the input is assumed
|
||||
to be UTF-8 when diagnostics read this file. */
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/* Implements the lang_hooks.init routine for language D. */
|
||||
|
||||
static bool
|
||||
@ -373,6 +387,11 @@ d_init (void)
|
||||
Expression::_init ();
|
||||
Objc::_init ();
|
||||
|
||||
/* Diagnostics input init, to enable BOM skipping and
|
||||
input charset conversion. */
|
||||
diagnostic_initialize_input_context (global_dc,
|
||||
d_input_charset_callback, true);
|
||||
|
||||
/* Back-end init. */
|
||||
global_binding_level = ggc_cleared_alloc <binding_level> ();
|
||||
current_binding_level = global_binding_level;
|
||||
|
@ -293,6 +293,17 @@ diagnostic_urls_init (diagnostic_context *context, int value /*= -1 */)
|
||||
= determine_url_format ((diagnostic_url_rule_t) value);
|
||||
}
|
||||
|
||||
/* Create the file_cache, if not already created, and tell it how to
|
||||
translate files on input. */
|
||||
void diagnostic_initialize_input_context (diagnostic_context *context,
|
||||
diagnostic_input_charset_callback ccb,
|
||||
bool should_skip_bom)
|
||||
{
|
||||
if (!context->m_file_cache)
|
||||
context->m_file_cache = new file_cache;
|
||||
context->m_file_cache->initialize_input_context (ccb, should_skip_bom);
|
||||
}
|
||||
|
||||
/* Do any cleaning up required after the last diagnostic is emitted. */
|
||||
|
||||
void
|
||||
|
@ -446,6 +446,25 @@ extern void diagnostic_show_locus (diagnostic_context *,
|
||||
diagnostic_t diagnostic_kind);
|
||||
extern void diagnostic_show_any_path (diagnostic_context *, diagnostic_info *);
|
||||
|
||||
/* Because we read source files a second time after the frontend did it the
|
||||
first time, we need to know how the frontend handled things like character
|
||||
set conversion and UTF-8 BOM stripping, in order to make everything
|
||||
consistent. This function needs to be called by each frontend that requires
|
||||
non-default behavior, to inform the diagnostics infrastructure how input is
|
||||
to be processed. The default behavior is to do no conversion and not to
|
||||
strip a UTF-8 BOM.
|
||||
|
||||
The callback should return the input charset to be used to convert the given
|
||||
file's contents to UTF-8, or it should return NULL if no conversion is needed
|
||||
for this file. SHOULD_SKIP_BOM only applies in case no conversion was
|
||||
performed, and if true, it will cause a UTF-8 BOM to be skipped at the
|
||||
beginning of the file. (In case a conversion was performed, the BOM is
|
||||
rather skipped as part of the conversion process.) */
|
||||
|
||||
void diagnostic_initialize_input_context (diagnostic_context *context,
|
||||
diagnostic_input_charset_callback ccb,
|
||||
bool should_skip_bom);
|
||||
|
||||
/* Force diagnostics controlled by OPTIDX to be kind KIND. */
|
||||
extern diagnostic_t diagnostic_classify_diagnostic (diagnostic_context *,
|
||||
int /* optidx */,
|
||||
|
@ -493,6 +493,12 @@ gfc_cpp_post_options (void)
|
||||
|
||||
cpp_post_options (cpp_in);
|
||||
|
||||
|
||||
/* Let diagnostics infrastructure know how to convert input files the same
|
||||
way libcpp will do it, namely, with no charset conversion but with
|
||||
skipping of a UTF-8 BOM if present. */
|
||||
diagnostic_initialize_input_context (global_dc, nullptr, true);
|
||||
|
||||
gfc_cpp_register_include_paths ();
|
||||
}
|
||||
|
||||
|
100
gcc/input.c
100
gcc/input.c
@ -22,7 +22,6 @@ along with GCC; see the file COPYING3. If not see
|
||||
#include "coretypes.h"
|
||||
#include "intl.h"
|
||||
#include "diagnostic.h"
|
||||
#include "diagnostic-core.h"
|
||||
#include "selftest.h"
|
||||
#include "cpplib.h"
|
||||
|
||||
@ -30,6 +29,20 @@ along with GCC; see the file COPYING3. If not see
|
||||
#define HAVE_ICONV 0
|
||||
#endif
|
||||
|
||||
/* Input charset configuration. */
|
||||
static const char *default_charset_callback (const char *)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void
|
||||
file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
|
||||
bool should_skip_bom)
|
||||
{
|
||||
in_context.ccb = (ccb ? ccb : default_charset_callback);
|
||||
in_context.should_skip_bom = should_skip_bom;
|
||||
}
|
||||
|
||||
/* This is a cache used by get_next_line to store the content of a
|
||||
file to be searched for file lines. */
|
||||
class file_cache_slot
|
||||
@ -51,7 +64,8 @@ public:
|
||||
|
||||
void inc_use_count () { m_use_count++; }
|
||||
|
||||
void create (const char *file_path, FILE *fp, unsigned highest_use_count);
|
||||
bool create (const file_cache::input_context &in_context,
|
||||
const char *file_path, FILE *fp, unsigned highest_use_count);
|
||||
void evict ();
|
||||
|
||||
private:
|
||||
@ -110,6 +124,10 @@ public:
|
||||
far. */
|
||||
char *m_data;
|
||||
|
||||
/* The allocated buffer to be freed may start a little earlier than DATA,
|
||||
e.g. if a UTF8 BOM was skipped at the beginning. */
|
||||
int m_alloc_offset;
|
||||
|
||||
/* The size of the DATA array above.*/
|
||||
size_t m_size;
|
||||
|
||||
@ -147,6 +165,17 @@ public:
|
||||
doesn't explode. We thus scale total_lines down to
|
||||
line_record_size. */
|
||||
vec<line_info, va_heap> m_line_record;
|
||||
|
||||
void offset_buffer (int offset)
|
||||
{
|
||||
gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
|
||||
: (size_t) offset <= m_size);
|
||||
gcc_assert (m_data);
|
||||
m_alloc_offset += offset;
|
||||
m_data += offset;
|
||||
m_size -= offset;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/* Current position in real source file. */
|
||||
@ -419,21 +448,25 @@ file_cache::add_file (const char *file_path)
|
||||
|
||||
unsigned highest_use_count = 0;
|
||||
file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
|
||||
r->create (file_path, fp, highest_use_count);
|
||||
if (!r->create (in_context, file_path, fp, highest_use_count))
|
||||
return NULL;
|
||||
return r;
|
||||
}
|
||||
|
||||
/* Populate this slot for use on FILE_PATH and FP, dropping any
|
||||
existing cached content within it. */
|
||||
|
||||
void
|
||||
file_cache_slot::create (const char *file_path, FILE *fp,
|
||||
bool
|
||||
file_cache_slot::create (const file_cache::input_context &in_context,
|
||||
const char *file_path, FILE *fp,
|
||||
unsigned highest_use_count)
|
||||
{
|
||||
m_file_path = file_path;
|
||||
if (m_fp)
|
||||
fclose (m_fp);
|
||||
m_fp = fp;
|
||||
if (m_alloc_offset)
|
||||
offset_buffer (-m_alloc_offset);
|
||||
m_nb_read = 0;
|
||||
m_line_start_idx = 0;
|
||||
m_line_num = 0;
|
||||
@ -443,6 +476,36 @@ file_cache_slot::create (const char *file_path, FILE *fp,
|
||||
m_use_count = ++highest_use_count;
|
||||
m_total_lines = total_lines_num (file_path);
|
||||
m_missing_trailing_newline = true;
|
||||
|
||||
|
||||
/* Check the input configuration to determine if we need to do any
|
||||
transformations, such as charset conversion or BOM skipping. */
|
||||
if (const char *input_charset = in_context.ccb (file_path))
|
||||
{
|
||||
/* Need a full-blown conversion of the input charset. */
|
||||
fclose (m_fp);
|
||||
m_fp = NULL;
|
||||
const cpp_converted_source cs
|
||||
= cpp_get_converted_source (file_path, input_charset);
|
||||
if (!cs.data)
|
||||
return false;
|
||||
if (m_data)
|
||||
XDELETEVEC (m_data);
|
||||
m_data = cs.data;
|
||||
m_nb_read = m_size = cs.len;
|
||||
m_alloc_offset = cs.data - cs.to_free;
|
||||
}
|
||||
else if (in_context.should_skip_bom)
|
||||
{
|
||||
if (read_data ())
|
||||
{
|
||||
const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
|
||||
offset_buffer (offset);
|
||||
m_nb_read -= offset;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* file_cache's ctor. */
|
||||
@ -450,6 +513,7 @@ file_cache_slot::create (const char *file_path, FILE *fp,
|
||||
file_cache::file_cache ()
|
||||
: m_file_slots (new file_cache_slot[num_file_slots])
|
||||
{
|
||||
initialize_input_context (nullptr, false);
|
||||
}
|
||||
|
||||
/* file_cache's dtor. */
|
||||
@ -478,8 +542,8 @@ file_cache::lookup_or_add_file (const char *file_path)
|
||||
|
||||
file_cache_slot::file_cache_slot ()
|
||||
: m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
|
||||
m_size (0), m_nb_read (0), m_line_start_idx (0), m_line_num (0),
|
||||
m_total_lines (0), m_missing_trailing_newline (true)
|
||||
m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
|
||||
m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
|
||||
{
|
||||
m_line_record.create (0);
|
||||
}
|
||||
@ -495,6 +559,7 @@ file_cache_slot::~file_cache_slot ()
|
||||
}
|
||||
if (m_data)
|
||||
{
|
||||
offset_buffer (-m_alloc_offset);
|
||||
XDELETEVEC (m_data);
|
||||
m_data = 0;
|
||||
}
|
||||
@ -509,7 +574,7 @@ file_cache_slot::~file_cache_slot ()
|
||||
bool
|
||||
file_cache_slot::needs_read_p () const
|
||||
{
|
||||
return (m_nb_read == 0
|
||||
return m_fp && (m_nb_read == 0
|
||||
|| m_nb_read == m_size
|
||||
|| (m_line_start_idx >= m_nb_read - 1));
|
||||
}
|
||||
@ -531,9 +596,20 @@ file_cache_slot::maybe_grow ()
|
||||
if (!needs_grow_p ())
|
||||
return;
|
||||
|
||||
size_t size = m_size == 0 ? buffer_size : m_size * 2;
|
||||
m_data = XRESIZEVEC (char, m_data, size);
|
||||
m_size = size;
|
||||
if (!m_data)
|
||||
{
|
||||
gcc_assert (m_size == 0 && m_alloc_offset == 0);
|
||||
m_size = buffer_size;
|
||||
m_data = XNEWVEC (char, m_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
const int offset = m_alloc_offset;
|
||||
offset_buffer (-offset);
|
||||
m_size *= 2;
|
||||
m_data = XRESIZEVEC (char, m_data, m_size);
|
||||
offset_buffer (offset);
|
||||
}
|
||||
}
|
||||
|
||||
/* Read more data into the cache. Extends the cache if need be.
|
||||
@ -632,7 +708,7 @@ file_cache_slot::get_next_line (char **line, ssize_t *line_len)
|
||||
m_missing_trailing_newline = false;
|
||||
}
|
||||
|
||||
if (ferror (m_fp))
|
||||
if (m_fp && ferror (m_fp))
|
||||
return false;
|
||||
|
||||
/* At this point, we've found the end of the of line. It either
|
||||
|
10
gcc/input.h
10
gcc/input.h
@ -111,6 +111,15 @@ class file_cache
|
||||
file_cache_slot *lookup_or_add_file (const char *file_path);
|
||||
void forcibly_evict_file (const char *file_path);
|
||||
|
||||
/* See comments in diagnostic.h about the input conversion context. */
|
||||
struct input_context
|
||||
{
|
||||
diagnostic_input_charset_callback ccb;
|
||||
bool should_skip_bom;
|
||||
};
|
||||
void initialize_input_context (diagnostic_input_charset_callback ccb,
|
||||
bool should_skip_bom);
|
||||
|
||||
private:
|
||||
file_cache_slot *evicted_cache_tab_entry (unsigned *highest_use_count);
|
||||
file_cache_slot *add_file (const char *file_path);
|
||||
@ -119,6 +128,7 @@ class file_cache
|
||||
private:
|
||||
static const size_t num_file_slots = 16;
|
||||
file_cache_slot *m_file_slots;
|
||||
input_context in_context;
|
||||
};
|
||||
|
||||
extern expanded_location
|
||||
|
17
gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c
Normal file
17
gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c
Normal file
@ -0,0 +1,17 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-require-iconv "CP850" } */
|
||||
/* { dg-options "-finput-charset=CP850 -fdiagnostics-show-caret" } */
|
||||
|
||||
/* Test that diagnostics are converted to UTF-8; this file is encoded in
|
||||
CP850. Why CP850? -finput-charset only supports encodings that are a
|
||||
superset of ASCII. But encodings that look like latin-1 are automatically
|
||||
converted by expect to UTF-8, and hence by the time dg sees them, it can't
|
||||
verify they were actually output in UTF-8. So codepage 850 was chosen as one
|
||||
that is hopefully available and meets the requirements of matching ASCII and
|
||||
not matching latin-1. */
|
||||
const char *section = "õ"
|
||||
/* { dg-error "expected .* at end of input" "" { target *-*-*} .-1 } */
|
||||
/* { dg-begin-multiline-output "" }
|
||||
const char *section = "§"
|
||||
^~~~~
|
||||
{ dg-end-multiline-output "" } */
|
14
gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c
Normal file
14
gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c
Normal file
@ -0,0 +1,14 @@
|
||||
int 1;
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-fdiagnostics-show-caret" } */
|
||||
|
||||
/* This file begins with a UTF-8 byte order mark. Verify that diagnostics
|
||||
still point to the right place, since the stripping of the BOM happens twice,
|
||||
once when libcpp reads the file, and once when diagnostics infrastucture
|
||||
reads it. */
|
||||
|
||||
/* { dg-error "expected .* before numeric constant" "" { target *-*-*} 1 } */
|
||||
/* { dg-begin-multiline-output "" }
|
||||
int 1;
|
||||
^
|
||||
{ dg-end-multiline-output "" } */
|
109
libcpp/charset.c
109
libcpp/charset.c
@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = {
|
||||
cset_converter structure for conversion from FROM to TO. If
|
||||
iconv_open() fails, issue an error and return an identity
|
||||
converter. Silently return an identity converter if FROM and TO
|
||||
are identical. */
|
||||
are identical.
|
||||
|
||||
PFILE is only used for generating diagnostics; setting it to NULL
|
||||
suppresses diagnostics. */
|
||||
|
||||
static struct cset_converter
|
||||
init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
|
||||
{
|
||||
@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
|
||||
|
||||
if (ret.cd == (iconv_t) -1)
|
||||
{
|
||||
if (errno == EINVAL)
|
||||
cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
|
||||
"conversion from %s to %s not supported by iconv",
|
||||
from, to);
|
||||
else
|
||||
cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
|
||||
|
||||
if (pfile)
|
||||
{
|
||||
if (errno == EINVAL)
|
||||
cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
|
||||
"conversion from %s to %s not supported by iconv",
|
||||
from, to);
|
||||
else
|
||||
cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
|
||||
}
|
||||
ret.func = convert_no_conversion;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
|
||||
"no iconv implementation, cannot convert from %s to %s",
|
||||
from, to);
|
||||
if (pfile)
|
||||
{
|
||||
cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
|
||||
"no iconv implementation, cannot convert from %s to %s",
|
||||
from, to);
|
||||
}
|
||||
ret.func = convert_no_conversion;
|
||||
ret.cd = (iconv_t) -1;
|
||||
ret.width = -1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -2122,6 +2132,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
|
||||
buf, bufp - buf, HT_ALLOC));
|
||||
}
|
||||
|
||||
|
||||
/* Utility to strip a UTF-8 byte order marking from the beginning
|
||||
of a buffer. Returns the number of bytes to skip, which currently
|
||||
will be either 0 or 3. */
|
||||
int
|
||||
cpp_check_utf8_bom (const char *data, size_t data_length)
|
||||
{
|
||||
|
||||
#if HOST_CHARSET == HOST_CHARSET_ASCII
|
||||
const unsigned char *udata = (const unsigned char *) data;
|
||||
if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb
|
||||
&& udata[2] == 0xbf)
|
||||
return 3;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Convert an input buffer (containing the complete contents of one
|
||||
source file) from INPUT_CHARSET to the source character set. INPUT
|
||||
points to the input buffer, SIZE is its allocated size, and LEN is
|
||||
@ -2135,7 +2164,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
|
||||
INPUT is expected to have been allocated with xmalloc. This
|
||||
function will either set *BUFFER_START to INPUT, or free it and set
|
||||
*BUFFER_START to a pointer to another xmalloc-allocated block of
|
||||
memory. */
|
||||
memory.
|
||||
|
||||
PFILE is only used to generate diagnostics; setting it to NULL suppresses
|
||||
diagnostics, and causes a return of NULL if there was any error instead. */
|
||||
|
||||
uchar *
|
||||
_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
|
||||
uchar *input, size_t size, size_t len,
|
||||
@ -2158,17 +2191,27 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
|
||||
to.text = XNEWVEC (uchar, to.asize);
|
||||
to.len = 0;
|
||||
|
||||
if (!APPLY_CONVERSION (input_cset, input, len, &to))
|
||||
cpp_error (pfile, CPP_DL_ERROR,
|
||||
"failure to convert %s to %s",
|
||||
CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
|
||||
|
||||
const bool ok = APPLY_CONVERSION (input_cset, input, len, &to);
|
||||
free (input);
|
||||
}
|
||||
|
||||
/* Clean up the mess. */
|
||||
if (input_cset.func == convert_using_iconv)
|
||||
iconv_close (input_cset.cd);
|
||||
/* Clean up the mess. */
|
||||
if (input_cset.func == convert_using_iconv)
|
||||
iconv_close (input_cset.cd);
|
||||
|
||||
/* Handle conversion failure. */
|
||||
if (!ok)
|
||||
{
|
||||
if (!pfile)
|
||||
{
|
||||
XDELETEVEC (to.text);
|
||||
*buffer_start = NULL;
|
||||
*st_size = 0;
|
||||
return NULL;
|
||||
}
|
||||
cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s",
|
||||
input_charset, SOURCE_CHARSET);
|
||||
}
|
||||
}
|
||||
|
||||
/* Resize buffer if we allocated substantially too much, or if we
|
||||
haven't enough space for the \n-terminator or following
|
||||
@ -2192,19 +2235,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
|
||||
|
||||
buffer = to.text;
|
||||
*st_size = to.len;
|
||||
#if HOST_CHARSET == HOST_CHARSET_ASCII
|
||||
/* The HOST_CHARSET test just above ensures that the source charset
|
||||
is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that
|
||||
glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
|
||||
|
||||
/* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note
|
||||
that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
|
||||
BOM -- however, even if it did, we would still need this code due
|
||||
to the 'convert_no_conversion' case. */
|
||||
if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
|
||||
&& to.text[2] == 0xbf)
|
||||
{
|
||||
*st_size -= 3;
|
||||
buffer += 3;
|
||||
}
|
||||
#endif
|
||||
const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len);
|
||||
*st_size -= bom_len;
|
||||
buffer += bom_len;
|
||||
|
||||
*buffer_start = to.text;
|
||||
return buffer;
|
||||
@ -2244,6 +2282,13 @@ _cpp_default_encoding (void)
|
||||
return current_encoding;
|
||||
}
|
||||
|
||||
/* Check if the configured input charset requires no conversion, other than
|
||||
possibly stripping a UTF-8 BOM. */
|
||||
bool cpp_input_conversion_is_trivial (const char *input_charset)
|
||||
{
|
||||
return !strcasecmp (input_charset, SOURCE_CHARSET);
|
||||
}
|
||||
|
||||
/* Implementation of class cpp_string_location_reader. */
|
||||
|
||||
/* Constructor for cpp_string_location_reader. */
|
||||
|
@ -173,7 +173,7 @@ static bool pch_open_file (cpp_reader *pfile, _cpp_file *file,
|
||||
static bool find_file_in_dir (cpp_reader *pfile, _cpp_file *file,
|
||||
bool *invalid_pch, location_t loc);
|
||||
static bool read_file_guts (cpp_reader *pfile, _cpp_file *file,
|
||||
location_t loc);
|
||||
location_t loc, const char *input_charset);
|
||||
static bool read_file (cpp_reader *pfile, _cpp_file *file,
|
||||
location_t loc);
|
||||
static struct cpp_dir *search_path_head (cpp_reader *, const char *fname,
|
||||
@ -671,9 +671,12 @@ _cpp_find_file (cpp_reader *pfile, const char *fname, cpp_dir *start_dir,
|
||||
|
||||
Use LOC for any diagnostics.
|
||||
|
||||
PFILE may be NULL. In this case, no diagnostics are issued.
|
||||
|
||||
FIXME: Flush file cache and try again if we run out of memory. */
|
||||
static bool
|
||||
read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
|
||||
read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
|
||||
const char *input_charset)
|
||||
{
|
||||
ssize_t size, total, count;
|
||||
uchar *buf;
|
||||
@ -681,8 +684,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
|
||||
|
||||
if (S_ISBLK (file->st.st_mode))
|
||||
{
|
||||
cpp_error_at (pfile, CPP_DL_ERROR, loc,
|
||||
"%s is a block device", file->path);
|
||||
if (pfile)
|
||||
cpp_error_at (pfile, CPP_DL_ERROR, loc,
|
||||
"%s is a block device", file->path);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -699,8 +703,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
|
||||
does not bite us. */
|
||||
if (file->st.st_size > INTTYPE_MAXIMUM (ssize_t))
|
||||
{
|
||||
cpp_error_at (pfile, CPP_DL_ERROR, loc,
|
||||
"%s is too large", file->path);
|
||||
if (pfile)
|
||||
cpp_error_at (pfile, CPP_DL_ERROR, loc,
|
||||
"%s is too large", file->path);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -733,29 +738,29 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
|
||||
|
||||
if (count < 0)
|
||||
{
|
||||
cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc);
|
||||
if (pfile)
|
||||
cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc);
|
||||
free (buf);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (regular && total != size && STAT_SIZE_RELIABLE (file->st))
|
||||
if (pfile && regular && total != size && STAT_SIZE_RELIABLE (file->st))
|
||||
cpp_error_at (pfile, CPP_DL_WARNING, loc,
|
||||
"%s is shorter than expected", file->path);
|
||||
|
||||
file->buffer = _cpp_convert_input (pfile,
|
||||
CPP_OPTION (pfile, input_charset),
|
||||
input_charset,
|
||||
buf, size + 16, total,
|
||||
&file->buffer_start,
|
||||
&file->st.st_size);
|
||||
file->buffer_valid = true;
|
||||
|
||||
return true;
|
||||
file->buffer_valid = file->buffer;
|
||||
return file->buffer_valid;
|
||||
}
|
||||
|
||||
/* Convenience wrapper around read_file_guts that opens the file if
|
||||
necessary and closes the file descriptor after reading. FILE must
|
||||
have been passed through find_file() at some stage. Use LOC for
|
||||
any diagnostics. */
|
||||
any diagnostics. Unlike read_file_guts(), PFILE may not be NULL. */
|
||||
static bool
|
||||
read_file (cpp_reader *pfile, _cpp_file *file, location_t loc)
|
||||
{
|
||||
@ -773,7 +778,8 @@ read_file (cpp_reader *pfile, _cpp_file *file, location_t loc)
|
||||
return false;
|
||||
}
|
||||
|
||||
file->dont_read = !read_file_guts (pfile, file, loc);
|
||||
file->dont_read = !read_file_guts (pfile, file, loc,
|
||||
CPP_OPTION (pfile, input_charset));
|
||||
close (file->fd);
|
||||
file->fd = -1;
|
||||
|
||||
@ -2145,3 +2151,25 @@ _cpp_has_header (cpp_reader *pfile, const char *fname, int angle_brackets,
|
||||
return file->err_no != ENOENT;
|
||||
}
|
||||
|
||||
/* Read a file and convert to input charset, the same as if it were being read
|
||||
by a cpp_reader. */
|
||||
|
||||
cpp_converted_source
|
||||
cpp_get_converted_source (const char *fname, const char *input_charset)
|
||||
{
|
||||
cpp_converted_source res = {};
|
||||
_cpp_file file = {};
|
||||
file.fd = -1;
|
||||
file.name = lbasename (fname);
|
||||
file.path = fname;
|
||||
if (!open_file (&file))
|
||||
return res;
|
||||
const bool ok = read_file_guts (NULL, &file, 0, input_charset);
|
||||
close (file.fd);
|
||||
if (!ok)
|
||||
return res;
|
||||
res.to_free = (char *) file.buffer_start;
|
||||
res.data = (char *) file.buffer;
|
||||
res.len = file.st.st_size;
|
||||
return res;
|
||||
}
|
||||
|
@ -1379,6 +1379,20 @@ extern struct _cpp_file *cpp_get_file (cpp_buffer *);
|
||||
extern cpp_buffer *cpp_get_prev (cpp_buffer *);
|
||||
extern void cpp_clear_file_cache (cpp_reader *);
|
||||
|
||||
/* cpp_get_converted_source returns the contents of the given file, as it exists
|
||||
after cpplib has read it and converted it from the input charset to the
|
||||
source charset. Return struct will be zero-filled if the data could not be
|
||||
read for any reason. The data starts at the DATA pointer, but the TO_FREE
|
||||
pointer is what should be passed to free(), as there may be an offset. */
|
||||
struct cpp_converted_source
|
||||
{
|
||||
char *to_free;
|
||||
char *data;
|
||||
size_t len;
|
||||
};
|
||||
cpp_converted_source cpp_get_converted_source (const char *fname,
|
||||
const char *input_charset);
|
||||
|
||||
/* In pch.c */
|
||||
struct save_macro_data;
|
||||
extern int cpp_save_state (cpp_reader *, FILE *);
|
||||
@ -1449,6 +1463,7 @@ class cpp_display_width_computation {
|
||||
/* Convenience functions that are simple use cases for class
|
||||
cpp_display_width_computation. Tab characters will be expanded to spaces
|
||||
as determined by TABSTOP. */
|
||||
|
||||
int cpp_byte_column_to_display_column (const char *data, int data_length,
|
||||
int column, int tabstop);
|
||||
inline int cpp_display_width (const char *data, int data_length,
|
||||
@ -1461,4 +1476,7 @@ int cpp_display_column_to_byte_column (const char *data, int data_length,
|
||||
int display_col, int tabstop);
|
||||
int cpp_wcwidth (cppchar_t c);
|
||||
|
||||
bool cpp_input_conversion_is_trivial (const char *input_charset);
|
||||
int cpp_check_utf8_bom (const char *data, size_t data_length);
|
||||
|
||||
#endif /* ! LIBCPP_CPPLIB_H */
|
||||
|
Loading…
Reference in New Issue
Block a user