Support URL-escape non-ASCII characters properly as far as possible

(ie. all characters from the Basic Multilingual Plane from Unicode)
This commit is contained in:
Rémi Denis-Courmont 2006-03-27 13:28:24 +00:00
parent 798b0d97c4
commit 168549fa94

View File

@ -1,7 +1,7 @@
/*****************************************************************************
* vlc_url.h: URL related macros
*****************************************************************************
* Copyright (C) 2002-2005 the VideoLAN team
* Copyright (C) 2002-2006 the VideoLAN team
* $Id$
*
* Authors: Christophe Massiot <massiot@via.ecp.fr>
@ -174,6 +174,7 @@ static inline int isurlsafe( int c )
return ( (unsigned char)( c - 'a' ) < 26 )
|| ( (unsigned char)( c - 'A' ) < 26 )
|| ( (unsigned char)( c - '0' ) < 10 )
|| ( (unsigned char)( c ) > 127 )
/* Hmm, we should not encode character that are allowed in URLs
* (even if they are not URL-safe), nor URL-safe characters.
* We still encode some of them because of Microsoft's crap browser.
@ -181,6 +182,11 @@ static inline int isurlsafe( int c )
|| ( strchr( "-_.", c ) != NULL );
}
static inline char url_hexchar( int c )
{
return ( c < 10 ) ? c + '0' : c + 'A' - 10;
}
/*****************************************************************************
* vlc_UrlEncode:
*****************************************************************************
@ -189,14 +195,9 @@ static inline int isurlsafe( int c )
*****************************************************************************/
static inline char *vlc_UrlEncode( const char *psz_url )
{
char *psz_enc, *out;
char psz_enc[3 * strlen( psz_url ) + 1], *out = psz_enc;
const unsigned char *in;
psz_enc = (char *)malloc( 3 * strlen( psz_url ) + 1 );
if( psz_enc == NULL )
return NULL;
out = psz_enc;
for( in = (const unsigned char *)psz_url; *in; in++ )
{
unsigned char c = *in;
@ -205,16 +206,48 @@ static inline char *vlc_UrlEncode( const char *psz_url )
*out++ = (char)c;
else
{
uint16_t cp;
*out++ = '%';
*out++ = ( ( c >> 4 ) >= 0xA ) ? 'A' + ( c >> 4 ) - 0xA
: '0' + ( c >> 4 );
*out++ = ( ( c & 0xf ) >= 0xA ) ? 'A' + ( c & 0xf ) - 0xA
: '0' + ( c & 0xf );
/* UTF-8 to UCS-2 conversion */
if( ( c & 0x7f ) == 0 )
cp = c;
else
if( ( c & 0xe0 ) == 0xc0 )
{
cp = ((c & 0x1f) << 6) | (in[1] & 0x3f);
in++;
}
else
if( ( c & 0xf0 ) == 0xe0 )
{
cp = ((c & 0xf) << 12) | ((in[1] & 0x3f) << 6) | (in[2] & 0x3f);
in += 2;
}
else
/* cannot URL-encode code points outside the BMP */
return NULL;
if( cp < 0xff )
{
/* Encode ISO-8859-1 characters */
*out++ = url_hexchar( cp >> 4 );
*out++ = url_hexchar( cp & 0xf );
}
else
{
/* Encode non-Latin-1 characters */
*out++ = 'u';
*out++ = url_hexchar( cp >> 12 );
*out++ = url_hexchar((cp >> 8) & 0xf );
*out++ = url_hexchar((cp >> 4) & 0xf );
*out++ = url_hexchar( cp & 0xf );
}
}
}
*out++ = '\0';
return (char *)realloc( psz_enc, out - psz_enc );
return strdup( psz_enc );
}
/*****************************************************************************