summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2017-12-18 22:08:54 -0500
committerRich Felker <dalias@aerifal.cx>2017-12-18 22:31:18 -0500
commit95c6044e2ae85846330814c4ac5ebf4102dbe02c (patch)
treeaf2c6f65ebcb37cca79713f72a678faf54117e9f
parent9d4d0ee41b06acf68dac40332f53be7bfbde7404 (diff)
downloadmusl-95c6044e2ae85846330814c4ac5ebf4102dbe02c.tar.gz
musl-95c6044e2ae85846330814c4ac5ebf4102dbe02c.tar.bz2
musl-95c6044e2ae85846330814c4ac5ebf4102dbe02c.tar.xz
musl-95c6044e2ae85846330814c4ac5ebf4102dbe02c.zip
add support for BOM-determined-endian UCS2, UTF-16, and UTF-32 to iconv
previously, the charset names without endianness specified were always interpreted as big endian. unicode specifies that UTF-16 and UTF-32 have BOM-determined endianness if BOM is present, and are otherwise big endian. since commit 5b546faa67544af395d6407553762b37e9711157 added support for stateful encodings, it is now possible to implement BOM support via the conversion descriptor state. for conversions to these charsets, the output is always big endian and does not have a BOM.
-rw-r--r--src/locale/iconv.c43
1 files changed, 40 insertions, 3 deletions
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index 1784dc9d..c5dd122f 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -16,6 +16,9 @@
#define WCHAR_T 0306
#define US_ASCII 0307
#define UTF_8 0310
+#define UTF_16 0312
+#define UTF_32 0313
+#define UCS2 0314
#define EUC_JP 0320
#define SHIFT_JIS 0321
#define ISO2022_JP 0322
@@ -35,13 +38,16 @@
static const unsigned char charmaps[] =
"utf8\0char\0\0\310"
"wchart\0\0\306"
-"ucs2\0ucs2be\0\0\304"
+"ucs2be\0\0\304"
"ucs2le\0\0\305"
-"utf16\0utf16be\0\0\302"
+"utf16be\0\0\302"
"utf16le\0\0\301"
-"ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
+"ucs4be\0utf32be\0\0\300"
"ucs4le\0utf32le\0\0\303"
"ascii\0usascii\0iso646\0iso646us\0\0\307"
+"utf16\0\0\312"
+"ucs4\0utf32\0\0\313"
+"ucs2\0\0\314"
"eucjp\0\0\320"
"shiftjis\0sjis\0\0\321"
"iso2022jp\0\0\322"
@@ -145,6 +151,9 @@ iconv_t iconv_open(const char *to, const char *from)
iconv_t cd = combine_to_from(t, f);
switch (charmaps[f]) {
+ case UTF_16:
+ case UTF_32:
+ case UCS2:
case ISO2022_JP:
scd = malloc(sizeof *scd);
if (!scd) return (iconv_t)-1;
@@ -285,6 +294,31 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
c = ((c-0xd7c0)<<10) + (d-0xdc00);
}
break;
+ case UCS2:
+ case UTF_16:
+ l = 0;
+ if (!scd->state) {
+ if (*inb < 2) goto starved;
+ c = get_16((void *)*in, 0);
+ scd->state = type==UCS2
+ ? c==0xfffe ? UCS2LE : UCS2BE
+ : c==0xfffe ? UTF_16LE : UTF_16BE;
+ if (c == 0xfffe || c == 0xfeff)
+ l = 2;
+ }
+ type = scd->state;
+ continue;
+ case UTF_32:
+ l = 0;
+ if (!scd->state) {
+ if (*inb < 4) goto starved;
+ c = get_32((void *)*in, 0);
+ scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE;
+ if (c == 0xfffe0000 || c == 0xfeff)
+ l = 4;
+ }
+ type = scd->state;
+ continue;
case SHIFT_JIS:
if (c < 128) break;
if (c-0xa1 <= 0xdf-0xa1) {
@@ -589,8 +623,11 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
*(*out)++ = 'B';
*outb -= 8;
break;
+ case UCS2:
+ totype = UCS2BE;
case UCS2BE:
case UCS2LE:
+ case UTF_16:
case UTF_16BE:
case UTF_16LE:
if (c < 0x10000 || type-UCS2BE < 2U) {