From 74675a58507e769beee7d949dbed788af3c4139d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 30 Apr 2009 10:08:18 -0400 Subject: NLS: update handling of Unicode This patch (as1239) updates the kernel's treatment of Unicode. The character-set conversion routines are well behind the current state of the Unicode specification: They don't recognize the existence of code points beyond plane 0 or of surrogate pairs in the UTF-16 encoding. The old wchar_t 16-bit type is retained because it's still used in lots of places. This shouldn't cause any new problems; if a conversion now results in an invalid 16-bit code then before it must have yielded an undefined code. Difficult-to-read names like "utf_mbstowcs" are replaced with more transparent names like "utf8s_to_utf16s" and the ordering of the parameters is rationalized (buffer lengths come immediate after the pointers they refer to, and the inputs precede the outputs). Fortunately the low-level conversion routines are used in only a few places; the interfaces to the higher-level uni2char and char2uni methods have been left unchanged. Signed-off-by: Alan Stern Acked-by: Clemens Ladisch Signed-off-by: Greg Kroah-Hartman --- fs/nls/nls_base.c | 164 +++++++++++++++++++++++++++++++++++------------------- fs/nls/nls_utf8.c | 13 ++++- 2 files changed, 116 insertions(+), 61 deletions(-) (limited to 'fs/nls') diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 750abf211e2..477d37d83b3 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -15,6 +15,7 @@ #include #include #include +#include static struct nls_table default_table; static struct nls_table *tables = &default_table; @@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] = {0, /* end of table */} }; -int -utf8_mbtowc(wchar_t *p, const __u8 *s, int n) +#define UNICODE_MAX 0x0010ffff +#define PLANE_SIZE 0x00010000 + +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff + +int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) { - long l; + unsigned long l; int c0, c, nc; const struct utf8_table *t; @@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) nc++; if ((c0 & t->cmask) == t->cval) { l &= t->lmask; - if (l < t->lval) + if (l < t->lval || l > UNICODE_MAX || + (l & SURROGATE_MASK) == SURROGATE_PAIR) return -1; - *p = l; + *pu = (unicode_t) l; return nc; } - if (n <= nc) + if (len <= nc) return -1; s++; c = (*s ^ 0x80) & 0xFF; @@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) } return -1; } +EXPORT_SYMBOL(utf8_to_utf32); -int -utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n) +int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) { - __u16 *op; - const __u8 *ip; - int size; - - op = pwcs; - ip = s; - while (*ip && n > 0) { - if (*ip & 0x80) { - size = utf8_mbtowc(op, ip, n); - if (size == -1) { - /* Ignore character and move on */ - ip++; - n--; - } else { - op++; - ip += size; - n -= size; - } - } else { - *op++ = *ip++; - n--; - } - } - return (op - pwcs); -} - -int -utf8_wctomb(__u8 *s, wchar_t wc, int maxlen) -{ - long l; + unsigned long l; int c, nc; const struct utf8_table *t; - + if (!s) return 0; - - l = wc; + + l = u; + if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) + return -1; + nc = 0; for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { nc++; if (l <= t->lmask) { c = t->shift; - *s = t->cval | (l >> c); + *s = (u8) (t->cval | (l >> c)); while (c > 0) { c -= 6; s++; - *s = 0x80 | ((l >> c) & 0x3F); + *s = (u8) (0x80 | ((l >> c) & 0x3F)); } return nc; } } return -1; } +EXPORT_SYMBOL(utf32_to_utf8); -int -utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) +int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) { - const __u16 *ip; - __u8 *op; + u16 *op; int size; + unicode_t u; + + op = pwcs; + while (*s && len > 0) { + if (*s & 0x80) { + size = utf8_to_utf32(s, len, &u); + if (size < 0) { + /* Ignore character and move on */ + size = 1; + } else if (u >= PLANE_SIZE) { + u -= PLANE_SIZE; + *op++ = (wchar_t) (SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS)); + *op++ = (wchar_t) (SURROGATE_PAIR | + SURROGATE_LOW | + (u & SURROGATE_BITS)); + } else { + *op++ = (wchar_t) u; + } + s += size; + len -= size; + } else { + *op++ = *s++; + len--; + } + } + return op - pwcs; +} +EXPORT_SYMBOL(utf8s_to_utf16s); + +static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + return c; + case UTF16_LITTLE_ENDIAN: + return __le16_to_cpu(c); + case UTF16_BIG_ENDIAN: + return __be16_to_cpu(c); + } +} + +int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, + u8 *s, int maxlen) +{ + u8 *op; + int size; + unsigned long u, v; op = s; - ip = pwcs; - while (*ip && maxlen > 0) { - if (*ip > 0x7f) { - size = utf8_wctomb(op, *ip, maxlen); + while (len > 0 && maxlen > 0) { + u = get_utf16(*pwcs, endian); + if (!u) + break; + pwcs++; + len--; + if (u > 0x7f) { + if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { + if (u & SURROGATE_LOW) { + /* Ignore character and move on */ + continue; + } + if (len <= 0) + break; + v = get_utf16(*pwcs, endian); + if ((v & SURROGATE_MASK) != SURROGATE_PAIR || + !(v & SURROGATE_LOW)) { + /* Ignore character and move on */ + continue; + } + u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + + (v & SURROGATE_BITS); + pwcs++; + len--; + } + size = utf32_to_utf8(u, op, maxlen); if (size == -1) { /* Ignore character and move on */ } else { @@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) maxlen -= size; } } else { - *op++ = (__u8) *ip; + *op++ = (u8) u; maxlen--; } - ip++; } - return (op - s); + return op - s; } +EXPORT_SYMBOL(utf16s_to_utf8s); int register_nls(struct nls_table * nls) { @@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls); EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); EXPORT_SYMBOL(load_nls_default); -EXPORT_SYMBOL(utf8_mbtowc); -EXPORT_SYMBOL(utf8_mbstowcs); -EXPORT_SYMBOL(utf8_wctomb); -EXPORT_SYMBOL(utf8_wcstombs); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index aa2c42fdd97..0d60a44acac 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { int n; - if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) { + if (boundlen <= 0) + return -ENAMETOOLONG; + + n = utf32_to_utf8(uni, out, boundlen); + if (n < 0) { *out = '?'; return -EINVAL; } @@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { int n; + unicode_t u; - if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) { + n = utf8_to_utf32(rawstring, boundlen, &u); + if (n < 0 || u > MAX_WCHAR_T) { *uni = 0x003f; /* ? */ - n = -EINVAL; + return -EINVAL; } + *uni = (wchar_t) u; return n; } -- cgit v1.2.3-70-g09d2