[linux-cifs-client] [PATCH] cifs: add helper to simplify unicode
to NLS conversion and use it (try #2)
Peter Hudec
PeterHudec at web.de
Mon Apr 13 11:06:21 GMT 2009
Here are my suggestions. Please note that it is not tested, not even
compiled!
When assuming that const wchar_t *str is a fixed width UCS-2 string, we
can just divide the number of input bytes with 2 and
get the number of chars.
Additionally, I replaced maxlen with maxbytes, as I think we want to
limit the number of bytes and not the number of chars.
I cleaned up not needed stack variables, such as uni.
This function calculates a native 0-termination, which should be fail-safe.
Example:
AAA\0
In UCS-2-LE:
41 00 41 00 00 00
*****
Converted to UTF-8, ASCII or ISO 8859-1:
41 41 41 00
**
'Converted' to UTF-16:
41 00 41 00 00 00
*****
/*
* UniStrnlenBytes: Return the length of a NLS string in bytes including
0-termination.
* Also, populates 'nchars' with the number of chars.
*/
static inline size_t
UniStrnlenBytes(const wchar_t *str, const unsigned int maxbytes,
unsigned int *nchars,
const struct nls_table *codepage)
{
int nc;
size_t nbytes = 0;
char buf[NLS_MAX_CHARSET_SIZE]; /* enough for one char at a time */
if (maxbytes == 0)
return -EINVAL;
while (1) {
nc = codepage->uni2char(*str, buf, NLS_MAX_CHARSET_SIZE);
if (nc > 0)
nbytes += nc;
else
nbytes += 1; /* for '?' */
if (nbytes >= maxbytes)
break; /* byte limit reached */
if (!*str)
break; /* 0-termination processed */
str++; /* next char */
}
*nchars = nbytes / 2; /* fixed width UCS-2: 16 bit per character */
return nbytes;
}
/*
* Calculates, allocates memory and converts src (UCS-2) to a NLS string.
* Note: caller is responsible for freeing dst if function returned 0.
* returns:
* on success - 0
* on failure - errno
*/
int
cifs_ucs_to_nls(char **dst, const char *src, const unsigned int
maxbytes, int *plen,
const struct nls_table *nls_codepage)
{
size_t nbytes;
int outlen;
nbytes = UniStrnlenBytes((wchar_t *)src, maxbytes, plen, nls_codepage);
if (nbytes <= 0)
goto err_exit;
*dst = kzalloc(nbytes, GFP_KERNEL); /* nbytes includes 0-termination */
if (!*dst)
goto err_exit;
outlen = cifs_strfromUCS_le(*dst, (__le16 *)src, *plen, nls_codepage);
/* Add 0-termination */
/* nbytes - outlen should be sufficient, but you never know */
nls_codepage->uni2char(0, *dst[outlen], nbytes - outlen);
return 0;
err_exit:
cERROR(1, ("Failed to allocate buffer for string\n"));
return -ENOMEM;
}
I found an alternative wctomb function in sh-utils.
For me it looks more efficient. The question is if it should be cut to
handle only 4 byte sequences.
Original from sh-utils-2.0.14/lib/unicodeio.c:
/* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
Returns the number of bytes stored, or -1 if wc is out of range. */
static int
utf8_wctomb (unsigned char *r, unsigned int wc)
{
int count;
if (wc < 0x80)
count = 1;
else if (wc < 0x800)
count = 2;
else if (wc < 0x10000)
count = 3;
else if (wc < 0x200000)
count = 4;
else if (wc < 0x4000000)
count = 5;
else if (wc <= 0x7fffffff)
count = 6;
else
return -1;
switch (count)
{
/* Note: code falls through cases! */
case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
case 1: r[0] = wc;
}
return count;
}
Modified:
/* Stores the UTF-8 representation of the Unicode character wc in r[0..3].
Returns the number of bytes stored, or -1 if wc is out of range. */
static int
utf8_wctomb (unsigned char *r, unsigned int wc)
{
int count;
if (wc < 0x80)
count = 1;
else if (wc < 0x800)
count = 2;
else if (wc < 0x10000)
count = 3;
else if (wc <= 0x10ffff) /* maximum codepoint U+10FFFF */
count = 4;
else
return -1;
switch (count)
{
/* Note: code falls through cases! */
case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
case 1: r[0] = wc;
}
return count;
}
As Microsoft seems to use surrogates, known as UTF16, in newer versions,
these routings would produce so called CESU-8 instead of valid UTF-8.
Perhaps we have to fix this.
More information about the linux-cifs-client
mailing list