[linux-cifs-client] [PATCH] cifs: add helper to simplify unicode to NLS conversion and use it (try #2)

Mon Apr 13 11:06:21 GMT 2009

Here are my suggestions. Please note that it is not tested, not even 
compiled!

When assuming that const wchar_t *str is a fixed width UCS-2 string, we 
can just divide the number of input bytes with 2 and

get the number of chars.
Additionally, I replaced maxlen with maxbytes, as I think we want to 
limit the number of bytes and not the number of chars.
I cleaned up not needed stack variables, such as uni.

This function calculates a native 0-termination, which should be fail-safe.

Example:
AAA\0
In UCS-2-LE:
41 00 41 00 00 00
            *****
Converted to UTF-8, ASCII or ISO 8859-1:
41 41 41 00
         **
'Converted' to UTF-16:
41 00 41 00 00 00
            *****

/*
 * UniStrnlenBytes: Return the length of a NLS string in bytes including 
0-termination.
 * Also, populates 'nchars' with the number of chars.
 */
static inline size_t
UniStrnlenBytes(const wchar_t *str, const unsigned int maxbytes, 
unsigned int *nchars,
        const struct nls_table *codepage)
{
    int nc;
    size_t nbytes = 0;
    char buf[NLS_MAX_CHARSET_SIZE]; /* enough for one char at a time */

    if (maxbytes == 0)
        return -EINVAL;

    while (1) {
        nc = codepage->uni2char(*str, buf, NLS_MAX_CHARSET_SIZE);
        if (nc > 0)
            nbytes += nc;
        else
            nbytes += 1; /* for '?' */
        if (nbytes >= maxbytes)
            break; /* byte limit reached */
        if (!*str)
            break; /* 0-termination processed */
        str++; /* next char */
    }
    *nchars = nbytes / 2; /* fixed width UCS-2: 16 bit per character */

    return nbytes;
}

/*
 * Calculates, allocates memory and converts src (UCS-2) to a NLS string.
 * Note: caller is responsible for freeing dst if function returned 0.
 * returns:
 *     on success - 0
 *     on failure - errno
 */
int
cifs_ucs_to_nls(char **dst, const char *src, const unsigned int 
maxbytes, int *plen,
               const struct nls_table *nls_codepage)
{
    size_t nbytes;
    int outlen;

    nbytes = UniStrnlenBytes((wchar_t *)src, maxbytes, plen, nls_codepage);

    if (nbytes <= 0)
        goto err_exit;

    *dst = kzalloc(nbytes, GFP_KERNEL); /* nbytes includes 0-termination */
    if (!*dst)
        goto err_exit;

    outlen = cifs_strfromUCS_le(*dst, (__le16 *)src, *plen, nls_codepage);

    /* Add 0-termination */
    /* nbytes - outlen should be sufficient, but you never know */
    nls_codepage->uni2char(0, *dst[outlen], nbytes - outlen);

    return 0;

err_exit:
    cERROR(1, ("Failed to allocate buffer for string\n"));
    return -ENOMEM;
}

I found an alternative wctomb function in sh-utils.
For me it looks more efficient. The question is if it should be cut to 
handle only 4 byte sequences.

Original from sh-utils-2.0.14/lib/unicodeio.c:
/* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
   Returns the number of bytes stored, or -1 if wc is out of range.  */
static int
utf8_wctomb (unsigned char *r, unsigned int wc)
{
  int count;

  if (wc < 0x80)
    count = 1;
  else if (wc < 0x800)
    count = 2;
  else if (wc < 0x10000)
    count = 3;
  else if (wc < 0x200000)
    count = 4;
  else if (wc < 0x4000000)
    count = 5;
  else if (wc <= 0x7fffffff)
    count = 6;
  else
    return -1;

  switch (count)
    {
      /* Note: code falls through cases! */
      case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
      case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
      case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
      case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
      case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
      case 1: r[0] = wc;
    }

  return count;
}

Modified:
/* Stores the UTF-8 representation of the Unicode character wc in r[0..3].
   Returns the number of bytes stored, or -1 if wc is out of range.  */
static int
utf8_wctomb (unsigned char *r, unsigned int wc)
{
  int count;

  if (wc < 0x80)
    count = 1;
  else if (wc < 0x800)
    count = 2;
  else if (wc < 0x10000)
    count = 3;
  else if (wc <= 0x10ffff) /* maximum codepoint U+10FFFF */
    count = 4;
  else
    return -1;

  switch (count)
    {
      /* Note: code falls through cases! */
      case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
      case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
      case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
      case 1: r[0] = wc;
    }

  return count;
}

As Microsoft seems to use surrogates, known as UTF16, in newer versions, 
these routings would produce so called CESU-8 instead of valid UTF-8. 
Perhaps we have to fix this.