LDAP and UTF-8

Sun Feb 17 07:52:02 GMT 2002

Hi,

I am using the NET ADS USER command in Samba head to list the users stored
in a Win2K ADS. Some user names contain non-english characters encoded in UTF-8
format which don't get displayed correctly. Winbindd has the same problems.

I added the some UTF-8 helper functions to lib/chrcnv.c to be able to convert
UTF-8 to unix. 

Would someone comment on 
a) is there another (better) way to convert from UTF-8 to Unix than
using convert_string(...) and adding the pull_ and push_ wrapper functions ?
b) what is the difference between CH_UNIX and CH_DISPLAY, when should I
convert to CH_DISPLAY instead of CH_UNIX ?
c) are there platforms without iconv, so this will possibly break things ?

...Juergen

--- include/charset.orig	Sun Feb 17 16:21:43 2002
+++ include/charset.h	Sun Feb 17 16:21:46 2002
@@ -19,6 +19,6 @@
 */

 /* this defines the charset types used in samba */
-typedef enum {CH_UCS2=0, CH_UNIX=1, CH_DISPLAY=2, CH_DOS=3} charset_t;
+typedef enum {CH_UCS2=0, CH_UNIX=1, CH_DISPLAY=2, CH_DOS=3, CH_UTF8} charset_t;

-#define NUM_CHARSETS 4
+#define NUM_CHARSETS 5

--- libads/ldap.orig	Sun Feb 17 16:17:44 2002
+++ libads/ldap.c	Sun Feb 17 16:20:38 2002
@@ -14,7 +14,7 @@
 static void dump_string(const char *field, struct berval **values)
 {
 	int i;
+	fstring c1;
 	for (i=0; values[i]; i++) {
-		printf("%s: %s\n", field, values[i]->bv_val);
+		pull_utf8_fstring(c1,values[i]->bv_val);
+		printf("%s: %s\n", field, c1);
 	}
 }

--- lib/charcnv.orig	Sun Feb 17 16:07:34 2002
+++ lib/charcnv.c	Sun Feb 17 16:16:24 2002
@@ -37,6 +37,7 @@
 	else if (ch == CH_UNIX) ret = lp_unix_charset();
 	else if (ch == CH_DOS) ret = lp_dos_charset();
 	else if (ch == CH_DISPLAY) ret = lp_display_charset();
+	else if (ch == CH_UTF8) ret = "UTF-8";
 
 	if (!ret || !*ret) ret = "ASCII";
 	return ret;
@@ -72,6 +73,13 @@
 		conv_handles[CH_UCS2][CH_UNIX] = smb_iconv_open("ASCII", "UCS-2LE");
 	}
 	
+	if (!conv_handles[CH_UNIX][CH_UTF8]) {
+		conv_handles[CH_UNIX][CH_UTF8] = smb_iconv_open("UTF-8", "ASCII");
+	}
+	if (!conv_handles[CH_UTF8][CH_UNIX]) {
+		conv_handles[CH_UTF8][CH_UNIX] = smb_iconv_open("ASCII", "UTF-8");
+	}
+
 
 	for (c1=0;c1<NUM_CHARSETS;c1++) {
 		for (c2=0;c2<NUM_CHARSETS;c2++) {
@@ -151,7 +159,7 @@


 		               break;
-		  case EILSEQ: reason="Illegal myltibyte sequence"; break;
+		  case EILSEQ: reason="Illegal multibyte sequence"; break;
 		}
 		/* smb_panic(reason); */
 	}
@@ -390,6 +398,37 @@
 	return len;
 }
 
+/****************************************************************************
+copy a string from a char* src to a unicode destination
+return the number of bytes occupied by the string in the destination
+flags can have:
+  STR_TERMINATE means include the null termination
+  STR_UPPER     means uppercase in the destination
+dest_len is the maximum length allowed in the destination. If dest_len
+is -1 then no maxiumum is used
+****************************************************************************/
+int push_utf8(const void *base_ptr, void *dest, const char *src, int dest_len, int flags)
+{
+	int src_len = strlen(src);
+	pstring tmpbuf;
+
+	/* treat a pstring as "unlimited" length */
+	if (dest_len == -1) {
+		dest_len = sizeof(pstring);
+	}
+
+	if (flags & STR_UPPER) {
+		pstrcpy(tmpbuf, src);
+		strupper(tmpbuf);
+		src = tmpbuf;
+	}
+
+	if (flags & STR_TERMINATE) {
+		src_len++;
+	}
+
+	return convert_string(CH_UNIX, CH_UTF8, src, src_len, dest, dest_len);
+}
 
 /****************************************************************************
 copy a string from a ucs2 source to a unix char* destination
@@ -435,6 +474,40 @@
 	return pull_ucs2(NULL, dest, src, sizeof(fstring), -1, STR_TERMINATE);
 }
 
+/****************************************************************************
+copy a string from a utf-8 source to a unix char* destination
+flags can have:
+  STR_TERMINATE means the string in src is null terminated
+if STR_TERMINATE is set then src_len is ignored
+src_len is the length of the source area in bytes
+return the number of bytes occupied by the string in src
+the resulting string in "dest" is always null terminated
+****************************************************************************/
+int pull_utf8(const void *base_ptr, char *dest, const void *src, int dest_len, int src_len, int flags)
+{
+	int ret;
+
+	if (dest_len == -1) {
+		dest_len = sizeof(pstring);
+	}
+
+	if (flags & STR_TERMINATE) src_len = strlen(src)+1;
+
+	ret = convert_string(CH_UTF8, CH_UNIX, src, src_len, dest, dest_len);
+	if (dest_len) dest[MIN(ret, dest_len-1)] = 0;
+
+	return src_len;
+}
+
+int pull_utf8_pstring(char *dest, const void *src)
+{
+	return pull_utf8(NULL, dest, src, sizeof(pstring), -1, STR_TERMINATE);
+}
+
+int pull_utf8_fstring(char *dest, const void *src)
+{
+	return pull_utf8(NULL, dest, src, sizeof(fstring), -1, STR_TERMINATE);
+}

 /****************************************************************************
 copy a string from a char* src to a unicode or ascii