[FYI] Adding support for Unicode normalisation conversion to lib/util/charset/

Ralph Böhme slow at samba.org
Tue Apr 9 16:50:28 UTC 2019


Hi!

Fwiw, maybe this of interest to anyone: I have a small patchset that adds 
support for Unicode normalisation NFC/NFD conversion to lib/util/charset/ using 
libicu.

I need this to convert strings coming in over mdssvc RPC (aka Spotlight) from 
Mac clients.

There's a more complete patchset running in this pipeline:

https://gitlab.com/samba-team/devel/samba/pipelines/55974836

This is on-top of the gitlab WIP patches for container image rebuild 
automation. I'm adding libicu-devel packages to the images here to ensure the 
torture test of the patchset is always run.

I will submit a full patchset once the gitlab stuff is upstream. Hopefully soon! 
:)

-slow

-- 
Ralph Boehme, Samba Team                https://samba.org/
Samba Developer, SerNet GmbH   https://sernet.de/en/samba/
GPG-Fingerprint   FAE2C6088A24252051C559E4AA1E9B7126399E46
-------------- next part --------------
From a6eb7fe60993fcf8a56a0c22e65a61a1e01d05b8 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Tue, 9 Apr 2019 11:21:57 +0200
Subject: [PATCH 1/2] charset: add support for Unicode normalisation with
 libicu

---
 lib/util/charset/iconv.c           | 109 +++++++++++++++++++++++++++++
 lib/util/charset/wscript_build     |   2 +-
 lib/util/charset/wscript_configure |  11 +++
 3 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c
index 4fae09fda52..23551023e87 100644
--- a/lib/util/charset/iconv.c
+++ b/lib/util/charset/iconv.c
@@ -28,6 +28,11 @@
 #include "libcli/util/ntstatus.h"
 #include "lib/util/util_str_hex.h"
 
+#ifdef HAVE_ICU_I18N
+#include <unicode/ustring.h>
+#include <unicode/utrans.h>
+#endif
+
 #ifdef strcasecmp
 #undef strcasecmp
 #endif
@@ -165,6 +170,64 @@ static size_t sys_iconv(void *cd,
 }
 #endif
 
+#ifdef HAVE_ICU_I18N
+static size_t sys_uconv(void *cd,
+			const char **inbuf,
+			size_t *inbytesleft,
+			char **outbuf,
+			size_t *outbytesleft)
+{
+	UTransliterator *t = (UTransliterator *)cd;
+	UChar ustr[*inbytesleft*2];
+	int32_t ustrlen;
+	int32_t limit;
+	int32_t converted_len;
+	UErrorCode ue = 0;
+
+	/* Convert from UTF8 to UCS2 */
+	(void)u_strFromUTF8(ustr,         /* dst */
+			    sizeof(ustr), /* dst buflen */
+			    &ustrlen,	  /* dst written */
+			    *inbuf,       /* src */
+			    *inbytesleft, /* src length */
+			    &ue);
+	if (U_FAILURE(ue)) {
+		return -1;
+	}
+
+	limit = ustrlen;
+
+	/* Runs translitaration *inplace* (!) */
+	utrans_transUChars(t,
+			   ustr,         /* text */
+			   &ustrlen,     /* text length */
+			   sizeof(ustr), /* text buflen */
+			   0,		 /* start */
+			   &limit,       /* limit */
+			   &ue);
+	if (U_FAILURE(ue)) {
+		return -1;
+	}
+
+	(void)u_strToUTF8(*outbuf,        /* dst */
+			  *outbytesleft,  /* dst buflen */
+			  &converted_len, /* dst written */
+			  ustr,	          /* src */
+			  ustrlen,        /* src length */
+			  &ue);
+	if (U_FAILURE(ue)) {
+		return -1;
+	}
+
+	*inbuf += *inbytesleft;
+	*inbytesleft = 0;
+	*outbuf += converted_len;
+	*outbytesleft -= converted_len;
+
+	return converted_len;
+}
+#endif
+
 /**
  * This is a simple portable iconv() implementaion.
  *
@@ -302,6 +365,52 @@ _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
 	}
 #endif
 
+#ifdef HAVE_ICU_I18N
+	if (strcasecmp(fromcode, "UTF8-MAC") == 0 &&
+	    strcasecmp(tocode, "UTF8") == 0)
+	{
+		U_STRING_DECL(t, "any-nfc", strlen("any-nfc"));
+		UErrorCode ue = 0;
+
+		U_STRING_INIT(t, "any-nfc", strlen("any-nfc"));
+
+		ret->cd_direct = utrans_openU(t,
+					      strlen("any-nfc"),
+					      UTRANS_FORWARD,
+					      NULL,
+					      0,
+					      NULL,
+					      &ue);
+		if (U_FAILURE(ue)) {
+			return (smb_iconv_t)-1;
+		}
+		ret->direct = sys_uconv;
+		return ret;
+	}
+
+	if (strcasecmp(fromcode, "UTF8") == 0 &&
+	    strcasecmp(tocode, "UTF8-MAC") == 0)
+	{
+		U_STRING_DECL(tname, "any-nfd", 7);
+		UErrorCode ue = 0;
+
+		U_STRING_INIT(tname, "any-nfd", 7);
+
+		ret->cd_direct = utrans_openU(tname,
+					      7,
+					      UTRANS_FORWARD,
+					      NULL,
+					      0,
+					      NULL,
+					      &ue);
+		if (U_FAILURE(ue)) {
+			return (smb_iconv_t)-1;
+		}
+		ret->direct = sys_uconv;
+		return ret;
+	}
+#endif
+
 	if (ret->pull == NULL && from == NULL) {
 		goto failed;
 	}
diff --git a/lib/util/charset/wscript_build b/lib/util/charset/wscript_build
index a3728f6a4bd..8fed718e7dc 100644
--- a/lib/util/charset/wscript_build
+++ b/lib/util/charset/wscript_build
@@ -2,7 +2,7 @@
 
 bld.SAMBA_SUBSYSTEM('ICONV_WRAPPER',
                     source='iconv.c',
-                    public_deps='iconv replace talloc')
+                    public_deps='iconv replace talloc ' +  bld.env['icu-libs'])
 
 bld.SAMBA_SUBSYSTEM('charset',
                     public_headers='charset.h',
diff --git a/lib/util/charset/wscript_configure b/lib/util/charset/wscript_configure
index d5ac5d0100f..ca49ec6d3dc 100644
--- a/lib/util/charset/wscript_configure
+++ b/lib/util/charset/wscript_configure
@@ -36,3 +36,14 @@ conf.CHECK_CODE('''
                 msg='Checking errno of iconv for illegal multibyte sequence',
                 lib='iconv',
                 headers='errno.h iconv.h')
+
+if conf.CHECK_CFG(package='icu-i18n',
+               args='--cflags --libs',
+               msg='Checking for icu-i18n',
+               uselib_store='ICU_I18N'):
+    for lib in conf.env['LIB_ICU_I18N']:
+        conf.CHECK_LIB(lib, shlib=True, mandatory=True)
+    conf.env['icu-libs'] = ' '.join(conf.env['LIB_ICU_I18N'])
+    if not conf.CHECK_HEADERS('unicode/ustring.h'):
+        conf.fatal('Found libicu, but unicode/ustring.h is missing')
+    conf.DEFINE('HAVE_UTF8_MAC', 1)
-- 
2.20.1


From 8f8fd5ecfc76955ac713608615455a303d985d46 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Tue, 9 Apr 2019 13:34:39 +0200
Subject: [PATCH 2/2] charset: add tests for Unicode NFC <-> NFD conversion

---
 lib/util/charset/tests/convert_string.c | 108 ++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/lib/util/charset/tests/convert_string.c b/lib/util/charset/tests/convert_string.c
index e63fca650b0..cb5e16b5ca1 100644
--- a/lib/util/charset/tests/convert_string.c
+++ b/lib/util/charset/tests/convert_string.c
@@ -96,6 +96,10 @@ static const char *gd_cp850_upper_base64 = "R5pOVEhFUiBERVNDSE5FUg==";
 static const char *gd_cp850_lower_base64 = "Z4FudGhlciBkZXNjaG5lcg==";
 static const char *gd_iso8859_1_base64 = "R/xudGhlciBEZXNjaG5lcg==";
 static const char *gd_utf16le_base64 = "RwD8AG4AdABoAGUAcgAgAEQAZQBzAGMAaABuAGUAcgA=";
+/* täst */
+static const char *utf8_nfc_base64 = "dMOkc3QA";
+/* täst, where ä = a + combining diaeresis */
+static const char *utf8_nfd_base64 = "dGHMiHN0AA==";
 
 static bool test_gd_iso8859_cp850_handle(struct torture_context *tctx)
 {
@@ -1188,6 +1192,108 @@ static bool test_plato_latin_cp850_utf8_handle(struct torture_context *tctx)
 	return true;
 }
 
+static bool test_utf8_mac_nfc_to_nfd(struct torture_context *tctx)
+{
+	smb_iconv_t ic;
+	DATA_BLOB utf8_nfc_blob;
+	DATA_BLOB utf8_nfd_blob;
+	DATA_BLOB blob;
+	size_t nconv;
+	const char *src = NULL;
+	char *dst = NULL;
+	size_t dst_left;
+	size_t srclen;
+	bool ret = true;
+
+	ic = smb_iconv_open("UTF8-MAC", "UTF8");
+	torture_assert_goto(tctx, ic != (smb_iconv_t)-1, ret, done,
+			    "creating iconv handle\n");
+
+	utf8_nfc_blob = base64_decode_data_blob_talloc(tctx, utf8_nfc_base64);
+	torture_assert_not_null_goto(tctx, utf8_nfc_blob.data, ret, done,
+				     "OOM\n");
+
+	utf8_nfd_blob = base64_decode_data_blob_talloc(tctx, utf8_nfd_base64);
+	torture_assert_not_null_goto(tctx, utf8_nfd_blob.data, ret, done,
+				     "OOM\n");
+
+	blob = data_blob_talloc_zero(tctx, 255);
+	torture_assert_not_null_goto(tctx, blob.data, ret, done, "OOM\n");
+
+	dst = (char *)blob.data;
+	dst_left = blob.length;
+	src = (const char *)utf8_nfc_blob.data;
+	srclen = strlen(src);
+
+	nconv = smb_iconv(ic,
+			  &src,
+			  &srclen,
+			  &dst,
+			  &dst_left);
+	torture_assert_goto(tctx, nconv != (size_t)-1, ret, done,
+			    "smb_iconv failed\n");
+
+	blob.length = nconv + 1; /* +1 for the trailing zero */
+	torture_assert_data_blob_equal(tctx,
+				       blob,
+				       utf8_nfd_blob,
+				       "Conversion failed\n");
+
+done:
+	return ret;
+}
+
+static bool test_utf8_mac_nfd_to_nfc(struct torture_context *tctx)
+{
+	smb_iconv_t ic;
+	DATA_BLOB utf8_nfc_blob;
+	DATA_BLOB utf8_nfd_blob;
+	DATA_BLOB blob;
+	size_t nconv;
+	const char *src = NULL;
+	char *dst = NULL;
+	size_t dst_left;
+	size_t srclen;
+	bool ret = true;
+
+	ic = smb_iconv_open("UTF8", "UTF8-MAC");
+	torture_assert_goto(tctx, ic != (smb_iconv_t)-1, ret, done,
+			    "creating iconv handle\n");
+
+	utf8_nfc_blob = base64_decode_data_blob_talloc(tctx, utf8_nfc_base64);
+	torture_assert_not_null_goto(tctx, utf8_nfc_blob.data, ret, done,
+				     "OOM\n");
+
+	utf8_nfd_blob = base64_decode_data_blob_talloc(tctx, utf8_nfd_base64);
+	torture_assert_not_null_goto(tctx, utf8_nfd_blob.data, ret, done,
+				     "OOM\n");
+
+	blob = data_blob_talloc_zero(tctx, 255);
+	torture_assert_not_null_goto(tctx, blob.data, ret, done, "OOM\n");
+
+	dst = (char *)blob.data;
+	dst_left = blob.length;
+	src = (const char *)utf8_nfd_blob.data;
+	srclen = strlen(src);
+
+	nconv = smb_iconv(ic,
+			  &src,
+			  &srclen,
+			  &dst,
+			  &dst_left);
+	torture_assert_goto(tctx, nconv != (size_t)-1, ret, done,
+			    "smb_iconv failed\n");
+
+	blob.length = nconv + 1; /* +1 for the trailing zero */
+	torture_assert_data_blob_equal(tctx,
+				       blob,
+				       utf8_nfc_blob,
+				       "Conversion failed\n");
+
+done:
+	return ret;
+}
+
 static bool test_gd_case_utf8_handle(struct torture_context *tctx)
 {
 	struct smb_iconv_handle *iconv_handle;
@@ -1713,6 +1819,8 @@ struct torture_suite *torture_local_convert_string_handle(TALLOC_CTX *mem_ctx)
 	torture_suite_add_simple_test(suite, "plato_cp850_utf8", test_plato_cp850_utf8_handle);
 	torture_suite_add_simple_test(suite, "plato_minus_1", test_plato_minus_1_handle);
 	torture_suite_add_simple_test(suite, "plato_latin_cp850_utf8", test_plato_latin_cp850_utf8_handle);
+	torture_suite_add_simple_test(suite, "utf8-mac-nfc-to-nfd", test_utf8_mac_nfc_to_nfd);
+	torture_suite_add_simple_test(suite, "utf8-mac-nfd-to-nfc", test_utf8_mac_nfd_to_nfc);
 	return suite;
 }
 
-- 
2.20.1



More information about the samba-technical mailing list