[FYI] Adding support for Unicode normalisation conversion to lib/util/charset/
Ralph Böhme
slow at samba.org
Tue Apr 9 16:50:28 UTC 2019
Hi!
Fwiw, maybe this of interest to anyone: I have a small patchset that adds
support for Unicode normalisation NFC/NFD conversion to lib/util/charset/ using
libicu.
I need this to convert strings coming in over mdssvc RPC (aka Spotlight) from
Mac clients.
There's a more complete patchset running in this pipeline:
https://gitlab.com/samba-team/devel/samba/pipelines/55974836
This is on-top of the gitlab WIP patches for container image rebuild
automation. I'm adding libicu-devel packages to the images here to ensure the
torture test of the patchset is always run.
I will submit a full patchset once the gitlab stuff is upstream. Hopefully soon!
:)
-slow
--
Ralph Boehme, Samba Team https://samba.org/
Samba Developer, SerNet GmbH https://sernet.de/en/samba/
GPG-Fingerprint FAE2C6088A24252051C559E4AA1E9B7126399E46
-------------- next part --------------
From a6eb7fe60993fcf8a56a0c22e65a61a1e01d05b8 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Tue, 9 Apr 2019 11:21:57 +0200
Subject: [PATCH 1/2] charset: add support for Unicode normalisation with
libicu
---
lib/util/charset/iconv.c | 109 +++++++++++++++++++++++++++++
lib/util/charset/wscript_build | 2 +-
lib/util/charset/wscript_configure | 11 +++
3 files changed, 121 insertions(+), 1 deletion(-)
diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c
index 4fae09fda52..23551023e87 100644
--- a/lib/util/charset/iconv.c
+++ b/lib/util/charset/iconv.c
@@ -28,6 +28,11 @@
#include "libcli/util/ntstatus.h"
#include "lib/util/util_str_hex.h"
+#ifdef HAVE_ICU_I18N
+#include <unicode/ustring.h>
+#include <unicode/utrans.h>
+#endif
+
#ifdef strcasecmp
#undef strcasecmp
#endif
@@ -165,6 +170,64 @@ static size_t sys_iconv(void *cd,
}
#endif
+#ifdef HAVE_ICU_I18N
+static size_t sys_uconv(void *cd,
+ const char **inbuf,
+ size_t *inbytesleft,
+ char **outbuf,
+ size_t *outbytesleft)
+{
+ UTransliterator *t = (UTransliterator *)cd;
+ UChar ustr[*inbytesleft*2];
+ int32_t ustrlen;
+ int32_t limit;
+ int32_t converted_len;
+ UErrorCode ue = 0;
+
+ /* Convert from UTF8 to UCS2 */
+ (void)u_strFromUTF8(ustr, /* dst */
+ sizeof(ustr), /* dst buflen */
+ &ustrlen, /* dst written */
+ *inbuf, /* src */
+ *inbytesleft, /* src length */
+ &ue);
+ if (U_FAILURE(ue)) {
+ return -1;
+ }
+
+ limit = ustrlen;
+
+ /* Runs translitaration *inplace* (!) */
+ utrans_transUChars(t,
+ ustr, /* text */
+ &ustrlen, /* text length */
+ sizeof(ustr), /* text buflen */
+ 0, /* start */
+ &limit, /* limit */
+ &ue);
+ if (U_FAILURE(ue)) {
+ return -1;
+ }
+
+ (void)u_strToUTF8(*outbuf, /* dst */
+ *outbytesleft, /* dst buflen */
+ &converted_len, /* dst written */
+ ustr, /* src */
+ ustrlen, /* src length */
+ &ue);
+ if (U_FAILURE(ue)) {
+ return -1;
+ }
+
+ *inbuf += *inbytesleft;
+ *inbytesleft = 0;
+ *outbuf += converted_len;
+ *outbytesleft -= converted_len;
+
+ return converted_len;
+}
+#endif
+
/**
* This is a simple portable iconv() implementaion.
*
@@ -302,6 +365,52 @@ _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
}
#endif
+#ifdef HAVE_ICU_I18N
+ if (strcasecmp(fromcode, "UTF8-MAC") == 0 &&
+ strcasecmp(tocode, "UTF8") == 0)
+ {
+ U_STRING_DECL(t, "any-nfc", strlen("any-nfc"));
+ UErrorCode ue = 0;
+
+ U_STRING_INIT(t, "any-nfc", strlen("any-nfc"));
+
+ ret->cd_direct = utrans_openU(t,
+ strlen("any-nfc"),
+ UTRANS_FORWARD,
+ NULL,
+ 0,
+ NULL,
+ &ue);
+ if (U_FAILURE(ue)) {
+ return (smb_iconv_t)-1;
+ }
+ ret->direct = sys_uconv;
+ return ret;
+ }
+
+ if (strcasecmp(fromcode, "UTF8") == 0 &&
+ strcasecmp(tocode, "UTF8-MAC") == 0)
+ {
+ U_STRING_DECL(tname, "any-nfd", 7);
+ UErrorCode ue = 0;
+
+ U_STRING_INIT(tname, "any-nfd", 7);
+
+ ret->cd_direct = utrans_openU(tname,
+ 7,
+ UTRANS_FORWARD,
+ NULL,
+ 0,
+ NULL,
+ &ue);
+ if (U_FAILURE(ue)) {
+ return (smb_iconv_t)-1;
+ }
+ ret->direct = sys_uconv;
+ return ret;
+ }
+#endif
+
if (ret->pull == NULL && from == NULL) {
goto failed;
}
diff --git a/lib/util/charset/wscript_build b/lib/util/charset/wscript_build
index a3728f6a4bd..8fed718e7dc 100644
--- a/lib/util/charset/wscript_build
+++ b/lib/util/charset/wscript_build
@@ -2,7 +2,7 @@
bld.SAMBA_SUBSYSTEM('ICONV_WRAPPER',
source='iconv.c',
- public_deps='iconv replace talloc')
+ public_deps='iconv replace talloc ' + bld.env['icu-libs'])
bld.SAMBA_SUBSYSTEM('charset',
public_headers='charset.h',
diff --git a/lib/util/charset/wscript_configure b/lib/util/charset/wscript_configure
index d5ac5d0100f..ca49ec6d3dc 100644
--- a/lib/util/charset/wscript_configure
+++ b/lib/util/charset/wscript_configure
@@ -36,3 +36,14 @@ conf.CHECK_CODE('''
msg='Checking errno of iconv for illegal multibyte sequence',
lib='iconv',
headers='errno.h iconv.h')
+
+if conf.CHECK_CFG(package='icu-i18n',
+ args='--cflags --libs',
+ msg='Checking for icu-i18n',
+ uselib_store='ICU_I18N'):
+ for lib in conf.env['LIB_ICU_I18N']:
+ conf.CHECK_LIB(lib, shlib=True, mandatory=True)
+ conf.env['icu-libs'] = ' '.join(conf.env['LIB_ICU_I18N'])
+ if not conf.CHECK_HEADERS('unicode/ustring.h'):
+ conf.fatal('Found libicu, but unicode/ustring.h is missing')
+ conf.DEFINE('HAVE_UTF8_MAC', 1)
--
2.20.1
From 8f8fd5ecfc76955ac713608615455a303d985d46 Mon Sep 17 00:00:00 2001
From: Ralph Boehme <slow at samba.org>
Date: Tue, 9 Apr 2019 13:34:39 +0200
Subject: [PATCH 2/2] charset: add tests for Unicode NFC <-> NFD conversion
---
lib/util/charset/tests/convert_string.c | 108 ++++++++++++++++++++++++
1 file changed, 108 insertions(+)
diff --git a/lib/util/charset/tests/convert_string.c b/lib/util/charset/tests/convert_string.c
index e63fca650b0..cb5e16b5ca1 100644
--- a/lib/util/charset/tests/convert_string.c
+++ b/lib/util/charset/tests/convert_string.c
@@ -96,6 +96,10 @@ static const char *gd_cp850_upper_base64 = "R5pOVEhFUiBERVNDSE5FUg==";
static const char *gd_cp850_lower_base64 = "Z4FudGhlciBkZXNjaG5lcg==";
static const char *gd_iso8859_1_base64 = "R/xudGhlciBEZXNjaG5lcg==";
static const char *gd_utf16le_base64 = "RwD8AG4AdABoAGUAcgAgAEQAZQBzAGMAaABuAGUAcgA=";
+/* täst */
+static const char *utf8_nfc_base64 = "dMOkc3QA";
+/* täst, where ä = a + combining diaeresis */
+static const char *utf8_nfd_base64 = "dGHMiHN0AA==";
static bool test_gd_iso8859_cp850_handle(struct torture_context *tctx)
{
@@ -1188,6 +1192,108 @@ static bool test_plato_latin_cp850_utf8_handle(struct torture_context *tctx)
return true;
}
+static bool test_utf8_mac_nfc_to_nfd(struct torture_context *tctx)
+{
+ smb_iconv_t ic;
+ DATA_BLOB utf8_nfc_blob;
+ DATA_BLOB utf8_nfd_blob;
+ DATA_BLOB blob;
+ size_t nconv;
+ const char *src = NULL;
+ char *dst = NULL;
+ size_t dst_left;
+ size_t srclen;
+ bool ret = true;
+
+ ic = smb_iconv_open("UTF8-MAC", "UTF8");
+ torture_assert_goto(tctx, ic != (smb_iconv_t)-1, ret, done,
+ "creating iconv handle\n");
+
+ utf8_nfc_blob = base64_decode_data_blob_talloc(tctx, utf8_nfc_base64);
+ torture_assert_not_null_goto(tctx, utf8_nfc_blob.data, ret, done,
+ "OOM\n");
+
+ utf8_nfd_blob = base64_decode_data_blob_talloc(tctx, utf8_nfd_base64);
+ torture_assert_not_null_goto(tctx, utf8_nfd_blob.data, ret, done,
+ "OOM\n");
+
+ blob = data_blob_talloc_zero(tctx, 255);
+ torture_assert_not_null_goto(tctx, blob.data, ret, done, "OOM\n");
+
+ dst = (char *)blob.data;
+ dst_left = blob.length;
+ src = (const char *)utf8_nfc_blob.data;
+ srclen = strlen(src);
+
+ nconv = smb_iconv(ic,
+ &src,
+ &srclen,
+ &dst,
+ &dst_left);
+ torture_assert_goto(tctx, nconv != (size_t)-1, ret, done,
+ "smb_iconv failed\n");
+
+ blob.length = nconv + 1; /* +1 for the trailing zero */
+ torture_assert_data_blob_equal(tctx,
+ blob,
+ utf8_nfd_blob,
+ "Conversion failed\n");
+
+done:
+ return ret;
+}
+
+static bool test_utf8_mac_nfd_to_nfc(struct torture_context *tctx)
+{
+ smb_iconv_t ic;
+ DATA_BLOB utf8_nfc_blob;
+ DATA_BLOB utf8_nfd_blob;
+ DATA_BLOB blob;
+ size_t nconv;
+ const char *src = NULL;
+ char *dst = NULL;
+ size_t dst_left;
+ size_t srclen;
+ bool ret = true;
+
+ ic = smb_iconv_open("UTF8", "UTF8-MAC");
+ torture_assert_goto(tctx, ic != (smb_iconv_t)-1, ret, done,
+ "creating iconv handle\n");
+
+ utf8_nfc_blob = base64_decode_data_blob_talloc(tctx, utf8_nfc_base64);
+ torture_assert_not_null_goto(tctx, utf8_nfc_blob.data, ret, done,
+ "OOM\n");
+
+ utf8_nfd_blob = base64_decode_data_blob_talloc(tctx, utf8_nfd_base64);
+ torture_assert_not_null_goto(tctx, utf8_nfd_blob.data, ret, done,
+ "OOM\n");
+
+ blob = data_blob_talloc_zero(tctx, 255);
+ torture_assert_not_null_goto(tctx, blob.data, ret, done, "OOM\n");
+
+ dst = (char *)blob.data;
+ dst_left = blob.length;
+ src = (const char *)utf8_nfd_blob.data;
+ srclen = strlen(src);
+
+ nconv = smb_iconv(ic,
+ &src,
+ &srclen,
+ &dst,
+ &dst_left);
+ torture_assert_goto(tctx, nconv != (size_t)-1, ret, done,
+ "smb_iconv failed\n");
+
+ blob.length = nconv + 1; /* +1 for the trailing zero */
+ torture_assert_data_blob_equal(tctx,
+ blob,
+ utf8_nfc_blob,
+ "Conversion failed\n");
+
+done:
+ return ret;
+}
+
static bool test_gd_case_utf8_handle(struct torture_context *tctx)
{
struct smb_iconv_handle *iconv_handle;
@@ -1713,6 +1819,8 @@ struct torture_suite *torture_local_convert_string_handle(TALLOC_CTX *mem_ctx)
torture_suite_add_simple_test(suite, "plato_cp850_utf8", test_plato_cp850_utf8_handle);
torture_suite_add_simple_test(suite, "plato_minus_1", test_plato_minus_1_handle);
torture_suite_add_simple_test(suite, "plato_latin_cp850_utf8", test_plato_latin_cp850_utf8_handle);
+ torture_suite_add_simple_test(suite, "utf8-mac-nfc-to-nfd", test_utf8_mac_nfc_to_nfd);
+ torture_suite_add_simple_test(suite, "utf8-mac-nfd-to-nfc", test_utf8_mac_nfd_to_nfc);
return suite;
}
--
2.20.1
More information about the samba-technical
mailing list