svn commit: samba r18787 - in branches/SAMBA_3_0/source: include lib script smbd

jra at samba.org jra at samba.org
Thu Sep 21 17:00:08 GMT 2006


Author: jra
Date: 2006-09-21 17:00:07 +0000 (Thu, 21 Sep 2006)
New Revision: 18787

WebSVN: http://websvn.samba.org/cgi-bin/viewcvs.cgi?view=rev&root=samba&rev=18787

Log:
Fix the strlen_m and strlen_m_term code by merging
in (and using elsewhere) next_codepoint from Samba4.
Jerry please test.
Jeremy.

Modified:
   branches/SAMBA_3_0/source/include/charset.h
   branches/SAMBA_3_0/source/include/smb.h
   branches/SAMBA_3_0/source/lib/charcnv.c
   branches/SAMBA_3_0/source/lib/util_str.c
   branches/SAMBA_3_0/source/script/mkproto.awk
   branches/SAMBA_3_0/source/smbd/reply.c
   branches/SAMBA_3_0/source/smbd/service.c


Changeset:
Modified: branches/SAMBA_3_0/source/include/charset.h
===================================================================
--- branches/SAMBA_3_0/source/include/charset.h	2006-09-21 16:42:56 UTC (rev 18786)
+++ branches/SAMBA_3_0/source/include/charset.h	2006-09-21 17:00:07 UTC (rev 18787)
@@ -22,6 +22,7 @@
 /* this defines the charset types used in samba */
 typedef enum {CH_UCS2=0, CH_UTF16=0, CH_UNIX=1, CH_DISPLAY=2, CH_DOS=3, CH_UTF8=4} charset_t;
 
+#if 0
 /* FIXME!!!  Hack job for now to get the lsa ndr code compiling */
 #ifndef strlen_m
 #define strlen_m strlen
@@ -29,8 +30,8 @@
 #ifndef strlen_m_term
 #define strlen_m_term strlen
 #endif
+#endif
 
-
 #define NUM_CHARSETS 5
 
 /* 

Modified: branches/SAMBA_3_0/source/include/smb.h
===================================================================
--- branches/SAMBA_3_0/source/include/smb.h	2006-09-21 16:42:56 UTC (rev 18786)
+++ branches/SAMBA_3_0/source/include/smb.h	2006-09-21 17:00:07 UTC (rev 18787)
@@ -170,6 +170,10 @@
 #define COPY_UCS2_CHAR(dest,src) (((unsigned char *)(dest))[0] = ((unsigned char *)(src))[0],\
 				((unsigned char *)(dest))[1] = ((unsigned char *)(src))[1], (dest))
 
+/* Large data type for manipulating uint32 unicode codepoints */
+typedef uint32 codepoint_t;
+#define INVALID_CODEPOINT ((codepoint_t)-1)
+
 /* pipe string names */
 #define PIPE_LANMAN   "\\PIPE\\LANMAN"
 #define PIPE_SRVSVC   "\\PIPE\\srvsvc"

Modified: branches/SAMBA_3_0/source/lib/charcnv.c
===================================================================
--- branches/SAMBA_3_0/source/lib/charcnv.c	2006-09-21 16:42:56 UTC (rev 18786)
+++ branches/SAMBA_3_0/source/lib/charcnv.c	2006-09-21 17:00:07 UTC (rev 18787)
@@ -1374,33 +1374,86 @@
 	return 0;
 }
 
-/****************************************************************
- Calculate the size (in bytes) of the next multibyte character in
- our internal character set. Note that p must be pointing to a
- valid mb char, not within one.
-****************************************************************/
+/*
+  Return the unicode codepoint for the next multi-byte CH_UNIX character
+  in the string. The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
 
-size_t next_mb_char_size(const char *s)
+  Also return the number of bytes consumed (which tells the caller
+  how many bytes to skip to get to the next CH_UNIX character).
+
+  Return INVALID_CODEPOINT if the next character cannot be converted.
+*/
+
+codepoint_t next_codepoint(const char *str, size_t *size)
 {
-	size_t i;
+	/* It cannot occupy more than 4 bytes in UTF16 format */
+	uint8_t buf[4];
+	smb_iconv_t descriptor;
+	size_t ilen_orig;
+	size_t ilen;
+	size_t olen;
+	char *outbuf;
 
-	if (!(*s & 0x80))
-		return 1; /* ascii. */
+	if ((str[0] & 0x80) == 0) {
+		*size = 1;
+		return (codepoint_t)str[0];
+	}
 
-	conv_silent = True;
-	for ( i = 1; i <=4; i++ ) {
-		smb_ucs2_t uc;
-		if (convert_string(CH_UNIX, CH_UCS2, s, i, &uc, 2, False) == 2) {
-#if 0 /* JRATEST */
-			DEBUG(10,("next_mb_char_size: size %u at string %s\n",
-				(unsigned int)i, s));
-#endif
-			conv_silent = False;
-			return i;
+	/* We assume that no multi-byte character can take
+	   more than 5 bytes. This is OK as we only
+	   support codepoints up to 1M */
+
+	ilen_orig = strnlen(str, 5);
+	ilen = ilen_orig;
+
+        lazy_initialize_conv();
+
+	/* CH_UCS2 == UTF16-LE. */
+        descriptor = conv_handles[CH_UNIX][CH_UCS2];
+	if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
+		*size = 1;
+		return INVALID_CODEPOINT;
+	}
+
+	/* This looks a little strange, but it is needed to cope
+	   with codepoints above 64k which are encoded as per RFC2781. */
+	olen = 2;
+	outbuf = (char *)buf;
+	smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
+	if (olen == 2) {
+		/* We failed to convert to a 2 byte character.
+		   See if we can convert to a 4 UTF16-LE byte char encoding.
+		*/
+		olen = 4;
+		outbuf = (char *)buf;
+		smb_iconv(descriptor,  &str, &ilen, &outbuf, &olen);
+		if (olen == 4) {
+			/* We didn't convert any bytes */
+			*size = 1;
+			return INVALID_CODEPOINT;
 		}
+		olen = 4 - olen;
+	} else {
+		olen = 2 - olen;
 	}
-	/* We're hosed - we don't know how big this is... */
-	DEBUG(10,("next_mb_char_size: unknown size at string %s\n", s));
-	conv_silent = False;
-	return 1;
+
+	*size = ilen_orig - ilen;
+
+	if (olen == 2) {
+		/* 2 byte, UTF16-LE encoded value. */
+		return (codepoint_t)SVAL(buf, 0);
+	}
+	if (olen == 4) {
+		/* Decode a 4 byte UTF16-LE character manually.
+		   See RFC2871 for the encoding machanism.
+		*/
+		codepoint_t w1 = SVAL(buf,0) & ~0xD800;
+		codepoint_t w2 = SVAL(buf,2) & ~0xDC00;
+
+		return (codepoint_t)0x10000 +
+				(w1 << 10) + w2;
+	}
+
+	/* no other length is valid */
+	return INVALID_CODEPOINT;
 }

Modified: branches/SAMBA_3_0/source/lib/util_str.c
===================================================================
--- branches/SAMBA_3_0/source/lib/util_str.c	2006-09-21 16:42:56 UTC (rev 18786)
+++ branches/SAMBA_3_0/source/lib/util_str.c	2006-09-21 17:00:07 UTC (rev 18787)
@@ -1593,6 +1593,58 @@
 }
 
 /**
+ Count the number of UCS2 characters in a string. Normally this will
+ be the same as the number of bytes in a string for single byte strings,
+ but will be different for multibyte.
+**/
+
+size_t strlen_m(const char *s)
+{
+	size_t count = 0;
+
+	if (!s) {
+		return 0;
+	}
+
+	while (*s && !(((uint8_t)*s) & 0x80)) {
+		s++;
+		count++;
+	}
+
+	if (!*s) {
+		return count;
+	}
+
+	while (*s) {
+		size_t c_size;
+		codepoint_t c = next_codepoint(s, &c_size);
+		if (c < 0x10000) {
+			/* Unicode char fits into 16 bits. */
+			count += 1;
+		} else {
+			/* Double-width unicode char - 32 bits. */
+			count += 2;
+		}
+		s += c_size;
+	}
+
+	return count;
+}
+
+/**
+ Count the number of UCS2 characters in a string including the null
+ terminator.
+**/
+
+size_t strlen_m_term(const char *s)
+{
+	if (!s) {
+		return 0;
+	}
+	return strlen_m(s) + 1;
+}
+
+/**
  Return a RFC2254 binary string representation of a buffer.
  Used in LDAP filters.
  Caller must free.

Modified: branches/SAMBA_3_0/source/script/mkproto.awk
===================================================================
--- branches/SAMBA_3_0/source/script/mkproto.awk	2006-09-21 16:42:56 UTC (rev 18786)
+++ branches/SAMBA_3_0/source/script/mkproto.awk	2006-09-21 17:00:07 UTC (rev 18787)
@@ -146,7 +146,7 @@
     gotstart = 1;
   }
 
-  if( $0 ~ /^NODE_STATUS_STRUCT|SMB_STRUCT_DIR|ELOG_TDB/ ) {
+  if( $0 ~ /^NODE_STATUS_STRUCT|SMB_STRUCT_DIR|ELOG_TDB|codepoint_t/ ) {
     gotstart = 1;
   }
 

Modified: branches/SAMBA_3_0/source/smbd/reply.c
===================================================================
--- branches/SAMBA_3_0/source/smbd/reply.c	2006-09-21 16:42:56 UTC (rev 18786)
+++ branches/SAMBA_3_0/source/smbd/reply.c	2006-09-21 17:00:07 UTC (rev 18787)
@@ -132,13 +132,22 @@
 					break;
 			}
 		} else {
-			switch(next_mb_char_size(s)) {
+			size_t siz;
+			/* Get the size of the next MB character. */
+			next_codepoint(s,&siz);
+			switch(siz) {
+				case 5:
+					*d++ = *s++;
+					/*fall through*/
 				case 4:
 					*d++ = *s++;
+					/*fall through*/
 				case 3:
 					*d++ = *s++;
+					/*fall through*/
 				case 2:
 					*d++ = *s++;
+					/*fall through*/
 				case 1:
 					*d++ = *s++;
 					break;
@@ -266,7 +275,13 @@
 			}
 			*d++ = *s++;
 		} else {
-			switch(next_mb_char_size(s)) {
+			size_t siz;
+			/* Get the size of the next MB character. */
+			next_codepoint(s,&siz);
+			switch(siz) {
+				case 5:
+					*d++ = *s++;
+					/*fall through*/
 				case 4:
 					*d++ = *s++;
 					/*fall through*/
@@ -374,7 +389,13 @@
 		if (!(*s & 0x80)) {
 			*d++ = *s++;
 		} else {
-			switch(next_mb_char_size(s)) {
+			size_t siz;
+			/* Get the size of the next MB character. */
+			next_codepoint(s,&siz);
+			switch(siz) {
+				case 5:
+					*d++ = *s++;
+					/*fall through*/
 				case 4:
 					*d++ = *s++;
 					/*fall through*/

Modified: branches/SAMBA_3_0/source/smbd/service.c
===================================================================
--- branches/SAMBA_3_0/source/smbd/service.c	2006-09-21 16:42:56 UTC (rev 18786)
+++ branches/SAMBA_3_0/source/smbd/service.c	2006-09-21 17:00:07 UTC (rev 18787)
@@ -95,13 +95,22 @@
 		if (!(*s & 0x80)) {
 			*d++ = *s++;
 		} else {
-			switch(next_mb_char_size(s)) {
+			size_t siz;
+			/* Get the size of the next MB character. */
+			next_codepoint(s,&siz);
+			switch(siz) {
+				case 5:
+					*d++ = *s++;
+					/*fall through*/
 				case 4:
 					*d++ = *s++;
+					/*fall through*/
 				case 3:
 					*d++ = *s++;
+					/*fall through*/
 				case 2:
 					*d++ = *s++;
+					/*fall through*/
 				case 1:
 					*d++ = *s++;
 					break;



More information about the samba-cvs mailing list