List:Commits« Previous MessageNext Message »
From:Alexander Barkov Date:February 15 2011 11:33am
Subject:bzr commit into mysql-trunk branch (alexander.barkov:3648) WL#5331
View as plain text  
#At file:///home/bar/mysql-bzr/mysql-trunk/ based on revid:luis.soares@stripped

 3648 Alexander Barkov	2011-02-15
      A pre-requisite patch for
      WL#5331 Support Unicode for Windows command line client
      
      Moving character set conversion routine implementation from
      sq/sql_string.cc to strings/ctype.c, as conversion functionality
      is occasionally needed in client tools.

    modified:
      client/sql_string.cc
      client/sql_string.h
      include/m_ctype.h
      sql/sql_string.cc
      sql/sql_string.h
      strings/ctype.c
=== modified file 'client/sql_string.cc'
--- a/client/sql_string.cc	2010-09-13 09:58:11 +0000
+++ b/client/sql_string.cc	2011-02-15 11:30:56 +0000
@@ -707,80 +707,6 @@ String *copy_if_not_alloced(String *to,S
   Help functions
 ****************************************************************************/
 
-/*
-  copy a string from one character set to another
-  
-  SYNOPSIS
-    copy_and_convert()
-    to			Store result here
-    to_cs		Character set of result string
-    from		Copy from here
-    from_length		Length of from string
-    from_cs		From character set
-
-  NOTES
-    'to' must be big enough as form_length * to_cs->mbmaxlen
-
-  RETURN
-    length of bytes copied to 'to'
-*/
-
-
-uint32
-copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
-                 const char *from, uint32 from_length, CHARSET_INFO *from_cs,
-                 uint *errors)
-{
-  int         cnvres;
-  my_wc_t     wc;
-  const uchar *from_end= (const uchar*) from+from_length;
-  char *to_start= to;
-  uchar *to_end= (uchar*) to+to_length;
-  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
-  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
-  uint error_count= 0;
-
-  while (1)
-  {
-    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from,
-				      from_end)) > 0)
-      from+= cnvres;
-    else if (cnvres == MY_CS_ILSEQ)
-    {
-      error_count++;
-      from++;
-      wc= '?';
-    }
-    else if (cnvres > MY_CS_TOOSMALL)
-    {
-      /*
-        A correct multibyte sequence detected
-        But it doesn't have Unicode mapping.
-      */
-      error_count++;
-      from+= (-cnvres);
-      wc= '?';
-    }
-    else
-      break;  // Not enough characters
-
-outp:
-    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
-      to+= cnvres;
-    else if (cnvres == MY_CS_ILUNI && wc != '?')
-    {
-      error_count++;
-      wc= '?';
-      goto outp;
-    }
-    else
-      break;
-  }
-  *errors= error_count;
-  return (uint32) (to - to_start);
-}
-
-
 void String::print(String *str)
 {
   char *st= (char*)Ptr, *end= st+str_length;

=== modified file 'client/sql_string.h'
--- a/client/sql_string.h	2010-10-19 22:51:34 +0000
+++ b/client/sql_string.h	2011-02-15 11:30:56 +0000
@@ -25,9 +25,12 @@
 class String;
 int sortcmp(const String *a,const String *b, CHARSET_INFO *cs);
 String *copy_if_not_alloced(String *a,String *b,uint32 arg_length);
-uint32 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
-			const char *from, uint32 from_length,
-			CHARSET_INFO *from_cs, uint *errors);
+inline uint32 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
+                               const char *from, uint32 from_length,
+                               CHARSET_INFO *from_cs, uint *errors)
+{
+  return my_convert(to, to_length, to_cs, from, from_length, from_cs, errors);
+}
 
 class String
 {

=== modified file 'include/m_ctype.h'
--- a/include/m_ctype.h	2010-12-20 10:28:06 +0000
+++ b/include/m_ctype.h	2011-02-15 11:30:56 +0000
@@ -686,6 +686,10 @@ my_bool my_charset_is_ascii_compatible(C
 extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
                               const char* fmt, va_list ap);
 
+uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
+                  const char *from, uint32 from_length, CHARSET_INFO *from_cs,
+                  uint *errors);
+
 #define	_MY_U	01	/* Upper case */
 #define	_MY_L	02	/* Lower case */
 #define	_MY_NMR	04	/* Numeral (digit) */

=== modified file 'sql/sql_string.cc'
--- a/sql/sql_string.cc	2011-01-13 08:19:52 +0000
+++ b/sql/sql_string.cc	2011-02-15 11:30:56 +0000
@@ -750,140 +750,6 @@ String *copy_if_not_alloced(String *to,S
   Help functions
 ****************************************************************************/
 
-/*
-  copy a string from one character set to another
-  
-  SYNOPSIS
-    copy_and_convert()
-    to			Store result here
-    to_cs		Character set of result string
-    from		Copy from here
-    from_length		Length of from string
-    from_cs		From character set
-
-  NOTES
-    'to' must be big enough as form_length * to_cs->mbmaxlen
-
-  RETURN
-    length of bytes copied to 'to'
-*/
-
-
-static uint32
-copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
-                          const char *from, uint32 from_length,
-                          CHARSET_INFO *from_cs,
-                          uint *errors)
-{
-  int         cnvres;
-  my_wc_t     wc;
-  const uchar *from_end= (const uchar*) from+from_length;
-  char *to_start= to;
-  uchar *to_end= (uchar*) to+to_length;
-  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
-  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
-  uint error_count= 0;
-
-  while (1)
-  {
-    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from,
-				      from_end)) > 0)
-      from+= cnvres;
-    else if (cnvres == MY_CS_ILSEQ)
-    {
-      error_count++;
-      from++;
-      wc= '?';
-    }
-    else if (cnvres > MY_CS_TOOSMALL)
-    {
-      /*
-        A correct multibyte sequence detected
-        But it doesn't have Unicode mapping.
-      */
-      error_count++;
-      from+= (-cnvres);
-      wc= '?';
-    }
-    else
-      break;  // Not enough characters
-
-outp:
-    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
-      to+= cnvres;
-    else if (cnvres == MY_CS_ILUNI && wc != '?')
-    {
-      error_count++;
-      wc= '?';
-      goto outp;
-    }
-    else
-      break;
-  }
-  *errors= error_count;
-  return (uint32) (to - to_start);
-}
-
-
-/*
-  Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
-*/
-uint32
-copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
-                 const char *from, uint32 from_length, CHARSET_INFO *from_cs,
-                 uint *errors)
-{
-  /*
-    If any of the character sets is not ASCII compatible,
-    immediately switch to slow mb_wc->wc_mb method.
-  */
-  if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
-    return copy_and_convert_extended(to, to_length, to_cs,
-                                     from, from_length, from_cs, errors);
-
-  uint32 length= min(to_length, from_length), length2= length;
-
-#if defined(__i386__)
-  /*
-    Special loop for i386, it allows to refer to a
-    non-aligned memory block as UINT32, which makes
-    it possible to copy four bytes at once. This
-    gives about 10% performance improvement comparing
-    to byte-by-byte loop.
-  */
-  for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
-  {
-    if ((*(uint32*)from) & 0x80808080)
-      break;
-    *((uint32*) to)= *((const uint32*) from);
-  }
-#endif
-
-  for (; ; *to++= *from++, length--)
-  {
-    if (!length)
-    {
-      *errors= 0;
-      return length2;
-    }
-    if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
-    {
-      uint32 copied_length= length2 - length;
-      to_length-= copied_length;
-      from_length-= copied_length;
-      return copied_length + copy_and_convert_extended(to, to_length,
-                                                       to_cs,
-                                                       from, from_length,
-                                                       from_cs,
-                                                       errors);
-    }
-  }
-
-  DBUG_ASSERT(FALSE); // Should never get to here
-  return 0;           // Make compiler happy
-}
-
-
 /**
   Copy string with HEX-encoding of "bad" characters.
 

=== modified file 'sql/sql_string.h'
--- a/sql/sql_string.h	2011-01-13 08:19:52 +0000
+++ b/sql/sql_string.h	2011-02-15 11:30:56 +0000
@@ -33,9 +33,12 @@ typedef struct st_mem_root MEM_ROOT;
 
 int sortcmp(const String *a,const String *b, CHARSET_INFO *cs);
 String *copy_if_not_alloced(String *a,String *b,uint32 arg_length);
-uint32 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
-			const char *from, uint32 from_length,
-			CHARSET_INFO *from_cs, uint *errors);
+inline uint32 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
+                               const char *from, uint32 from_length,
+                               CHARSET_INFO *from_cs, uint *errors)
+{
+  return my_convert(to, to_length, to_cs, from, from_length, from_cs, errors);
+}
 uint32 well_formed_copy_nchars(CHARSET_INFO *to_cs,
                                char *to, uint to_length,
                                CHARSET_INFO *from_cs,

=== modified file 'strings/ctype.c'
--- a/strings/ctype.c	2011-01-19 13:35:54 +0000
+++ b/strings/ctype.c	2011-02-15 11:30:56 +0000
@@ -927,3 +927,143 @@ my_charset_is_ascii_compatible(CHARSET_I
   }
   return 1;
 }
+
+
+/*
+  Convert a string between two character sets.
+  'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
+
+  @param  to[OUT]       Store result here
+  @param  to_length     Size of "to" buffer
+  @param  to_cs         Character set of result string
+  @param  from          Copy from here
+  @param  from_length   Length of the "from" string
+  @param  from_cs       Character set of the "from" string
+  @param  errors[OUT]   Number of conversion errors
+
+  @return Number of bytes copied to 'to' string
+*/
+
+static uint32
+my_convert_internal(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
+                    const char *from, uint32 from_length,CHARSET_INFO *from_cs,
+                    uint *errors)
+{
+  int         cnvres;
+  my_wc_t     wc;
+  const uchar *from_end= (const uchar*) from + from_length;
+  char *to_start= to;
+  uchar *to_end= (uchar*) to + to_length;
+  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
+  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
+  uint error_count= 0;
+
+  while (1)
+  {
+    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
+      from+= cnvres;
+    else if (cnvres == MY_CS_ILSEQ)
+    {
+      error_count++;
+      from++;
+      wc= '?';
+    }
+    else if (cnvres > MY_CS_TOOSMALL)
+    {
+      /*
+        A correct multibyte sequence detected
+        But it doesn't have Unicode mapping.
+      */
+      error_count++;
+      from+= (-cnvres);
+      wc= '?';
+    }
+    else
+      break;  // Not enough characters
+
+outp:
+    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
+      to+= cnvres;
+    else if (cnvres == MY_CS_ILUNI && wc != '?')
+    {
+      error_count++;
+      wc= '?';
+      goto outp;
+    }
+    else
+      break;
+  }
+  *errors= error_count;
+  return (uint32) (to - to_start);
+}
+
+
+/*
+  Convert a string between two character sets.
+   Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
+  'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
+
+  @param  to[OUT]       Store result here
+  @param  to_length     Size of "to" buffer
+  @param  to_cs         Character set of result string
+  @param  from          Copy from here
+  @param  from_length   Length of the "from" string
+  @param  from_cs       Character set of the "from" string
+  @param  errors[OUT]   Number of conversion errors
+
+  @return Number of bytes copied to 'to' string
+*/
+
+uint32
+my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
+           const char *from, uint32 from_length, CHARSET_INFO *from_cs,
+           uint *errors)
+{
+  uint32 length, length2;
+  /*
+    If any of the character sets is not ASCII compatible,
+    immediately switch to slow mb_wc->wc_mb method.
+  */
+  if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
+    return my_convert_internal(to, to_length, to_cs,
+                               from, from_length, from_cs, errors);
+
+  length= length2= min(to_length, from_length);
+
+#if defined(__i386__)
+  /*
+    Special loop for i386, it allows to refer to a
+    non-aligned memory block as UINT32, which makes
+    it possible to copy four bytes at once. This
+    gives about 10% performance improvement comparing
+    to byte-by-byte loop.
+  */
+  for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
+  {
+    if ((*(uint32*)from) & 0x80808080)
+      break;
+    *((uint32*) to)= *((const uint32*) from);
+  }
+#endif /* __i386__ */
+
+  for (; ; *to++= *from++, length--)
+  {
+    if (!length)
+    {
+      *errors= 0;
+      return length2;
+    }
+    if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
+    {
+      uint32 copied_length= length2 - length;
+      to_length-= copied_length;
+      from_length-= copied_length;
+      return copied_length + my_convert_internal(to, to_length, to_cs,
+                                                 from, from_length, from_cs,
+                                                 errors);
+    }
+  }
+
+  DBUG_ASSERT(FALSE); // Should never get to here
+  return 0;           // Make compiler happy
+}


Attachment: [text/bzr-bundle] bzr/alexander.barkov@oracle.com-20110215113056-5qzxii3z08ibsvy7.bundle
Thread
bzr commit into mysql-trunk branch (alexander.barkov:3648) WL#5331Alexander Barkov15 Feb