List:Commits« Previous MessageNext Message »
From:kgeorge Date:February 6 2008 5:45pm
Subject:bk commit into 6.0 tree (gkodinov:1.2799) BUG#14637
View as plain text  
Below is the list of changes that have just been committed into a local
6.0 repository of gkodinov.  When gkodinov does a push these changes
will be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2008-02-06 18:45:46+02:00, gkodinov@stripped +5 -0
  Bug #14637: trim trailing spaces processes data only byte wise
   Use and int * where possible to scan for trailing space in a
   string instead of always iterating char-by-char.

  include/m_string.h@stripped, 2008-02-06 18:45:44+02:00, gkodinov@stripped +59 -0
    Bug #14637: scan for space through ints

  strings/ctype-bin.c@stripped, 2008-02-06 18:45:44+02:00, gkodinov@stripped +1 -4
    Bug #14637: scan for space through ints

  strings/ctype-latin1.c@stripped, 2008-02-06 18:45:44+02:00, gkodinov@stripped +2 -3
    Bug #14637: scan for space through ints

  strings/ctype-mb.c@stripped, 2008-02-06 18:45:44+02:00, gkodinov@stripped +1 -4
    Bug #14637: scan for space through ints

  strings/ctype-simple.c@stripped, 2008-02-06 18:45:44+02:00, gkodinov@stripped +4 -6
    Bug #14637: scan for space through ints

diff -Nrup a/include/m_string.h b/include/m_string.h
--- a/include/m_string.h	2007-11-30 15:52:49 +02:00
+++ b/include/m_string.h	2008-02-06 18:45:44 +02:00
@@ -263,4 +263,63 @@ typedef struct st_mysql_lex_string LEX_S
 #define USTRING_WITH_LEN(X) ((uchar*) X), ((size_t) (sizeof(X) - 1))
 #define C_STRING_WITH_LEN(X) ((char *) (X)), ((size_t) (sizeof(X) - 1))
 
+/* 
+  Skip trailing space. Made as a marcro so it can inline.  The idea is simple :
+  On most systems reading memory in larger chunks (ideally equal to the size of
+  the chinks that the machine physically reads from memory) causes fewer memory
+  access loops and hence increased performance.
+  This is why the 'int' type is used : it's closest to that (according to how
+  it's defined in C).
+  So when we determine the amount of whitespace at the end of a string we do
+  the following :
+    1. We divide the string into 3 zones :
+      a) from the start of the string (__start) to the first multiple
+        of sizeof(int)  (__start_words)
+      b) from the end of the string (__end) to the last multiple of sizeof(int)
+        (__end_words)
+      c) a zone that is aligned to sizeof(int) and can be safely accessed
+        through an int *
+    2. We start comparing backwards from (c) char-by-char. If all we find is
+       space then we continue
+    3. If there are elements in zone (b) we compare them as unsigned ints to a
+       int mask (SPACE_INT) consisting of all spaces
+    4. Finally we compare the remaining part (a) of the string char by char.
+       This covers for the last non-space unsigned int from 3. (if any)
+   
+   This algorithm works well for relatively larger strings, but it will slow
+   the things down for smaller strings (because of the additional calculations
+   and checks compared to the naive method). Thus the barrier of length 20
+   is added.
+*/
+#if SIZEOF_INT == 2
+#define SPACE_INT 0x2020
+#elif SIZEOF_INT == 4
+#define SPACE_INT 0x20202020
+#elif SIZEOF_INT == 8
+#define SPACE_INT 0x2020202020202020
+#else
+#error define the appropriate constant for a word full of spaces
+#endif
+
+#define SKIP_TRAILING_SPACE(type,ptr,len,end_ret) \
+{ \
+  type *__start= ptr; \
+  register type *__end= ptr + len; \
+  if (len > 20) \
+  { \
+    type *__end_words= (type *)(((intptr)__end) / SIZEOF_INT * SIZEOF_INT); \
+    type *__start_words= (type *) \
+       ((((intptr)__start) + SIZEOF_INT - 1) / SIZEOF_INT * SIZEOF_INT); \
+    DBUG_ASSERT(((intptr)__start) >= SIZEOF_INT); \
+    if (__end_words > __start) \
+      while (__end > __end_words && __end[-1] == 0x20) \
+        __end--; \
+    if (__end[-1] == 0x20 && __start_words < __end_words) \
+      while (__end > __start_words && ((unsigned *)__end)[-1] == SPACE_INT) \
+        __end -= SIZEOF_INT; \
+  } \
+  while (__end > __start && __end[-1] == 0x20) \
+    __end--; \
+  end_ret= __end; \
+}
 #endif
diff -Nrup a/strings/ctype-bin.c b/strings/ctype-bin.c
--- a/strings/ctype-bin.c	2007-06-21 12:59:24 +03:00
+++ b/strings/ctype-bin.c	2008-02-06 18:45:44 +02:00
@@ -278,14 +278,11 @@ void my_hash_sort_8bit_bin(CHARSET_INFO 
 {
   const uchar *pos = key;
   
-  key+= len;
-  
   /*
      Remove trailing spaces. We have to do this to be able to compare
     'A ' and 'A' as identical
   */
-  while (key > pos && key[-1] == ' ')
-    key--;
+  SKIP_TRAILING_SPACE(const uchar, key, len, key);
 
   for (; pos < (uchar*) key ; pos++)
   {
diff -Nrup a/strings/ctype-latin1.c b/strings/ctype-latin1.c
--- a/strings/ctype-latin1.c	2007-06-21 12:59:24 +03:00
+++ b/strings/ctype-latin1.c	2008-02-06 18:45:44 +02:00
@@ -683,13 +683,12 @@ void my_hash_sort_latin1_de(CHARSET_INFO
 			    const uchar *key, size_t len,
 			    ulong *nr1, ulong *nr2)
 {
-  const uchar *end= key+len;
+  const uchar *end;
   /*
     Remove end space. We have to do this to be able to compare
     'AE' and 'Ä' as identical
   */
-  while (end > key && end[-1] == ' ')
-    end--;
+  SKIP_TRAILING_SPACE(const uchar, key, len, end);
 
   for (; key < end ; key++)
   {
diff -Nrup a/strings/ctype-mb.c b/strings/ctype-mb.c
--- a/strings/ctype-mb.c	2007-10-22 14:43:29 +03:00
+++ b/strings/ctype-mb.c	2008-02-06 18:45:44 +02:00
@@ -565,14 +565,11 @@ void my_hash_sort_mb_bin(CHARSET_INFO *c
 {
   const uchar *pos = key;
   
-  key+= len;
-  
   /*
      Remove trailing spaces. We have to do this to be able to compare
     'A ' and 'A' as identical
   */
-  while (key > pos && key[-1] == ' ')
-    key--;
+  SKIP_TRAILING_SPACE(const uchar, key, len, key);
   
   for (; pos < (uchar*) key ; pos++)
   {
diff -Nrup a/strings/ctype-simple.c b/strings/ctype-simple.c
--- a/strings/ctype-simple.c	2007-12-07 12:43:57 +02:00
+++ b/strings/ctype-simple.c	2008-02-06 18:45:44 +02:00
@@ -305,14 +305,13 @@ void my_hash_sort_simple(CHARSET_INFO *c
 			 ulong *nr1, ulong *nr2)
 {
   register uchar *sort_order=cs->sort_order;
-  const uchar *end= key + len;
+  const uchar *end;
   
   /*
     Remove end space. We have to do this to be able to compare
     'A ' and 'A' as identical
   */
-  while (end > key && end[-1] == ' ')
-    end--;
+  SKIP_TRAILING_SPACE(const uchar, key, len, end);
   
   for (; key < (uchar*) end ; key++)
   {
@@ -1166,9 +1165,8 @@ size_t my_well_formed_len_8bit(CHARSET_I
 size_t my_lengthsp_8bit(CHARSET_INFO *cs __attribute__((unused)),
                         const char *ptr, size_t length)
 {
-  const char *end= ptr+length;
-  while (end > ptr && end[-1] == ' ')
-    end--;
+  const char *end;
+  SKIP_TRAILING_SPACE(const char, ptr, length, end);
   return (size_t) (end-ptr);
 }
 
Thread
bk commit into 6.0 tree (gkodinov:1.2799) BUG#14637kgeorge6 Feb
  • Re: bk commit into 6.0 tree (gkodinov:1.2799) BUG#14637Sergei Golubchik12 Feb