List:Commits« Previous MessageNext Message »
From:Sergey Vojtovich Date:November 13 2010 11:55am
Subject:Re: bzr commit into mysql-next-mr branch (bar:3201) WL#5510
View as plain text  
Hi Alexender,

rather frequently requested feature. :)
See comments inline.

On Thu, Nov 11, 2010 at 09:53:09AM +0000, Alexander Barkov wrote:
> #At file:///home/bar/mysql-bzr/mysql-next-mr.w5510/ based on
> revid:alexander.nozdrin@stripped
> 
>  3201 Alexander Barkov	2010-11-11
>       WL#5510 Functions to_base64 and from_base64
...skip...

> === modified file 'mysys/base64.c'
> --- a/mysys/base64.c	2009-03-20 14:27:53 +0000
> +++ b/mysys/base64.c	2010-11-11 09:49:55 +0000
> @@ -100,24 +100,176 @@ base64_encode(const void *src, size_t sr
>  }
>  
>  
> +/*
> +  Helper table for base64_decode
> +  -2 means "space character"
> +  -1 means "bad character"
> +  Non-negative values mean valid base64 encoding character.
> +*/
> +static int8
> +from_base64_table[]=
> +{
> +/*00*/  -1,-1,-1,-1,-1,-1,-1,-1,-1,-2,-2,-2,-2,-2,-1,-1, 
> +/*10*/  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 
> +/*20*/  -2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63, /*  !"#$%&'()*+,-./ */
> +/*30*/  52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1, /* 0123456789:;<=>?
> */
> +/*40*/  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, /* @ABCDEFGHIJKLMNO */
> +/*50*/  15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1, /* PQRSTUVWXYZ[\]^_ */
> +/*60*/  -1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40, /* `abcdefghijklmno */
> +/*70*/  41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1, /* pqrstuvwxyz{|}~  */
> +/*80*/  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 
> +/*90*/  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 
> +/*A0*/  -2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 
> +/*B0*/  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 
> +/*C0*/  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 
> +/*D0*/  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 
> +/*E0*/  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 
> +/*F0*/  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
> +};
> +
> +
> +/*
> +  Convert base64 encoding character to bits
> +
> +  SYNOPSIS
> +    my_base64_pos
> +    
> +    c     - input character
> +    error - pointer to error
> +
> +  DESCRIPTION
> +    Convert input character to a number in the range [0..63].
> +    If input character is not valid base64 encoding character,
> +    then error is set to 1.
> +
> +  RETURN VALUE
> +    -2  - c is a space character
> +    -1  - c is an unknow character
> +    non-negative number - c is a valid base64 encoding character
> +*/
>  static inline uint
> -pos(unsigned char c)
> +my_base64_pos(unsigned char c, int *error)
>  {
> -  return (uint) (strchr(base64_table, c) - base64_table);
> +  int res= from_base64_table[c];
> +  if (res < 0)
> +  {
> +    *error= 1;
> +    return 0;
> +  }
> +  return (uint) res;
>  }
>  
>  
> -#define SKIP_SPACE(src, i, size)                                \
> -{                                                               \
> -  while (i < size && my_isspace(&my_charset_latin1, * src))     \
> -  {                                                             \
> -    i++;                                                        \
> -    src++;                                                      \
> -  }                                                             \
> -  if (i == size)                                                \
> -  {                                                             \
> -    break;                                                      \
> -  }                                                             \
> +typedef struct my_base64_decoder_t
> +{
> +  const char *src; /* Pointer to the current input position        */
> +  const char *end; /* Pointer to the end of input buffer           */
> +  unsigned int c;  /* Collect bits into this number                */
> +  int error;       /* Error code                                   */
> +  uchar state;     /* Character number in the current group of 4   */
> +  uchar mark;      /* Number of padding marks in the current group */
> +} MY_BASE64_DECODER;
> +
> +
> +/*
> +  Skip leading spaces in a base64 encoded string
> +  
> +  SYNOPSIS
> +    my_base64_decoder_skip_spaces
> +    
> +    decoder  Pointer to MY_BASE64_DECODER
> +    
> +  DESCRIPTION
> +    Skip leading spaces and stop on the first non-space character.
> +    decoder->p will point to the first non-space character, or
> +    to the end of the input string.
> +    
> +  RETURN VALUE
> +    0 - end-of-input found
> +    1 - if there are some more non-space characters
> +*/
> +static inline int
> +my_base64_decoder_skip_spaces(MY_BASE64_DECODER *decoder)
> +{
> +  for ( ; decoder->src < decoder->end; decoder->src++)
> +  {
> +    if (from_base64_table[(uchar) *decoder->src] != -2)
> +      return 1;
> +  }
> +  return 0;
> +}
> +
> +
> +/*
> +  Get the next character from a base64 encoded string
> +
> +  SYNOPSIS
> +    my_base64_decoder_getch
> +    
> +    decoder  Pointer to MY_BASE64_DECODER
> +    
> +  DESCRIPTION
> +    Skip spaces, then scan the next base64 character or a pad character
> +    and collect bits into decoder->c.
> +    
> +  RETURN VALUE
> +    0 - error (unexpected character or unexpected end-of-input found)
> +    1 - success (valid base64 encoding character found)
> +*/
> +static inline int
> +my_base64_decoder_getch(MY_BASE64_DECODER *decoder)
> +{
> +  if (!my_base64_decoder_skip_spaces(decoder))
> +  {
> +    if (decoder->state > 0)
> +      decoder->error= 1; /* Unexpected end-of-input found */
> +    return 0;
> +  }
> +  
> +  decoder->c <<= 6;
> +  decoder->c+= my_base64_pos(*decoder->src++, &decoder->error);
> +  
> +  if (!decoder->error) /* Valid base64 character found */
> +  {
> +    if (decoder->mark)
> +    {
> +      /* If we have scanned '=' already, then only '=' is valid */
> +      DBUG_ASSERT(decoder->state == 3);
> +      decoder->error= 1;
> +      decoder->src--;
> +      return 0; /* '=' expected */
> +    }
> +    decoder->state++;
> +    return 1;
> +  }
> +  
> +  switch (decoder->state) {
> +  case 0:
> +  case 1:
> +    decoder->src--;
> +    return 0; /* base64 character expected */
> +    break;
> +  
> +  case 2:
> +  case 3:
> +    if (decoder->src[-1] == '=')
> +    {
> +      decoder->error= 0;
> +      decoder->mark++;
> +    }
> +    else
> +    {
> +      decoder->src--;
> +      return 0; /* base64 character or '=' expected */
> +    }
> +    break;
> +  
> +  default:
> +    DBUG_ASSERT(0);
> +  }
> +  
> +  decoder->state++;
> +  return 1;
>  }
>  
>  
> @@ -155,75 +307,40 @@ int
>  base64_decode(const char *src_base, size_t len,
>                void *dst, const char **end_ptr)
>  {
> -  char b[3];
> -  size_t i= 0;
> -  char *dst_base= (char *)dst;
> -  char const *src= src_base;
> -  char *d= dst_base;
> -  size_t j;
> -
> -  while (i < len)
> +  char *d= (char*) dst;
> +  MY_BASE64_DECODER decoder;
> +  
> +  decoder.src= src_base;
> +  decoder.end= src_base + len;
> +  decoder.error= 0;
> +  decoder.mark= 0;
> +  
> +  for ( ; ; )
>    {
> -    unsigned c= 0;
> -    size_t mark= 0;
> -
> -    SKIP_SPACE(src, i, len);
> -
> -    c += pos(*src++);
> -    c <<= 6;
> -    i++;
> -
> -    SKIP_SPACE(src, i, len);
> -
> -    c += pos(*src++);
> -    c <<= 6;
> -    i++;
> -
> -    SKIP_SPACE(src, i, len);
> -
> -    if (*src != '=')
> -      c += pos(*src++);
> -    else
> +    decoder.c= 0;
> +    decoder.state= 0;
> +    
> +    if (!my_base64_decoder_getch(&decoder) ||
> +        !my_base64_decoder_getch(&decoder) ||
> +        !my_base64_decoder_getch(&decoder) ||
> +        !my_base64_decoder_getch(&decoder))
> +      break;
> +    
> +    *d++= (decoder.c >> 16) & 0xff;
> +    *d++= (decoder.c >>  8) & 0xff;
> +    *d++= (decoder.c >>  0) & 0xff;
> +    
> +    if (decoder.mark)
>      {
> -      src += 2;                /* There should be two bytes padding */
> -      i= len;
> -      mark= 2;
> -      c <<= 6;
> -      goto end;
> +      d-= decoder.mark;
> +      break;
>      }
> -    c <<= 6;
> -    i++;
> -
> -    SKIP_SPACE(src, i, len);
> -
> -    if (*src != '=')
> -      c += pos(*src++);
> -    else
> -    {
> -      src += 1;                 /* There should be one byte padding */
> -      i= len;
> -      mark= 1;
> -      goto end;
> -    }
> -    i++;
> -
> -  end:
> -    b[0]= (c >> 16) & 0xff;
> -    b[1]= (c >>  8) & 0xff;
> -    b[2]= (c >>  0) & 0xff;
> -
> -    for (j=0; j<3-mark; j++)
> -      *d++= b[j];
>    }
> -
> +  
>    if (end_ptr != NULL)
> -    *end_ptr= src;
> -
> -  /*
> -    The variable 'i' is set to 'len' when padding has been read, so it
> -    does not actually reflect the number of bytes read from 'src'.
> -   */
> -  return i != len ? -1 : (int) (d - dst_base);
> +    *end_ptr= decoder.src;
> +  
> +  return decoder.error ? -1 : (int) (d - (char*) dst);
>  }
>  
>  
> 
I didn't review this file carefully yet. I believe it is
performance critical code. Do you think it is ok to waste
CPU time for accessing structure members and function calls?
Even though some compilers may optimize it, but afair not all
support inlining in C?

I'm not strongly against it, but I'm concerned. If second
reviewer likes your approach in general, I'll do in-depth
review.

> === modified file 'sql/item_create.cc'
> --- a/sql/item_create.cc	2010-07-16 21:00:50 +0000
> +++ b/sql/item_create.cc	2010-11-11 09:49:55 +0000
> @@ -284,7 +284,29 @@ protected:
>    virtual ~Create_func_aes_decrypt() {}
>  };
>  
> +class Create_func_to_base64 : public Create_func_arg1
> +{
> +public:
> +  virtual Item *create(THD *thd, Item *arg1);
> +
> +  static Create_func_to_base64 s_singleton;
> +
> +protected:
> +  Create_func_to_base64() {}
> +  virtual ~Create_func_to_base64() {}
> +};
> +
> +class Create_func_from_base64 : public Create_func_arg1
> +{
> +public:
> +  virtual Item *create(THD *thd, Item *arg1);
> +
> +  static Create_func_from_base64 s_singleton;
>  
> +protected:
> +  Create_func_from_base64() {}
> +  virtual ~Create_func_from_base64() {}
> +};
>  #ifdef HAVE_SPATIAL
>  class Create_func_area : public Create_func_arg1
>  {
For some reason Create_func* class declarations were
ordered by class name.

> @@ -2703,6 +2725,22 @@ Create_func_aes_decrypt::create(THD *thd
>  }
>  
>  
> +Create_func_to_base64 Create_func_to_base64::s_singleton;
> +
> +Item*
> +Create_func_to_base64::create(THD *thd, Item *arg1)
> +{
> +  return new (thd->mem_root) Item_func_to_base64(arg1);
> +}
> +
> +Create_func_from_base64 Create_func_from_base64::s_singleton;
> +
> +Item*
> +Create_func_from_base64::create(THD *thd, Item *arg1)
> +{
> +  return new (thd->mem_root) Item_func_from_base64(arg1);
> +}
> +
>  #ifdef HAVE_SPATIAL
>  Create_func_area Create_func_area::s_singleton;
> 
Same here. They were ordered.
 
> @@ -4892,6 +4930,7 @@ static Native_func_registry func_array[]
>    { { C_STRING_WITH_LEN("FORMAT") }, BUILDER(Create_func_format)},
>    { { C_STRING_WITH_LEN("FOUND_ROWS") }, BUILDER(Create_func_found_rows)},
>    { { C_STRING_WITH_LEN("FROM_DAYS") }, BUILDER(Create_func_from_days)},
> +  { { C_STRING_WITH_LEN("FROM_BASE64") }, BUILDER(Create_func_from_base64)},
>    { { C_STRING_WITH_LEN("FROM_UNIXTIME") }, BUILDER(Create_func_from_unixtime)},
>    { { C_STRING_WITH_LEN("GEOMCOLLFROMTEXT") },
> GEOM_BUILDER(Create_func_geometry_from_text)},
>    { { C_STRING_WITH_LEN("GEOMCOLLFROMWKB") },
> GEOM_BUILDER(Create_func_geometry_from_wkb)},
> @@ -5014,6 +5053,7 @@ static Native_func_registry func_array[]
>    { { C_STRING_WITH_LEN("TIME_TO_SEC") }, BUILDER(Create_func_time_to_sec)},
>    { { C_STRING_WITH_LEN("TOUCHES") }, GEOM_BUILDER(Create_func_touches)},
>    { { C_STRING_WITH_LEN("TO_DAYS") }, BUILDER(Create_func_to_days)},
> +  { { C_STRING_WITH_LEN("TO_BASE64") }, BUILDER(Create_func_to_base64)},
>    { { C_STRING_WITH_LEN("TO_SECONDS") }, BUILDER(Create_func_to_seconds)},
>    { { C_STRING_WITH_LEN("UCASE") }, BUILDER(Create_func_ucase)},
>    { { C_STRING_WITH_LEN("UNCOMPRESS") }, BUILDER(Create_func_uncompress)},
> 
Same here. They were ordered.

> === modified file 'sql/item_strfunc.cc'
> --- a/sql/item_strfunc.cc	2010-08-20 11:52:40 +0000
> +++ b/sql/item_strfunc.cc	2010-11-11 09:49:55 +0000
> @@ -48,6 +48,7 @@
>  #include "password.h"           // my_make_scrambled_password,
>                                  // my_make_scrambled_password_323
>  #include <m_ctype.h>
> +#include <base64.h>
>  #include "my_md5.h"
>  #include "sha1.h"
>  #include "my_aes.h"
> @@ -464,6 +465,91 @@ void Item_func_aes_decrypt::fix_length_a
>  }
>  
>  
> +void Item_func_to_base64::fix_length_and_dec()
> +{
> +  int length;
> +  maybe_null= args[0]->maybe_null;
When argument is NOT NULL and we e.g. fail to allocate memory at
::val_str_ascii(), we return NULL for NOT NULL item.

> +  collation.set(default_charset(), DERIVATION_COERCIBLE, MY_REPERTOIRE_ASCII);
> +  if (args[0]->max_length > INT_MAX32 ||
> +      (length= base64_needed_encoded_length((int) args[0]->max_length)) < 0)
Though the code itself is fine, but still...
base64_needed_encoded_length() is prone to integer overruns, at least
for the following ranges it will incorrectly return positive value:
INT_MAX32 - 1, INT_MAX32, INT_MAX - 536870910...INT_MAX - 515953864.

Either fix base64_needed_encoded_length(), or add DBUG_ASSERT() there,
so we know where the real problem is hidden.

> +  {
> +    /*
> +      base64 family functions use "int" data type for length arguments
> +      and return values, therefore they support length range [0..INT_MAX32].
> +      When args[0]->max_length is longer than base64 functions can handle,
> +      the code in base64_needed_encoded_length() will overflow and return
> +      negative value. Let's limit maximim length to INT_MAX32 in such cases.
> +    */
> +    length= INT_MAX32;
> +    maybe_null= 1;
> +  }
> +  DBUG_ASSERT(length > 0);
What puprose serves this assert? One line above is obvious code branch
that makes length bigger than 0.

> +  fix_char_length((uint) length - 1);
> +}
> +
> +
> +String *Item_func_to_base64::val_str_ascii(String *str)
> +{
> +  String *res= args[0]->val_str(str);
> +  int length;
> +  if (!res ||
> +      res->length() > INT_MAX32 ||
> +      (length= base64_needed_encoded_length((int) res->length())) < 0 ||
> +      (uint) length > current_thd->variables.max_allowed_packet ||
> +      tmp_value.alloc((uint) length))
> +  {
> +    null_value= 1; // NULL input, too long input, or EOM.
I assume you mean OOM.
http://en.wikipedia.org/wiki/EOM
http://en.wikipedia.org/wiki/OOM

> +    if (length > 0 &&
> +        (uint) length > current_thd->variables.max_allowed_packet)
length may be uninitialized here.

> +    {
> +      push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
> +                          ER_WARN_ALLOWED_PACKET_OVERFLOWED,
> +                          ER(ER_WARN_ALLOWED_PACKET_OVERFLOWED), func_name(),
> +                          current_thd->variables.max_allowed_packet);
And the warning is:
Result of to_base64() was larger than max_allowed_packet (1048576) - truncated

Personally from the above text I would guess that result value is
truncated up to max_allowed_packet length. But it is truncated to
NULL. OTOH warning text doesn't promise anything. Well, ok.

> +    }
> +    return 0;
> +  }
> +  base64_encode(res->ptr(), (int) res->length(), (char*) tmp_value.ptr());
> +  DBUG_ASSERT(length > 0);
What puprose serves this assert? Two lines above is obvious code branch
that handles length <= 0.

> +  tmp_value.length((uint) length - 1); // Without trailing '\0'
> +  null_value= 0;
> +  return &tmp_value;
> +}
> +
> +
> +void Item_func_from_base64::fix_length_and_dec()
> +{
> +  int length;
> +  if (args[0]->max_length > INT_MAX32 ||
> +      (length= base64_needed_decoded_length((int) args[0]->max_length)) < 0)
This way of coding makes me think that we really need to check
base64_needed_decoded_length() whereas we don't - it is 3/4
of max_length. And we check max_length one line above.

OTOH base64_needed_decoded_length() is prone to integer overrun. :(
It is a good idea to fix it.

> +    length= INT_MAX32; // Overflow, limit to INT_MAX32
> +  max_length= ((uint) length);
Why not ::fix_char_length()?

> +  maybe_null= 1; // Can be NULL, e.g. in case of badly formed input string
> +}
> +
> +
> +String *Item_func_from_base64::val_str(String *str)
> +{
> +  String *res= args[0]->val_str_ascii(str);
> +  int length;
> +  const char *end_ptr;
> +  
> +  if (!res ||
> +      res->length() > INT_MAX32 ||
> +      (length= base64_needed_decoded_length((int) res->length())) < 0 ||
We don't need to check decoded_length() here. Instead:
tmp_value.alloc((uint) base64_needed_decoded_length((int) res->length()))

> +      tmp_value.alloc((uint) length) ||
> +      (length= base64_decode(res->ptr(), (int) res->length(),
> +                             (char *) tmp_value.ptr(), &end_ptr)) < 0 ||
> +      end_ptr < res->ptr() + res->length())
> +  {
> +    null_value= 1; // NULL input, too long input, EOM, or badly formed input
s/EOM/OOM/ ?

> +    return 0;
> +  }
> +  tmp_value.length((uint) length);
> +  return &tmp_value;
> +}
> +
> +
>  /**
>    Concatenate args with the following premises:
>    If only one arg (which is ok), return value of arg;
> 
> === modified file 'sql/item_strfunc.h'
> --- a/sql/item_strfunc.h	2010-08-20 11:52:40 +0000
> +++ b/sql/item_strfunc.h	2010-11-11 09:49:55 +0000
> @@ -91,6 +91,27 @@ public:
>    const char *func_name() const { return "sha2"; }
>  };
>  
> +class Item_func_to_base64 :public Item_str_ascii_func
> +{
> +  String tmp_value;
> +public:
> +  Item_func_to_base64(Item *a) :Item_str_ascii_func(a) {}
> +  String *val_str_ascii(String *);
> +  void fix_length_and_dec();
> +  const char *func_name() const { return "to_base64"; }
> +};
> +
> +class Item_func_from_base64 :public Item_str_func
> +{
> +  String tmp_value;
> +public:
> +  Item_func_from_base64(Item *a) :Item_str_func(a) {}
> +  String *val_str(String *);
> +  void fix_length_and_dec();
> +  const char *func_name() const { return "from_base64"; }
> +};
> +
> +
>  class Item_func_aes_encrypt :public Item_str_func
>  {
>  public:
> 

Regards,
Sergey
-- 
Sergey Vojtovich <svoj@stripped>
MySQL AB, Software Engineer
Izhevsk, Russia, www.mysql.com
Thread
bzr commit into mysql-next-mr branch (bar:3201) WL#5510Alexander Barkov11 Nov
  • Re: bzr commit into mysql-next-mr branch (bar:3201) WL#5510Sergey Vojtovich13 Nov
    • Re: bzr commit into mysql-next-mr branch (bar:3201) WL#5510Alexander Barkov18 Nov