Hi Stephan,
saf wrote:
> I'm developper at the NOOFS project (filesystem which saves its data in a SQL
> database).
> I needed more speed for the function escape_string_for_mysql(...), so I changed the
> source code.
Mark Matthews and Konstantin Osipov are looking at it.
You'll hear back from them.
Regards,
Arjen.
> My local benchmark tests results was that this function is now two times faster
> (without multi-byte
> characters) than before. I couldn't test it with multi-byte characters, if somebody
> at MySQL
> could do it, it would be nice. "Normally" it should work with the two.
>
> I know that this new code is "ugly", but there was no other choice to have more
> speed. It's possible to
> make a beautifuller code, but the function will loss speed.
>
> The patch is for MySQL 5.0.16.
>
> Here my patch, if it works with Multi-Byte characters, please apply it:
>
> diff -Naur ../old/mysql-5.0.16/mysys/charset.c mysql-5.0.16/mysys/charset.c
> --- ../old/mysql-5.0.16/mysys/charset.c 2005-11-15 01:12:37.000000000 +0100
> +++ mysql-5.0.16/mysys/charset.c 2005-12-01 00:11:37.000000000 +0100
> @@ -596,95 +596,182 @@
> >=0 The length of the escaped string
> */
>
> -ulong escape_string_for_mysql(CHARSET_INFO *charset_info,
> - char *to, ulong to_length,
> - const char *from, ulong length)
> -{
> - const char *to_start= to;
> - const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
> - my_bool overflow= FALSE;
> #ifdef USE_MB
> - my_bool use_mb_flag= use_mb(charset_info);
> -#endif
> - for (end= from + length; from < end; from++)
> - {
> - char escape= 0;
> -#ifdef USE_MB
> - int tmp_length;
> - if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
> +
> +/* High speed function if no multi-byte character flag */
> +static ulong escape_string_for_mysql_no_mb(char *to, const char *from, ulong
> length)
> +{
> + ulong i, u;
> +
> + for (i = 0, u = 0; i < length; i++)
> {
> - if (to + tmp_length > to_end)
> - {
> - overflow= TRUE;
> - break;
> - }
> - while (tmp_length--)
> - *to++= *from++;
> - from--;
> - continue;
> + if (!from[i])
> + {
> + to[u++]= '\\';
> + to[u++]= '0';
> + continue;
> + }
> + if (from[i] == '\n')
> + {
> + to[u++]= '\\';
> + to[u++]= 'n';
> + continue;
> + }
> + if (from[i] == '\r')
> + {
> + to[u++]= '\\';
> + to[u++]= 'r';
> + continue;
> + }
> + if (from[i] == '\\')
> + {
> + to[u++]= '\\';
> + to[u++]= '\\';
> + continue;
> + }
> + if (from[i] == '\'')
> + {
> + to[u++]= '\\';
> + to[u++]= '\'';
> + continue;
> + }
> + if (from[i] == '"')
> + {
> + to[u++]= '\\';
> + to[u++]= '"';
> + continue;
> + }
> + if (from[i] == '\032')
> + {
> + to[u++]= '\\';
> + to[u++]= 'Z';
> + continue;
> + }
> + to[u++]= from[i];
> }
> - /*
> - If the next character appears to begin a multi-byte character, we
> - escape that first byte of that apparent multi-byte character. (The
> - character just looks like a multi-byte character -- if it were actually
> - a multi-byte character, it would have been passed through in the test
> - above.)
> -
> - Without this check, we can create a problem by converting an invalid
> - multi-byte character into a valid one. For example, 0xbf27 is not
> - a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
> - */
> - if (use_mb_flag && (tmp_length= my_mbcharlen(charset_info, *from)) >
> 1)
> - escape= *from;
> - else
> -#endif
> - switch (*from) {
> - case 0: /* Must be escaped for 'mysql' */
> - escape= '0';
> - break;
> - case '\n': /* Must be escaped for logs */
> - escape= 'n';
> - break;
> - case '\r':
> - escape= 'r';
> - break;
> - case '\\':
> - escape= '\\';
> - break;
> - case '\'':
> - escape= '\'';
> - break;
> - case '"': /* Better safe than sorry */
> - escape= '"';
> - break;
> - case '\032': /* This gives problems on Win32 */
> - escape= 'Z';
> - break;
> - }
> - if (escape)
> + to[u]= 0;
> + return (u);
> +}
> +
> +#define CHECK_AND_ESCAPE(src, replace_token) \
> + if (from[i] == src) \
> + { \
> + to[u++]= '\\'; \
> + to[u++]= replace_token; \
> + i++; \
> + continue; \
> + }
> +
> +/* We can't detect overflow on the to string, it depends how much
> + memory the client has allocated. */
> +static escape_string_for_mysql_mb(CHARSET_INFO *charset_info,
> + char *to, const char *from, ulong length)
> +{
> + ulong i, u;
> + const char *end;
> +
> + end = from + length;
> + for (i = 0, u = 0; i < length;)
> {
> - if (to + 2 > to_end)
> - {
> - overflow= TRUE;
> - break;
> - }
> - *to++= '\\';
> - *to++= escape;
> + int tmp_length;
> +
> + if ((tmp_length= my_ismbchar(charset_info, from, end)))
> + {
> + if (tmp_length)
> + {
> + if (i + tmp_length > length)
> + return (ulong)~0;
> + while (tmp_length--)
> + to[u++]= from[i++];
> + }
> + continue;
> + }
> + CHECK_AND_ESCAPE(0, '0');
> + CHECK_AND_ESCAPE('\n', 'n');
> + CHECK_AND_ESCAPE('\r', 'r');
> + CHECK_AND_ESCAPE('\\', '\\');
> + CHECK_AND_ESCAPE('\'', '\'');
> + CHECK_AND_ESCAPE('"', '"');
> + CHECK_AND_ESCAPE('\032', 'Z');
> +
> + /* Normal behaviour, only copy */
> + to[u++]= from[i];
> + i++;
> }
> - else
> + to[u]= 0;
> + return (u);
> +}
> +
> +ulong escape_string_for_mysql(CHARSET_INFO *charset_info,
> + char *to, ulong to_length,
> + const char *from, ulong length)
> +{
> + my_bool use_mb_flag= use_mb(charset_info);
> +
> + if (!use_mb_flag)
> + return escape_string_for_mysql_no_mb(to, from, length);
> + return escape_string_for_mysql_mb(charset_info, to, from, length);
> +}
> +
> +#else
> +
> +/* high optimization without MB */
> +ulong escape_string_for_mysql(CHARSET_INFO *charset_info,
> + char *to, ulong to_length,
> + const char *from, ulong length)
> +{
> + ulong i, u;
> +
> + for (i = 0, u = 0; i < length; i++)
> {
> - if (to + 1 > to_end)
> - {
> - overflow= TRUE;
> - break;
> - }
> - *to++= *from;
> + if (!from[i])
> + {
> + to[u++]= '\\';
> + to[u++]= '0';
> + continue;
> + }
> + if (from[i] == '\n')
> + {
> + to[u++]= '\\';
> + to[u++]= 'n';
> + continue;
> + }
> + if (from[i] == '\r')
> + {
> + to[u++]= '\\';
> + to[u++]= 'r';
> + continue;
> + }
> + if (from[i] == '\\')
> + {
> + to[u++]= '\\';
> + to[u++]= '\\';
> + continue;
> + }
> + if (from[i] == '\'')
> + {
> + to[u++]= '\\';
> + to[u++]= '\'';
> + continue;
> + }
> + if (from[i] == '"')
> + {
> + to[u++]= '\\';
> + to[u++]= '"';
> + continue;
> + }
> + if (from[i] == '\032')
> + {
> + to[u++]= '\\';
> + to[u++]= 'Z';
> + continue;
> + }
> + to[u++]= from[i];
> }
> - }
> - *to= 0;
> - return overflow ? (ulong)~0 : (ulong) (to - to_start);
> + to[u]= 0;
> + return (u);
> }
> -
> +#endif
>
> #ifdef BACKSLASH_MBTAIL
> static CHARSET_INFO *fs_cset_cache= NULL;
>
--
Arjen Lentz, Community Relations Manager, MySQL AB
Program Chair, MySQL Users Conference
MySQL Users Conference 2006 (Santa Clara CA, 24-27 April)
http://www.mysqluc.com/