From: Alexander Barkov Date: February 16 2011 9:00am Subject: WL#5331 Support Unicode for Windows command line client List-Archive: http://lists.mysql.com/commits/131381 Message-Id: <4D5B9242.6040700@oracle.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="------------050709050109000604010904" --------------050709050109000604010904 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Based on Vlad's patch: http://lists.mysql.com/commits/105379 --------------050709050109000604010904 Content-Type: text/plain; name="w5331.diff" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="w5331.diff" === modified file 'client/mysql.cc' --- client/mysql.cc 2011-02-09 06:56:59 +0000 +++ client/mysql.cc 2011-02-16 08:57:08 +0000 @@ -195,6 +195,18 @@ const char *default_dbug_option="d:t:o,/tmp/mysql.trace"; + +#ifdef __WIN__ +static my_bool use_unicode_api= 1; +#endif /* __WIN__ */ + +/* Various printing flags */ +#define MY_PRINT_0 1 /* Replace 0x00 bytes to "\0" */ +#define MY_PRINT_XML 2 /* Encode XML entities */ +#define MY_PRINT_MB 4 /* Recognize multi-byte characters */ +#define MY_PRINT_CTRL 8 /* Replace TAB, NL, CR to "\t", "\n", "\r" */ + +void tee_write(FILE *file, const char *s, size_t slen, int flags); void tee_fprintf(FILE *file, const char *fmt, ...); void tee_fputs(const char *s, FILE *file); void tee_puts(const char *s, FILE *file); @@ -1113,6 +1125,17 @@ close(stdout_fileno_copy); /* Clean up dup(). */ } +#ifdef __WIN__ + /* + Convert command line parameters from UTF16LE to UTF8MB4 + when "mysql.exe --unicode" is specified. + Otherwise, the usual argc/argv will be used, + which bring arguments using ANSI code page. + */ + if (use_unicode_api) + my_win_translate_command_line_args(&my_charset_utf8mb4_bin, &argc, &argv); +#endif + if (load_defaults("my",load_default_groups,&argc,&argv)) { my_end(0); @@ -1409,6 +1432,10 @@ {"default-character-set", OPT_DEFAULT_CHARSET, "Set the default character set.", &default_charset, &default_charset, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#if __WIN__ + {"unicode", 0, "Use Windows Unicode API for console I/O and command line arguments.", + &use_unicode_api, &use_unicode_api, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, +#endif __WIN__ {"delimiter", OPT_DELIMITER, "Delimiter to be used.", &delimiter_str, &delimiter_str, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, {"execute", 'e', "Execute command and quit. (Disables --force and history file.)", 0, @@ -1885,6 +1912,15 @@ tmpbuf.alloc(65535); tmpbuf.length(0); buffer.length(0); + + if (use_unicode_api) + { + line= my_win_console_readline(charset_info, + (char*) tmpbuf.ptr(), + tmpbuf.alloced_length()); + goto got_a_line; + } + size_t clen; do { @@ -1901,6 +1937,8 @@ */ if (line) line= buffer.c_ptr(); + +got_a_line: #else if (opt_outfile) fputs(prompt, OUTFILE); @@ -3455,19 +3493,12 @@ grid. (The \0 is also the reason we can't use fprintf() .) */ unsigned int i; - const char *p; if (right_justified) for (i= data_length; i < total_bytes_to_send; i++) tee_putc((int)' ', PAGER); - for (i= 0, p= data; i < data_length; i+= 1, p+= 1) - { - if (*p == '\0') - tee_putc((int)' ', PAGER); - else - tee_putc((int)*p, PAGER); - } + tee_write(PAGER, data, data_length, MY_PRINT_0|MY_PRINT_MB); if (! right_justified) for (i= data_length; i < total_bytes_to_send; i++) @@ -3587,16 +3618,7 @@ tee_fprintf(PAGER, "%*s: ",(int) max_length,field->name); if (cur[off]) { - unsigned int i; - const char *p; - - for (i= 0, p= cur[off]; i < lengths[off]; i+= 1, p+= 1) - { - if (*p == '\0') - tee_putc((int)' ', PAGER); - else - tee_putc((int)*p, PAGER); - } + tee_write(PAGER, cur[off], lengths[off], MY_PRINT_0|MY_PRINT_MB); tee_putc('\n', PAGER); } else @@ -3666,16 +3688,7 @@ if (!src) tee_fputs("NULL", PAGER); else - { - for (const char *p = src; length; p++, length--) - { - const char *t; - if ((t = array_value(xmlmeta, *p))) - tee_fputs(t, PAGER); - else - tee_putc(*p, PAGER); - } - } + tee_write(PAGER, src, length, MY_PRINT_XML|MY_PRINT_MB); } @@ -3686,37 +3699,9 @@ tee_fputs("NULL", PAGER); else { - if (opt_raw_data) - { - unsigned long i; - /* Can't use tee_fputs(), it stops with NUL characters. */ - for (i= 0; i < length; i++, pos++) - tee_putc(*pos, PAGER); - } - else for (const char *end=pos+length ; pos != end ; pos++) - { -#ifdef USE_MB - int l; - if (use_mb(charset_info) && - (l = my_ismbchar(charset_info, pos, end))) - { - while (l--) - tee_putc(*pos++, PAGER); - pos--; - continue; - } -#endif - if (!*pos) - tee_fputs("\\0", PAGER); // This makes everything hard - else if (*pos == '\t') - tee_fputs("\\t", PAGER); // This would destroy tab format - else if (*pos == '\n') - tee_fputs("\\n", PAGER); // This too - else if (*pos == '\\') - tee_fputs("\\\\", PAGER); - else - tee_putc(*pos, PAGER); - } + int flags= MY_PRINT_MB | (opt_raw_data ? 0 : (MY_PRINT_0|MY_PRINT_CTRL)); + /* Can't use tee_fputs(), it stops with NUL characters. */ + tee_write(PAGER, pos, length, flags); } } @@ -4317,7 +4302,30 @@ mysql_options(&mysql, MYSQL_INIT_COMMAND, init_command); } - mysql_options(&mysql, MYSQL_SET_CHARSET_NAME, default_charset); + mysql_set_character_set(&mysql, default_charset); +#if __WIN__ + uint cnv_errors; + String converted_database, converted_user; + if (use_unicode_api && + !my_charset_same(&my_charset_utf8mb4_bin, mysql.charset)) + { + /* Convert user and database from UTF8MB4 to connection character set */ + if (user) + { + converted_user.copy(user, strlen(user) + 1, + &my_charset_utf8mb4_bin, mysql.charset, + &cnv_errors); + user= (char*) converted_user.ptr(); + } + if (database) + { + converted_database.copy(database, strlen(database) + 1, + &my_charset_utf8mb4_bin, mysql.charset, + &cnv_errors); + database= (char*) converted_database.ptr(); + } + } +#endif if (opt_plugin_dir && *opt_plugin_dir) mysql_options(&mysql, MYSQL_PLUGIN_DIR, opt_plugin_dir); @@ -4339,7 +4347,29 @@ } return -1; // Retryable } - + +#if __WIN__ + /* Convert --execute buffer from UTF8MB4 to connection character set */ + if (status.line_buff && use_unicode_api && + !my_charset_same(&my_charset_utf8mb4_bin, mysql.charset)) + { + String tmp; + LINE_BUFFER *b= status.line_buff; + size_t len= b->end - b->buffer + 1; + uint dummy_errors; + /* Notice +1, to convert including trailing '\0' character */ + if (tmp.copy(status.line_buff->buffer, + status.line_buff->end - status.line_buff->buffer + 1, + &my_charset_utf8mb4_bin, mysql.charset, &dummy_errors)) + return 1; + /* Free the old line buffer */ + batch_readline_end(status.line_buff); + /* Re-initialize line buffer from the converted string */ + if (!(status.line_buff= batch_readline_command(NULL, (char*) tmp.ptr()))) + return 1; + } +#endif /* __WIN__ */ + charset_info= mysql.charset; connected=1; @@ -4645,11 +4675,73 @@ } +/** + Write data to a stream. + Various modes, corresponding to --tab, --xml, --raw parameters, + are supported. + + @param file Stream to write to + @param s String to write + @param slen String length + @flags Flags for --tab, --xml, --raw. +*/ +void tee_write(FILE *file, const char *s, size_t slen, int flags) +{ + const char *se; + + for (se= s + slen; s < se; s++) + { + const char *t; + + if (flags & MY_PRINT_MB) + { + int mblen, i; + if (use_mb(charset_info) && + (mblen= my_ismbchar(charset_info, s, se))) + { +#if __WIN__ + if (use_unicode_api && my_win_is_console(file)) + my_win_console_write(charset_info, s, mblen); + else + fwrite(file, 1, mblen, file); + if (opt_outfile) + fwrite(OUTFILE, 1, mblen, file); + s+= mblen - 1; + continue; +#endif + for (i= 0; i < mblen; i++) + tee_putc(s[i], file); + s+= mblen - 1; + continue; + } + } + + if ((flags & MY_PRINT_XML) && (t= array_value(xmlmeta, *s))) + tee_fputs(t, file); + else if ((flags & MY_PRINT_0) && *s == '\0') + tee_putc((int) ' ', file); // This makes everything hard + else if ((flags & MY_PRINT_CTRL) && *s == '\t') + tee_fputs("\\t", file); // This would destroy tab format + else if ((flags & MY_PRINT_CTRL) && *s == '\n') + tee_fputs("\\n", file); // This too + else if ((flags & MY_PRINT_CTRL) && *s == '\\') + tee_fputs("\\\\", file); + else + tee_putc((int) *s, file); + } +} + + void tee_fprintf(FILE *file, const char *fmt, ...) { va_list args; va_start(args, fmt); +#if __WIN__ + if (use_unicode_api && my_win_is_console(file)) + my_win_console_vfprintf(charset_info, fmt, args); + else +#endif (void) vfprintf(file, fmt, args); va_end(args); @@ -4664,6 +4756,11 @@ void tee_fputs(const char *s, FILE *file) { +#ifdef __WIN__ + if (use_unicode_api && my_win_is_console(file)) + my_win_console_fputs(charset_info, s); + else +#endif fputs(s, file); if (opt_outfile) fputs(s, OUTFILE); @@ -4672,17 +4769,17 @@ void tee_puts(const char *s, FILE *file) { - fputs(s, file); - fputc('\n', file); - if (opt_outfile) - { - fputs(s, OUTFILE); - fputc('\n', OUTFILE); - } + tee_fputs(s, file); + tee_putc('\n', file); } void tee_putc(int c, FILE *file) { +#if __WIN__ + if (use_unicode_api && my_win_is_console(file)) + my_win_console_putc(charset_info, c); + else +#endif putc(c, file); if (opt_outfile) putc(c, OUTFILE); === modified file 'include/my_sys.h' --- include/my_sys.h 2011-02-08 15:54:12 +0000 +++ include/my_sys.h 2011-02-15 13:02:57 +0000 @@ -947,8 +947,14 @@ /* implemented in my_conio.c */ char* my_cgets(char *string, size_t clen, size_t* plen); - -#endif +my_bool my_win_is_console(FILE *file); +char *my_win_console_readline(CHARSET_INFO *cs, char *mbbuf, size_t mbbufsize); +void my_win_console_write(CHARSET_INFO *cs, const char *data, size_t datalen); +void my_win_console_fputs(CHARSET_INFO *cs, const char *data); +void my_win_console_putc(CHARSET_INFO *cs, int c); +void my_win_console_vfprintf(CHARSET_INFO *cs, const char *fmt, va_list args); +int my_win_translate_command_line_args(CHARSET_INFO *cs, int *ac, char ***av); +#endif /* __WIN__ */ #include === modified file 'mysys/my_conio.c' --- mysys/my_conio.c 2009-02-13 16:41:47 +0000 +++ mysys/my_conio.c 2011-02-16 08:20:35 +0000 @@ -219,4 +219,261 @@ return result; } + +/* Windows console handling */ + +/* Maximum line length on Windows console */ +#define MAX_CONSOLE_LINE_SIZE 65535 + +/** + Determine if a file is a windows console + + @param file Input stream + + @return + @retval 0 if file is not Windows console + @retval 1 if file is Windows console +*/ +my_bool +my_win_is_console(FILE *file) +{ + DWORD mode; + if (GetConsoleMode((HANDLE) _get_osfhandle(_fileno(file)), &mode)) + return 1; + return 0; +} + + +/** + Read line from Windows console using Unicode API + and translate input to session character set. + Note, as Windows API breaks supplementary characters + into to wchar_t pairs, we cannot read and convert individual + wchar_t values separately. So let's use a buffer for + Unicode console input, and then convert it to "cs" in a single shot. + String is terminated with '\0' character. + + @param cs Character string to convert to. + @param mbbuf Write input data here. + @param mbbufsize Number of bytes available in mbbuf. + + @rerval Pointer to mbbuf, or NULL on I/0 error. +*/ +char * +my_win_console_readline(CHARSET_INFO *cs, char *mbbuf, size_t mbbufsize) +{ + uint dummy_errors; + static wchar_t u16buf[MAX_CONSOLE_LINE_SIZE + 1]; + size_t pos, mblen; + DWORD console_mode; + HANDLE console= GetStdHandle(STD_INPUT_HANDLE); + + DBUG_ASSERT(mbbufsize > 0); /* Need space for at least trailing '\0' */ + GetConsoleMode(console, &console_mode); + SetConsoleMode(console, ENABLE_LINE_INPUT | + ENABLE_PROCESSED_INPUT | ENABLE_ECHO_INPUT); + for(pos= 0; ; ) + { + DWORD nchars; + BOOL ok= ReadConsoleW(console, &u16buf[pos], 1, &nchars, NULL); + if (!ok || nchars == 0) + { + SetConsoleMode(console, console_mode); + return NULL; + } + if (u16buf[pos] == L'\r') + continue; + if (pos == MAX_CONSOLE_LINE_SIZE || u16buf[pos] == L'\n') + break; + pos++; + } + SetConsoleMode(console, console_mode); + /* Convert Unicode to session character set */ + mblen= my_convert(mbbuf, mbbufsize - 1, cs, + (const char *) u16buf, pos * sizeof(wchar_t), + &my_charset_utf16le_bin, &dummy_errors); + DBUG_ASSERT(mblen < mbbufsize); /* Safety */ + mbbuf[mblen]= 0; + return mbbuf; +} + + +/** + Translate client charset to Windows wchars for console I/O. + Unlike copy_and_convert(), in case of a wrong multi-byte sequence + we don't print '?' character, we fallback to ISO-8859-1 instead. + This gives a better idea how binary data (e.g. BLOB) look like. + + @param cs Character set of the input string + @param from Input string + @param from_length Length of the input string + @param to[OUT] Write Unicode data here + @param to_chars Number of characters available in "to" +*/ +static size_t +my_mbstou16s(CHARSET_INFO *cs, const uchar * from, size_t from_length, + wchar_t *to, size_t to_chars) +{ + CHARSET_INFO *to_cs= &my_charset_utf16le_bin; + const uchar *from_end= from + from_length; + wchar_t *to_orig= to, *to_end= to + to_chars; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; + my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb; + while (from < from_end) + { + int cnvres; + my_wc_t wc; + if ((cnvres= (*mb_wc)(cs, &wc, from, from_end)) > 0) + { + if (!wc) + break; + from+= cnvres; + } + else if (cnvres == MY_CS_ILSEQ) + { + wc= (my_wc_t) (uchar) *from; /* Fallback to ISO-8859-1 */ + from+= 1; + } + else if (cnvres > MY_CS_TOOSMALL) + { + /* + A correct multibyte sequence detected + But it doesn't have Unicode mapping. + */ + from+= (-cnvres); + wc= '?'; + } + else /* Incomplete character */ + { + wc= (my_wc_t) (uchar) *from; /* Fallback to ISO-8859-1 */ + from+= 1; + } +outp: + if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, (uchar*) to_end)) > 0) + { + /* We can never convert only a part of wchar_t */ + DBUG_ASSERT((cnvres % sizeof(wchar_t)) == 0); + /* cnvres returns numner of bytes, convert to number of wchar_t's */ + to+= cnvres / sizeof(wchar_t); + } + else if (cnvres == MY_CS_ILUNI && wc != '?') + { + wc= '?'; + goto outp; + } + else + break; /* Not enough space */ + } + return to - to_orig; +} + + +/** + Write a string in the given character set to Windows console. + As Window breaks supplementary characters into two parts, + we cannot use a simple loop sending the result of + cs->cset->mb_wc() to console. + So we converts string from client charset to an array of wchar_t, + then write the array to console in a single shot. + + @param cs Character set of the string + @param data String to print + @param datalen Length of input string in bytes +*/ +void +my_win_console_write(CHARSET_INFO *cs, const char *data, size_t datalen) +{ + static wchar_t u16buf[MAX_CONSOLE_LINE_SIZE + 1]; + size_t nchars= my_mbstou16s(cs, (const uchar*) data, datalen, + u16buf, sizeof(u16buf)); + DWORD nwritten; + WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE), + u16buf, (DWORD)nchars, &nwritten, NULL); +} + + +/** + Write a single-byte character to console. + Note: one should not send parts of a single multi-byte characters + in separate consequent my_win_console_putc() calls. + For multi-byte characters use my_win_colsole_write() instead. + + @param cs Character set of the input character + @param c Character (single byte) +*/ +void +my_win_console_putc(CHARSET_INFO *cs, int c) +{ + char ch= (char) c; + my_win_console_write(cs, &ch, 1); +} + + +/** + Write a 0-terminated string to Windows console. + + @param cs Character set of the string to print + @param data String to print +*/ +void +my_win_console_fputs(CHARSET_INFO *cs, const char *data) +{ + my_win_console_write(cs, data, strlen(data)); +} + + +/* + Handle formatted output on the Windows console. +*/ +void +my_win_console_vfprintf(CHARSET_INFO *cs, const char *fmt, va_list args) +{ + static char buff[MAX_CONSOLE_LINE_SIZE + 1]; + size_t len= vsnprintf(buff, sizeof(buff) - 1, fmt, args); + my_win_console_write(cs, buff, len); +} + + +#include + +/** + Translate Unicode command line parameters to the given character set + (Typically to utf8mb4). + Translated parameters are allocated using my_once_alloc(). + + @param tocs Character set to convert parameters to. + @param[OUT] argc Write number of parameters here + @param[OUT] argv Write pointer to allocated parameters here. +*/ +int +my_win_translate_command_line_args(CHARSET_INFO *cs, int *argc, char ***argv) +{ + int i, ac; + char **av; + wchar_t *command_line= GetCommandLineW(); + wchar_t **wargs= CommandLineToArgvW(command_line, &ac); + size_t nbytes= (ac + 1) * sizeof(char*); + + /* Allocate new command line parameter */ + av= (char**) my_once_alloc(nbytes, MYF(MY_ZEROFILL)); + + for(i= 0; i < *argc; i++) + { + uint dummy_errors; + size_t arg_len= wcslen(wargs[i]); + size_t len, alloced_len= arg_len * cs->mbmaxlen + 1; + av[i]= (char *) my_once_alloc(alloced_len, MYF(0)); + len= my_convert(av[i], alloced_len, cs, + (const char *) wargs[i], arg_len * sizeof(wchar_t), + &my_charset_utf16le_bin, &dummy_errors); + DBUG_ASSERT(len < alloced_len); + av[i][len]= '\0'; + } + *argv= av; + *argc= ac; + /* Cleanup on exit */ + LocalFree((HLOCAL) wargs); + return 0; +} + #endif /* __WIN__ */ === modified file 'sql-common/client.c' --- sql-common/client.c 2011-01-31 15:55:58 +0000 +++ sql-common/client.c 2011-02-16 08:42:39 +0000 @@ -4246,11 +4246,25 @@ if (mysql->options.charset_dir) charsets_dir= mysql->options.charset_dir; + if (!mysql->net.vio) + { + /* Initialize with automatic OS character set detection. */ + mysql_options(mysql, MYSQL_SET_CHARSET_NAME, cs_name); + mysql_init_character_set(mysql); + cs_name= mysql->options.charset_name; + } + if (strlen(cs_name) < MY_CS_NAME_SIZE && (cs= get_charset_by_csname(cs_name, MY_CS_PRIMARY, MYF(0)))) { char buff[MY_CS_NAME_SIZE + 10]; charsets_dir= save_csdir; + if (!mysql->net.vio) + { + /* If there is no connection yet we don't send "SET NAMES" query */ + mysql->charset= cs; + return 0; + } /* Skip execution of "SET NAMES" for pre-4.1 servers */ if (mysql_get_server_version(mysql) < 40100) return 0; --------------050709050109000604010904--