List:Commits« Previous MessageNext Message »
From:Alexander Barkov Date:February 16 2011 9:00am
Subject:WL#5331 Support Unicode for Windows command line client
View as plain text  
Based on Vlad's patch: http://lists.mysql.com/commits/105379

=== modified file 'client/mysql.cc'
--- client/mysql.cc	2011-02-09 06:56:59 +0000
+++ client/mysql.cc	2011-02-16 08:57:08 +0000
@@ -195,6 +195,18 @@
 
 const char *default_dbug_option="d:t:o,/tmp/mysql.trace";
 
+
+#ifdef __WIN__
+static my_bool use_unicode_api= 1;
+#endif /* __WIN__ */
+
+/* Various printing flags */
+#define MY_PRINT_0    1  /* Replace 0x00 bytes to "\0"              */
+#define MY_PRINT_XML  2  /* Encode XML entities                     */
+#define MY_PRINT_MB   4  /* Recognize multi-byte characters         */
+#define MY_PRINT_CTRL 8  /* Replace TAB, NL, CR to "\t", "\n", "\r" */
+
+void tee_write(FILE *file, const char *s, size_t slen, int flags);
 void tee_fprintf(FILE *file, const char *fmt, ...);
 void tee_fputs(const char *s, FILE *file);
 void tee_puts(const char *s, FILE *file);
@@ -1113,6 +1125,17 @@
       close(stdout_fileno_copy);             /* Clean up dup(). */
   }
 
+#ifdef __WIN__
+  /*
+    Convert command line parameters from UTF16LE to UTF8MB4
+    when "mysql.exe --unicode" is specified.
+    Otherwise, the usual argc/argv will be used,
+    which bring arguments using ANSI code page.
+  */
+  if (use_unicode_api)
+    my_win_translate_command_line_args(&my_charset_utf8mb4_bin, &argc,
&argv);
+#endif
+
   if (load_defaults("my",load_default_groups,&argc,&argv))
   {
     my_end(0);
@@ -1409,6 +1432,10 @@
   {"default-character-set", OPT_DEFAULT_CHARSET,
    "Set the default character set.", &default_charset,
    &default_charset, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#if __WIN__
+  {"unicode", 0, "Use Windows Unicode API for console I/O and command line arguments.",
+   &use_unicode_api, &use_unicode_api, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif __WIN__
   {"delimiter", OPT_DELIMITER, "Delimiter to be used.", &delimiter_str,
    &delimiter_str, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"execute", 'e', "Execute command and quit. (Disables --force and history file.)", 0,
@@ -1885,6 +1912,15 @@
         tmpbuf.alloc(65535);
       tmpbuf.length(0);
       buffer.length(0);
+
+      if (use_unicode_api)
+      {
+        line= my_win_console_readline(charset_info,
+                                      (char*) tmpbuf.ptr(),
+                                      tmpbuf.alloced_length());
+        goto got_a_line;
+      }
+
       size_t clen;
       do
       {
@@ -1901,6 +1937,8 @@
       */
       if (line)
         line= buffer.c_ptr();
+
+got_a_line:
 #else
       if (opt_outfile)
 	fputs(prompt, OUTFILE);
@@ -3455,19 +3493,12 @@
     grid.  (The \0 is also the reason we can't use fprintf() .) 
   */
   unsigned int i;
-  const char *p;
 
   if (right_justified) 
     for (i= data_length; i < total_bytes_to_send; i++)
       tee_putc((int)' ', PAGER);
 
-  for (i= 0, p= data; i < data_length; i+= 1, p+= 1)
-  {
-    if (*p == '\0')
-      tee_putc((int)' ', PAGER);
-    else
-      tee_putc((int)*p, PAGER);
-  }
+  tee_write(PAGER, data, data_length, MY_PRINT_0|MY_PRINT_MB);
 
   if (! right_justified) 
     for (i= data_length; i < total_bytes_to_send; i++)
@@ -3587,16 +3618,7 @@
         tee_fprintf(PAGER, "%*s: ",(int) max_length,field->name);
       if (cur[off])
       {
-        unsigned int i;
-        const char *p;
-
-        for (i= 0, p= cur[off]; i < lengths[off]; i+= 1, p+= 1)
-        {
-          if (*p == '\0')
-            tee_putc((int)' ', PAGER);
-          else
-            tee_putc((int)*p, PAGER);
-        }
+        tee_write(PAGER, cur[off], lengths[off], MY_PRINT_0|MY_PRINT_MB);
         tee_putc('\n', PAGER);
       }
       else
@@ -3666,16 +3688,7 @@
   if (!src)
     tee_fputs("NULL", PAGER);
   else
-  {
-    for (const char *p = src; length; p++, length--)
-    {
-      const char *t;
-      if ((t = array_value(xmlmeta, *p)))
-	tee_fputs(t, PAGER);
-      else
-	tee_putc(*p, PAGER);
-    }
-  }
+    tee_write(PAGER, src, length, MY_PRINT_XML|MY_PRINT_MB);
 }
 
 
@@ -3686,37 +3699,9 @@
     tee_fputs("NULL", PAGER);
   else
   {
-    if (opt_raw_data)
-    {
-      unsigned long i;
-      /* Can't use tee_fputs(), it stops with NUL characters. */
-      for (i= 0; i < length; i++, pos++)
-        tee_putc(*pos, PAGER);
-    }
-    else for (const char *end=pos+length ; pos != end ; pos++)
-    {
-#ifdef USE_MB
-      int l;
-      if (use_mb(charset_info) &&
-          (l = my_ismbchar(charset_info, pos, end)))
-      {
-	  while (l--)
-	    tee_putc(*pos++, PAGER);
-	  pos--;
-	  continue;
-      }
-#endif
-      if (!*pos)
-	tee_fputs("\\0", PAGER); // This makes everything hard
-      else if (*pos == '\t')
-	tee_fputs("\\t", PAGER); // This would destroy tab format
-      else if (*pos == '\n')
-	tee_fputs("\\n", PAGER); // This too
-      else if (*pos == '\\')
-	tee_fputs("\\\\", PAGER);
-	else
-	tee_putc(*pos, PAGER);
-    }
+    int flags= MY_PRINT_MB | (opt_raw_data ? 0 : (MY_PRINT_0|MY_PRINT_CTRL));
+    /* Can't use tee_fputs(), it stops with NUL characters. */
+    tee_write(PAGER, pos, length, flags);
   }
 }
 
@@ -4317,7 +4302,30 @@
     mysql_options(&mysql, MYSQL_INIT_COMMAND, init_command);
   }
 
-  mysql_options(&mysql, MYSQL_SET_CHARSET_NAME, default_charset);
+  mysql_set_character_set(&mysql, default_charset);
+#if __WIN__
+  uint cnv_errors;
+  String converted_database, converted_user;
+  if (use_unicode_api &&
+      !my_charset_same(&my_charset_utf8mb4_bin, mysql.charset))
+  {
+    /* Convert user and database from UTF8MB4 to connection character set */
+    if (user)
+    {
+      converted_user.copy(user, strlen(user) + 1,
+                          &my_charset_utf8mb4_bin, mysql.charset,
+                          &cnv_errors);
+      user= (char*) converted_user.ptr();
+    }
+    if (database)
+    {
+      converted_database.copy(database, strlen(database) + 1,
+                              &my_charset_utf8mb4_bin, mysql.charset,
+                              &cnv_errors);
+      database= (char*) converted_database.ptr();
+    }
+  }
+#endif
   
   if (opt_plugin_dir && *opt_plugin_dir)
     mysql_options(&mysql, MYSQL_PLUGIN_DIR, opt_plugin_dir);
@@ -4339,7 +4347,29 @@
     }
     return -1;					// Retryable
   }
-  
+
+#if __WIN__
+  /* Convert --execute buffer from UTF8MB4 to connection character set */
+  if (status.line_buff && use_unicode_api &&
+      !my_charset_same(&my_charset_utf8mb4_bin, mysql.charset))
+  {
+    String tmp;
+    LINE_BUFFER *b= status.line_buff;
+    size_t len= b->end - b->buffer + 1;
+    uint dummy_errors;
+    /* Notice +1, to convert including trailing '\0' character */
+    if (tmp.copy(status.line_buff->buffer,
+                 status.line_buff->end - status.line_buff->buffer + 1,
+                 &my_charset_utf8mb4_bin, mysql.charset, &dummy_errors))
+      return 1;
+    /* Free the old line buffer */
+    batch_readline_end(status.line_buff);
+    /* Re-initialize line buffer from the converted string */
+    if (!(status.line_buff= batch_readline_command(NULL, (char*) tmp.ptr())))
+      return 1;
+  }
+#endif /* __WIN__ */
+
   charset_info= mysql.charset;
   
   connected=1;
@@ -4645,11 +4675,73 @@
 }
 
 
+/**
+  Write data to a stream.
+  Various modes, corresponding to --tab, --xml, --raw parameters,
+  are supported.
+
+  @param file   Stream to write to
+  @param s      String to write
+  @param slen   String length
+  @flags        Flags for --tab, --xml, --raw.
+*/
+void tee_write(FILE *file, const char *s, size_t slen, int flags)
+{
+  const char *se;
+
+  for (se= s + slen; s < se; s++)
+  {
+    const char *t;
+
+    if (flags & MY_PRINT_MB)
+    {
+      int mblen, i;
+      if (use_mb(charset_info) &&
+          (mblen= my_ismbchar(charset_info, s, se)))
+      {
+#if __WIN__
+        if (use_unicode_api && my_win_is_console(file))
+          my_win_console_write(charset_info, s, mblen);
+        else
+          fwrite(file, 1, mblen, file);
+        if (opt_outfile)
+          fwrite(OUTFILE, 1, mblen, file);
+        s+= mblen - 1;
+        continue;
+#endif
+        for (i= 0; i < mblen; i++)
+	        tee_putc(s[i], file);
+        s+= mblen - 1;
+	      continue;
+      }
+    }
+
+    if ((flags & MY_PRINT_XML) && (t= array_value(xmlmeta, *s)))
+      tee_fputs(t, file);
+    else if ((flags & MY_PRINT_0) && *s == '\0')
+      tee_putc((int) ' ', file);   // This makes everything hard
+    else if ((flags & MY_PRINT_CTRL) && *s == '\t')
+      tee_fputs("\\t", file);      // This would destroy tab format
+    else if ((flags & MY_PRINT_CTRL) && *s == '\n')
+      tee_fputs("\\n", file);      // This too
+    else if ((flags & MY_PRINT_CTRL) && *s == '\\')
+      tee_fputs("\\\\", file);
+    else
+      tee_putc((int) *s, file);
+  }
+}
+
+
 void tee_fprintf(FILE *file, const char *fmt, ...)
 {
   va_list args;
 
   va_start(args, fmt);
+#if __WIN__
+  if (use_unicode_api && my_win_is_console(file))
+    my_win_console_vfprintf(charset_info, fmt, args);
+  else
+#endif
   (void) vfprintf(file, fmt, args);
   va_end(args);
 
@@ -4664,6 +4756,11 @@
 
 void tee_fputs(const char *s, FILE *file)
 {
+#ifdef __WIN__
+  if (use_unicode_api && my_win_is_console(file))
+    my_win_console_fputs(charset_info, s);
+  else
+#endif
   fputs(s, file);
   if (opt_outfile)
     fputs(s, OUTFILE);
@@ -4672,17 +4769,17 @@
 
 void tee_puts(const char *s, FILE *file)
 {
-  fputs(s, file);
-  fputc('\n', file);
-  if (opt_outfile)
-  {
-    fputs(s, OUTFILE);
-    fputc('\n', OUTFILE);
-  }
+  tee_fputs(s, file);
+  tee_putc('\n', file);
 }
 
 void tee_putc(int c, FILE *file)
 {
+#if __WIN__
+  if (use_unicode_api && my_win_is_console(file))
+    my_win_console_putc(charset_info, c);
+  else
+#endif
   putc(c, file);
   if (opt_outfile)
     putc(c, OUTFILE);

=== modified file 'include/my_sys.h'
--- include/my_sys.h	2011-02-08 15:54:12 +0000
+++ include/my_sys.h	2011-02-15 13:02:57 +0000
@@ -947,8 +947,14 @@
 
 /* implemented in my_conio.c */
 char* my_cgets(char *string, size_t clen, size_t* plen);
-
-#endif
+my_bool my_win_is_console(FILE *file);
+char *my_win_console_readline(CHARSET_INFO *cs, char *mbbuf, size_t mbbufsize);
+void my_win_console_write(CHARSET_INFO *cs, const char *data, size_t datalen);
+void my_win_console_fputs(CHARSET_INFO *cs, const char *data);
+void my_win_console_putc(CHARSET_INFO *cs, int c);
+void my_win_console_vfprintf(CHARSET_INFO *cs, const char *fmt, va_list args);
+int my_win_translate_command_line_args(CHARSET_INFO *cs, int *ac, char ***av);
+#endif /* __WIN__ */
 
 #include <mysql/psi/psi.h>
 

=== modified file 'mysys/my_conio.c'
--- mysys/my_conio.c	2009-02-13 16:41:47 +0000
+++ mysys/my_conio.c	2011-02-16 08:20:35 +0000
@@ -219,4 +219,261 @@
   return result;
 }
 
+
+/* Windows console handling */
+
+/* Maximum line length on Windows console */
+#define MAX_CONSOLE_LINE_SIZE 65535
+
+/**
+  Determine if a file is a windows console
+
+  @param file Input stream
+
+  @return
+  @retval  0 if file is not Windows console
+  @retval  1 if file is Windows console
+*/
+my_bool
+my_win_is_console(FILE *file)
+{
+  DWORD mode;
+  if (GetConsoleMode((HANDLE) _get_osfhandle(_fileno(file)), &mode))
+    return 1;
+  return 0;
+}
+
+
+/**
+  Read line from Windows console using Unicode API
+  and translate input to session character set.
+  Note, as Windows API breaks supplementary characters
+  into to wchar_t pairs, we cannot read and convert individual
+  wchar_t values separately. So let's use a buffer for
+  Unicode console input, and then convert it to "cs" in a single shot.
+  String is terminated with '\0' character.
+
+  @param cs         Character string to convert to.
+  @param mbbuf      Write input data here.
+  @param mbbufsize  Number of bytes available in mbbuf.
+
+  @rerval           Pointer to mbbuf, or NULL on I/0 error.
+*/
+char *
+my_win_console_readline(CHARSET_INFO *cs, char *mbbuf, size_t mbbufsize)
+{
+  uint dummy_errors;
+  static wchar_t u16buf[MAX_CONSOLE_LINE_SIZE + 1];
+  size_t pos, mblen;
+  DWORD console_mode;
+  HANDLE console= GetStdHandle(STD_INPUT_HANDLE);
+
+  DBUG_ASSERT(mbbufsize > 0); /* Need space for at least trailing '\0' */
+  GetConsoleMode(console, &console_mode);
+  SetConsoleMode(console, ENABLE_LINE_INPUT |
+	                        ENABLE_PROCESSED_INPUT | ENABLE_ECHO_INPUT);
+  for(pos= 0; ; )
+  {
+    DWORD nchars;
+    BOOL ok= ReadConsoleW(console, &u16buf[pos], 1, &nchars, NULL);
+    if (!ok || nchars == 0)
+    {
+      SetConsoleMode(console, console_mode);
+      return NULL;
+    }
+    if (u16buf[pos] == L'\r')
+      continue;
+    if (pos == MAX_CONSOLE_LINE_SIZE || u16buf[pos] == L'\n')
+      break;
+    pos++;
+  }
+  SetConsoleMode(console, console_mode);
+  /* Convert Unicode to session character set */
+  mblen= my_convert(mbbuf, mbbufsize - 1, cs,
+                    (const char *) u16buf, pos * sizeof(wchar_t),
+	                   &my_charset_utf16le_bin, &dummy_errors);
+  DBUG_ASSERT(mblen < mbbufsize); /* Safety */
+  mbbuf[mblen]= 0;
+  return mbbuf;
+}
+
+
+/**
+  Translate client charset to Windows wchars for console I/O.
+  Unlike copy_and_convert(), in case of a wrong multi-byte sequence
+  we don't print '?' character, we fallback to ISO-8859-1 instead.
+  This gives a better idea how binary data (e.g. BLOB) look like.
+
+  @param cs           Character set of the input string
+  @param from         Input string
+  @param from_length  Length of the input string
+  @param to[OUT]      Write Unicode data here
+  @param to_chars     Number of characters available in "to"
+*/
+static size_t
+my_mbstou16s(CHARSET_INFO *cs, const uchar * from, size_t from_length,
+             wchar_t *to, size_t to_chars)
+{
+  CHARSET_INFO *to_cs= &my_charset_utf16le_bin;
+  const uchar *from_end= from + from_length;
+  wchar_t *to_orig= to, *to_end= to + to_chars;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
+  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
+  while (from < from_end)
+  {
+    int cnvres;
+    my_wc_t wc;
+    if ((cnvres= (*mb_wc)(cs, &wc, from, from_end)) > 0)
+    {
+      if (!wc)
+        break;
+      from+= cnvres;
+    }
+    else if (cnvres == MY_CS_ILSEQ)
+    {
+      wc= (my_wc_t) (uchar) *from; /* Fallback to ISO-8859-1 */
+      from+= 1;
+    }
+    else if (cnvres > MY_CS_TOOSMALL)
+    {
+      /*
+        A correct multibyte sequence detected
+        But it doesn't have Unicode mapping. 
+      */
+      from+= (-cnvres);
+      wc= '?';
+    }
+    else /* Incomplete character */
+    {
+      wc= (my_wc_t) (uchar) *from; /* Fallback to ISO-8859-1 */
+      from+= 1;
+    }
+outp:
+    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, (uchar*) to_end)) > 0)
+    {
+      /* We can never convert only a part of wchar_t */
+      DBUG_ASSERT((cnvres % sizeof(wchar_t)) == 0);
+      /* cnvres returns numner of bytes, convert to number of wchar_t's */
+      to+= cnvres / sizeof(wchar_t);
+    }
+    else if (cnvres == MY_CS_ILUNI && wc != '?')
+    {
+      wc= '?';
+      goto outp;
+    }
+    else
+      break; /* Not enough space */
+  }
+  return to - to_orig;
+}
+
+
+/**
+  Write a string in the given character set to Windows console. 
+  As Window breaks supplementary characters into two parts,
+  we cannot use a simple loop sending the result of
+  cs->cset->mb_wc() to console.
+  So we converts string from client charset to an array of wchar_t,
+  then write the array to console in a single shot.
+
+  @param cs       Character set of the string
+  @param data     String to print
+  @param datalen  Length of input string in bytes
+*/
+void
+my_win_console_write(CHARSET_INFO *cs, const char *data, size_t datalen)
+{
+  static wchar_t u16buf[MAX_CONSOLE_LINE_SIZE + 1];
+  size_t nchars= my_mbstou16s(cs, (const uchar*) data, datalen,
+                              u16buf, sizeof(u16buf));
+  DWORD nwritten;
+  WriteConsoleW(GetStdHandle(STD_OUTPUT_HANDLE),
+                u16buf, (DWORD)nchars, &nwritten, NULL);
+}
+
+
+/**
+  Write a single-byte character to console.
+  Note: one should not send parts of a single multi-byte characters
+  in separate consequent my_win_console_putc() calls.
+  For multi-byte characters use my_win_colsole_write() instead.
+
+  @param cs  Character set of the input character
+  @param c   Character (single byte)
+*/
+void
+my_win_console_putc(CHARSET_INFO *cs, int c)
+{
+  char ch= (char) c;
+  my_win_console_write(cs, &ch, 1);
+}
+
+
+/**
+  Write a 0-terminated string to Windows console.
+
+  @param cs    Character set of the string to print
+  @param data  String to print
+*/
+void
+my_win_console_fputs(CHARSET_INFO *cs, const char *data)
+{
+  my_win_console_write(cs, data, strlen(data));
+}
+
+
+/*
+  Handle formatted output on the Windows console.
+*/
+void
+my_win_console_vfprintf(CHARSET_INFO *cs, const char *fmt, va_list args)
+{
+  static char buff[MAX_CONSOLE_LINE_SIZE + 1];
+  size_t len= vsnprintf(buff, sizeof(buff) - 1, fmt, args);
+  my_win_console_write(cs, buff, len);
+}
+
+
+#include <shellapi.h>
+
+/**
+  Translate Unicode command line parameters to the given character set
+  (Typically to utf8mb4).
+  Translated parameters are allocated using my_once_alloc().
+
+  @param      tocs    Character set to convert parameters to.
+  @param[OUT] argc    Write number of parameters here
+  @param[OUT] argv    Write pointer to allocated parameters here.
+*/
+int
+my_win_translate_command_line_args(CHARSET_INFO *cs, int *argc, char ***argv)
+{
+  int i, ac;
+  char **av;
+  wchar_t *command_line= GetCommandLineW();
+  wchar_t **wargs= CommandLineToArgvW(command_line, &ac);
+  size_t nbytes= (ac + 1) * sizeof(char*);
+
+  /* Allocate new command line parameter */
+  av= (char**) my_once_alloc(nbytes, MYF(MY_ZEROFILL));
+
+  for(i= 0; i < *argc; i++)
+  {
+    uint dummy_errors;
+    size_t arg_len= wcslen(wargs[i]);
+    size_t len, alloced_len= arg_len * cs->mbmaxlen + 1;
+    av[i]= (char *) my_once_alloc(alloced_len, MYF(0));
+    len= my_convert(av[i], alloced_len, cs,
+                    (const char *) wargs[i], arg_len * sizeof(wchar_t),
+                    &my_charset_utf16le_bin, &dummy_errors);
+    DBUG_ASSERT(len < alloced_len);
+    av[i][len]= '\0';
+  }
+  *argv= av;
+  *argc= ac;
+  /* Cleanup on exit */
+  LocalFree((HLOCAL) wargs);
+  return 0;
+}
+
 #endif /* __WIN__ */

=== modified file 'sql-common/client.c'
--- sql-common/client.c	2011-01-31 15:55:58 +0000
+++ sql-common/client.c	2011-02-16 08:42:39 +0000
@@ -4246,11 +4246,25 @@
   if (mysql->options.charset_dir)
     charsets_dir= mysql->options.charset_dir;
 
+  if (!mysql->net.vio)
+  {
+    /* Initialize with automatic OS character set detection. */
+    mysql_options(mysql, MYSQL_SET_CHARSET_NAME, cs_name);
+    mysql_init_character_set(mysql);
+    cs_name= mysql->options.charset_name;
+  }
+
   if (strlen(cs_name) < MY_CS_NAME_SIZE &&
      (cs= get_charset_by_csname(cs_name, MY_CS_PRIMARY, MYF(0))))
   {
     char buff[MY_CS_NAME_SIZE + 10];
     charsets_dir= save_csdir;
+    if (!mysql->net.vio)
+    {
+      /* If there is no connection yet we don't send "SET NAMES" query */
+      mysql->charset= cs;
+      return 0;
+    }
     /* Skip execution of "SET NAMES" for pre-4.1 servers */
     if (mysql_get_server_version(mysql) < 40100)
       return 0;


Thread
WL#5331 Support Unicode for Windows command line clientAlexander Barkov16 Feb