List:Commits« Previous MessageNext Message »
From:Alexander Barkov Date:March 23 2011 12:16pm
Subject:bzr commit into mysql-trunk branch (alexander.barkov:3315) WL#2048 WL#3770
WL#5833
View as plain text  
#At file:///home/bar/mysql-bzr/mysql-trunk/ based on revid:vinay.fisrekar@stripped

 3315 Alexander Barkov	2011-03-23
      Adding functionality to dump Unicode canonical equivalence,
      as a preparation to:
      WL#5833 Process canonical equivalence in collation customization 
      WL#2048 Add function for Unicode normalization
      WL#3770 Unicode-compliant comparison and sorting of combining

    modified:
      strings/uctypedump.c
=== modified file 'strings/uctypedump.c'
--- a/strings/uctypedump.c	2011-01-19 13:17:52 +0000
+++ b/strings/uctypedump.c	2011-03-23 12:14:17 +0000
@@ -21,7 +21,6 @@
 #include <my_global.h>
 #include <m_string.h>
 #include <m_ctype.h>
-#include "m_ctype.h"
 
 
 typedef struct my_ctype_name_st
@@ -86,156 +85,466 @@ ctypestr2num(const char *tok)
 }
 
 
-int main(int ac, char ** av)
+#define MAX_CHAR 0x10FFFF
+#define MAX_DECOMPOSITION_LENGTH 2
+
+
+typedef struct 
 {
-  char str[1024];
-  unsigned char ctypea[64*1024];
-  size_t i;
-  size_t plane;
-  MY_UNI_CTYPE uctype[256];
-  FILE *f= stdin;
+  uint code;
+  char *name;
+  char general_category[3];
+  int combining_class;
+  int bidirectional_category;
+  uint decomposition_mapping[MAX_DECOMPOSITION_LENGTH];
+  uint decimal_digit_value; /* 0-9 */
+  uint digit_value;         /* 0-9 */
+  char *numeric_value;      /* Examples: 0, 1, 10, 100, 1000, 1/2, 5/2 */
+  my_bool mirrored;         /* Y or N */
+  char *unicode_1_0_name;
+  char *iso10646_comment_field;
+  uint uppercase_mapping;
+  uint lowercase_mapping;
+  uint titlecase_mapping;
+
+  int mysql_ctype; /* ctype in MySQL format */
+
+} MY_UNIDATA_CHAR;
+
+
+typedef struct
+{
+  int maxchar;
+  int debug;
+  int ctype;
+  int decomp;
+  const char *fname;
+  const char *varname;
+} MY_UNIDATA_PARAM;
+
 
-  if (ac > 1 && av[1] && !(f= fopen(av[1],"r")))
+
+static void
+unidata_param_init(MY_UNIDATA_PARAM *p)
+{
+  p->maxchar= MAX_CHAR;
+  p->debug= 0;
+  p->ctype= 1;
+  p->decomp= 1;
+  p->fname= NULL;
+  p->varname= "";
+}
+
+
+static void
+load_unidata(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *chr)
+{
+  char str[1024];
+  FILE *f= prm->fname ? fopen(prm->fname, "r") : stdin;
+  if (!f)
   {
-    fprintf(stderr, "Can't open file %s\n", av[1]);
+    fprintf(stderr, "Can't open file %s\n", prm->fname);
     exit(1);
   }
-  bzero(&ctypea,sizeof(ctypea));
-  bzero(&uctype, sizeof(uctype));
-  
-  printf("/*\n");
-  printf("  Unicode ctype data\n");
-  printf("  Generated from %s\n", av[1] ? av[1] : "stdin");
-  printf("*/\n");
-  
-  while(fgets(str, sizeof(str), f))
+
+  while (fgets(str, sizeof(str), f))
   {
-    size_t n= 0, code= 0;
-    char *s,*e;
-    int ctype= 0;
-    
-    for(s= str; s; )
+    size_t n;
+    char *s, *e;
+    MY_UNIDATA_CHAR ch;
+    bzero(&ch, sizeof(ch));
+
+    for(n= 0, s= str; s; n++)
     {
-      char *end;
-      char tok[1024]="";
-      e=strchr(s,';');
-      if(e)
+      char *end, tok[1024]= "";
+
+      if((e= strchr(s, ';')))
       {
-        strncpy(tok,s,(unsigned int)(e-s));
-        tok[e-s]=0;
+        strncpy(tok, s, (unsigned int) (e - s));
+        tok[e - s]= 0;
       }
       else
       {
-        strcpy(tok,s);
+        strcpy(tok, s);
       }
-      
-      end=tok+strlen(tok);
-      
+
+      end= tok + strlen(tok);
+
       switch(n)
       {
-        case 0: code= strtol(tok,&end,16);break;
-        case 2: ctype= ctypestr2num(tok);break;
+        case 0: ch.code= strtol(tok, &end, 16); break;
+        case 1: break; /* Character name */
+        case 2: /* General category */
+          ch.general_category[0]= tok[0];
+          ch.general_category[1]= tok[1];
+          ch.general_category[2]= '\0';
+          ch.mysql_ctype= ctypestr2num(tok);
+          break;
+          
+        case 3: /* Canonical Combining Class */
+          ch.combining_class= atoi(tok);
+          /*
+          if (ch.combining_class)
+            printf("YYY[%04X]=%d\n", ch.code, ch.combining_class);
+          */
+          break;
+        case 4: break;  /* Bidirectional Category */
+        case 5: /* Character Decomposition Mapping */
+          if (*tok != '<')
+          {
+            size_t i;
+            char *dec, *endptr;
+            for (dec= strtok_r(tok, " \t", &endptr), i= 0;
+                 dec;
+                 dec= strtok_r(NULL, " \t", &endptr), i++)
+            {
+              if (i >= MAX_DECOMPOSITION_LENGTH)
+              {
+                fprintf(stderr, "Decomposition length is too long for character %04X\n", ch.code);
+                exit(1);
+              }
+              ch.decomposition_mapping[i]= strtol(dec, NULL, 16);
+            }
+          }
+          break;
+
+        case 6: /* Decimal digit value */
+          ch.decimal_digit_value= atoi(tok);
+          break;
+
+        case 7:  /* Digit value */
+          ch.digit_value= atoi(tok);
+          break;
+
+        case 8:  /* Numeric value */
+          break;
+
+        case 9: break;  /* Mirrored */
+        case 10: break; /* Unicode 1.0 Name */
+        case 11: break; /* 10646 comment field */
+        case 12: break; /* Uppercase */
+        case 13: break; /* Lowecase  */
+        case 14: break; /* Titlecase */
       }
-      
-      n++;
-      if(e)  s=e+1;
-      else  s=e;
-    }
-    if(code<=0xFFFF)
-    {
-      ctypea[code]= ctype;
+      s= e ? e + 1 : e;
     }
+    if(ch.code <= prm->maxchar)
+      chr[ch.code]= ch;
   }
-  
+}
+
+
+static void
+unidata_char_set_cjk(MY_UNIDATA_CHAR *unidata, int max_char, int cur_char)
+{
+  if (cur_char < max_char)
+  {
+    MY_UNIDATA_CHAR *ch= &unidata[cur_char];
+    ch->mysql_ctype= _MY_L | _MY_U;
+    strcpy(ch->general_category, "Lo");
+  }
+}
+
+
+static void
+fill_implicit_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata)
+{
+  int i;
   /* Fill digits */
   for (i= '0'; i <= '9'; i++)
-    ctypea[i]= _MY_NMR;
-    
+    unidata[i].mysql_ctype= _MY_NMR;
+  /* Fill hex digits */
   for (i= 'a'; i <= 'z'; i++)
-    ctypea[i]|= _MY_X;
+    unidata[i].mysql_ctype|= _MY_X;
   for (i= 'A'; i <= 'Z'; i++)
-    ctypea[i]|= _MY_X;
-  
-  
+    unidata[i].mysql_ctype|= _MY_X;
+
   /* Fill ideographs  */
-  
   /* CJK Ideographs Extension A (U+3400 - U+4DB5) */
-  for(i=0x3400;i<=0x4DB5;i++)
-  {
-    ctypea[i]= _MY_L | _MY_U;
-  }
-  
+  for(i= 0x3400; i <= 0x4DB5; i++)
+    unidata_char_set_cjk(unidata, prm->maxchar, i);
+
   /* CJK Ideographs (U+4E00 - U+9FA5) */
-  for(i=0x4E00;i<=0x9FA5;i++){
-    ctypea[i]= _MY_L | _MY_U;
-  }
-  
+  for(i= 0x4E00; i <= 0x9FA5; i++)  /* 9FCB in 5.2.0 */
+    unidata_char_set_cjk(unidata, prm->maxchar, i);
+
   /* Hangul Syllables (U+AC00 - U+D7A3)  */
-  for(i=0xAC00;i<=0xD7A3;i++)
+  for(i= 0xAC00; i <= 0xD7A3; i++)
+    unidata_char_set_cjk(unidata, prm->maxchar, i);
+
+  /*
+  20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
+  2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
+  */
+  for (i= 0x20000; i <= 0x2A6D6; i++)
+    unidata_char_set_cjk(unidata, prm->maxchar, i);
+
+  /*
+  2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
+  2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
+  */
+  for (i= 0x2A700; i <= 0x2B734; i++)
+    unidata_char_set_cjk(unidata, prm->maxchar, i);
+
+   
+ /* 
+  TODO:
+  D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
+  DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
+  DB80;<Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
+  DBFF;<Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
+  DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
+  DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
+
+  E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
+  F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
+  F0000;<Plane 15 Private Use, First>;Co;0;L;;;;;N;;;;;
+  FFFFD;<Plane 15 Private Use, Last>;Co;0;L;;;;;N;;;;;
+  100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
+  10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;0
+  */
+}
+
+
+/*
+  Check if ctype for the entire page consisting of "nchars"
+  characters is the same.
+  Return -1 otherwise.
+*/
+static int
+page_ctype(MY_UNIDATA_CHAR *data, size_t nchars)
+{
+  size_t i;
+  for (i= 1; i < nchars; i++)
   {
-    ctypea[i]= _MY_L | _MY_U;
+    if (data[i].mysql_ctype != data->mysql_ctype)
+      return -1;
   }
-  
-  
-  /* Calc plane parameters */
-  for(plane=0;plane<256;plane++)
+  return data->mysql_ctype;
+}
+
+
+static void
+dump_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata)
+{
+  int page, max_page= (prm->maxchar + 255) / 256;
+
+  printf("/*\n");
+  printf("  Unicode ctype data\n");
+  printf("  Generated from %s\n", prm->fname ? prm->fname : "stdin");
+  printf("*/\n");
+
+  /* Dump planes with mixed ctype */
+  for(page= 0; page < max_page; page++)
   {
-    size_t character;
-    uctype[plane].ctype= ctypea+plane*256;
-    
-    uctype[plane].pctype= uctype[plane].ctype[0];
-    for(character=1;character<256;character++)
+    if (page_ctype(unidata + page * 256, 256) < 0)
     {
-      if (uctype[plane].ctype[character] != uctype[plane].pctype)
+      size_t charnum, num;
+      printf("static unsigned char uctype%s_page%02X[256]=\n{\n",
+             prm->varname, page);
+      for(num= 0, charnum=0; charnum < 256; charnum++)
       {
-        uctype[plane].pctype= 0; /* Mixed plane */
-        break;
+        printf(" %2d%s", unidata[page * 256 + charnum].mysql_ctype,
+               charnum < 255 ? "," : "");
+        if(++num == 16)
+        {
+          printf("\n");
+          num= 0;
+        }
       }
+      printf("};\n\n");
     }
-    if (character==256)	/* All the same, no needs to dump whole plane */
-      uctype[plane].ctype= NULL; 
   }
-  
-  /* Dump mixed planes */
-  for(plane=0;plane<256;plane++)
+
+  /* Dump ctype page index */
+  printf("MY_UNI_CTYPE my_uni_ctype%s[%d]={\n", prm->varname, max_page);
+  for(page= 0; page < max_page; page++)
   {
-    if(uctype[plane].ctype)
+    char page_name[128]="NULL";
+    int ctype;
+    if ((ctype= page_ctype(unidata + page * 256, 256)) < 0)
     {
-      int charnum=0;
-      int num=0;
-      
-      printf("static unsigned char uctype_page%02X[256]=\n{\n",plane);
-      
-      for(charnum=0;charnum<256;charnum++)
+      sprintf(page_name,"uctype%s_page%02X", prm->varname, page);
+      ctype= 0;
+    }
+    printf("\t{%d,%s}%s\n", ctype, page_name, page < max_page - 1 ? "," : "");
+  }
+  printf("};\n\n\n");
+}
+
+
+/*
+static int
+decomposition_length(MY_UNIDATA_CHAR *ch)
+{
+  if (ch->decomposition_mapping[1])
+    return 2;
+  if (ch->decomposition_mapping[0])
+    return 1;
+  return 0;
+}
+*/
+
+static void
+dump_decomposition_page(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata,
+                        uint pageno, uint nchars)
+{
+  uint i, ofs= pageno * 256;
+  printf("static MY_UNI_DECOMPOSITION decomp%s_p%02X[256]= {\n",
+         prm->varname, pageno);
+  for (i= 0; i < nchars; i++)
+  {
+    MY_UNIDATA_CHAR *ch= &unidata[ofs + i];
+    
+    printf("/* %04X */ {0x%04X,0x%04X},",
+           ofs + i, ch->decomposition_mapping[0], ch->decomposition_mapping[1]);
+    
+    if (ch->decomposition_mapping[0])
+      printf(" %s/* [%s-%s][%d-%d] */",
+             ch->decomposition_mapping[0] < 0x10000 ? " " : "",
+             unidata[ch->decomposition_mapping[0]].general_category,
+             unidata[ch->decomposition_mapping[1]].general_category,
+             unidata[ch->decomposition_mapping[0]].combining_class,
+             unidata[ch->decomposition_mapping[1]].combining_class);
+    printf("\n");
+  }  
+  printf("};\n\n\n");
+}
+
+
+static size_t
+calc_decompositions(MY_UNIDATA_CHAR *unidata, size_t nchars)
+{
+  size_t i, n;
+  for (n= i= 0; i < nchars; i++)
+  {
+    if (unidata[i].decomposition_mapping[0])
+      n++;
+  }
+  return n;
+}
+
+
+static void
+dump_decomposition(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata)
+{
+  int i, npages= (prm->maxchar + 255) / 256;
+
+  printf("/*\n");
+  printf("  Unicode canonical decomposition data\n");
+  printf("  Generated from %s\n", prm->fname ? prm->fname : "stdin");
+  printf("*/\n");
+
+  /* Dump pages */
+  for (i= 0; i < npages; i++)
+  {
+    MY_UNIDATA_CHAR *page= unidata + i * 256;
+    if (calc_decompositions(page, 256))
+      dump_decomposition_page(prm, unidata, i, 256);
+  }
+
+  /* Dump decompositions */
+  printf("static MY_UNI_DECOMPOSITION *my_uni_decomp%s[%d]=\n{\n",
+         prm->varname, npages);
+  for (i= 0; i < npages; i++)
+  {
+    MY_UNIDATA_CHAR *page= unidata + i * 256;
+    if (calc_decompositions(page, 256))
+      printf("decom%s_p%02X,", prm->varname, i);
+    else
+      printf("NULL,");
+    if ((i % 8) == 7)
+      printf("\n");
+  }
+  printf("};\n");
+}
+
+
+static void
+usage(FILE *f, int rc)
+{
+  exit(rc);
+}
+
+
+static int
+get_int_option(const char *str, const char *name, int *num)
+{
+  size_t namelen= strlen(name);
+  if (!strncmp(str, name, namelen))
+  {
+    const char *val= str + namelen;
+    if (val[0] == '0' && val[1] == 'x')
+    {
+      *num= strtol(val, NULL, 16);
+    }
+    else
+    {
+      *num= atoi(val);
+      if (*num == 0 && *val !='0')
       {
-        int cod;
-        
-        cod=(plane<<8)+charnum;
-        printf(" %2d%s",uctype[plane].ctype[charnum],charnum<255?",":"");
-      
-        num++;
-        if(num==16)
-        {
-          printf("\n");
-          num=0;
-        }
+        fprintf(stderr, "\nBad numeric option value: %s\n\n", str);
+        usage(stderr, 1);
       }
-      printf("};\n\n");
     }
+    return 1;
   }
-  
-  
-  /* Dump plane index */
-  printf("MY_UNI_CTYPE my_uni_ctype[256]={\n");
-  for(plane=0;plane<256;plane++)
-  {
-    char plane_name[128]="NULL";
-    if(uctype[plane].ctype){
-      sprintf(plane_name,"uctype_page%02X",plane);
+  return 0;
+}
+
+
+static int
+get_const_str_option(const char *str, const char *name, const char **val)
+{
+  size_t namelen= strlen(name);
+  if (!strncmp(str, name, namelen))
+  {
+    *val= str + namelen;
+    return 1;
+  }
+  return 0;
+}
+
+
+static void
+process_options(MY_UNIDATA_PARAM *prm, int ac, char **av)
+{
+  int i;
+  unidata_param_init(prm);
+  for (i= 1; i < ac ; i++)
+  {
+    /* printf("[%d]=%s\n", i, av[i]); */
+    if (av[i][0] != '-' || av[i][1] != '-')
+      break;
+    if (!get_const_str_option(av[i], "--name=", &prm->varname) &&
+        !get_int_option(av[i], "--maxchar=", &prm->maxchar) &&
+        !get_int_option(av[i], "--ctype=",   &prm->ctype) &&
+        !get_int_option(av[i], "--decomp=",  &prm->decomp) &&
+        !get_int_option(av[i], "--debug=",   &prm->debug))
+    {
+      fprintf(stderr, "\nUnknown option: %s\n\n", av[i]);
+      usage(stderr, 1);
     }
-    printf("\t{%d,%s}%s\n",uctype[plane].pctype,plane_name,plane<255?",":"");
   }
-  printf("};\n");
-  
+  prm->fname= av[i];
+}
+
+
+int main(int ac, char ** av)
+{
+  MY_UNIDATA_PARAM prm;
+  static MY_UNIDATA_CHAR unidata[MAX_CHAR + 1];
+
+  process_options(&prm, ac, av);
+  bzero(unidata, sizeof(unidata));
+  fill_implicit_ctype(&prm, unidata);
+  load_unidata(&prm, unidata);
+
+  if (prm.ctype)
+    dump_ctype(&prm, unidata);
+
+  if (prm.decomp)
+    dump_decomposition(&prm, unidata);
+
   return 0;
 }


Attachment: [text/bzr-bundle] bzr/alexander.barkov@oracle.com-20110323121417-ug1i6tlqd2yyatpu.bundle
Thread
bzr commit into mysql-trunk branch (alexander.barkov:3315) WL#2048 WL#3770WL#5833Alexander Barkov23 Mar