From: Alexander Barkov Date: March 23 2011 12:16pm Subject: bzr push into mysql-trunk branch (alexander.barkov:3314 to 3315) WL#2048 WL#3770 WL#5833 List-Archive: http://lists.mysql.com/commits/133625 Message-Id: <201103231216.p2NCGSAn029519@bar.myoffice.izhnet.ru> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit 3315 Alexander Barkov 2011-03-23 Adding functionality to dump Unicode canonical equivalence, as a preparation to: WL#5833 Process canonical equivalence in collation customization WL#2048 Add function for Unicode normalization WL#3770 Unicode-compliant comparison and sorting of combining modified: strings/uctypedump.c 3314 Vinay Fisrekar 2011-03-23 BUG#11867052 - ADD FUNCS_1 TO THE DEFAULT SUITE LIST IN MYSQL-TEST-RUN.PL Changes: mysql-test-run.pl - Added funcs_1 suite in default suite list default.daily , default.push - Removed funcs_1 suite commands as funcs_1 will run with default is_engines_innodb - skip run for embedded mode due to bug modified: mysql-test/collections/default.daily mysql-test/collections/default.push mysql-test/mysql-test-run.pl mysql-test/suite/funcs_1/t/is_engines_innodb.test === modified file 'strings/uctypedump.c' --- a/strings/uctypedump.c 2011-01-19 13:17:52 +0000 +++ b/strings/uctypedump.c 2011-03-23 12:14:17 +0000 @@ -21,7 +21,6 @@ #include #include #include -#include "m_ctype.h" typedef struct my_ctype_name_st @@ -86,156 +85,466 @@ ctypestr2num(const char *tok) } -int main(int ac, char ** av) +#define MAX_CHAR 0x10FFFF +#define MAX_DECOMPOSITION_LENGTH 2 + + +typedef struct { - char str[1024]; - unsigned char ctypea[64*1024]; - size_t i; - size_t plane; - MY_UNI_CTYPE uctype[256]; - FILE *f= stdin; + uint code; + char *name; + char general_category[3]; + int combining_class; + int bidirectional_category; + uint decomposition_mapping[MAX_DECOMPOSITION_LENGTH]; + uint decimal_digit_value; /* 0-9 */ + uint digit_value; /* 0-9 */ + char *numeric_value; /* Examples: 0, 1, 10, 100, 1000, 1/2, 5/2 */ + my_bool mirrored; /* Y or N */ + char *unicode_1_0_name; + char *iso10646_comment_field; + uint uppercase_mapping; + uint lowercase_mapping; + uint titlecase_mapping; + + int mysql_ctype; /* ctype in MySQL format */ + +} MY_UNIDATA_CHAR; + + +typedef struct +{ + int maxchar; + int debug; + int ctype; + int decomp; + const char *fname; + const char *varname; +} MY_UNIDATA_PARAM; + - if (ac > 1 && av[1] && !(f= fopen(av[1],"r"))) + +static void +unidata_param_init(MY_UNIDATA_PARAM *p) +{ + p->maxchar= MAX_CHAR; + p->debug= 0; + p->ctype= 1; + p->decomp= 1; + p->fname= NULL; + p->varname= ""; +} + + +static void +load_unidata(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *chr) +{ + char str[1024]; + FILE *f= prm->fname ? fopen(prm->fname, "r") : stdin; + if (!f) { - fprintf(stderr, "Can't open file %s\n", av[1]); + fprintf(stderr, "Can't open file %s\n", prm->fname); exit(1); } - bzero(&ctypea,sizeof(ctypea)); - bzero(&uctype, sizeof(uctype)); - - printf("/*\n"); - printf(" Unicode ctype data\n"); - printf(" Generated from %s\n", av[1] ? av[1] : "stdin"); - printf("*/\n"); - - while(fgets(str, sizeof(str), f)) + + while (fgets(str, sizeof(str), f)) { - size_t n= 0, code= 0; - char *s,*e; - int ctype= 0; - - for(s= str; s; ) + size_t n; + char *s, *e; + MY_UNIDATA_CHAR ch; + bzero(&ch, sizeof(ch)); + + for(n= 0, s= str; s; n++) { - char *end; - char tok[1024]=""; - e=strchr(s,';'); - if(e) + char *end, tok[1024]= ""; + + if((e= strchr(s, ';'))) { - strncpy(tok,s,(unsigned int)(e-s)); - tok[e-s]=0; + strncpy(tok, s, (unsigned int) (e - s)); + tok[e - s]= 0; } else { - strcpy(tok,s); + strcpy(tok, s); } - - end=tok+strlen(tok); - + + end= tok + strlen(tok); + switch(n) { - case 0: code= strtol(tok,&end,16);break; - case 2: ctype= ctypestr2num(tok);break; + case 0: ch.code= strtol(tok, &end, 16); break; + case 1: break; /* Character name */ + case 2: /* General category */ + ch.general_category[0]= tok[0]; + ch.general_category[1]= tok[1]; + ch.general_category[2]= '\0'; + ch.mysql_ctype= ctypestr2num(tok); + break; + + case 3: /* Canonical Combining Class */ + ch.combining_class= atoi(tok); + /* + if (ch.combining_class) + printf("YYY[%04X]=%d\n", ch.code, ch.combining_class); + */ + break; + case 4: break; /* Bidirectional Category */ + case 5: /* Character Decomposition Mapping */ + if (*tok != '<') + { + size_t i; + char *dec, *endptr; + for (dec= strtok_r(tok, " \t", &endptr), i= 0; + dec; + dec= strtok_r(NULL, " \t", &endptr), i++) + { + if (i >= MAX_DECOMPOSITION_LENGTH) + { + fprintf(stderr, "Decomposition length is too long for character %04X\n", ch.code); + exit(1); + } + ch.decomposition_mapping[i]= strtol(dec, NULL, 16); + } + } + break; + + case 6: /* Decimal digit value */ + ch.decimal_digit_value= atoi(tok); + break; + + case 7: /* Digit value */ + ch.digit_value= atoi(tok); + break; + + case 8: /* Numeric value */ + break; + + case 9: break; /* Mirrored */ + case 10: break; /* Unicode 1.0 Name */ + case 11: break; /* 10646 comment field */ + case 12: break; /* Uppercase */ + case 13: break; /* Lowecase */ + case 14: break; /* Titlecase */ } - - n++; - if(e) s=e+1; - else s=e; - } - if(code<=0xFFFF) - { - ctypea[code]= ctype; + s= e ? e + 1 : e; } + if(ch.code <= prm->maxchar) + chr[ch.code]= ch; } - +} + + +static void +unidata_char_set_cjk(MY_UNIDATA_CHAR *unidata, int max_char, int cur_char) +{ + if (cur_char < max_char) + { + MY_UNIDATA_CHAR *ch= &unidata[cur_char]; + ch->mysql_ctype= _MY_L | _MY_U; + strcpy(ch->general_category, "Lo"); + } +} + + +static void +fill_implicit_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata) +{ + int i; /* Fill digits */ for (i= '0'; i <= '9'; i++) - ctypea[i]= _MY_NMR; - + unidata[i].mysql_ctype= _MY_NMR; + /* Fill hex digits */ for (i= 'a'; i <= 'z'; i++) - ctypea[i]|= _MY_X; + unidata[i].mysql_ctype|= _MY_X; for (i= 'A'; i <= 'Z'; i++) - ctypea[i]|= _MY_X; - - + unidata[i].mysql_ctype|= _MY_X; + /* Fill ideographs */ - /* CJK Ideographs Extension A (U+3400 - U+4DB5) */ - for(i=0x3400;i<=0x4DB5;i++) - { - ctypea[i]= _MY_L | _MY_U; - } - + for(i= 0x3400; i <= 0x4DB5; i++) + unidata_char_set_cjk(unidata, prm->maxchar, i); + /* CJK Ideographs (U+4E00 - U+9FA5) */ - for(i=0x4E00;i<=0x9FA5;i++){ - ctypea[i]= _MY_L | _MY_U; - } - + for(i= 0x4E00; i <= 0x9FA5; i++) /* 9FCB in 5.2.0 */ + unidata_char_set_cjk(unidata, prm->maxchar, i); + /* Hangul Syllables (U+AC00 - U+D7A3) */ - for(i=0xAC00;i<=0xD7A3;i++) + for(i= 0xAC00; i <= 0xD7A3; i++) + unidata_char_set_cjk(unidata, prm->maxchar, i); + + /* + 20000;;Lo;0;L;;;;;N;;;;; + 2A6D6;;Lo;0;L;;;;;N;;;;; + */ + for (i= 0x20000; i <= 0x2A6D6; i++) + unidata_char_set_cjk(unidata, prm->maxchar, i); + + /* + 2A700;;Lo;0;L;;;;;N;;;;; + 2B734;;Lo;0;L;;;;;N;;;;; + */ + for (i= 0x2A700; i <= 0x2B734; i++) + unidata_char_set_cjk(unidata, prm->maxchar, i); + + + /* + TODO: + D800;;Cs;0;L;;;;;N;;;;; + DB7F;;Cs;0;L;;;;;N;;;;; + DB80;;Cs;0;L;;;;;N;;;;; + DBFF;;Cs;0;L;;;;;N;;;;; + DC00;;Cs;0;L;;;;;N;;;;; + DFFF;;Cs;0;L;;;;;N;;;;; + + E000;;Co;0;L;;;;;N;;;;; + F8FF;;Co;0;L;;;;;N;;;;; + F0000;;Co;0;L;;;;;N;;;;; + FFFFD;;Co;0;L;;;;;N;;;;; + 100000;;Co;0;L;;;;;N;;;;; + 10FFFD;;Co;0;L;;;;;N;;;;;0 + */ +} + + +/* + Check if ctype for the entire page consisting of "nchars" + characters is the same. + Return -1 otherwise. +*/ +static int +page_ctype(MY_UNIDATA_CHAR *data, size_t nchars) +{ + size_t i; + for (i= 1; i < nchars; i++) { - ctypea[i]= _MY_L | _MY_U; + if (data[i].mysql_ctype != data->mysql_ctype) + return -1; } - - - /* Calc plane parameters */ - for(plane=0;plane<256;plane++) + return data->mysql_ctype; +} + + +static void +dump_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata) +{ + int page, max_page= (prm->maxchar + 255) / 256; + + printf("/*\n"); + printf(" Unicode ctype data\n"); + printf(" Generated from %s\n", prm->fname ? prm->fname : "stdin"); + printf("*/\n"); + + /* Dump planes with mixed ctype */ + for(page= 0; page < max_page; page++) { - size_t character; - uctype[plane].ctype= ctypea+plane*256; - - uctype[plane].pctype= uctype[plane].ctype[0]; - for(character=1;character<256;character++) + if (page_ctype(unidata + page * 256, 256) < 0) { - if (uctype[plane].ctype[character] != uctype[plane].pctype) + size_t charnum, num; + printf("static unsigned char uctype%s_page%02X[256]=\n{\n", + prm->varname, page); + for(num= 0, charnum=0; charnum < 256; charnum++) { - uctype[plane].pctype= 0; /* Mixed plane */ - break; + printf(" %2d%s", unidata[page * 256 + charnum].mysql_ctype, + charnum < 255 ? "," : ""); + if(++num == 16) + { + printf("\n"); + num= 0; + } } + printf("};\n\n"); } - if (character==256) /* All the same, no needs to dump whole plane */ - uctype[plane].ctype= NULL; } - - /* Dump mixed planes */ - for(plane=0;plane<256;plane++) + + /* Dump ctype page index */ + printf("MY_UNI_CTYPE my_uni_ctype%s[%d]={\n", prm->varname, max_page); + for(page= 0; page < max_page; page++) { - if(uctype[plane].ctype) + char page_name[128]="NULL"; + int ctype; + if ((ctype= page_ctype(unidata + page * 256, 256)) < 0) { - int charnum=0; - int num=0; - - printf("static unsigned char uctype_page%02X[256]=\n{\n",plane); - - for(charnum=0;charnum<256;charnum++) + sprintf(page_name,"uctype%s_page%02X", prm->varname, page); + ctype= 0; + } + printf("\t{%d,%s}%s\n", ctype, page_name, page < max_page - 1 ? "," : ""); + } + printf("};\n\n\n"); +} + + +/* +static int +decomposition_length(MY_UNIDATA_CHAR *ch) +{ + if (ch->decomposition_mapping[1]) + return 2; + if (ch->decomposition_mapping[0]) + return 1; + return 0; +} +*/ + +static void +dump_decomposition_page(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata, + uint pageno, uint nchars) +{ + uint i, ofs= pageno * 256; + printf("static MY_UNI_DECOMPOSITION decomp%s_p%02X[256]= {\n", + prm->varname, pageno); + for (i= 0; i < nchars; i++) + { + MY_UNIDATA_CHAR *ch= &unidata[ofs + i]; + + printf("/* %04X */ {0x%04X,0x%04X},", + ofs + i, ch->decomposition_mapping[0], ch->decomposition_mapping[1]); + + if (ch->decomposition_mapping[0]) + printf(" %s/* [%s-%s][%d-%d] */", + ch->decomposition_mapping[0] < 0x10000 ? " " : "", + unidata[ch->decomposition_mapping[0]].general_category, + unidata[ch->decomposition_mapping[1]].general_category, + unidata[ch->decomposition_mapping[0]].combining_class, + unidata[ch->decomposition_mapping[1]].combining_class); + printf("\n"); + } + printf("};\n\n\n"); +} + + +static size_t +calc_decompositions(MY_UNIDATA_CHAR *unidata, size_t nchars) +{ + size_t i, n; + for (n= i= 0; i < nchars; i++) + { + if (unidata[i].decomposition_mapping[0]) + n++; + } + return n; +} + + +static void +dump_decomposition(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata) +{ + int i, npages= (prm->maxchar + 255) / 256; + + printf("/*\n"); + printf(" Unicode canonical decomposition data\n"); + printf(" Generated from %s\n", prm->fname ? prm->fname : "stdin"); + printf("*/\n"); + + /* Dump pages */ + for (i= 0; i < npages; i++) + { + MY_UNIDATA_CHAR *page= unidata + i * 256; + if (calc_decompositions(page, 256)) + dump_decomposition_page(prm, unidata, i, 256); + } + + /* Dump decompositions */ + printf("static MY_UNI_DECOMPOSITION *my_uni_decomp%s[%d]=\n{\n", + prm->varname, npages); + for (i= 0; i < npages; i++) + { + MY_UNIDATA_CHAR *page= unidata + i * 256; + if (calc_decompositions(page, 256)) + printf("decom%s_p%02X,", prm->varname, i); + else + printf("NULL,"); + if ((i % 8) == 7) + printf("\n"); + } + printf("};\n"); +} + + +static void +usage(FILE *f, int rc) +{ + exit(rc); +} + + +static int +get_int_option(const char *str, const char *name, int *num) +{ + size_t namelen= strlen(name); + if (!strncmp(str, name, namelen)) + { + const char *val= str + namelen; + if (val[0] == '0' && val[1] == 'x') + { + *num= strtol(val, NULL, 16); + } + else + { + *num= atoi(val); + if (*num == 0 && *val !='0') { - int cod; - - cod=(plane<<8)+charnum; - printf(" %2d%s",uctype[plane].ctype[charnum],charnum<255?",":""); - - num++; - if(num==16) - { - printf("\n"); - num=0; - } + fprintf(stderr, "\nBad numeric option value: %s\n\n", str); + usage(stderr, 1); } - printf("};\n\n"); } + return 1; } - - - /* Dump plane index */ - printf("MY_UNI_CTYPE my_uni_ctype[256]={\n"); - for(plane=0;plane<256;plane++) - { - char plane_name[128]="NULL"; - if(uctype[plane].ctype){ - sprintf(plane_name,"uctype_page%02X",plane); + return 0; +} + + +static int +get_const_str_option(const char *str, const char *name, const char **val) +{ + size_t namelen= strlen(name); + if (!strncmp(str, name, namelen)) + { + *val= str + namelen; + return 1; + } + return 0; +} + + +static void +process_options(MY_UNIDATA_PARAM *prm, int ac, char **av) +{ + int i; + unidata_param_init(prm); + for (i= 1; i < ac ; i++) + { + /* printf("[%d]=%s\n", i, av[i]); */ + if (av[i][0] != '-' || av[i][1] != '-') + break; + if (!get_const_str_option(av[i], "--name=", &prm->varname) && + !get_int_option(av[i], "--maxchar=", &prm->maxchar) && + !get_int_option(av[i], "--ctype=", &prm->ctype) && + !get_int_option(av[i], "--decomp=", &prm->decomp) && + !get_int_option(av[i], "--debug=", &prm->debug)) + { + fprintf(stderr, "\nUnknown option: %s\n\n", av[i]); + usage(stderr, 1); } - printf("\t{%d,%s}%s\n",uctype[plane].pctype,plane_name,plane<255?",":""); } - printf("};\n"); - + prm->fname= av[i]; +} + + +int main(int ac, char ** av) +{ + MY_UNIDATA_PARAM prm; + static MY_UNIDATA_CHAR unidata[MAX_CHAR + 1]; + + process_options(&prm, ac, av); + bzero(unidata, sizeof(unidata)); + fill_implicit_ctype(&prm, unidata); + load_unidata(&prm, unidata); + + if (prm.ctype) + dump_ctype(&prm, unidata); + + if (prm.decomp) + dump_decomposition(&prm, unidata); + return 0; } No bundle (reason: useless for push emails).