#At file:///home/bar/mysql-bzr/mysql-trunk.w5624/ based on revid:mattias.jonsson@strippedvfisxc3ic
3699 Alexander Barkov 2011-02-22
WL#5624 Collation customization improvements
Extension for the original patch to handle "bulk" shift.
@ mysql-test/r/ctype_ldml.result
@ mysql-test/std_data/Index.xml
@mysql-test/t/ctype_ldml.test
Adding tests
@ strings/ctype-uca.c
@ strings/ctype.c
shift-after-method="expand" collation option.
modified:
mysql-test/r/ctype_ldml.result
mysql-test/std_data/Index.xml
mysql-test/t/ctype_ldml.test
strings/ctype-uca.c
strings/ctype.c
=== modified file 'mysql-test/r/ctype_ldml.result'
--- a/mysql-test/r/ctype_ldml.result 2010-12-20 10:28:06 +0000
+++ b/mysql-test/r/ctype_ldml.result 2011-02-22 13:25:05 +0000
@@ -468,6 +468,7 @@ utf8_5624_4 utf8 357 8
ucs2_test_ci ucs2 358 8
ucs2_vn_ci ucs2 359 8
ucs2_5624_1 ucs2 360 8
+utf8_5624_5 utf8 368 8
utf32_test_ci utf32 391 8
utf8_maxuserid_ci utf8 2047 8
show collation like '%test%';
@@ -1053,5 +1054,82 @@ wa GROUP_CONCAT(HEX(CONVERT(a USING ucs2
15D4 09B909CD
DROP TABLE t1;
#
+# WL#5624, shift after, using expansion
+#
+SET NAMES utf8 COLLATE utf8_5624_5;
+CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS a LIMIT 0;
+INSERT INTO t1 VALUES ('0'),('1'),('0z'),(_ucs2 0x0030FF9D);
+INSERT INTO t1 VALUES ('a'),('b'),('c'),('d'),('e'),('f'),('g'),('h'),('i');
+INSERT INTO t1 VALUES ('j'),('k'),('l'),('m'),('n'),('o'),('p'),('q'),('r');
+INSERT INTO t1 VALUES ('s'),('t'),('u'),('v'),('w'),('x'),('y'),('z');
+INSERT INTO t1 VALUES ('aa'),('aaa');
+INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
+INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
+INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
+INSERT INTO t1 VALUES ('AA'),('AAA');
+SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
+a HEX(WEIGHT_STRING(a))
+0 0E29
+0z 0E290E292357
+0ン 0E291E81
+a 0E29233E
+b 0E29233F
+c 0E292340
+d 0E292341
+e 0E292342
+f 0E292343
+g 0E292344
+h 0E292345
+i 0E292346
+j 0E292347
+k 0E292348
+l 0E292349
+m 0E29234A
+n 0E29234B
+o 0E29234C
+p 0E29234D
+q 0E29234E
+r 0E29234F
+s 0E292350
+t 0E292351
+u 0E292352
+v 0E292353
+w 0E292354
+x 0E292355
+y 0E292356
+z 0E292357
+aa 0E292358
+aaa 0E292359
+A 0E29333E
+B 0E29333F
+C 0E293340
+D 0E293341
+E 0E293342
+F 0E293343
+G 0E293344
+H 0E293345
+I 0E293346
+J 0E293347
+K 0E293348
+L 0E293349
+M 0E29334A
+N 0E29334B
+O 0E29334C
+P 0E29334D
+Q 0E29334E
+R 0E29334F
+S 0E293350
+T 0E293351
+U 0E293352
+V 0E293353
+W 0E293354
+X 0E293355
+Y 0E293356
+Z 0E293357
+AA 0E293358
+AAA 0E293359
+1 0E2A
+DROP TABLE t1;
+#
# End of WL#5624
#
=== modified file 'mysql-test/std_data/Index.xml'
--- a/mysql-test/std_data/Index.xml 2010-12-20 10:28:06 +0000
+++ b/mysql-test/std_data/Index.xml 2011-02-22 13:25:05 +0000
@@ -105,6 +105,22 @@
</rules>
</collation>
+ <!-- shift after using expansion -->
+ <collation name="utf8_5624_5" id="368" shift-after-method="expand">
+ <rules>
+ <!--
+ Put small basic Latin letters between 0 and 1.
+ Simple shift method would not work, because there is no
+ weight space between 0 and 1 in DUCET.
+ Also, to test it works with contractions, put some after 'z'.
+ -->
+ <reset>0</reset>
+ <pc>abcdefghijklmnopqrstuvwxyz</pc><p>aa</p><p>aaa</p>
+ <reset before="primary">1</reset>
+ <pc>ABCDEFGHIJKLMNOPQRSTUVWXYZ</pc><p>AA</p><p>AAA</p>
+ </rules>
+ </collation>
+
<collation name="utf8_hugeid_ci" id="2047000000">
<rules>
<reset>a</reset>
=== modified file 'mysql-test/t/ctype_ldml.test'
--- a/mysql-test/t/ctype_ldml.test 2010-12-20 10:28:06 +0000
+++ b/mysql-test/t/ctype_ldml.test 2011-02-22 13:25:05 +0000
@@ -347,5 +347,24 @@ FROM t1 GROUP BY a ORDER BY a;
DROP TABLE t1;
--echo #
+--echo # WL#5624, shift after, using expansion
+--echo #
+SET NAMES utf8 COLLATE utf8_5624_5;
+CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS a LIMIT 0;
+INSERT INTO t1 VALUES ('0'),('1'),('0z'),(_ucs2 0x0030FF9D);
+INSERT INTO t1 VALUES ('a'),('b'),('c'),('d'),('e'),('f'),('g'),('h'),('i');
+INSERT INTO t1 VALUES ('j'),('k'),('l'),('m'),('n'),('o'),('p'),('q'),('r');
+INSERT INTO t1 VALUES ('s'),('t'),('u'),('v'),('w'),('x'),('y'),('z');
+INSERT INTO t1 VALUES ('aa'),('aaa');
+INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
+INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
+INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
+INSERT INTO t1 VALUES ('AA'),('AAA');
+
+SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
+DROP TABLE t1;
+
+
+--echo #
--echo # End of WL#5624
--echo #
=== modified file 'strings/ctype-uca.c'
--- a/strings/ctype-uca.c 2011-01-19 13:35:54 +0000
+++ b/strings/ctype-uca.c 2011-02-22 13:25:05 +0000
@@ -20929,6 +20929,18 @@ my_coll_rule_reset(MY_COLL_RULE *r)
}
+/*
+ Shift methods:
+ Simple: "&B < C" : weight('C') = weight('B') + 1
+ Expand: weght('C') = { weight('B'), weight(last_non_ignorable) + 1 }
+*/
+typedef enum
+{
+ my_shift_method_simple= 0,
+ my_shift_method_expand
+} my_coll_shift_method;
+
+
typedef struct my_coll_rules_st
{
uint version; /* Unicode version, e.g. 400 or 520 */
@@ -20937,6 +20949,7 @@ typedef struct my_coll_rules_st
size_t mrules; /* Number of allocated rules */
MY_COLL_RULE *rule; /* Rule array */
MY_CHARSET_LOADER *loader;
+ my_coll_shift_method shift_after_method;
} MY_COLL_RULES;
@@ -21204,6 +21217,14 @@ my_coll_parser_scan_setting(MY_COLL_RULE
rules->version= 520;
rules->uca= &my_uca_v520;
}
+ else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[shift-after-method expand]")))
+ {
+ rules->shift_after_method= my_shift_method_expand;
+ }
+ else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[shift-after-method simple]")))
+ {
+ rules->shift_after_method= my_shift_method_simple;
+ }
else
{
return 0;
@@ -21415,7 +21436,8 @@ my_coll_parser_scan_reset_sequence(MY_CO
return 0;
}
- if (p->rule.before_level == 1) /* Apply "before primary" option */
+ if (p->rules->shift_after_method == my_shift_method_expand ||
+ p->rule.before_level == 1) /* Apply "before primary" option */
{
/*
Suppose we have this rule: &B[before primary] < C
@@ -21435,6 +21457,10 @@ my_coll_parser_scan_reset_sequence(MY_CO
We'll compose weight for C as: [BBBB-1][MMMM+1]
where [MMMM] is weight for "last_non_ignorable".
+
+ We also do the same trick for "reset after" if the collation
+ option says so. E.g. for the rules "&B < C", weight for
+ C will be calculated as: [BBBB][MMMM+1]
At this point we only need to store codepoints
'B' and 'last_non_ignorable'. Actual weights for 'C'
@@ -21924,7 +21950,27 @@ create_tailoring(CHARSET_INFO *cs, MY_CH
if (r->before_level == 1) /* Apply "&[before primary]" */
{
if (nweights >= 2)
+ {
to[nweights - 2]--; /* Reset before */
+ if (rules.shift_after_method == my_shift_method_expand)
+ {
+ /*
+ Special case. Don't let characters shifted after X
+ and before next(X) intermix to each other.
+
+ For example:
+ "[shift-after-method expand] &0 < a &[before primary]1 < A".
+ I.e. we reorder 'a' after '0', and then 'A' before '1'.
+ 'a' must be sorted before 'A'.
+
+ Note, there are no real collations in CLDR which shift
+ after and before two neighbourgh characters. We need this
+ just in case. Reserving 4096 (0x1000) weights for such
+ cases is perfectly enough.
+ */
+ to[nweights - 1]+= 0x1000;
+ }
+ }
else
{
my_snprintf(loader->error, sizeof(loader->error),
=== modified file 'strings/ctype.c'
--- a/strings/ctype.c 2011-02-15 11:30:56 +0000
+++ b/strings/ctype.c 2011-02-22 13:25:05 +0000
@@ -88,6 +88,7 @@ struct my_cs_file_section_st
#define _CS_UCA_VERSION 100
#define _CS_CL_SUPPRESS_CONTRACTIONS 101
#define _CS_CL_OPTIMIZE 102
+#define _CS_CL_SHIFT_AFTER_METHOD 103
/* Collation Settings */
@@ -187,6 +188,7 @@ static struct my_cs_file_section_st sec[
{_CS_UCA_VERSION, "charsets/charset/collation/version"},
{_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"},
{_CS_CL_OPTIMIZE, "charsets/charset/collation/optimize"},
+ {_CS_CL_SHIFT_AFTER_METHOD, "charsets/charset/collation/shift-after-method"},
/* Collation Settings */
{_CS_ST_SETTINGS, "charsets/charset/collation/settings"},
@@ -646,6 +648,10 @@ static int cs_value(MY_XML_PARSER *st,co
rc= tailoring_append(st, "[optimize %.*s]", len, attr);
break;
+ case _CS_CL_SHIFT_AFTER_METHOD:
+ rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr);
+ break;
+
/* Collation Settings */
case _CS_ST_STRENGTH:
/* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */
Attachment: [text/bzr-bundle] bzr/alexander.barkov@oracle.com-20110222132505-vb6dturtjbv5qvjw.bundle
| Thread |
|---|
| • bzr commit into mysql-trunk branch (alexander.barkov:3699) WL#5624 | Alexander Barkov | 22 Feb |