MySQL Lists are EOL. Please join:

List:Commits« Previous MessageNext Message »
From:Hakan Kuecuekyilmaz Date:December 5 2008 1:29pm
Subject:bzr commit into mysql-6.0-falcon branch (hky:2927)
View as plain text  
#At file:///home/hakan/work/mysql/mysql-6.0-falcon/ based on revid:hky@stripped

 2927 Hakan Kuecuekyilmaz	2008-12-05
      Extracted Unicode to UTF8 converter into function transform()
      for better readability and reuse.
         * Now we convert uppercase and lowercase releations
         * Run time is higher now for loading the initial data.
modified:
  mysql-test/suite/falcon/t/falcon_unicode.test

=== modified file 'mysql-test/suite/falcon/t/falcon_unicode.test'
--- a/mysql-test/suite/falcon/t/falcon_unicode.test	2008-12-04 15:45:11 +0000
+++ b/mysql-test/suite/falcon/t/falcon_unicode.test	2008-12-05 13:29:40 +0000
@@ -48,6 +48,7 @@ let $other_engine = 'MyISAM';
 eval SET @@storage_engine = $engine;
 
 --disable_warnings
+DROP FUNCTION IF EXISTS transform;
 DROP TABLE IF EXISTS t0;
 DROP TABLE IF EXISTS t1;
 DROP TABLE IF EXISTS t2;
@@ -63,7 +64,9 @@ CREATE TABLE t0 (
   id int auto_increment not null primary key,
   a varchar(1) character set utf8,
   category  varchar(100),
-  comment   varchar(80)
+  comment   varchar(80),
+  uppercase varchar(1) character set utf8,
+  lowercase varchar(1) character set utf8
 ) Engine $other_engine;
 
 # Falcon test table without any keys.
@@ -104,55 +107,90 @@ CREATE TABLE t_err (
 ) Engine $other_engine;
 
 #
-# Transform Unicode code points to UTF-8 and insert
-# the characters into reference table. The transformation
-# is done in place, so that the Unicode code point file
-# can be updated in an easy fashion.
-#
 # Unicode to UTF-8 transformation algorithm is taken from
 # http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&item_id=IWS-AppendixA
 #
-SET @one_byte =      CONV('007F', 16, 10);
-SET @two_byte =      CONV('07FF', 16, 10);
-SET @two_byte_low =  CONV('E000', 16, 10);
-SET @two_byte_high = CONV('FFFF', 16, 10);
-SET @three_byte =    CONV('D7FF', 16, 10);
+DELIMITER //;
+
+DELIMITER //
+CREATE FUNCTION transform(a varchar(8))
+  RETURNS varchar(8)
+  DETERMINISTIC
+
+  BEGIN
+    # Surrogate pair boundries, which are going to be ignored.
+    DECLARE surrogate1 varchar(8) DEFAULT CONV('D800', 16, 10);
+    DECLARE surrogate2 varchar(8) DEFAULT CONV('DB7F', 16, 10);
+    DECLARE surrogate3 varchar(8) DEFAULT CONV('DB80', 16, 10);
+    DECLARE surrogate4 varchar(8) DEFAULT CONV('DBFF', 16, 10);
+    DECLARE surrogate5 varchar(8) DEFAULT CONV('DC00', 16, 10);
+    DECLARE surrogate6 varchar(8) DEFAULT CONV('DFFF', 16, 10);
+
+    DECLARE one_byte int DEFAULT        CONV('007F', 16, 10);
+    DECLARE two_byte int DEFAULT        CONV('07FF', 16, 10);
+    DECLARE three_byte int DEFAULT      CONV('D7FF', 16, 10);
+    DECLARE three_byte_low int DEFAULT  CONV('E000', 16, 10);
+    DECLARE three_byte_high int DEFAULT CONV('FFFF', 16, 10);
+
+    DECLARE d int;
+    SET d = CONV(a, 16, 10);
+
+    # In case of missing uppercase or lowercase code points
+    # we return an empty string.
+    IF length(a) = 0 THEN
+      return '';
+    END IF;
+
+    # Also skip the six surrogate boundries.
+    IF d = surrogate1 || d = surrogate2 || d = surrogate3
+       || d = surrogate4 || d = surrogate5 || d = surrogate6 THEN
+      return '';
+    END IF;
+
+    IF d <= one_byte THEN
+      return CONV(a, 16, 16);
+    ELSEIF d <= two_byte THEN
+      return
+        CONCAT(CONV((d DIV 64) + 192, 10, 16),
+               CONV((d MOD 64) + 128, 10, 16));
+    ELSEIF d <= three_byte
+           || (three_byte_low <= d && d <= three_byte_high) THEN
+      return
+        CONCAT(CONV((d DIV 4096) + 224, 10, 16),
+               CONV(((d MOD 4096) div 64) + 128, 10, 16),
+               CONV((d MOD 64) + 128, 10, 16));
+    ELSE
+      return
+        CONCAT(CONV((d DIV 262144) + 240, 10, 16),
+               CONV(((d MOD 262144) DIV 4096) + 128, 10, 16),
+               CONV(((d MOD 4096) DIV 64) + 128, 10, 16),
+               CONV((d MOD 64) + 128, 10, 16));
+    END IF;
+
+  END
+//
+DELIMITER ;
+
+DELIMITER ;//
 
 #
-# @todo How to substitute CONV(@var1, 16, 10) with a
-# variable for easier reading?
+# Transform Unicode code points to UTF-8 and insert
+# the characters into reference table. The transformation
+# is done in place, so that the Unicode code point file
+# can be updated in an easy fashion.
 #
 LOAD DATA LOCAL INFILE 'include/UnicodeData.txt' INTO TABLE t0
   FIELDS TERMINATED BY ';'
   OPTIONALLY ENCLOSED BY ''
   LINES TERMINATED BY '\n'
   (@var1, @category, @dummy3, @dummy4, @dummy5, @dummy6, @dummy7,
-   @dummy8, @dummy9, @dummy10, @comment, @dummy12, @dummy13, @dummy14, @dummy15)
-SET a = (UNHEX((
-  SELECT IF (CONV(@var1, 16, 10) + 0 <= @one_byte + 0,
-             # One byte UTF-8, leave it as it is. The hex2hex conv() is done
-             # to get rid of leading 00, which are distracting MySQL a bit.
-             CONV(@var1, 16, 16),
-             IF (CONV(@var1, 16, 10) + 0 <= @two_byte + 0,
-                 # Two byte UTF-8.
-                 CONCAT(CONV((CONV(@var1, 16, 10) DIV 64) + 192, 10, 16),
-                        CONV((CONV(@var1, 16, 10) MOD 64) + 128, 10, 16)),
-                 IF (CONV(@var1, 16, 10) + 0 <= @three_byte + 0
-                     || (@two_byte_low + 0 <= CONV(@var1, 16, 10) + 0 && CONV(@var1, 16, 10) + 0 <= @two_byte_high + 0),
-                     # Three byte UTF-8.
-                     CONCAT(CONV((CONV(@var1, 16, 10) DIV 4096) + 224, 10, 16),
-                            CONV(((CONV(@var1, 16, 10) MOD 4096) div 64) + 128, 10, 16),
-                            CONV((CONV(@var1, 16, 10) MOD 64) + 128, 10, 16)),
-                     # Four byte UTF-8.
-                     CONCAT(CONV((CONV(@var1, 16, 10) DIV 262144) + 240, 10, 16),
-                            CONV(((CONV(@var1, 16, 10) MOD 262144) DIV 4096) + 128, 10, 16),
-                            CONV(((CONV(@var1, 16, 10) MOD 4096) DIV 64) + 128, 10, 16),
-                            CONV((CONV(@var1, 16, 10) MOD 64) + 128, 10, 16))
-                 )
-             )
-         )
-  ))
-), category = @category, comment = @comment;
+   @dummy8, @dummy9, @dummy10, @comment, @dummy12, @uppercase, @lowercase, @dummy15)
+# @todo Skip empty strings (surrogates).
+SET a = (UNHEX((SELECT transform(@var1)))),
+    category = @category,
+    comment = @comment,
+    uppercase = (UNHEX((SELECT transform(@uppercase)))),
+    lowercase = (UNHEX((SELECT transform(@lowercase))));
 
 # ----------------------------------------------------- #
 # --- Test                                          --- #
@@ -177,6 +215,7 @@ SELECT count(*) FROM t_err;
 # ----------------------------------------------------- #
 # --- Final cleanup                                 --- #
 # ----------------------------------------------------- #
+DROP FUNCTION transform;
 DROP TABLE t0;
 DROP TABLE t1;
 DROP TABLE t2;

Thread
bzr commit into mysql-6.0-falcon branch (hky:2927) Hakan Kuecuekyilmaz5 Dec