List:Internals« Previous MessageNext Message »
From:mikael Date:August 24 2005 3:03pm
Subject:bk commit into 5.1 tree (mronstrom:1.1896)
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of mikron. When mikron does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.1896 05/08/24 17:03:00 mronstrom@stripped +1 -0
  WL 2602, 2603, 2604: Partition Management
  More reorganisation of code and now more or less finalised high-level
  code parts, still a few methods left to design and most of the handler
  code also plus the partition state handling in the syntax part.

  sql/sql_table.cc
    1.267 05/08/24 17:02:47 mronstrom@stripped +169 -118
    More reorganisation of code and now more or less finalised high-level
    code parts, still a few methods left to design and most of the handler
    code also plus the partition state handling in the syntax part.

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	mronstrom
# Host:	mikael-ronstr-ms-dator.local
# Root:	/Users/mikron/wl2602

--- 1.266/sql/sql_table.cc	2005-08-23 20:21:42 +02:00
+++ 1.267/sql/sql_table.cc	2005-08-24 17:02:47 +02:00
@@ -84,22 +84,36 @@
   HA_CREATE_INFO *create_info;
   List<create_field> *create_list;
   List<Key> *key_list;
-  uint db_options;
   TABLE *table;
   KEY *key_info_buffer;
-  uint key_count;
   char *db;
   char *table_name;
+  enum thr_lock_type old_lock_type;
+  uint key_count;
+  uint db_options;
 };
 
-static bool mysql_add_partitions(TABLE *table, HA_CREATE_INFO *create_info,
-                                 const char *db, const char *table_name)
+/*
+  SYNOPSIS
+    mysql_add_partitions()
+    create_info                Create information used to create partitions
+    db                         Database name
+    table_name                 Table name
+    All parameters passed through the write_frm_type object
+  RETURN VALUES
+    TRUE                          Failure
+    FALSE                         Success
+  DESCRIPTION
+    Request handler to add partitions as set in states of the partition
+*/
+
+static bool mysql_add_partitions(write_frm_type *wft)
 {
   char path[FN_REFLEN+1];
   DBUG_ENTER("mysql_add_partitions");
 
-  build_table_path(path, sizeof(path), db, table_name, "");
-  DBUG_RETURN(table->file->add_partitions(create_info, path));
+  build_table_path(path, sizeof(path), wft->db, wft->table_name, "");
+  DBUG_RETURN(wft->table->file->add_partitions(wft->create_info, path));
 }
 
 static bool mysql_write_frm(write_frm_type *wft,
@@ -176,6 +190,7 @@
     db                            Database name
     table_name                    Table name
     old_lock_level                Old lock level
+    All parameters passed through the write_frm_type object
   RETURN VALUES
     TRUE                          Failure
     FALSE                         Success
@@ -186,21 +201,19 @@
     is ongoing.
 */
 
-static bool abort_and_upgrade_lock(THD *thd, TABLE *table, const char *db,
-                                   const char *table_name,
-                                   thr_lock_type *old_lock_level)
+static bool abort_and_upgrade_lock(write_frm_type *wft)
 {
   uint flags= RTFC_WAIT_OTHER_THREAD_FLAG | RTFC_CHECK_KILLED_FLAG;
   int error= FALSE;
   DBUG_ENTER("abort_and_upgrade_locks");
 
-  *old_lock_level= table->reginfo.lock_type;
+  wft->old_lock_type= wft->table->reginfo.lock_type;
   VOID(pthread_mutex_lock(&LOCK_open));
-  mysql_lock_abort(thd, table);
-  VOID(remove_table_from_cache(thd, db, table_name, flags));
-  if (thd->killed)
+  mysql_lock_abort(wft->thd, wft->table);
+  VOID(remove_table_from_cache(wft->thd, wft->db, wft->table_name, flags));
+  if (wft->thd->killed)
   {
-    thd->no_warnings_for_error= 0;
+    wft->thd->no_warnings_for_error= 0;
     error= TRUE;
   }
   VOID(pthread_mutex_unlock(&LOCK_open));
@@ -210,10 +223,10 @@
 /*
   SYNOPSIS
     mysql_drop_partitions()
-    thd                         Thread object
     table                       Table object
     db                          Database name
     table_name                  Table name
+    All parameters passed through the write_frm_type object
   RETURN VALUES
     TRUE                          Failure
     FALSE                         Success
@@ -222,17 +235,16 @@
     those partitions from the list.
 */
 
-static bool mysql_drop_partitions(TABLE *table, const char *db,
-                                  const char *table_name)
+static bool mysql_drop_partitions(write_frm_type *wft)
 {
   char path[FN_REFLEN+1];
-  partition_info *part_info= table->s->part_info;
+  partition_info *part_info= wft->table->s->part_info;
   List_iterator<partition_element> part_it(part_info->partitions);
   uint i= 0, remove_count= 0;
   DBUG_ENTER("mysql_drop_partitions");
 
-  build_table_path(path, sizeof(path), db, table_name, "");
-  if (table->file->drop_partitions(path))
+  build_table_path(path, sizeof(path), wft->db, wft->table_name, "");
+  if (wft->table->file->drop_partitions(path))
   {
     DBUG_RETURN(TRUE);
   }
@@ -258,6 +270,7 @@
     db                               Database name
     table_name                       Table name
     old_lock_type                    Lock type to downgrade to
+    All parameters passed through the write_frm_type object
   RESULT VALUES
     NONE
   DESCRIPTION
@@ -269,15 +282,13 @@
     We also downgrade locks after the upgrade to WRITE_ONLY
 */
 
-void close_open_tables_and_downgrade(THD *thd, TABLE *table, const char *db,
-                                     const char *table_name,
-                                     enum thr_lock_type old_lock_type)
+static void close_open_tables_and_downgrade(write_frm_type *wft)
 {
   VOID(pthread_mutex_lock(&LOCK_open));
-  remove_table_from_cache(thd, db, table_name,
+  remove_table_from_cache(wft->thd, wft->db, wft->table_name,
                           RTFC_WAIT_OTHER_THREAD_FLAG);
   VOID(pthread_mutex_unlock(&LOCK_open));
-  mysql_lock_downgrade_write(thd, table, old_lock_type);
+  mysql_lock_downgrade_write(wft->thd, wft->table, wft->old_lock_type);
 }
 
 
@@ -288,16 +299,17 @@
     table                         Table object
     db                            Database name
     table_name                    Table name
+    All parameters passed through the write_frm_type object
   RETURN VALUES
-    NONE
+    TRUE                          Failure
+    FALSE                         Success
   DESCRIPTION
     We have changed the frm file and now we want to wait for all users of
     the old frm to complete before proceeding to ensure that no one
     remains that uses the old frm definition.
 */
 
-bool mysql_wait_completed_table(THD *thd, TABLE *table, const char *db,
-                                const char *table_name)
+static bool mysql_wait_completed_table(write_frm_type *wft)
 {
   /* TODO RONM: Write this routine */
   DBUG_ENTER("mysql_wait_completed_table");
@@ -4374,56 +4386,78 @@
   {
     /* Set-up struct used to write frm files */
     partition_info *part_info= table->s->part_info;
-    thr_lock_type old_lock_type;
-    write_frm_type wft;
-    wft.thd= thd;
-    wft.create_info= create_info;
-    wft.create_list= &create_list;
-    wft.key_list= &key_list;
-    wft.db_options= 0;
-    wft.table= table;
-    wft.key_info_buffer= wft.key_info_buffer;
-    wft.key_count= wft.key_count;
-    wft.db= db;
-    wft.table_name= table_name;
+    write_frm_type wft_obj;
+    write_frm_type *wft= &wft_obj;
+    wft->thd= thd;
+    wft->create_info= create_info;
+    wft->create_list= &create_list;
+    wft->key_list= &key_list;
+    wft->db_options= 0;
+    wft->table= table;
+    wft->key_info_buffer= 0;
+    wft->key_count= 0;
+    wft->db= db;
+    wft->table_name= table_name;
     thd->lex->part_info= part_info;
     if (online_drop_partition)
     {
-        /*
+      /*
         Now after all checks and setting state on dropped partitions we can
         start the actual dropping of the partitions.
-        1) Lock the partitions to be dropped in TL_WRITE_ONLY to ensure all
-           other accesses on the partitions are completed and no new ones
-           are started until we have changed the frm file. Other partitions
-           we can downgrade the locks to TL_WRITE_ALLOW_WRITE since they
-           are not changed in any manner.
-        2) Write the new frm file where state of dropped partitions is
-           changed to PART_IS_DROPPED
-        3) Perform the actual drop of the partitions using the handler of the
-           table.
-        4) Write a new frm file of the table where the partitions are dropped
-           from the table.
-        5) Wait for all users of the table to be completed such that we are
-           certain that all new users of the table use the new frm file.
-        6) Write binlog and return from statement (including releasing all
+
+        Drop partition is actually two things happening. The first is that
+        a lot of records are deleted. The second is that the behaviour of
+        subsequent updates and writes and deletes will change. The delete
+        part can be handled without any particular high lock level by
+        transactional engines whereas non-transactional engines need to
+        ensure that this change is done with an exclusive lock on the table.
+        The second part, the change of partitioning does however require
+        an exclusive lock to install the new partitioning as one atomic
+        operation. If this is not the case, it is possible for two
+        transactions to see the change in a different order than their
+        serialisation order. Thus we need an exclusive lock for both
+        transactional and non-transactional engines.
+
+        1) Lock the table in TL_WRITE_ONLY to ensure all other accesses to
+           the table have completed
+        2) Write the new frm file where the partitions have changed but are
+           still remaining with the state PART_TO_BE_DROPPED
+        3) Ensure that any users that has opened the table but not yet
+           reached the abort lock do that before downgrading the lock.
+        4) Drop the partitions
+        5) Write the frm file where the partition has been dropped
+        6) Wait until all accesses using the old frm file has completed
+        7) Write binlog and return from statement (including releasing all
            remaining locks).
       */
 
-      if ((abort_and_upgrade_lock(thd, table, db,
-                                  table_name, &old_lock_type)) ||
-          (mysql_write_frm(&wft, TRUE, FALSE)) ||
-          (mysql_drop_partitions(table, db, table_name)) ||
-          (mysql_write_frm(&wft, FALSE, TRUE)))
+      if ((abort_and_upgrade_lock(wft)) ||
+          (mysql_write_frm(wft, TRUE, FALSE)) ||
+          (close_open_tables_and_downgrade(wft), FALSE) || 
+          (mysql_drop_partitions(wft)) ||
+          (mysql_write_frm(wft, FALSE, TRUE)) ||
+          (mysql_wait_completed_table(wft)))
       {
         DBUG_RETURN(TRUE);
       }
-      close_open_tables_and_downgrade(thd, table,
-                                      db, table_name, old_lock_type);
     }
     else if (online_add_empty_partition)
     {
       /*
         ADD RANGE/LIST PARTITIONS
+        In this case there are no tuples removed and no tuples are added.
+        Thus the operation is merely adding a new partition. Thus it is
+        necessary to perform the change as an atomic operation. Otherwise
+        someone reading without seeing the new partition could potentially
+        miss updates made by a transaction serialised before it that are
+        inserted into the new partition.
+
+        For LIST partitions it could be possible to avoid the exclusive lock
+        (and for RANGE partitions if they didn't rearrange range definitions
+        after a DROP PARTITION) if one ensured that failed accesses to the
+        dropped partitions was aborted for sure (thus only possible for
+        transactional engines).
+        
         1) Downgrade all locks to TL_WRITE_ALLOW_WRITE unless the user have
            used lock table before starting ALTER TABLE.
         2) Write the new frm file where state of added partitions is
@@ -4434,85 +4468,102 @@
            ongoing users have completed before progressing.
         5) Write a new frm file of the table where the partitions are added
            to the table.
-        6) Write binlog and return from statement (including releasing all
+        6) Wait until all accesses using the old frm file has completed
+        7) Write binlog and return from statement (including releasing all
            remaining locks).
       */
       if (table->reginfo.lock_type == TL_WRITE_ALLOW_READ)
-        mysql_lock_downgrade_write(thd, table, TL_WRITE_ALLOW_WRITE);
-      if ((mysql_write_frm(&wft, TRUE, FALSE)) ||
-          (mysql_add_partitions(table, create_info, db, table_name)) ||
-          (abort_and_upgrade_lock(thd, table, db,
-                                  table_name, &old_lock_type)) ||
-          (mysql_write_frm(&wft, FALSE, TRUE)))
+        mysql_lock_downgrade_write(wft->thd, wft->table,
+                                   TL_WRITE_ALLOW_WRITE);
+      if ((mysql_write_frm(wft, TRUE, FALSE)) ||
+          (mysql_add_partitions(wft)) ||
+          (abort_and_upgrade_lock(wft)) ||
+          (mysql_write_frm(wft, FALSE, TRUE)) ||
+          (close_open_tables_and_downgrade(wft), FALSE))
       {
         DBUG_RETURN(TRUE);
       }
-      close_open_tables_and_downgrade(thd, table,
-                                      db, table_name, old_lock_type);
     }
-    else if (online_add_hash_partition || online_reorg_partition)
+    else
     {
       /*
         ADD HASH PARTITION/
         REORGANISE PARTITION (non-NDB)
+        In this case there is no new data and no data is lost, thus by
+        ensuring that all updates go to both the old and the new partitioning
+        scheme we can actually perform this operation lock-free. This is
+        only possible if the handler can ensure double-write for a period.
+        The double write will ensure that it doesn't matter where the data
+        is read from since both places are updated for writes. If such
+        double writing is not performed then it is necessary to perform the
+        change with the usual exclusive lock. With double writes it is even
+        possible to perform writes in parallel with the reorganisation of
+        partitions.
+
+        Thus with double write possibility we get the following procedure.
         1) Write the new frm file where state of added partitions is
-           changed to PART_TO_BE_ADDED
+           changed to PART_TO_BE_ADDED and the reorganised partitions
+           are set in state PART_TO_BE_REORGED.
         2) Add the new partitions
         3) Copy from the reorganised partitions to the new partitions
-        4) Lock all partitions in TL_WRITE_ONLY to ensure that no users
-           are still using the old partitioning scheme. Wait until all
-           ongoing users have completed before progressing.
-        5) Drop the reorganised partitions
-        6) Write a new frm file of the table where the partitions are
+        4) Write a new frm file of the table where the partitions are
            reorganised.
-        7) Write binlog and return from statement (including releasing all
+        5) Wait until all accesses using the old frm file has completed
+        6) Drop the reorganised partitions
+        7) Write a new frm file of the table where the partitions are
+           reorganised.
+        8) Wait until all accesses using the old frm file has completed
+        9) Write binlog and return from statement (including releasing all
            remaining locks).
-      */
-      if ((mysql_write_frm(&wft, TRUE, FALSE)) ||
-          (mysql_add_partitions(table, create_info, db, table_name)) ||
-          (table->file->copy_partitions()) ||
-          (abort_and_upgrade_lock(thd, table, db,
-                                  table_name, &old_lock_type)) ||
-          (mysql_drop_partitions(table, db, table_name)) ||
-          (mysql_write_frm(&wft, FALSE, TRUE)))
-      {
-        DBUG_RETURN(TRUE);
-      }
-      close_open_tables_and_downgrade(thd, table,
-                                      db, table_name, old_lock_type);
 
-    }
-    else
-    {
-      /*
-        ADD HASH PARTITION/
-        REORGANISE PARTITION (NDB)
-        1) Downgrade all locks to TL_WRITE_ALLOW_WRITE
-        2) Write the new frm file where state of added partitions is
-           changed to PART_TO_BE_ADDED
-        3) Add the new partitions
-        4) Copy from the reorganised partitions to the new partitions
+        Without double write procedure we get the following procedure.
+        1) Write the new frm file where state of added partitions is
+           changed to PART_TO_BE_ADDED and the reorganised partitions
+           are set in state PART_TO_BE_REORGED.
+        2) Add the new partitions
+        3) Copy from the reorganised partitions to the new partitions
+        4) Lock all partitions in TL_WRITE_ONLY to ensure that no users
+           are still using the old partitioning scheme. Wait until all
+           ongoing users have completed before progressing.
         5) Write a new frm file of the table where the partitions are
            reorganised.
-        6) Wait for all users of the table to be completed such that we are
-           certain that all new users of the table use the new frm file.
+        6) Wait until all accesses using the old frm file has completed
         7) Drop the reorganised partitions
-        8) Wait for all users of the table to be completed such that we are
-           certain that all new users of the table use the new frm file.
-        9) Write binlog and return from statement (including releasing all
+        8) Write binlog and return from statement (including releasing all
            remaining locks).
       */
-      if (table->reginfo.lock_type == TL_WRITE_ALLOW_READ)
-        mysql_lock_downgrade_write(thd, table, TL_WRITE_ALLOW_WRITE);
-      if ((mysql_write_frm(&wft, TRUE, FALSE)) ||
-          (mysql_add_partitions(table, create_info, db, table_name)) ||
-          (table->file->copy_partitions()) ||
-          (mysql_write_frm(&wft, FALSE, TRUE)) ||
-          (mysql_wait_completed_table(thd, table, db, table_name)) ||
-          (mysql_drop_partitions(table, db, table_name)) ||
-          (mysql_wait_completed_table(thd, table, db, table_name)))
+
+      if (table->file->alter_table_flags())
+          //& ONLINE_DOUBLE_WRITE)
       {
-        DBUG_RETURN(TRUE);
+        if (table->reginfo.lock_type == TL_WRITE_ALLOW_READ)
+          mysql_lock_downgrade_write(wft->thd, wft->table,
+                                     TL_WRITE_ALLOW_WRITE);
+        if ((mysql_write_frm(wft, TRUE, FALSE)) ||
+            (mysql_add_partitions(wft)) ||
+            (table->file->copy_partitions()) ||
+            (mysql_write_frm(wft, FALSE, TRUE)) ||
+            (mysql_wait_completed_table(wft)) ||
+            (mysql_drop_partitions(wft)) ||
+            (mysql_write_frm(wft, FALSE, FALSE)) ||
+            (mysql_wait_completed_table(wft)))
+        {
+          DBUG_RETURN(TRUE);
+        }
+      }
+      else
+      {
+        if ((mysql_write_frm(wft, TRUE, FALSE)) ||
+            (mysql_add_partitions(wft)) ||
+            (table->file->copy_partitions()) ||
+            (abort_and_upgrade_lock(wft)) ||
+            (mysql_write_frm(wft, FALSE, TRUE)) ||
+            (close_open_tables_and_downgrade(wft), FALSE) ||
+            (mysql_drop_partitions(wft)) ||
+            (mysql_write_frm(wft, FALSE, FALSE)))
+        {
+          DBUG_RETURN(TRUE);
+        }
       }
     }
     /*
Thread
bk commit into 5.1 tree (mronstrom:1.1896)mikael24 Aug