List:Commits« Previous MessageNext Message »
From:jonas Date:February 5 2007 7:47pm
Subject:bk commit into 5.1 tree (jonas:1.2420) BUG#25801
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-02-05 19:47:19+01:00, jonas@eel.(none) +5 -0
  ndb - bug#25801
    - improve error message if starting wo/ enough REDO
    - decrease likelyhood of trying to start too early

  storage/ndb/include/mgmapi/ndbd_exit_codes.h@stripped, 2007-02-05 19:47:15+01:00,
jonas@eel.(none) +1 -0
    Add new error code (that maybe should have been there a looong time)

  storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-02-05 19:47:16+01:00,
jonas@eel.(none) +53 -6
    Add new check (during SR) for that sufficient REDO is present
      before continuing SR

  storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp@stripped, 2007-02-05 19:47:16+01:00,
jonas@eel.(none) +1 -0
    Add list of GCI's of nodes so that we can check for sufficient REDO during a SR

  storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp@stripped, 2007-02-05 19:47:16+01:00,
jonas@eel.(none) +83 -22
    Add check for REDO during SR
      so that
    1) cluster is not trying to start too soon
    2) a better error message (than internal error) is provided if not enough REDO is
present

  storage/ndb/src/kernel/error/ndbd_exit_codes.c@stripped, 2007-02-05 19:47:16+01:00,
jonas@eel.(none) +1 -0
    Add new error code (that maybe should have been there a looong time)

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	eel.(none)
# Root:	/home/jonas/src/51-work

--- 1.101/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-02-05 19:47:28 +01:00
+++ 1.102/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-02-05 19:47:28 +01:00
@@ -1194,11 +1194,58 @@
 void Dbdih::execDIH_RESTARTREQ(Signal* signal) 
 {
   jamEntry();
-  cntrlblockref = signal->theData[0];
-  if(m_ctx.m_config.getInitialStart()){
-    sendSignal(cntrlblockref, GSN_DIH_RESTARTREF, signal, 1, JBB);
-  } else {
-    readGciFileLab(signal);
+  if (signal->theData[0])
+  {
+    jam();
+    cntrlblockref = signal->theData[0];
+    if(m_ctx.m_config.getInitialStart()){
+      sendSignal(cntrlblockref, GSN_DIH_RESTARTREF, signal, 1, JBB);
+    } else {
+      readGciFileLab(signal);
+    }
+  }
+  else
+  {
+    /**
+     * Precondition, (not checked)
+     *   atleast 1 node in each node group
+     */
+    Uint32 i;
+    NdbNodeBitmask mask;
+    mask.assign(NdbNodeBitmask::Size, signal->theData + 1);
+    Uint32 *node_gcis = signal->theData+1+NdbNodeBitmask::Size;
+    Uint32 node_group_gcis[MAX_NDB_NODES+1];
+    bzero(node_group_gcis, sizeof(node_group_gcis));
+    for (i = 0; i<MAX_NDB_NODES; i++)
+    {
+      if (mask.get(i))
+      {
+	jam();
+	Uint32 ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
+	ndbrequire(ng < MAX_NDB_NODES);
+	Uint32 gci = node_gcis[i];
+	if (gci > node_group_gcis[ng])
+	{
+	  jam();
+	  node_group_gcis[ng] = gci;
+	}
+      }
+    }
+    for (i = 0; i<MAX_NDB_NODES && node_group_gcis[i] == 0; i++);
+    
+    Uint32 gci = node_group_gcis[i];
+    for (i++ ; i<MAX_NDB_NODES; i++)
+    {
+      jam();
+      if (node_group_gcis[i] && node_group_gcis[i] != gci)
+      {
+	jam();
+	signal->theData[0] = i;
+	return;
+      }
+    }
+    signal->theData[0] = MAX_NDB_NODES;
+    return;
   }
   return;
 }//Dbdih::execDIH_RESTARTREQ()
@@ -12391,7 +12438,7 @@
 	(buf, sizeof(buf), 
 	 "Illegal initial start, no alive node in nodegroup %u", i);
       progError(__LINE__, 
-		NDBD_EXIT_SR_RESTARTCONFLICT,
+		NDBD_EXIT_INSUFFICENT_NODES,
 		buf);
       
     }

--- 1.18/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2007-02-05 19:47:28 +01:00
+++ 1.19/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2007-02-05 19:47:28 +01:00
@@ -128,6 +128,7 @@
     Uint32 m_president_candidate_gci;
     Uint16 m_regReqReqSent;
     Uint16 m_regReqReqRecv;
+    Uint32 m_node_gci[MAX_NDB_NODES];
   } c_start;
   
   NdbNodeBitmask c_definedNodes; // DB nodes in config

--- 1.49/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2007-02-05 19:47:28 +01:00
+++ 1.50/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2007-02-05 19:47:28 +01:00
@@ -1093,7 +1093,8 @@
     jam();
     c_start.m_starting_nodes_w_log.set(TaddNodeno);
   }
-  
+  c_start.m_node_gci[TaddNodeno] = node_gci;
+
   skip_nodes.bitAND(c_definedNodes);
   c_start.m_skip_nodes.bitOR(skip_nodes);
   
@@ -1242,6 +1243,7 @@
   wait.bitANDC(tmp);
 
   Uint32 retVal = 0;
+  Uint32 incompleteng = MAX_NDB_NODES; // Illegal value
   NdbNodeBitmask report_mask;
 
   if ((c_start.m_latest_gci == 0) || 
@@ -1327,7 +1329,7 @@
         report_mask.assign(c_definedNodes);
         report_mask.bitANDC(c_start.m_starting_nodes);
         retVal = 1;
-        goto start_report;
+        goto check_log;
       case CheckNodeGroups::Partitioning:
         ndbrequire(result != CheckNodeGroups::Lose);
         signal->theData[1] = 
@@ -1335,7 +1337,7 @@
         report_mask.assign(c_definedNodes);
         report_mask.bitANDC(c_start.m_starting_nodes);
         retVal = 1;
-        goto start_report;
+        goto check_log;
       }
     }
 
@@ -1359,12 +1361,7 @@
     case CheckNodeGroups::Partitioning:
       if (now < partitioned_timeout && result != CheckNodeGroups::Win)
       {
-        signal->theData[1] = c_restartPartionedTimeout == (Uint32) ~0 ? 4 : 5;
-        signal->theData[2] = Uint32((partitioned_timeout - now + 500) / 1000);
-        report_mask.assign(c_definedNodes);
-        report_mask.bitANDC(c_start.m_starting_nodes);
-        retVal = 0;
-        goto start_report;
+        goto missinglog;
       }
       // Fall through...
     case CheckNodeGroups::Win:
@@ -1372,12 +1369,61 @@
         all ? 0x8001 : (result == CheckNodeGroups::Win ? 0x8002 : 0x8003);
       report_mask.assign(c_definedNodes);
       report_mask.bitANDC(c_start.m_starting_nodes);
-      retVal = 1;
-      goto start_report;
+      retVal = 2;
+      goto check_log;
     }
   }
   ndbrequire(false);
 
+check_log:
+  jam();
+  {
+    Uint32 save[4+4*NdbNodeBitmask::Size];
+    memcpy(save, signal->theData, sizeof(save));
+    
+    signal->theData[0] = 0;
+    c_start.m_starting_nodes.copyto(NdbNodeBitmask::Size, signal->theData+1);
+    memcpy(signal->theData+1+NdbNodeBitmask::Size, c_start.m_node_gci,
+	   4*MAX_NDB_NODES);
+    EXECUTE_DIRECT(DBDIH, GSN_DIH_RESTARTREQ, signal, 
+		   1+NdbNodeBitmask::Size+MAX_NDB_NODES);
+    
+    incompleteng = signal->theData[0];
+    memcpy(signal->theData, save, sizeof(save));
+
+    if (incompleteng != MAX_NDB_NODES)
+    {
+      jam();
+      if (retVal == 1)
+      {
+	jam();
+	goto incomplete_log;
+      }
+      else if (retVal == 2)
+      {
+	if (now <= partitioned_timeout)
+	{
+	  jam();
+	  goto missinglog;
+	}
+	else
+	{
+	  goto incomplete_log;
+	}
+      }
+      ndbrequire(false);
+    }
+  }
+  goto start_report;
+
+missinglog:
+  signal->theData[1] = c_restartPartionedTimeout == (Uint32) ~0 ? 4 : 5;
+  signal->theData[2] = Uint32((partitioned_timeout - now + 500) / 1000);
+  report_mask.assign(c_definedNodes);
+  report_mask.bitANDC(c_start.m_starting_nodes);
+  retVal = 0;
+  goto start_report;
+  
 start_report:
   jam();
   {
@@ -1396,17 +1442,32 @@
   
 missing_nodegroup:
   jam();
-  char buf[100], mask1[100], mask2[100];
-  c_start.m_starting_nodes.getText(mask1);
-  tmp.assign(c_start.m_starting_nodes);
-  tmp.bitANDC(c_start.m_starting_nodes_w_log);
-  tmp.getText(mask2);
-  BaseString::snprintf(buf, sizeof(buf),
-		       "Unable to start missing node group! "
-		       " starting: %s (missing fs for: %s)",
-		       mask1, mask2);
-  progError(__LINE__, NDBD_EXIT_SR_RESTARTCONFLICT, buf);
-  return 0;                                     // Deadcode
+  {
+    char buf[100], mask1[100], mask2[100];
+    c_start.m_starting_nodes.getText(mask1);
+    tmp.assign(c_start.m_starting_nodes);
+    tmp.bitANDC(c_start.m_starting_nodes_w_log);
+    tmp.getText(mask2);
+    BaseString::snprintf(buf, sizeof(buf),
+			 "Unable to start missing node group! "
+			 " starting: %s (missing fs for: %s)",
+			 mask1, mask2);
+    progError(__LINE__, NDBD_EXIT_INSUFFICENT_NODES, buf);
+    return 0;                                     // Deadcode
+  }
+
+incomplete_log:
+  jam();
+  {
+    char buf[100], mask1[100];
+    c_start.m_starting_nodes.getText(mask1);
+    BaseString::snprintf(buf, sizeof(buf),
+			 "Incomplete log for node group: %d! "
+			 " starting nodes: %s",
+			 incompleteng, mask1);
+    progError(__LINE__, NDBD_EXIT_INSUFFICENT_NODES, buf);
+    return 0;                                     // Deadcode
+  }
 }
 
 void

--- 1.14/storage/ndb/include/mgmapi/ndbd_exit_codes.h	2007-02-05 19:47:28 +01:00
+++ 1.15/storage/ndb/include/mgmapi/ndbd_exit_codes.h	2007-02-05 19:47:28 +01:00
@@ -146,6 +146,7 @@
 #define NDBD_EXIT_AFS_READ_UNDERFLOW        2816
 
 #define NDBD_EXIT_INVALID_LCP_FILE          2352
+#define NDBD_EXIT_INSUFFICENT_NODES         2353
 
 const char *
 ndbd_exit_message(int faultId, ndbd_exit_classification *cl);

--- 1.16/storage/ndb/src/kernel/error/ndbd_exit_codes.c	2007-02-05 19:47:28 +01:00
+++ 1.17/storage/ndb/src/kernel/error/ndbd_exit_codes.c	2007-02-05 19:47:28 +01:00
@@ -160,6 +160,7 @@
    {NDBD_EXIT_AFS_READ_UNDERFLOW        , XFI, "Read underflow"},
    
    {NDBD_EXIT_INVALID_LCP_FILE, XFI, "Invalid LCP" },
+   {NDBD_EXIT_INSUFFICENT_NODES, XRE, "Insufficent nodes for system restart" },
    
    /* Sentinel */
    {0, XUE,
Thread
bk commit into 5.1 tree (jonas:1.2420) BUG#25801jonas5 Feb