MySQL Lists are EOL. Please join:

List:Commits« Previous MessageNext Message »
From:tomas Date:April 26 2006 1:55pm
Subject:bk commit into 5.0 tree (tomas:1.2106) BUG#19039
View as plain text  
Below is the list of changes that have just been committed into a local
5.0 repository of tomas. When tomas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2106 06/04/26 15:55:30 tomas@stripped +7 -0
  Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
  - addded more retries to wait for nodefailure to complete
  Bug #19039 multi node failure causes node failure handling not to complete
  - patch to avoid this scenario when the management server is used to perform the stop
  - wait for NF_COMPLETE_REP in management server before returning
  ndb: allocate nodeid
  - only retry on retryable error

  ndb/src/mgmsrv/Services.cpp
    1.63 06/04/26 15:55:24 tomas@stripped +15 -2
    Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
    - addded more retries to wait for nodefailure to complete
    Bug #19039 multi node failure causes node failure handling not to complete
    - patch to avoid this scenario when the management server is used to perform the stop
    - wait for NF_COMPLETE_REP in management server before returning
    ndb: allocate nodeid
    - only retry on retryable error

  ndb/src/mgmsrv/MgmtSrvr.hpp
    1.43 06/04/26 15:55:24 tomas@stripped +4 -2
    Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
    - addded more retries to wait for nodefailure to complete
    Bug #19039 multi node failure causes node failure handling not to complete
    - patch to avoid this scenario when the management server is used to perform the stop
    - wait for NF_COMPLETE_REP in management server before returning
    ndb: allocate nodeid
    - only retry on retryable error

  ndb/src/mgmsrv/MgmtSrvr.cpp
    1.98 06/04/26 15:55:24 tomas@stripped +107 -39
    Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
    - addded more retries to wait for nodefailure to complete
    Bug #19039 multi node failure causes node failure handling not to complete
    - patch to avoid this scenario when the management server is used to perform the stop
    - wait for NF_COMPLETE_REP in management server before returning
    ndb: allocate nodeid
    - only retry on retryable error

  ndb/src/mgmapi/mgmapi.cpp
    1.59 06/04/26 15:55:24 tomas@stripped +9 -4
    Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
    - addded more retries to wait for nodefailure to complete
    Bug #19039 multi node failure causes node failure handling not to complete
    - patch to avoid this scenario when the management server is used to perform the stop
    - wait for NF_COMPLETE_REP in management server before returning
    ndb: allocate nodeid
    - only retry on retryable error

  ndb/src/kernel/vm/Configuration.cpp
    1.46 06/04/26 15:55:24 tomas@stripped +2 -1
    Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
    - addded more retries to wait for nodefailure to complete
    Bug #19039 multi node failure causes node failure handling not to complete
    - patch to avoid this scenario when the management server is used to perform the stop
    - wait for NF_COMPLETE_REP in management server before returning
    ndb: allocate nodeid
    - only retry on retryable error

  ndb/src/common/mgmcommon/ConfigRetriever.cpp
    1.34 06/04/26 15:55:24 tomas@stripped +4 -2
    Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
    - addded more retries to wait for nodefailure to complete
    Bug #19039 multi node failure causes node failure handling not to complete
    - patch to avoid this scenario when the management server is used to perform the stop
    - wait for NF_COMPLETE_REP in management server before returning
    ndb: allocate nodeid
    - only retry on retryable error

  ndb/include/mgmapi/mgmapi.h
    1.48 06/04/26 15:55:24 tomas@stripped +7 -1
    Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
    - addded more retries to wait for nodefailure to complete
    Bug #19039 multi node failure causes node failure handling not to complete
    - patch to avoid this scenario when the management server is used to perform the stop
    - wait for NF_COMPLETE_REP in management server before returning
    ndb: allocate nodeid
    - only retry on retryable error

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	tomas
# Host:	poseidon.ndb.mysql.com
# Root:	/home/tomas/mysql-5.0

--- 1.47/ndb/include/mgmapi/mgmapi.h	2006-02-13 12:58:03 +01:00
+++ 1.48/ndb/include/mgmapi/mgmapi.h	2006-04-26 15:55:24 +02:00
@@ -232,6 +232,12 @@
     /** Could not connect to socker */
     NDB_MGM_COULD_NOT_CONNECT_TO_SOCKET = 1011,
 
+    /* Alloc node id failures */
+    /** Generic error, retry may succeed */
+    NDB_MGM_ALLOCID_ERROR = 1101,
+    /** Non retriable error */
+    NDB_MGM_ALLOCID_CONFIG_MISMATCH = 1102,
+
     /* Service errors - Start/Stop Node or System */
     /** Start failed */
     NDB_MGM_START_FAILED = 2001,
@@ -999,7 +1005,7 @@
   void ndb_mgm_destroy_configuration(struct ndb_mgm_configuration *);
 
   int ndb_mgm_alloc_nodeid(NdbMgmHandle handle,
-			   unsigned version, int nodetype);
+			   unsigned version, int nodetype, int log_event);
 
   /**
    * End Session

--- 1.33/ndb/src/common/mgmcommon/ConfigRetriever.cpp	2006-01-16 14:13:13 +01:00
+++ 1.34/ndb/src/common/mgmcommon/ConfigRetriever.cpp	2006-04-26 15:55:24 +02:00
@@ -349,12 +349,14 @@
 	if(!ndb_mgm_connect(m_handle, 0, 0, 0))
 	  goto next;
 
-      res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type);
+      res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type,
+                                no_retries == 0 /* only log last retry */);
       if(res >= 0)
 	return _ownNodeId= (Uint32)res;
 
   next:
-      if (no_retries == 0)
+      int error = ndb_mgm_get_latest_error(m_handle);
+      if (no_retries == 0 || error == NDB_MGM_ALLOCID_CONFIG_MISMATCH)
 	break;
       no_retries--;
       NdbSleep_SecSleep(retry_delay_in_seconds);

--- 1.45/ndb/src/kernel/vm/Configuration.cpp	2006-04-25 19:04:42 +02:00
+++ 1.46/ndb/src/kernel/vm/Configuration.cpp	2006-04-26 15:55:24 +02:00
@@ -286,7 +286,8 @@
   if (globalData.ownId)
     cr.setNodeId(globalData.ownId);
 
-  globalData.ownId = cr.allocNodeId(2 /*retry*/,3 /*delay*/);
+  globalData.ownId = cr.allocNodeId(globalData.ownId ? 10 : 2 /*retry*/,
+                                    3 /*delay*/);
   
   if(globalData.ownId == 0){
     ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG, 

--- 1.58/ndb/src/mgmapi/mgmapi.cpp	2006-02-13 12:58:03 +01:00
+++ 1.59/ndb/src/mgmapi/mgmapi.cpp	2006-04-26 15:55:24 +02:00
@@ -1868,7 +1868,8 @@
 
 extern "C"
 int
-ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
+ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype,
+                     int log_event)
 {
   CHECK_HANDLE(handle, 0);
   CHECK_CONNECTED(handle, 0);
@@ -1888,9 +1889,11 @@
   args.put("endian", (endian_check.c[sizeof(long)-1])?"big":"little");
   if (handle->m_name)
     args.put("name", handle->m_name);
+  args.put("log_event", log_event);
 
   const ParserRow<ParserDummy> reply[]= {
     MGM_CMD("get nodeid reply", NULL, ""),
+      MGM_ARG("error_code", Int, Optional, "Error code"),
       MGM_ARG("nodeid", Int, Optional, "Error message"),
       MGM_ARG("result", String, Mandatory, "Error message"),
     MGM_END()
@@ -1903,14 +1906,16 @@
   nodeid= -1;
   do {
     const char * buf;
-    if(!prop->get("result", &buf) || strcmp(buf, "Ok") != 0){
+    if (!prop->get("result", &buf) || strcmp(buf, "Ok") != 0)
+    {
       const char *hostname= ndb_mgm_get_connected_host(handle);
       unsigned port=  ndb_mgm_get_connected_port(handle);
       BaseString err;
+      Uint32 error_code= NDB_MGM_ALLOCID_ERROR;
       err.assfmt("Could not alloc node id at %s port %d: %s",
 		 hostname, port, buf);
-      setError(handle, NDB_MGM_COULD_NOT_CONNECT_TO_SOCKET, __LINE__,
-	       err.c_str());
+      prop->get("error_code", &error_code);
+      setError(handle, error_code, __LINE__, err.c_str());
       break;
     }
     Uint32 _nodeid;

--- 1.97/ndb/src/mgmsrv/MgmtSrvr.cpp	2006-04-26 13:20:58 +02:00
+++ 1.98/ndb/src/mgmsrv/MgmtSrvr.cpp	2006-04-26 15:55:24 +02:00
@@ -507,9 +507,10 @@
   if (_ownNodeId == 0) // we did not get node id from other server
   {
     NodeId tmp= m_config_retriever->get_configuration_nodeid();
+    int error_code;
 
     if (!alloc_node_id(&tmp, NDB_MGM_NODE_TYPE_MGM,
-		       0, 0, error_string)){
+		       0, 0, error_code, error_string)){
       ndbout << "Unable to obtain requested nodeid: "
 	     << error_string.c_str() << endl;
       require(false);
@@ -1118,31 +1119,16 @@
       const NFCompleteRep * const rep =
 	CAST_CONSTPTR(NFCompleteRep, signal->getDataPtr());
 #ifdef VM_TRACE
-      ndbout_c("Node %d fail completed", rep->failedNodeId);
+      ndbout_c("sendSTOP_REQ Node %d fail completed", rep->failedNodeId);
 #endif
+      nodes.clear(rep->failedNodeId); // clear the failed node
+      if (singleUserNodeId == 0)
+        stoppedNodes.set(rep->failedNodeId);
       break;
     }
     case GSN_NODE_FAILREP:{
       const NodeFailRep * const rep =
 	CAST_CONSTPTR(NodeFailRep, signal->getDataPtr());
-      NodeBitmask failedNodes;
-      failedNodes.assign(NodeBitmask::Size, rep->theNodes);
-#ifdef VM_TRACE
-      {
-	ndbout << "Failed nodes:";
-	for (unsigned i = 0; i < 32*NodeBitmask::Size; i++)
-	  if(failedNodes.get(i))
-	    ndbout << " " << i;
-	ndbout << endl;
-      }
-#endif
-      failedNodes.bitAND(nodes);
-      if (!failedNodes.isclear())
-      {
-	nodes.bitANDC(failedNodes); // clear the failed nodes
-	if (singleUserNodeId == 0)
-	  stoppedNodes.bitOR(failedNodes);
-      }
       break;
     }
     default:
@@ -1263,11 +1249,47 @@
                         abort,
                         false,
                         true,
-                        nostart,
+                        true,
                         initialStart);
+
+  if (ret)
+    return ret;
+
   if (stopCount)
     *stopCount = nodes.count();
-  return ret;
+  
+  // start up the nodes again
+  int waitTime = 12000;
+  NDB_TICKS maxTime = NdbTick_CurrentMillisecond() + waitTime;
+  for (unsigned i = 0; i < node_ids.size(); i++)
+  {
+    NodeId nodeId= node_ids[i];
+    enum ndb_mgm_node_status s;
+    s = NDB_MGM_NODE_STATUS_NO_CONTACT;
+#ifdef VM_TRACE
+    ndbout_c("Waiting for %d not started", nodeId);
+#endif
+    while (s != NDB_MGM_NODE_STATUS_NOT_STARTED && waitTime > 0)
+    {
+      Uint32 startPhase = 0, version = 0, dynamicId = 0, nodeGroup = 0;
+      Uint32 connectCount = 0;
+      bool system;
+      const char *address;
+      status(nodeId, &s, &version, &startPhase, 
+             &system, &dynamicId, &nodeGroup, &connectCount, &address);
+      NdbSleep_MilliSleep(100);  
+      waitTime = (maxTime - NdbTick_CurrentMillisecond());
+    }
+  }
+
+  if (nostart)
+    return 0;
+
+  for (unsigned i = 0; i < node_ids.size(); i++)
+  {
+    int result = start(node_ids[i]);
+  }
+  return 0;
 }
 
 /*
@@ -1918,7 +1940,8 @@
 			enum ndb_mgm_node_type type,
 			struct sockaddr *client_addr, 
 			SOCKET_SIZE_TYPE *client_addr_len,
-			BaseString &error_string)
+			int &error_code, BaseString &error_string,
+                        int log_event)
 {
   DBUG_ENTER("MgmtSrvr::alloc_node_id");
   DBUG_PRINT("enter", ("nodeid=%d, type=%d, client_addr=%d",
@@ -1927,6 +1950,7 @@
     if (*nodeId == 0) {
       error_string.appfmt("no-nodeid-checks set in management server.\n"
 			  "node id must be set explicitly in connectstring");
+      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
       DBUG_RETURN(false);
     }
     DBUG_RETURN(true);
@@ -1951,8 +1975,10 @@
 
   if(NdbMutex_Lock(m_configMutex))
   {
+    // should not happen
     error_string.appfmt("unable to lock configuration mutex");
-    return false;
+    error_code = NDB_MGM_ALLOCID_ERROR;
+    DBUG_RETURN(false);
   }
   ndb_mgm_configuration_iterator
     iter(* _config->m_configValues, CFG_SECTION_NODE);
@@ -2023,6 +2049,7 @@
 			  "or specifying unique host names in config file.",
 			  id_found, tmp);
       NdbMutex_Unlock(m_configMutex);
+      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
       DBUG_RETURN(false);
     }
     if (config_hostname == 0) {
@@ -2031,6 +2058,7 @@
 			  "or specifying unique host names in config file,\n"
 			  "or specifying just one mgmt server in config file.",
 			  tmp);
+      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
       DBUG_RETURN(false);
     }
     id_found= tmp; // mgmt server matched, check for more matches
@@ -2072,8 +2100,9 @@
     
     char tmp_str[128];
     m_reserved_nodes.getText(tmp_str);
-    g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, m_reserved_nodes %s.",
-		       id_found, get_connect_address(id_found), tmp_str);
+    g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, "
+                       "m_reserved_nodes %s.",
+                       id_found, get_connect_address(id_found), tmp_str);
     DBUG_RETURN(true);
   }
 
@@ -2093,26 +2122,48 @@
     type_c_string.assfmt("%s(%s)", alias, str);
   }
 
-  if (*nodeId == 0) {
+  if (*nodeId == 0)
+  {
     if (found_matching_id)
+    {
       if (found_matching_type)
+      {
 	if (found_free_node)
+        {
 	  error_string.appfmt("Connection done from wrong host ip %s.",
 			      (client_addr)?
-			        inet_ntoa(((struct sockaddr_in *)
+                              inet_ntoa(((struct sockaddr_in *)
 					 (client_addr))->sin_addr):"");
+          error_code = NDB_MGM_ALLOCID_ERROR;
+        }
 	else
+        {
 	  error_string.appfmt("No free node id found for %s.",
 			      type_string.c_str());
+          error_code = NDB_MGM_ALLOCID_ERROR;
+        }
+      }
       else
+      {
 	error_string.appfmt("No %s node defined in config file.",
 			    type_string.c_str());
+        error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+      }
+    }
     else
+    {
       error_string.append("No nodes defined in config file.");
-  } else {
+      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+    }
+  }
+  else
+  {
     if (found_matching_id)
+    {
       if (found_matching_type)
-	if (found_free_node) {
+      {
+	if (found_free_node)
+        {
 	  // have to split these into two since inet_ntoa overwrites itself
 	  error_string.appfmt("Connection with id %d done from wrong host ip %s,",
 			      *nodeId, inet_ntoa(((struct sockaddr_in *)
@@ -2120,27 +2171,44 @@
 	  error_string.appfmt(" expected %s(%s).", config_hostname,
 			      r_config_addr ?
 			      "lookup failed" : inet_ntoa(config_addr));
-	} else
+          error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+	}
+        else
+        {
 	  error_string.appfmt("Id %d already allocated by another node.",
 			      *nodeId);
+          error_code = NDB_MGM_ALLOCID_ERROR;
+        }
+      }
       else
+      {
 	error_string.appfmt("Id %d configured as %s, connect attempted as %s.",
 			    *nodeId, type_c_string.c_str(),
 			    type_string.c_str());
+        error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+      }
+    }
     else
+    {
       error_string.appfmt("No node defined with id=%d in config file.",
 			  *nodeId);
+      error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+    }
   }
 
-  g_eventLogger.warning("Allocate nodeid (%d) failed. Connection from ip %s. "
-			"Returned error string \"%s\"",
-			*nodeId,
-			client_addr != 0 ? inet_ntoa(((struct sockaddr_in *)(client_addr))->sin_addr) : "<none>",
-			error_string.c_str());
-
-  NodeBitmask connected_nodes2;
-  get_connected_nodes(connected_nodes2);
+  if (log_event || error_code == NDB_MGM_ALLOCID_CONFIG_MISMATCH)
   {
+    g_eventLogger.warning("Allocate nodeid (%d) failed. Connection from ip %s."
+                          " Returned error string \"%s\"",
+                          *nodeId,
+                          client_addr != 0
+                          ? inet_ntoa(((struct sockaddr_in *)
+                                       (client_addr))->sin_addr)
+                          : "<none>",
+                          error_string.c_str());
+
+    NodeBitmask connected_nodes2;
+    get_connected_nodes(connected_nodes2);
     BaseString tmp_connected, tmp_not_connected;
     for(Uint32 i = 0; i < MAX_NODES; i++)
     {

--- 1.42/ndb/src/mgmsrv/MgmtSrvr.hpp	2006-04-26 13:20:59 +02:00
+++ 1.43/ndb/src/mgmsrv/MgmtSrvr.hpp	2006-04-26 15:55:24 +02:00
@@ -434,8 +434,10 @@
    */
   bool getNextNodeId(NodeId * _nodeId, enum ndb_mgm_node_type type) const ;
   bool alloc_node_id(NodeId * _nodeId, enum ndb_mgm_node_type type,
-		     struct sockaddr *client_addr, SOCKET_SIZE_TYPE *client_addr_len,
-		     BaseString &error_string);
+		     struct sockaddr *client_addr,
+                     SOCKET_SIZE_TYPE *client_addr_len,
+		     int &error_code, BaseString &error_string,
+                     int log_event = 1);
   
   /**
    *

--- 1.62/ndb/src/mgmsrv/Services.cpp	2006-04-26 13:20:59 +02:00
+++ 1.63/ndb/src/mgmsrv/Services.cpp	2006-04-26 15:55:24 +02:00
@@ -138,6 +138,7 @@
     MGM_ARG("endian", String, Optional, "Endianness"),
     MGM_ARG("name", String, Optional, "Name of connection"),
     MGM_ARG("timeout", Int, Optional, "Timeout in seconds"),
+    MGM_ARG("log_event", Int, Optional, "Log failure in cluster log"),
 
   MGM_CMD("get version", &MgmApiSession::getVersion, ""),
   
@@ -425,6 +426,8 @@
   const char * public_key;
   const char * endian= NULL;
   const char * name= NULL;
+  Uint32 log_event= 1;
+  bool log_event_version;
   union { long l; char c[sizeof(long)]; } endian_check;
 
   args.get("version", &version);
@@ -437,6 +440,8 @@
   args.get("endian", &endian);
   args.get("name", &name);
   args.get("timeout", &timeout);
+  /* for backwards compatability keep track if client uses new protocol */
+  log_event_version= args.get("log_event", &log_event);
 
   endian_check.l = 1;
   if(endian 
@@ -476,11 +481,15 @@
   NodeId tmp= nodeid;
   if(tmp == 0 || !m_allocated_resources->is_reserved(tmp)){
     BaseString error_string;
+    int error_code;
     NDB_TICKS tick= 0;
+    /* only report error on second attempt as not to clog the cluster log */
     while (!m_mgmsrv.alloc_node_id(&tmp, (enum ndb_mgm_node_type)nodetype, 
-                                   &addr, &addrlen, error_string))
+                                   &addr, &addrlen, error_code, error_string,
+                                   tick == 0 ? 0 : log_event))
     {
-      if (tick == 0)
+      /* NDB_MGM_ALLOCID_CONFIG_MISMATCH is a non retriable error */
+      if (tick == 0 && error_code != NDB_MGM_ALLOCID_CONFIG_MISMATCH)
       {
         // attempt to free any timed out reservations
         tick= NdbTick_CurrentMillisecond();
@@ -492,6 +501,7 @@
         ps.tick= tick;
         m_mgmsrv.get_socket_server()->
           foreachSession(stop_session_if_timed_out,&ps);
+        error_string = "";
         continue;
       }
       const char *alias;
@@ -500,6 +510,9 @@
 						nodetype, &str);
       m_output->println(cmd);
       m_output->println("result: %s", error_string.c_str());
+      /* only use error_code protocol if client knows about it */
+      if (log_event_version)
+        m_output->println("error_code: %d", error_code);
       m_output->println("");
       return;
     }
Thread
bk commit into 5.0 tree (tomas:1.2106) BUG#19039tomas26 Apr