MySQL Lists are EOL. Please join:

List:Commits« Previous MessageNext Message »
From:tomas Date:June 5 2007 3:06pm
Subject:bk commit into 5.1 tree (tomas:1.2489) BUG#28899
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of tomas. When tomas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2489 07/06/05 17:06:33 tomas@stripped +10 -0
  Bug #28899  	not possible to set separate watchdog timeout at startup

  storage/ndb/src/mgmsrv/ConfigInfo.cpp
    1.98 07/06/05 17:06:24 tomas@stripped +12 -0
    add new configuration parameter TimeBetweenWatchDogCheckInitial

  storage/ndb/src/kernel/vm/WatchDog.cpp
    1.9 07/06/05 17:06:24 tomas@stripped +100 -56
    rewrite watchdog to check every 100ms for being stuch, but keep shutdown after 3 * interval
    for "action" == 9 (malloc)  keep old behavior and only output every interval

  storage/ndb/src/kernel/vm/SimulatedBlock.hpp
    1.30 07/06/05 17:06:24 tomas@stripped +2 -1
    introduce new state for "action" malloc of memory

  storage/ndb/src/kernel/vm/SimulatedBlock.cpp
    1.40 07/06/05 17:06:24 tomas@stripped +13 -5
    introduce new state for "action" malloc of memory

  storage/ndb/src/kernel/vm/Configuration.hpp
    1.23 07/06/05 17:06:24 tomas@stripped +1 -0
    read initial watchdog timeout and set it in the beginning

  storage/ndb/src/kernel/vm/Configuration.cpp
    1.57 07/06/05 17:06:24 tomas@stripped +10 -2
    read initial watchdog timeout and set it in the beginning

  storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp
    1.46 07/06/05 17:06:24 tomas@stripped +8 -0
    read watchdog timeout to set it after malloc

  storage/ndb/src/common/portlib/NdbTick.c
    1.8 07/06/05 17:06:24 tomas@stripped +1 -3
    enable timing code

  storage/ndb/include/portlib/NdbTick.h
    1.6 07/06/05 17:06:24 tomas@stripped +0 -4
    enable timing code

  storage/ndb/include/mgmapi/mgmapi_config_parameters.h
    1.32 07/06/05 17:06:23 tomas@stripped +2 -0
    add new configuration parameter TimeBetweenWatchDogCheckInitial

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	tomas
# Host:	poseidon.mysql.com
# Root:	/home/tomas/mysql-5.1-telco-gca

--- 1.31/storage/ndb/include/mgmapi/mgmapi_config_parameters.h	2006-12-31 01:06:42 +01:00
+++ 1.32/storage/ndb/include/mgmapi/mgmapi_config_parameters.h	2007-06-05 17:06:23 +02:00
@@ -81,6 +81,8 @@
 #define CFG_DB_BACKUP_WRITE_SIZE          136
 #define CFG_DB_BACKUP_MAX_WRITE_SIZE      139
 
+#define CFG_DB_WATCHDOG_INTERVAL_INITIAL  141
+
 #define CFG_LOG_DESTINATION           147
 
 #define CFG_DB_DISCLESS               148

--- 1.5/storage/ndb/include/portlib/NdbTick.h	2006-12-23 20:20:08 +01:00
+++ 1.6/storage/ndb/include/portlib/NdbTick.h	2007-06-05 17:06:24 +02:00
@@ -37,9 +37,6 @@ NDB_TICKS NdbTick_CurrentMillisecond(voi
  */
 int NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros);
 
-  /*#define TIME_MEASUREMENT*/
-#ifdef TIME_MEASUREMENT
-
 struct MicroSecondTimer {
   NDB_TICKS seconds;
   NDB_TICKS micro_seconds;
@@ -54,7 +51,6 @@ struct MicroSecondTimer {
 NDB_TICKS NdbTick_getMicrosPassed(struct MicroSecondTimer start,
                             struct MicroSecondTimer stop);
 int NdbTick_getMicroTimer(struct MicroSecondTimer* time_now);
-#endif
 
 #ifdef	__cplusplus
 }

--- 1.97/storage/ndb/src/mgmsrv/ConfigInfo.cpp	2007-03-20 16:29:26 +01:00
+++ 1.98/storage/ndb/src/mgmsrv/ConfigInfo.cpp	2007-06-05 17:06:24 +02:00
@@ -572,6 +572,18 @@ const ConfigInfo::ParamInfo ConfigInfo::
     STR_VALUE(MAX_INT_RNIL) },
 
   {
+    CFG_DB_WATCHDOG_INTERVAL_INITIAL,
+    "TimeBetweenWatchDogCheckInitial",
+    DB_TOKEN,
+    "Time between execution checks inside a database node in the early start phases when memory is allocated",
+    ConfigInfo::CI_USED,
+    true,
+    ConfigInfo::CI_INT,
+    "6000",
+    "70",
+    STR_VALUE(MAX_INT_RNIL) },
+
+  {
     CFG_DB_STOP_ON_ERROR,
     "StopOnError",
     DB_TOKEN,

--- 1.7/storage/ndb/src/common/portlib/NdbTick.c	2006-12-23 20:20:12 +01:00
+++ 1.8/storage/ndb/src/common/portlib/NdbTick.c	2007-06-05 17:06:24 +02:00
@@ -15,7 +15,7 @@
 
 
 #include <ndb_global.h>
-#include "NdbTick.h"
+#include <NdbTick.h>
 
 #define NANOSEC_PER_SEC  1000000000
 #define MICROSEC_PER_SEC 1000000
@@ -71,7 +71,6 @@ NdbTick_CurrentMicrosecond(NDB_TICKS * s
 }
 
 #endif
-#ifdef TIME_MEASUREMENT
 int
 NdbTick_getMicroTimer(struct MicroSecondTimer* input_timer)
 {
@@ -102,4 +101,3 @@ NdbTick_getMicrosPassed(struct MicroSeco
   }
   return ret_value;
 }
-#endif

--- 1.45/storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp	2007-05-28 16:00:32 +02:00
+++ 1.46/storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp	2007-06-05 17:06:24 +02:00
@@ -277,6 +277,14 @@ void Ndbcntr::execSTTOR(Signal* signal) 
     break;
   case ZSTART_PHASE_1:
     jam();
+    {
+      Uint32 db_watchdog_interval = 0;
+      const ndb_mgm_configuration_iterator * p = 
+        m_ctx.m_config.getOwnConfigIterator();
+      ndb_mgm_get_int_parameter(p, CFG_DB_WATCHDOG_INTERVAL, &db_watchdog_interval);
+      ndbrequire(db_watchdog_interval);
+      update_watch_dog_timer(db_watchdog_interval);
+    }
     startPhase1Lab(signal);
     break;
   case ZSTART_PHASE_2:

--- 1.56/storage/ndb/src/kernel/vm/Configuration.cpp	2007-01-23 01:18:03 +01:00
+++ 1.57/storage/ndb/src/kernel/vm/Configuration.cpp	2007-06-05 17:06:24 +02:00
@@ -443,6 +443,11 @@ Configuration::setupConfiguration(){
 	      "TimeBetweenWatchDogCheck missing");
   }
 
+  if(iter.get(CFG_DB_WATCHDOG_INTERVAL_INITIAL, &_timeBetweenWatchDogCheckInitial)){
+    ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG, "Invalid configuration fetched", 
+	      "TimeBetweenWatchDogCheckInitial missing");
+  }
+
   /**
    * Get paths
    */  
@@ -462,9 +467,12 @@ Configuration::setupConfiguration(){
    * Create the watch dog thread
    */
   { 
-    Uint32 t = _timeBetweenWatchDogCheck;
+    if (_timeBetweenWatchDogCheckInitial < _timeBetweenWatchDogCheck)
+      _timeBetweenWatchDogCheckInitial = _timeBetweenWatchDogCheck;
+
+    Uint32 t = _timeBetweenWatchDogCheckInitial;
     t = globalEmulatorData.theWatchDog ->setCheckInterval(t);
-    _timeBetweenWatchDogCheck = t;
+    _timeBetweenWatchDogCheckInitial = t;
   }
   
   ConfigValues* cf = ConfigValuesFactory::extractCurrentSection(iter.m_config);

--- 1.22/storage/ndb/src/kernel/vm/Configuration.hpp	2007-01-23 01:18:03 +01:00
+++ 1.23/storage/ndb/src/kernel/vm/Configuration.hpp	2007-06-05 17:06:24 +02:00
@@ -84,6 +84,7 @@ private:
   Uint32 _maxErrorLogs;
   Uint32 _lockPagesInMainMemory;
   Uint32 _timeBetweenWatchDogCheck;
+  Uint32 _timeBetweenWatchDogCheckInitial;
 
   ndb_mgm_configuration * m_ownConfig;
   ndb_mgm_configuration * m_clusterConfig;

--- 1.39/storage/ndb/src/kernel/vm/SimulatedBlock.cpp	2006-12-27 10:58:04 +01:00
+++ 1.40/storage/ndb/src/kernel/vm/SimulatedBlock.cpp	2007-06-05 17:06:24 +02:00
@@ -19,6 +19,7 @@
 #include <NdbOut.hpp>
 #include <GlobalData.hpp>
 #include <Emulator.hpp>
+#include <WatchDog.hpp>
 #include <ErrorHandlingMacros.hpp>
 #include <TimeQueue.hpp>
 #include <TransporterRegistry.hpp>
@@ -662,7 +663,7 @@ SimulatedBlock::allocRecord(const char *
   void * p = NULL;
   size_t size = n*s;
   Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s);
-  refresh_watch_dog(); 
+  refresh_watch_dog(9);
   if (real_size > 0){
 #ifdef VM_TRACE_MEM
     ndbout_c("%s::allocRecord(%s, %u, %u) = %llu bytes", 
@@ -696,12 +697,12 @@ SimulatedBlock::allocRecord(const char *
       char * ptr = (char*)p;
       const Uint32 chunk = 128 * 1024;
       while(size > chunk){
-	refresh_watch_dog(); 
+	refresh_watch_dog(9);
 	memset(ptr, 0, chunk);
 	ptr += chunk;
 	size -= chunk;
       }
-      refresh_watch_dog(); 
+      refresh_watch_dog(9);
       memset(ptr, 0, size);
     }
   }
@@ -720,9 +721,16 @@ SimulatedBlock::deallocRecord(void ** pt
 }
 
 void
-SimulatedBlock::refresh_watch_dog()
+SimulatedBlock::refresh_watch_dog(Uint32 place)
 {
-  globalData.incrementWatchDogCounter(1);
+  globalData.incrementWatchDogCounter(place);
+}
+
+void
+SimulatedBlock::update_watch_dog_timer(Uint32 interval)
+{
+  extern EmulatorData globalEmulatorData;
+  globalEmulatorData.theWatchDog->setCheckInterval(interval);
 }
 
 void

--- 1.29/storage/ndb/src/kernel/vm/SimulatedBlock.hpp	2007-01-11 21:13:13 +01:00
+++ 1.30/storage/ndb/src/kernel/vm/SimulatedBlock.hpp	2007-06-05 17:06:24 +02:00
@@ -334,7 +334,8 @@ protected:
    * Refresh Watch Dog in initialising code
    *
    */
-  void refresh_watch_dog();
+  void refresh_watch_dog(Uint32 place = 1);
+  void update_watch_dog_timer(Uint32 interval);
 
   /**
    * Prog error

--- 1.8/storage/ndb/src/kernel/vm/WatchDog.cpp	2007-05-28 16:32:16 +02:00
+++ 1.9/storage/ndb/src/kernel/vm/WatchDog.cpp	2007-06-05 17:06:24 +02:00
@@ -25,6 +25,8 @@
 #include <ErrorHandlingMacros.hpp>
 #include <EventLogger.hpp>
 
+#include <NdbTick.h>
+
 extern EventLogger g_eventLogger;
 
 extern "C" 
@@ -72,73 +74,115 @@ WatchDog::doStop(){
   }
 }
 
+const char *get_action(Uint32 IPValue)
+{
+  const char *action;
+  switch (IPValue) {
+  case 1:
+    action = "Job Handling";
+    break;
+  case 2:
+    action = "Scanning Timers";
+    break;
+  case 3:
+    action = "External I/O";
+    break;
+  case 4:
+    action = "Print Job Buffers at crash";
+    break;
+  case 5:
+    action = "Checking connections";
+    break;
+  case 6:
+    action = "Performing Send";
+    break;
+  case 7:
+    action = "Polling for Receive";
+    break;
+  case 8:
+    action = "Performing Receive";
+    break;
+  case 9:
+    action = "Allocating memory";
+    break;
+  default:
+    action = "Unknown place";
+    break;
+  }//switch
+  return action;
+}
+
 void 
-WatchDog::run(){
-  unsigned int anIPValue;
-  unsigned int alerts = 0;
+WatchDog::run()
+{
+  unsigned int anIPValue, sleep_time;
   unsigned int oldIPValue = 0;
-  
+  unsigned int theIntervalCheck = theInterval;
+  struct MicroSecondTimer start_time, last_time, now;
+  NdbTick_getMicroTimer(&start_time);
+  last_time = start_time;
+
   // WatchDog for the single threaded NDB
-  while(!theStop){
-    Uint32 tmp  = theInterval / 500;
-    tmp= (tmp ? tmp : 1);
-    
-    while(!theStop && tmp > 0){
-      NdbSleep_MilliSleep(500);
-      tmp--;
-    }
-    
+  while (!theStop)
+  {
+    sleep_time= 100;
+
+    NdbSleep_MilliSleep(sleep_time);
     if(theStop)
       break;
 
+    NdbTick_getMicroTimer(&now);
+    if (NdbTick_getMicrosPassed(last_time, now)/1000 > sleep_time*2)
+    {
+      struct tms my_tms;
+      times(&my_tms);
+      g_eventLogger.info("Watchdog: User time: %llu  System time: %llu",
+                         (Uint64)my_tms.tms_utime,
+                         (Uint64)my_tms.tms_stime);
+      g_eventLogger.warning("Watchdog: Warning overslept %u ms, expected %u ms.",
+                            NdbTick_getMicrosPassed(last_time, now)/1000,
+                            sleep_time);
+    }
+    last_time = now;
+
     // Verify that the IP thread is not stuck in a loop
     anIPValue = *theIPValue;
-    if(anIPValue != 0) {
+    if (anIPValue != 0)
+    {
       oldIPValue = anIPValue;
       globalData.incrementWatchDogCounter(0);
-      alerts = 0;
-    } else {
-      const char *last_stuck_action;
-      alerts++;
-      switch (oldIPValue) {
-      case 1:
-        last_stuck_action = "Job Handling";
-        break;
-      case 2:
-        last_stuck_action = "Scanning Timers";
-        break;
-      case 3:
-        last_stuck_action = "External I/O";
-        break;
-      case 4:
-        last_stuck_action = "Print Job Buffers at crash";
-        break;
-      case 5:
-        last_stuck_action = "Checking connections";
-        break;
-      case 6:
-        last_stuck_action = "Performing Send";
-        break;
-      case 7:
-        last_stuck_action = "Polling for Receive";
-        break;
-      case 8:
-        last_stuck_action = "Performing Receive";
-        break;
-      default:
-        last_stuck_action = "Unknown place";
-        break;
-      }//switch
-      g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
+      NdbTick_getMicroTimer(&start_time);
+      theIntervalCheck = theInterval;
+    }
+    else
+    {
+      int warn = 1;
+      Uint32 elapsed = NdbTick_getMicrosPassed(start_time, now)/1000;
+      /*
+        oldIPValue == 9 indicates malloc going on, this can take some time
+        so only warn if we pass the watchdog interval
+      */
+      if (oldIPValue == 9)
+        if (elapsed < theIntervalCheck)
+          warn = 0;
+        else
+          theIntervalCheck += theInterval;
+
+      if (warn)
       {
-        struct tms my_tms;
-        times(&my_tms);
-        g_eventLogger.info("User time: %llu  System time: %llu",
-                           (Uint64)my_tms.tms_utime,
-                           (Uint64)my_tms.tms_stime);
-      }
-      if(alerts == 3){
-	shutdownSystem(last_stuck_action);
+        const char *last_stuck_action = get_action(oldIPValue);
+        g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
+        {
+          struct tms my_tms;
+          times(&my_tms);
+          g_eventLogger.info("Watchdog: User time: %llu  System time: %llu",
+                             (Uint64)my_tms.tms_utime,
+                             (Uint64)my_tms.tms_stime);
+        }
+        if (elapsed > 3 * theInterval)
+        {
+          shutdownSystem(last_stuck_action);
+        }
       }
     }
   }
Thread
bk commit into 5.1 tree (tomas:1.2489) BUG#28899tomas5 Jun