Below is the list of changes that have just been committed into a local
5.1 repository of tomas. When tomas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet
1.2489 07/06/05 17:06:33 tomas@stripped +10 -0
Bug #28899 not possible to set separate watchdog timeout at startup
storage/ndb/src/mgmsrv/ConfigInfo.cpp
1.98 07/06/05 17:06:24 tomas@stripped +12 -0
add new configuration parameter TimeBetweenWatchDogCheckInitial
storage/ndb/src/kernel/vm/WatchDog.cpp
1.9 07/06/05 17:06:24 tomas@stripped +100 -56
rewrite watchdog to check every 100ms for being stuch, but keep shutdown after 3 *
interval
for "action" == 9 (malloc) keep old behavior and only output every interval
storage/ndb/src/kernel/vm/SimulatedBlock.hpp
1.30 07/06/05 17:06:24 tomas@stripped +2 -1
introduce new state for "action" malloc of memory
storage/ndb/src/kernel/vm/SimulatedBlock.cpp
1.40 07/06/05 17:06:24 tomas@stripped +13 -5
introduce new state for "action" malloc of memory
storage/ndb/src/kernel/vm/Configuration.hpp
1.23 07/06/05 17:06:24 tomas@stripped +1 -0
read initial watchdog timeout and set it in the beginning
storage/ndb/src/kernel/vm/Configuration.cpp
1.57 07/06/05 17:06:24 tomas@stripped +10 -2
read initial watchdog timeout and set it in the beginning
storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp
1.46 07/06/05 17:06:24 tomas@stripped +8 -0
read watchdog timeout to set it after malloc
storage/ndb/src/common/portlib/NdbTick.c
1.8 07/06/05 17:06:24 tomas@stripped +1 -3
enable timing code
storage/ndb/include/portlib/NdbTick.h
1.6 07/06/05 17:06:24 tomas@stripped +0 -4
enable timing code
storage/ndb/include/mgmapi/mgmapi_config_parameters.h
1.32 07/06/05 17:06:23 tomas@stripped +2 -0
add new configuration parameter TimeBetweenWatchDogCheckInitial
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: tomas
# Host: poseidon.mysql.com
# Root: /home/tomas/mysql-5.1-telco-gca
--- 1.31/storage/ndb/include/mgmapi/mgmapi_config_parameters.h 2006-12-31 01:06:42 +01:00
+++ 1.32/storage/ndb/include/mgmapi/mgmapi_config_parameters.h 2007-06-05 17:06:23 +02:00
@@ -81,6 +81,8 @@
#define CFG_DB_BACKUP_WRITE_SIZE 136
#define CFG_DB_BACKUP_MAX_WRITE_SIZE 139
+#define CFG_DB_WATCHDOG_INTERVAL_INITIAL 141
+
#define CFG_LOG_DESTINATION 147
#define CFG_DB_DISCLESS 148
--- 1.5/storage/ndb/include/portlib/NdbTick.h 2006-12-23 20:20:08 +01:00
+++ 1.6/storage/ndb/include/portlib/NdbTick.h 2007-06-05 17:06:24 +02:00
@@ -37,9 +37,6 @@ NDB_TICKS NdbTick_CurrentMillisecond(voi
*/
int NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros);
- /*#define TIME_MEASUREMENT*/
-#ifdef TIME_MEASUREMENT
-
struct MicroSecondTimer {
NDB_TICKS seconds;
NDB_TICKS micro_seconds;
@@ -54,7 +51,6 @@ struct MicroSecondTimer {
NDB_TICKS NdbTick_getMicrosPassed(struct MicroSecondTimer start,
struct MicroSecondTimer stop);
int NdbTick_getMicroTimer(struct MicroSecondTimer* time_now);
-#endif
#ifdef __cplusplus
}
--- 1.97/storage/ndb/src/mgmsrv/ConfigInfo.cpp 2007-03-20 16:29:26 +01:00
+++ 1.98/storage/ndb/src/mgmsrv/ConfigInfo.cpp 2007-06-05 17:06:24 +02:00
@@ -572,6 +572,18 @@ const ConfigInfo::ParamInfo ConfigInfo::
STR_VALUE(MAX_INT_RNIL) },
{
+ CFG_DB_WATCHDOG_INTERVAL_INITIAL,
+ "TimeBetweenWatchDogCheckInitial",
+ DB_TOKEN,
+ "Time between execution checks inside a database node in the early start phases when
memory is allocated",
+ ConfigInfo::CI_USED,
+ true,
+ ConfigInfo::CI_INT,
+ "6000",
+ "70",
+ STR_VALUE(MAX_INT_RNIL) },
+
+ {
CFG_DB_STOP_ON_ERROR,
"StopOnError",
DB_TOKEN,
--- 1.7/storage/ndb/src/common/portlib/NdbTick.c 2006-12-23 20:20:12 +01:00
+++ 1.8/storage/ndb/src/common/portlib/NdbTick.c 2007-06-05 17:06:24 +02:00
@@ -15,7 +15,7 @@
#include <ndb_global.h>
-#include "NdbTick.h"
+#include <NdbTick.h>
#define NANOSEC_PER_SEC 1000000000
#define MICROSEC_PER_SEC 1000000
@@ -71,7 +71,6 @@ NdbTick_CurrentMicrosecond(NDB_TICKS * s
}
#endif
-#ifdef TIME_MEASUREMENT
int
NdbTick_getMicroTimer(struct MicroSecondTimer* input_timer)
{
@@ -102,4 +101,3 @@ NdbTick_getMicrosPassed(struct MicroSeco
}
return ret_value;
}
-#endif
--- 1.45/storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp 2007-05-28 16:00:32 +02:00
+++ 1.46/storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp 2007-06-05 17:06:24 +02:00
@@ -277,6 +277,14 @@ void Ndbcntr::execSTTOR(Signal* signal)
break;
case ZSTART_PHASE_1:
jam();
+ {
+ Uint32 db_watchdog_interval = 0;
+ const ndb_mgm_configuration_iterator * p =
+ m_ctx.m_config.getOwnConfigIterator();
+ ndb_mgm_get_int_parameter(p, CFG_DB_WATCHDOG_INTERVAL, &db_watchdog_interval);
+ ndbrequire(db_watchdog_interval);
+ update_watch_dog_timer(db_watchdog_interval);
+ }
startPhase1Lab(signal);
break;
case ZSTART_PHASE_2:
--- 1.56/storage/ndb/src/kernel/vm/Configuration.cpp 2007-01-23 01:18:03 +01:00
+++ 1.57/storage/ndb/src/kernel/vm/Configuration.cpp 2007-06-05 17:06:24 +02:00
@@ -443,6 +443,11 @@ Configuration::setupConfiguration(){
"TimeBetweenWatchDogCheck missing");
}
+ if(iter.get(CFG_DB_WATCHDOG_INTERVAL_INITIAL, &_timeBetweenWatchDogCheckInitial)){
+ ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG, "Invalid configuration fetched",
+ "TimeBetweenWatchDogCheckInitial missing");
+ }
+
/**
* Get paths
*/
@@ -462,9 +467,12 @@ Configuration::setupConfiguration(){
* Create the watch dog thread
*/
{
- Uint32 t = _timeBetweenWatchDogCheck;
+ if (_timeBetweenWatchDogCheckInitial < _timeBetweenWatchDogCheck)
+ _timeBetweenWatchDogCheckInitial = _timeBetweenWatchDogCheck;
+
+ Uint32 t = _timeBetweenWatchDogCheckInitial;
t = globalEmulatorData.theWatchDog ->setCheckInterval(t);
- _timeBetweenWatchDogCheck = t;
+ _timeBetweenWatchDogCheckInitial = t;
}
ConfigValues* cf = ConfigValuesFactory::extractCurrentSection(iter.m_config);
--- 1.22/storage/ndb/src/kernel/vm/Configuration.hpp 2007-01-23 01:18:03 +01:00
+++ 1.23/storage/ndb/src/kernel/vm/Configuration.hpp 2007-06-05 17:06:24 +02:00
@@ -84,6 +84,7 @@ private:
Uint32 _maxErrorLogs;
Uint32 _lockPagesInMainMemory;
Uint32 _timeBetweenWatchDogCheck;
+ Uint32 _timeBetweenWatchDogCheckInitial;
ndb_mgm_configuration * m_ownConfig;
ndb_mgm_configuration * m_clusterConfig;
--- 1.39/storage/ndb/src/kernel/vm/SimulatedBlock.cpp 2006-12-27 10:58:04 +01:00
+++ 1.40/storage/ndb/src/kernel/vm/SimulatedBlock.cpp 2007-06-05 17:06:24 +02:00
@@ -19,6 +19,7 @@
#include <NdbOut.hpp>
#include <GlobalData.hpp>
#include <Emulator.hpp>
+#include <WatchDog.hpp>
#include <ErrorHandlingMacros.hpp>
#include <TimeQueue.hpp>
#include <TransporterRegistry.hpp>
@@ -662,7 +663,7 @@ SimulatedBlock::allocRecord(const char *
void * p = NULL;
size_t size = n*s;
Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s);
- refresh_watch_dog();
+ refresh_watch_dog(9);
if (real_size > 0){
#ifdef VM_TRACE_MEM
ndbout_c("%s::allocRecord(%s, %u, %u) = %llu bytes",
@@ -696,12 +697,12 @@ SimulatedBlock::allocRecord(const char *
char * ptr = (char*)p;
const Uint32 chunk = 128 * 1024;
while(size > chunk){
- refresh_watch_dog();
+ refresh_watch_dog(9);
memset(ptr, 0, chunk);
ptr += chunk;
size -= chunk;
}
- refresh_watch_dog();
+ refresh_watch_dog(9);
memset(ptr, 0, size);
}
}
@@ -720,9 +721,16 @@ SimulatedBlock::deallocRecord(void ** pt
}
void
-SimulatedBlock::refresh_watch_dog()
+SimulatedBlock::refresh_watch_dog(Uint32 place)
{
- globalData.incrementWatchDogCounter(1);
+ globalData.incrementWatchDogCounter(place);
+}
+
+void
+SimulatedBlock::update_watch_dog_timer(Uint32 interval)
+{
+ extern EmulatorData globalEmulatorData;
+ globalEmulatorData.theWatchDog->setCheckInterval(interval);
}
void
--- 1.29/storage/ndb/src/kernel/vm/SimulatedBlock.hpp 2007-01-11 21:13:13 +01:00
+++ 1.30/storage/ndb/src/kernel/vm/SimulatedBlock.hpp 2007-06-05 17:06:24 +02:00
@@ -334,7 +334,8 @@ protected:
* Refresh Watch Dog in initialising code
*
*/
- void refresh_watch_dog();
+ void refresh_watch_dog(Uint32 place = 1);
+ void update_watch_dog_timer(Uint32 interval);
/**
* Prog error
--- 1.8/storage/ndb/src/kernel/vm/WatchDog.cpp 2007-05-28 16:32:16 +02:00
+++ 1.9/storage/ndb/src/kernel/vm/WatchDog.cpp 2007-06-05 17:06:24 +02:00
@@ -25,6 +25,8 @@
#include <ErrorHandlingMacros.hpp>
#include <EventLogger.hpp>
+#include <NdbTick.h>
+
extern EventLogger g_eventLogger;
extern "C"
@@ -72,73 +74,115 @@ WatchDog::doStop(){
}
}
+const char *get_action(Uint32 IPValue)
+{
+ const char *action;
+ switch (IPValue) {
+ case 1:
+ action = "Job Handling";
+ break;
+ case 2:
+ action = "Scanning Timers";
+ break;
+ case 3:
+ action = "External I/O";
+ break;
+ case 4:
+ action = "Print Job Buffers at crash";
+ break;
+ case 5:
+ action = "Checking connections";
+ break;
+ case 6:
+ action = "Performing Send";
+ break;
+ case 7:
+ action = "Polling for Receive";
+ break;
+ case 8:
+ action = "Performing Receive";
+ break;
+ case 9:
+ action = "Allocating memory";
+ break;
+ default:
+ action = "Unknown place";
+ break;
+ }//switch
+ return action;
+}
+
void
-WatchDog::run(){
- unsigned int anIPValue;
- unsigned int alerts = 0;
+WatchDog::run()
+{
+ unsigned int anIPValue, sleep_time;
unsigned int oldIPValue = 0;
-
+ unsigned int theIntervalCheck = theInterval;
+ struct MicroSecondTimer start_time, last_time, now;
+ NdbTick_getMicroTimer(&start_time);
+ last_time = start_time;
+
// WatchDog for the single threaded NDB
- while(!theStop){
- Uint32 tmp = theInterval / 500;
- tmp= (tmp ? tmp : 1);
-
- while(!theStop && tmp > 0){
- NdbSleep_MilliSleep(500);
- tmp--;
- }
-
+ while (!theStop)
+ {
+ sleep_time= 100;
+
+ NdbSleep_MilliSleep(sleep_time);
if(theStop)
break;
+ NdbTick_getMicroTimer(&now);
+ if (NdbTick_getMicrosPassed(last_time, now)/1000 > sleep_time*2)
+ {
+ struct tms my_tms;
+ times(&my_tms);
+ g_eventLogger.info("Watchdog: User time: %llu System time: %llu",
+ (Uint64)my_tms.tms_utime,
+ (Uint64)my_tms.tms_stime);
+ g_eventLogger.warning("Watchdog: Warning overslept %u ms, expected %u ms.",
+ NdbTick_getMicrosPassed(last_time, now)/1000,
+ sleep_time);
+ }
+ last_time = now;
+
// Verify that the IP thread is not stuck in a loop
anIPValue = *theIPValue;
- if(anIPValue != 0) {
+ if (anIPValue != 0)
+ {
oldIPValue = anIPValue;
globalData.incrementWatchDogCounter(0);
- alerts = 0;
- } else {
- const char *last_stuck_action;
- alerts++;
- switch (oldIPValue) {
- case 1:
- last_stuck_action = "Job Handling";
- break;
- case 2:
- last_stuck_action = "Scanning Timers";
- break;
- case 3:
- last_stuck_action = "External I/O";
- break;
- case 4:
- last_stuck_action = "Print Job Buffers at crash";
- break;
- case 5:
- last_stuck_action = "Checking connections";
- break;
- case 6:
- last_stuck_action = "Performing Send";
- break;
- case 7:
- last_stuck_action = "Polling for Receive";
- break;
- case 8:
- last_stuck_action = "Performing Receive";
- break;
- default:
- last_stuck_action = "Unknown place";
- break;
- }//switch
- g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
+ NdbTick_getMicroTimer(&start_time);
+ theIntervalCheck = theInterval;
+ }
+ else
+ {
+ int warn = 1;
+ Uint32 elapsed = NdbTick_getMicrosPassed(start_time, now)/1000;
+ /*
+ oldIPValue == 9 indicates malloc going on, this can take some time
+ so only warn if we pass the watchdog interval
+ */
+ if (oldIPValue == 9)
+ if (elapsed < theIntervalCheck)
+ warn = 0;
+ else
+ theIntervalCheck += theInterval;
+
+ if (warn)
{
- struct tms my_tms;
- times(&my_tms);
- g_eventLogger.info("User time: %llu System time: %llu",
- (Uint64)my_tms.tms_utime,
- (Uint64)my_tms.tms_stime);
- }
- if(alerts == 3){
- shutdownSystem(last_stuck_action);
+ const char *last_stuck_action = get_action(oldIPValue);
+ g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
+ {
+ struct tms my_tms;
+ times(&my_tms);
+ g_eventLogger.info("Watchdog: User time: %llu System time: %llu",
+ (Uint64)my_tms.tms_utime,
+ (Uint64)my_tms.tms_stime);
+ }
+ if (elapsed > 3 * theInterval)
+ {
+ shutdownSystem(last_stuck_action);
+ }
}
}
}
| Thread |
|---|
| • bk commit into 5.1 tree (tomas:1.2489) BUG#28899 | tomas | 5 Jun |