#At file:///home/msvensson/mysql/7.0-testMgmd/ based on revid:magnus.blaudd@stripped49-jhy0okyj3zfs6u0v
3009 Magnus Blåudd 2009-09-25
Bug#45495 multiple managment servers fail to start
- Make it possible to start a second ndb_mgmd with --initial with exactly
the same configuration as an already started and confirmed ndb_mgmd.
Implementation details:
- When refusing a CONFIG_CHECK, send the confirmed
config back so that requestor can do "node recovery"
from it.
- If the config returned in CONFIG_CHECK_REF is exactly equal to
own config.ini - use it to do "node recovery"
- Add test program
modified:
storage/ndb/include/kernel/signaldata/ConfigChange.hpp
storage/ndb/src/mgmsrv/Config.cpp
storage/ndb/src/mgmsrv/Config.hpp
storage/ndb/src/mgmsrv/ConfigManager.cpp
storage/ndb/test/include/NdbMgmd.hpp
storage/ndb/test/include/NdbProcess.hpp
storage/ndb/test/ndbapi/ConfigFactory.hpp
storage/ndb/test/ndbapi/testMgmd.cpp
=== modified file 'storage/ndb/include/kernel/signaldata/ConfigChange.hpp'
--- a/storage/ndb/include/kernel/signaldata/ConfigChange.hpp 2009-02-23 09:36:25 +0000
+++ b/storage/ndb/include/kernel/signaldata/ConfigChange.hpp 2009-09-25 07:57:29 +0000
@@ -263,12 +263,14 @@ class ConfigCheckRef {
public:
STATIC_CONST( SignalLength = 5 );
+ STATIC_CONST( SignalLengthWithConfig = 6 );
private:
Uint32 error;
Uint32 generation;
Uint32 expected_generation;
Uint32 state;
Uint32 expected_state;
+ Uint32 length; // Length of the config data in long signal
};
=== modified file 'storage/ndb/src/mgmsrv/Config.cpp'
--- a/storage/ndb/src/mgmsrv/Config.cpp 2009-05-27 15:21:45 +0000
+++ b/storage/ndb/src/mgmsrv/Config.cpp 2009-09-25 07:57:29 +0000
@@ -713,9 +713,9 @@ void Config::print_diff(const Config* ot
const char*
-Config::diff2str(const Config* other, BaseString& str) const {
+Config::diff2str(const Config* other, BaseString& str, const unsigned * exclude) const {
Properties diff_list;
- diff(other, diff_list);
+ diff(other, diff_list, exclude);
return diff2str(diff_list, str);
}
=== modified file 'storage/ndb/src/mgmsrv/Config.hpp'
--- a/storage/ndb/src/mgmsrv/Config.hpp 2009-05-27 15:21:45 +0000
+++ b/storage/ndb/src/mgmsrv/Config.hpp 2009-09-25 07:57:29 +0000
@@ -106,7 +106,8 @@ public:
/*
Print the difference to string buffer
*/
- const char* diff2str(const Config* other, BaseString& str) const;
+ const char* diff2str(const Config* other, BaseString& str,
+ const unsigned* exclude = NULL) const;
/*
Determine if changing to the other config is illegal
=== modified file 'storage/ndb/src/mgmsrv/ConfigManager.cpp'
--- a/storage/ndb/src/mgmsrv/ConfigManager.cpp 2009-09-08 20:08:24 +0000
+++ b/storage/ndb/src/mgmsrv/ConfigManager.cpp 2009-09-25 07:57:29 +0000
@@ -1349,6 +1349,7 @@ ConfigManager::execCONFIG_CHECK_REQ(Sign
break;
case CS_CONFIRMED:
+
if (other_state != CS_CONFIRMED)
{
g_eventLogger->warning("Refusing other node, it's in different " \
@@ -1411,6 +1412,13 @@ ConfigManager::sendConfigCheckReq(Signal
ConfigCheckReq::SignalLength);
}
+static bool
+send_config_in_check_ref(Uint32 x)
+{
+ if (x >= NDB_MAKE_VERSION(7,0,8))
+ return true;
+ return false;
+}
void
ConfigManager::sendConfigCheckRef(SignalSender& ss, BlockReference to,
@@ -1420,6 +1428,7 @@ ConfigManager::sendConfigCheckRef(Signal
ConfigState state,
ConfigState other_state) const
{
+ int result;
NodeId nodeId = refToNode(to);
SimpleSignal ssig;
ConfigCheckRef* const ref =
@@ -1433,10 +1442,35 @@ ConfigManager::sendConfigCheckRef(Signal
g_eventLogger->debug("Send CONFIG_CHECK_REF with error: %d to node: %d",
error, nodeId);
- ss.sendSignal(nodeId, ssig, MGM_CONFIG_MAN,
- GSN_CONFIG_CHECK_REF, ConfigCheckRef::SignalLength);
-}
+ if (!send_config_in_check_ref(ss.getNodeInfo(nodeId).m_info.m_version))
+ {
+ result = ss.sendSignal(nodeId, ssig, MGM_CONFIG_MAN,
+ GSN_CONFIG_CHECK_REF, ConfigCheckRef::SignalLength);
+ }
+ else
+ {
+ UtilBuffer buf;
+ m_config->pack(buf);
+ ssig.ptr[0].p = (Uint32*)buf.get_data();
+ ssig.ptr[0].sz = (buf.length() + 3) / 4;
+ ssig.header.m_noOfSections = 1;
+ ref->length = buf.length();
+
+ g_eventLogger->debug("Sending CONFIG_CHECK_REF with config");
+
+ result = ss.sendFragmentedSignal(nodeId, ssig, MGM_CONFIG_MAN,
+ GSN_CONFIG_CHECK_REF,
+ ConfigCheckRef::SignalLengthWithConfig);
+ }
+
+ if (result != 0)
+ {
+ g_eventLogger->warning("Failed to send CONFIG_CHECK_REF "
+ "to node: %d, result: %d",
+ nodeId, result);
+ }
+}
void
ConfigManager::sendConfigCheckConf(SignalSender& ss, BlockReference to) const
@@ -1479,29 +1513,97 @@ ConfigManager::execCONFIG_CHECK_REF(Sign
const ConfigCheckRef* const ref =
CAST_CONSTPTR(ConfigCheckRef, sig->getDataPtr());
- g_eventLogger->info("Got CONFIG_CHECK_REF from node %d, "
- "error: %d, message: '%s'\n"
- "generation: %d, expected generation: %d\n"
+ if (!m_defragger.defragment(sig))
+ return; // More fragments to come
+
+ g_eventLogger->debug("Got CONFIG_CHECK_REF from node %d, "
+ "error: %d, message: '%s', "
+ "generation: %d, expected generation: %d, "
"state: %d, expected state: %d own-state: %u",
nodeId, ref->error,
ConfigCheckRef::errorMessage(ref->error),
ref->generation, ref->expected_generation,
ref->state, ref->expected_state,
m_config_state);
-
- if (m_config_state != CS_INITIAL &&
- ref->expected_state == CS_INITIAL)
- {
- g_eventLogger->info("Waiting for peer");
- return;
- }
- if (m_config_state == CS_INITIAL)
+ assert(ref->generation != ref->expected_generation ||
+ ref->state != ref->expected_state);
+ assert(ref->state == m_config_state);
+
+ Config other_config;
+ if (sig->header.theLength == ConfigCheckRef::SignalLengthWithConfig &&
+ ref->length)
+
+
+ switch(m_config_state)
{
- g_eventLogger->info("Waiting");
- return;
+ default:
+ case CS_UNINITIALIZED:
+ g_eventLogger->error("execCONFIG_CHECK_REQ: unhandled state");
+ abort();
+ break;
+
+ case CS_INITIAL:
+ if (ref->expected_state == CS_CONFIRMED)
+ {
+ if (sig->header.theLength != ConfigCheckRef::SignalLengthWithConfig)
+ break; // No config in the REF -> no action
+
+ // The other node has sent it's config in the signal, use it if equal
+ assert(sig->header.m_noOfSections == 1);
+
+ ConfigValuesFactory cf;
+ require(cf.unpack(sig->ptr[0].p, ref->length));
+
+ Config other_config(cf.getConfigValues());
+ assert(other_config.getGeneration() > 0);
+
+ unsigned exclude[]= {CFG_SECTION_SYSTEM, 0};
+ if (!other_config.equal(m_config, exclude))
+ {
+ BaseString buf;
+ g_eventLogger->error("This node was started --initial with "
+ "a config which is _not_ equal to the one "
+ "node %d is using. Refusing to start with "
+ "different configurations, diff: \n%s",
+ nodeId,
+ other_config.diff2str(m_config, buf, exclude));
+ exit(1);
+ }
+
+ g_eventLogger->info("This node was started --inital with "
+ "a config equal to the one node %d is using. "
+ "Will use the config with generation %d "
+ "from node %d!",
+ nodeId, other_config.getGeneration(), nodeId);
+
+ if (! prepareConfigChange(&other_config))
+ {
+ abortConfigChange();
+ g_eventLogger->error("Failed to write the fetched config to disk");
+ exit(1);
+ }
+ commitConfigChange();
+ m_config_state = CS_CONFIRMED;
+ g_eventLogger->info("The fetched configuration has been saved!");
+ m_waiting_for.clear(nodeId);
+ m_checked.set(nodeId);
+ delete m_new_config;
+ m_new_config = NULL;
+ return;
+ }
+ break;
+
+ case CS_CONFIRMED:
+ if (ref->expected_state == CS_INITIAL)
+ {
+ // MASV for peer todo what? Some kind of upgrade fix...
+ g_eventLogger->info("Waiting for peer");
+ return;
+ }
+ break;
}
-
+
g_eventLogger->error("Terminating");
exit(1);
}
=== modified file 'storage/ndb/test/include/NdbMgmd.hpp'
--- a/storage/ndb/test/include/NdbMgmd.hpp 2009-09-16 12:42:53 +0000
+++ b/storage/ndb/test/include/NdbMgmd.hpp 2009-09-25 07:57:29 +0000
@@ -63,8 +63,17 @@ public:
~NdbMgmd()
{
+ close();
+ }
+
+ void close(void)
+ {
if (m_handle)
+ {
+ ndb_mgm_disconnect_quiet(m_handle);
ndb_mgm_destroy_handle(&m_handle);
+ m_handle = NULL;
+ }
}
NdbMgmHandle handle(void) const {
@@ -137,6 +146,18 @@ public:
return false;
}
+ // Handshake with the server to make sure it's really there
+ int major, minor, build;
+ char buf[16];
+ if (ndb_mgm_get_version(m_handle, &major, &minor, &build,
+ sizeof(buf), buf) != 1)
+ {
+ error("connect: ndb_get_version failed");
+ return false;
+ }
+ printf("connected to ndb_mgmd version %d.%d.%d\n",
+ major, minor, build);
+
if ((m_nodeid = ndb_mgm_get_mgmd_nodeid(m_handle)) == 0){
error("connect: could not get nodeid of connected mgmd");
return false;
=== modified file 'storage/ndb/test/include/NdbProcess.hpp'
--- a/storage/ndb/test/include/NdbProcess.hpp 2009-09-16 12:53:49 +0000
+++ b/storage/ndb/test/include/NdbProcess.hpp 2009-09-25 07:57:29 +0000
@@ -21,6 +21,8 @@
#ifndef NDB_PROCESS_HPP
#define NDB_PROCESS_HPP
+#include <portlib/NdbSleep.h>
+
class NdbProcess
{
#ifdef _WIN32
@@ -49,6 +51,13 @@ public:
m_args.push_back(str);
}
+ void add(const char* str, const char* str2)
+ {
+ BaseString tmp;
+ tmp.assfmt("%s%s", str, str2);
+ m_args.push_back(tmp);
+ }
+
void add(const char* str, int val)
{
BaseString tmp;
@@ -108,6 +117,49 @@ public:
printf("Stopped process %d\n", m_pid);
return true;
}
+
+ bool wait(int& ret, int timeout = 0)
+ {
+ int retries = 0;
+ int status;
+ while (true)
+ {
+ pid_t ret_pid = waitpid(m_pid, &status, WNOHANG);
+ if (ret_pid == -1)
+ {
+ fprintf(stderr,
+ "Error occured when waiting for process %d, ret: %d, errno: %d\n",
+ m_pid, status, errno);
+ return false;
+ }
+
+ if (ret_pid == m_pid)
+ {
+ if (WIFEXITED(status))
+ ret = WEXITSTATUS(status);
+ else if (WIFSIGNALED(status))
+ ret = WTERMSIG(status);
+ else
+ ret = 37; // Unknown exit status
+
+ printf("Got process %d, status: %d, ret: %d\n", m_pid, status, ret);
+ return true;
+ }
+
+ if (timeout == 0)
+ return false;
+
+ if (retries++ > timeout*10)
+ {
+ fprintf(stderr,
+ "Timeout when waiting for process %d\n", m_pid);
+ return false;
+ }
+ NdbSleep_MilliSleep(10);
+ }
+ assert(false); // Never reached
+ }
+
private:
NdbProcess(BaseString name) :
=== modified file 'storage/ndb/test/ndbapi/ConfigFactory.hpp'
--- a/storage/ndb/test/ndbapi/ConfigFactory.hpp 2009-09-16 12:53:49 +0000
+++ b/storage/ndb/test/ndbapi/ConfigFactory.hpp 2009-09-25 07:57:29 +0000
@@ -61,6 +61,20 @@ struct ConfigFactory
}
static bool
+ put(Properties& config, const char* section, Uint32 section_no,
+ const char* key, Uint32 value)
+ {
+ Properties* p;
+ if (!config.getCopy(section, section_no, &p))
+ return false;
+ if (!p->put(key, value))
+ return false;
+ if (!config.put(section, section_no, p, true))
+ return false;
+ return true;
+ }
+
+ static bool
write_config_ini(Properties& config, const char* path)
{
FILE* config_file = fopen(path, "w");
=== modified file 'storage/ndb/test/ndbapi/testMgmd.cpp'
--- a/storage/ndb/test/ndbapi/testMgmd.cpp 2009-09-16 12:53:49 +0000
+++ b/storage/ndb/test/ndbapi/testMgmd.cpp 2009-09-25 07:57:29 +0000
@@ -111,13 +111,55 @@ public:
return (m_proc != NULL);
}
- bool start_from_config_ini(const char* working_dir)
+ bool start_from_config_ini(const char* working_dir,
+ const char* first_extra_arg = NULL, ...)
{
NdbProcess::Args args;
args.add("--configdir=.");
args.add("-f config.ini");
args.add("--ndb-nodeid=", m_nodeid);
args.add("--nodaemon");
+ args.add("--log-name=", name());
+ args.add("--verbose");
+
+ if (first_extra_arg)
+ {
+ // Append any extra args
+ va_list extra_args;
+ const char* str = first_extra_arg;
+ va_start(extra_args, first_extra_arg);
+ do
+ {
+ args.add(str);
+ } while ((str = va_arg(extra_args, const char*)) != NULL);
+ va_end(extra_args);
+ }
+
+ return start(working_dir, args);
+ }
+
+ bool start(const char* working_dir,
+ const char* first_extra_arg = NULL, ...)
+ {
+ NdbProcess::Args args;
+ args.add("--configdir=.");
+ args.add("--ndb-nodeid=", m_nodeid);
+ args.add("--nodaemon");
+ args.add("--log-name=", name());
+ args.add("--verbose");
+
+ if (first_extra_arg)
+ {
+ // Append any extra args
+ va_list extra_args;
+ const char* str = first_extra_arg;
+ va_start(extra_args, first_extra_arg);
+ do
+ {
+ args.add(str);
+ } while ((str = va_arg(extra_args, const char*)) != NULL);
+ va_end(extra_args);
+ }
return start(working_dir, args);
}
@@ -126,9 +168,8 @@ public:
{
g_info << "Stopping " << name() << endl;
- // Diconnect our "builtin" client
- // ??MASV if (m_mgmd_client.is_connected())
- m_mgmd_client.disconnect();
+ // Diconnect and close our "builtin" client
+ m_mgmd_client.close();
assert(m_proc);
if (!m_proc->stop())
@@ -142,6 +183,23 @@ public:
}
+ bool wait(int& ret, int timeout = 30)
+ {
+ g_info << "Waiting for " << name() << endl;
+
+ assert(m_proc);
+ if (!m_proc->wait(ret, timeout))
+ {
+ fprintf(stderr, "Failed to wait for process %s\n", name());
+ return false;
+ }
+ delete m_proc;
+ m_proc = 0;
+
+ return true;
+
+ }
+
bool connect(const Properties& config,
int num_retries = 30, int retry_delay_in_seconds = 1)
{
@@ -316,6 +374,189 @@ int runTestBasic2Mgm(NDBT_Context* ctx,
}
+int runTestBug45495(NDBT_Context* ctx, NDBT_Step* step)
+{
+ NDBT_Workingdir wd("test_mgmd"); // temporary working directory
+
+ g_err << "** Create config.ini" << endl;
+ Properties config = ConfigFactory::create(2);
+ CHECK(ConfigFactory::write_config_ini(config,
+ path(wd.path(),
+ "config.ini",
+ NULL).c_str()));
+ // Start ndb_mgmd(s)
+ MgmdProcessList mgmds;
+ for (int i = 1; i <= 2; i++)
+ {
+ Mgmd* mgmd = new Mgmd(i);
+ CHECK(mgmd->start_from_config_ini(wd.path()));
+ mgmds.push_back(mgmd);
+ }
+
+ // Connect the ndb_mgmd(s)
+ for (unsigned i = 0; i < mgmds.size(); i++)
+ CHECK(mgmds[i]->connect(config));
+
+ // wait for confirmed config
+ for (unsigned i = 0; i < mgmds.size(); i++)
+ CHECK(mgmds[i]->wait_confirmed_config());
+
+ // Check binary config files created
+ CHECK(file_exists(path(wd.path(),
+ "ndb_1_config.bin.1",
+ NULL).c_str()));
+ CHECK(file_exists(path(wd.path(),
+ "ndb_2_config.bin.1",
+ NULL).c_str()));
+
+ g_err << "** Restart one ndb_mgmd at a time --reload + --initial" << endl;
+ for (unsigned i = 0; i < mgmds.size(); i++)
+ {
+ CHECK(mgmds[i]->stop());
+ CHECK(mgmds[i]->start_from_config_ini(wd.path(),
+ "--reload", "--initial", NULL));
+ CHECK(mgmds[i]->connect(config));
+ CHECK(mgmds[i]->wait_confirmed_config());
+
+ // check ndb_X_config.bin.1 still exists but not ndb_X_config.bin.2
+ CHECK(file_exists(path(wd.path(),
+ "ndb_1_config.bin.1",
+ NULL).c_str()));
+ CHECK(file_exists(path(wd.path(),
+ "ndb_2_config.bin.1",
+ NULL).c_str()));
+
+ CHECK(!file_exists(path(wd.path(),
+ "ndb_1_config.bin.2",
+ NULL).c_str()));
+ CHECK(!file_exists(path(wd.path(),
+ "ndb_2_config.bin.2",
+ NULL).c_str()));
+ }
+
+ g_err << "** Restart one ndb_mgmd at a time --initial" << endl;
+ for (unsigned i = 0; i < mgmds.size(); i++)
+ {
+ CHECK(mgmds[i]->stop());
+ CHECK(mgmds[i]->start_from_config_ini(wd.path(),
+ "--initial", NULL));
+ CHECK(mgmds[i]->connect(config));
+ CHECK(mgmds[i]->wait_confirmed_config());
+
+ // check ndb_X_config.bin.1 still exists but not ndb_X_config.bin.2
+ CHECK(file_exists(path(wd.path(),
+ "ndb_1_config.bin.1",
+ NULL).c_str()));
+ CHECK(file_exists(path(wd.path(),
+ "ndb_2_config.bin.1",
+ NULL).c_str()));
+
+ CHECK(!file_exists(path(wd.path(),
+ "ndb_1_config.bin.2",
+ NULL).c_str()));
+ CHECK(!file_exists(path(wd.path(),
+ "ndb_2_config.bin.2",
+ NULL).c_str()));
+ }
+
+ g_err << "** Create config2.ini" << endl;
+ CHECK(ConfigFactory::put(config, "ndb_mgmd", 1, "ArbitrationDelay", 100));
+ CHECK(ConfigFactory::write_config_ini(config,
+ path(wd.path(),
+ "config2.ini",
+ NULL).c_str()));
+
+ g_err << "** Restart one ndb_mgmd at a time --initial should not work" << endl;
+ for (unsigned i = 0; i < mgmds.size(); i++)
+ {
+ CHECK(mgmds[i]->stop());
+ // Start from config2.ini
+ CHECK(mgmds[i]->start_from_config_ini(wd.path(),
+ "-f config2.ini",
+ "--initial", NULL));
+
+ // Wait for mgmd to exit and check return status
+ int ret;
+ CHECK(mgmds[i]->wait(ret));
+ CHECK(ret == 1);
+
+ // check config files exist only for the still running mgmd(s)
+ for (unsigned j = 0; j < mgmds.size(); j++)
+ {
+ BaseString tmp;
+ tmp.assfmt("ndb_%d_config.bin.1", j+1);
+ CHECK(file_exists(path(wd.path(),
+ tmp.c_str(),
+ NULL).c_str()) == (j != i));
+ }
+
+ // Start from config.ini again
+ CHECK(mgmds[i]->start_from_config_ini(wd.path(),
+ "--initial",
+ "--reload",
+ NULL));
+ CHECK(mgmds[i]->connect(config));
+ CHECK(mgmds[i]->wait_confirmed_config());
+ }
+
+ g_err << "** Reload from config2.ini" << endl;
+ for (unsigned i = 0; i < mgmds.size(); i++)
+ {
+ CHECK(mgmds[i]->stop());
+ // Start from config2.ini
+ CHECK(mgmds[i]->start_from_config_ini(wd.path(),
+ "-f config2.ini",
+ "--reload", NULL));
+ CHECK(mgmds[i]->connect(config));
+ CHECK(mgmds[i]->wait_confirmed_config());
+
+ CHECK(file_exists(path(wd.path(),
+ "ndb_1_config.bin.1",
+ NULL).c_str()));
+ CHECK(file_exists(path(wd.path(),
+ "ndb_2_config.bin.1",
+ NULL).c_str()));
+
+ CHECK(file_exists(path(wd.path(),
+ "ndb_1_config.bin.2",
+ NULL).c_str()));
+ CHECK(file_exists(path(wd.path(),
+ "ndb_2_config.bin.2",
+ NULL).c_str()));
+ }
+
+ g_err << "** Reload mgmd initial(from generation=2)" << endl;
+ for (unsigned i = 0; i < mgmds.size(); i++)
+ {
+ CHECK(mgmds[i]->stop());
+ CHECK(mgmds[i]->start_from_config_ini(wd.path(),
+ "-f config2.ini",
+ "--reload", "--initial", NULL));
+ CHECK(mgmds[i]->connect(config));
+ CHECK(mgmds[i]->wait_confirmed_config());
+
+ // check config files exist
+ for (unsigned j = 0; j < mgmds.size(); j++)
+ {
+ BaseString tmp;
+ tmp.assfmt("ndb_%d_config.bin.1", j+1);
+ CHECK(file_exists(path(wd.path(),
+ tmp.c_str(),
+ NULL).c_str()) == (i < j));
+
+ tmp.assfmt("ndb_%d_config.bin.2", j+1);
+ CHECK(file_exists(path(wd.path(),
+ tmp.c_str(),
+ NULL).c_str()));
+ }
+ }
+
+ return NDBT_OK;
+
+}
+
+
+
NDBT_TESTSUITE(testMgmd);
DRIVER(DummyDriver); /* turn off use of NdbApi */
@@ -324,6 +565,13 @@ TESTCASE("Basic2Mgm",
{
INITIALIZER(runTestBasic2Mgm);
}
+
+TESTCASE("Bug45495",
+ "Test that mgmd can be restarted in any order")
+{
+ INITIALIZER(runTestBug45495);
+}
+
NDBT_TESTSUITE_END(testMgmd);
int main(int argc, const char** argv)
Attachment: [text/bzr-bundle] bzr/magnus.blaudd@sun.com-20090925075729-989220mv4tjqopu0.bundle
| Thread |
|---|
| • bzr commit into mysql-5.1-telco-7.0 branch (magnus.blaudd:3009)Bug#45495 | Magnus Blåudd | 25 Sep |