Below is the list of changes that have just been committed into a local
5.1 repository of rafal. When rafal does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet@stripped, 2006-12-08 21:28:27+01:00, rafal@quant.(none) +4 -0
temporrary commit, will be documented later. Please ignore.
sql/log_event.cc@stripped, 2006-12-08 21:28:22+01:00, rafal@quant.(none) +47 -1
temporrary commit, will be documented later. Please ignore.
sql/log_event.h@stripped, 2006-12-08 21:28:23+01:00, rafal@quant.(none) +7 -0
temporrary commit, will be documented later. Please ignore.
sql/rpl_rli.h@stripped, 2006-12-08 21:28:23+01:00, rafal@quant.(none) +13 -1
temporrary commit, will be documented later. Please ignore.
sql/slave.cc@stripped, 2006-12-08 21:28:23+01:00, rafal@quant.(none) +388 -247
temporrary commit, will be documented later. Please ignore.
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: rafal
# Host: quant.(none)
# Root: /ext/mysql/bk/mysql-5.1-bug21448
--- 1.246/sql/log_event.cc 2006-12-08 21:28:36 +01:00
+++ 1.247/sql/log_event.cc 2006-12-08 21:28:36 +01:00
@@ -831,7 +831,11 @@
LOG_EVENT_MINIMAL_HEADER_LEN);
LOCK_MUTEX;
- DBUG_PRINT("info", ("my_b_tell=%lu", my_b_tell(file)));
+
+ my_off_t ev_start= my_b_tell(file);
+
+ DBUG_PRINT("info", ("my_b_tell=%lu", ev_start));
+
if (my_b_read(file, (byte *) head, header_size))
{
DBUG_PRINT("info", ("Log_event::read_log_event(IO_CACHE*,Format_desc*) \
@@ -879,7 +883,10 @@
goto err;
}
if ((res= read_log_event(buf, data_len, &error, description_event)))
+ {
res->register_temp_buf(buf);
+ res->at= ev_start;
+ }
err:
UNLOCK_MUTEX;
@@ -5315,6 +5322,45 @@
m_cols.bitmap= 0; // to not free it
}
#endif
+
+/**
+ @short Construct row event by decoding data sent in a packet
+
+ @par Content of the event packet:
+
+ - table_id : Id of the table into which the row should be written (as established by
+ Table_map event).
+
+ - flags : ???
+
+ - width : width of the following bitmap.
+
+ - column bitmap : for which of the colums there is data in the event.
+
+ - row data : values for columns set in the bitmap in the format used by handler
interface.
+
+ @par Format of the packet:
+
+ @verbatim
+
+ | common header |
+ +=================+
+ | table_id | (4 or 6 bytes) }
+ +-----------------+ } Rows_log_event header
+ | flags | (2 bytes) }
+ +=================+
+ | width | (coded integer) }
+ +-----------------+ } variable length
+ | column bitmap | (at least 1 byte) } part
+ +-----------------+ }
+ | | }
+ | row data | }
+ | | }
+ +=================+
+
+ @endverbatim
+*/
+
Rows_log_event::Rows_log_event(const char *buf, uint event_len,
Log_event_type event_type,
--- 1.136/sql/log_event.h 2006-12-08 21:28:36 +01:00
+++ 1.137/sql/log_event.h 2006-12-08 21:28:36 +01:00
@@ -568,6 +568,13 @@
occur), except the COMMIT query which has its real offset.
*/
my_off_t log_pos;
+
+ /*
+ Position of the event itself inside the log from which it was
+ read (either binlog or relay log).
+ */
+ my_off_t at;
+
/*
A temp buffer for read_log_event; it is later analysed according to the
event's type, and its content is distributed in the event-specific fields.
--- 1.288/sql/slave.cc 2006-12-08 21:28:36 +01:00
+++ 1.289/sql/slave.cc 2006-12-08 21:28:36 +01:00
@@ -475,6 +475,9 @@
notify_group_relay_log_name_update();
+ // reset trans_retries counter for the next transaction
+ trans_retries= 0;
+
/*
If the slave does not support transactions and replicates a transaction,
users should not trust group_master_log_pos (which they can display with
@@ -2985,14 +2988,19 @@
DBUG_RETURN(0);
}
-/*
- Check if the current error is of temporary nature of not.
+/**
+ Check if the current error is of temporary nature or not.
Some errors are temporary in nature, such as
ER_LOCK_DEADLOCK and ER_LOCK_WAIT_TIMEOUT. Ndb also signals
that the error is temporary by pushing a warning with the error code
ER_GET_TEMPORARY_ERRMSG, if the originating error is temporary.
+
+ @args
+
+ - thd : THD structure used for executing event
+ - error : error code returned by exec_event()
*/
-static int has_temporary_error(THD *thd)
+static int has_temporary_error(THD *thd, int error)
{
DBUG_ENTER("has_temporary_error");
@@ -3029,215 +3037,81 @@
DBUG_RETURN(0);
}
-static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
-{
- DBUG_ENTER("exec_relay_log_event");
- /*
- We acquire this mutex since we need it for all operations except
- event execution. But we will release it in places where we will
- wait for something for example inside of next_event().
- */
- pthread_mutex_lock(&rli->data_lock);
- /*
- This tests if the position of the end of the last previous executed event
- hits the UNTIL barrier.
- We would prefer to test if the position of the start (or possibly) end of
- the to-be-read event hits the UNTIL barrier, this is different if there
- was an event ignored by the I/O thread just before (BUG#13861 to be
- fixed).
- */
- if (rli->until_condition!=RELAY_LOG_INFO::UNTIL_NONE &&
- rli->is_until_satisfied())
- {
- char buf[22];
- sql_print_information("Slave SQL thread stopped because it reached its"
- " UNTIL position %s", llstr(rli->until_pos(), buf));
- /*
- Setting abort_slave flag because we do not want additional message about
- error in query execution to be printed.
- */
- rli->abort_slave= 1;
- pthread_mutex_unlock(&rli->data_lock);
- DBUG_RETURN(1);
- }
+static inline
+bool skip_event(Log_event *ev, bool explicit_skip )
+{
+ uint type_code= ev->get_type_code(); // TODO: get_type_code() should be const.
+
+ return ( ( ev->server_id == (uint32) ::server_id &&
+ !replicate_same_server_id &&
+ type_code != FORMAT_DESCRIPTION_EVENT ) ||
+ ( explicit_skip &&
+ type_code != ROTATE_EVENT &&
+ type_code != STOP_EVENT &&
+ type_code != START_EVENT_V3 &&
+ type_code!= FORMAT_DESCRIPTION_EVENT) );
+}
+
+static inline
+bool internal_event(Log_event *ev)
+{
+ uint type_code= ev->get_type_code();
+
+ return ( type_code == ROTATE_EVENT || type_code == STOP_EVENT ||
+ type_code == START_EVENT_V3 || type_code == FORMAT_DESCRIPTION_EVENT );
+}
+
+/**
+ Execute a replication event.
+
+ @returns
+
+ - OK : Event was executed without any problems.
+
+ - RETRY : A transient error was detected during execution and the current
+ event group should be re-executed.
- Log_event * ev = next_event(rli);
+ - EXIT : Replication should be terminated.
- DBUG_ASSERT(rli->sql_thd==thd);
+*/
- if (sql_slave_killed(thd,rli))
- {
- pthread_mutex_unlock(&rli->data_lock);
- delete ev;
- DBUG_RETURN(1);
- }
- if (ev)
- {
- int type_code = ev->get_type_code();
- int exec_res;
+enum exec_ev_result { OK=0, RETRY, EXIT };
- /*
- Queries originating from this server must be skipped.
- Low-level events (Format_desc, Rotate, Stop) from this server
- must also be skipped. But for those we don't want to modify
- group_master_log_pos, because these events did not exist on the master.
- Format_desc is not completely skipped.
- Skip queries specified by the user in slave_skip_counter.
- We can't however skip events that has something to do with the
- log files themselves.
- Filtering on own server id is extremely important, to ignore execution of
- events created by the creation/rotation of the relay log (remember that
- now the relay log starts with its Format_desc, has a Rotate etc).
- */
-
- DBUG_PRINT("info",("type_code=%d, server_id=%d",type_code,ev->server_id));
-
- if ((ev->server_id == (uint32) ::server_id &&
- !replicate_same_server_id &&
- type_code != FORMAT_DESCRIPTION_EVENT) ||
- (rli->slave_skip_counter &&
- type_code != ROTATE_EVENT && type_code != STOP_EVENT &&
- type_code != START_EVENT_V3 && type_code!= FORMAT_DESCRIPTION_EVENT))
- {
- DBUG_PRINT("info", ("event skipped"));
- /*
- We only skip the event here and do not increase the group log
- position. In the event that we have to restart, this means
- that we might have to skip the event again, but that is a
- minor issue.
-
- If we were to increase the group log position when skipping an
- event, it might be that we are restarting at the wrong
- position and have events before that we should have executed,
- so not increasing the group log position is a sure bet in this
- case.
+static
+enum exec_ev_result
+exec_relay_log_event(THD* thd, Log_event *ev, RELAY_LOG_INFO* rli, int *error)
+{
- In this way, we just step the group log position when we
- *know* that we are at the end of a group.
- */
- rli->inc_event_relay_log_pos();
+ if( !ev ) return OK; // nothing to do
+
+ DBUG_ENTER("exec_relay_log_event");
- /*
- Protect against common user error of setting the counter to 1
- instead of 2 while recovering from an insert which used auto_increment,
- rand or user var.
- */
- if (rli->slave_skip_counter &&
- !((type_code == INTVAR_EVENT ||
- type_code == RAND_EVENT ||
- type_code == USER_VAR_EVENT) &&
- rli->slave_skip_counter == 1) &&
- /*
- The events from ourselves which have something to do with the relay
- log itself must be skipped, true, but they mustn't decrement
- rli->slave_skip_counter, because the user is supposed to not see
- these events (they are not in the master's binlog) and if we
- decremented, START SLAVE would for example decrement when it sees
- the Rotate, so the event which the user probably wanted to skip
- would not be skipped.
- */
- !(ev->server_id == (uint32) ::server_id &&
- (type_code == ROTATE_EVENT || type_code == STOP_EVENT ||
- type_code == START_EVENT_V3 || type_code == FORMAT_DESCRIPTION_EVENT)))
- --rli->slave_skip_counter;
- pthread_mutex_unlock(&rli->data_lock);
- delete ev;
- DBUG_RETURN(0); // avoid infinite update loops
- }
- pthread_mutex_unlock(&rli->data_lock);
- thd->server_id = ev->server_id; // use the original server id for logging
- thd->set_time(); // time the query
- thd->lex->current_select= 0;
- if (!ev->when)
+ thd->server_id = ev->server_id; // use the original server id for logging
+ thd->set_time(); // time the query
+ thd->lex->current_select= 0;
+
+ if (!ev->when)
ev->when = time(NULL);
- ev->thd = thd; // because up to this point, ev->thd == 0
- exec_res = ev->exec_event(rli);
- DBUG_PRINT("info", ("exec_event result = %d", exec_res));
- DBUG_ASSERT(rli->sql_thd==thd);
- /*
- Format_description_log_event should not be deleted because it will be
- used to read info about the relay log's format; it will be deleted when
- the SQL thread does not need it, i.e. when this thread terminates.
- */
- if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
- {
- DBUG_PRINT("info", ("Deleting the event after it has been executed"));
- delete ev;
- }
- if (slave_trans_retries)
- {
- if (exec_res && has_temporary_error(thd))
- {
- const char *errmsg;
- /*
- We were in a transaction which has been rolled back because of a
- temporary error;
- let's seek back to BEGIN log event and retry it all again.
- We have to not only seek but also
- a) init_master_info(), to seek back to hot relay log's start for later
- (for when we will come back to this hot log after re-processing the
- possibly existing old logs where BEGIN is: check_binlog_magic() will
- then need the cache to be at position 0 (see comments at beginning of
- init_master_info()).
- b) init_relay_log_pos(), because the BEGIN may be an older relay log.
- */
- if (rli->trans_retries < slave_trans_retries)
- {
- if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL))
- sql_print_error("Failed to initialize the master info structure");
- else if (init_relay_log_pos(rli,
- rli->group_relay_log_name,
- rli->group_relay_log_pos,
- 1, &errmsg, 1))
- sql_print_error("Error initializing relay log position: %s",
- errmsg);
- else
- {
- exec_res= 0;
- end_trans(thd, ROLLBACK);
- /* chance for concurrent connection to get more locks */
- safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
- (CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
- pthread_mutex_lock(&rli->data_lock); // because of SHOW STATUS
- rli->trans_retries++;
- rli->retried_trans++;
- pthread_mutex_unlock(&rli->data_lock);
- DBUG_PRINT("info", ("Slave retries transaction "
- "rli->trans_retries: %lu", rli->trans_retries));
- }
- }
- else
- sql_print_error("Slave SQL thread retried transaction %lu time(s) "
- "in vain, giving up. Consider raising the value of "
- "the slave_transaction_retries variable.",
- slave_trans_retries);
- }
- else if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
- {
- /*
- Only reset the retry counter if the event succeeded or
- failed with a non-transient error. On a successful event,
- the execution will proceed as usual; in the case of a
- non-transient error, the slave will stop with an error.
- */
- rli->trans_retries= 0; // restart from fresh
- }
- }
- DBUG_RETURN(exec_res);
- }
- pthread_mutex_unlock(&rli->data_lock);
- slave_print_msg(ERROR_LEVEL, rli, 0, "\
-Could not parse relay log event entry. The possible reasons are: the master's \
-binary log is corrupted (you can check this by running 'mysqlbinlog' on the \
-binary log), the slave's relay log is corrupted (you can check this by running \
-'mysqlbinlog' on the relay log), a network problem, or a bug in the master's \
-or slave's MySQL code. If you want to check the master's binary log or slave's \
-relay log, you will be able to know their names by issuing 'SHOW SLAVE STATUS' \
-on this slave.\
-");
- DBUG_RETURN(1);
+
+ ev->thd = thd; // because up to this point, ev->thd == 0
+
+ int exec_result= ev->exec_event(rli);
+
+ if( error )
+ *error= exec_result;
+
+ DBUG_PRINT("info", ("exec_event result = %d", exec_result));
+ DBUG_ASSERT(rli->sql_thd==thd);
+
+ if( !exec_result )
+ DBUG_RETURN(OK);
+
+ // We got error while executing event, leave trace in error log.
+
+ DBUG_RETURN( has_temporary_error(thd,exec_result) ? RETRY : EXIT );
+
}
@@ -3562,12 +3436,17 @@
}
+/*
+ TODO: Consistently use slave_print_msg() or sql_print_* functions
+ to report errors.
+*/
+
/* Slave SQL Thread entry point */
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd; /* needs to be first for thread_stack */
- char llbuff[22],llbuff1[22];
+ char llbuff[22],llbuff1[22], llbuff2[22];
RELAY_LOG_INFO* rli = &((MASTER_INFO*)arg)->rli;
const char *errmsg;
@@ -3591,6 +3470,7 @@
rli->slave_run_id++;
pthread_detach_this_thread();
+
if (init_slave_thread(thd, SLAVE_THD_SQL))
{
/*
@@ -3602,6 +3482,7 @@
sql_print_error("Failed during slave thread initialization");
goto err;
}
+
thd->init_for_queries();
rli->sql_thd= thd;
thd->temporary_tables = rli->save_temporary_tables; // restore temp tables
@@ -3645,11 +3526,13 @@
1 /*need data lock*/, &errmsg,
1 /*look for a description_event*/))
{
- sql_print_error("Error initializing relay log position: %s",
+ sql_print_error("Slave SQL thread: Error initializing relay log position: %s",
errmsg);
goto err;
}
+
THD_CHECK_SENTRY(thd);
+
#ifndef DBUG_OFF
{
char llbuf1[22], llbuf2[22];
@@ -3674,16 +3557,18 @@
#endif
}
#endif
+
DBUG_ASSERT(rli->sql_thd == thd);
DBUG_PRINT("master_info",("log_file_name: %s position: %s",
rli->group_master_log_name,
llstr(rli->group_master_log_pos,llbuff)));
+
if (global_system_variables.log_warnings)
- sql_print_information("Slave SQL thread initialized, starting replication in \
-log '%s' at position %s, relay log '%s' position: %s", RPL_LOG_NAME,
-
llstr(rli->group_master_log_pos,llbuff),rli->group_relay_log_name,
- llstr(rli->group_relay_log_pos,llbuff1));
+ sql_print_information("Slave SQL thread: Initialized, starting replication in "
+ "log '%s' at position %s, relay log '%s' position: %s",
+ RPL_LOG_NAME, llstr(rli->group_master_log_pos,llbuff),
+ rli->group_relay_log_name,
llstr(rli->group_relay_log_pos,llbuff1));
/* execute init_slave variable */
if (sys_init_slave.value_length)
@@ -3691,62 +3576,318 @@
execute_init_command(thd, &sys_init_slave, &LOCK_sys_init_slave);
if (thd->query_error)
{
- sql_print_error("\
-Slave SQL thread aborted. Can't execute init_slave query");
+ sql_print_error("Slave SQL thread aborted. Can't execute init_slave query");
goto err;
}
}
- /* Read queries from the IO/THREAD until this thread is killed */
+ // End of SQL thread initialization
- while (!sql_slave_killed(thd,rli))
- {
- thd->proc_info = "Reading event from the relay log";
- DBUG_ASSERT(rli->sql_thd == thd);
- THD_CHECK_SENTRY(thd);
- if (exec_relay_log_event(thd,rli))
+ DBUG_ASSERT(rli->sql_thd == thd);
+ THD_CHECK_SENTRY(thd);
+
+ /* Read queries from the IO/THREAD until this thread is killed */
+
+ { // group because of gotos crossing variable initialization
+
+ Log_event *ev=NULL; // pointer to current event
+ int error; // error code returned when executing event
+ enum exec_ev_result res; // for storing result of exec_relay_log_event()
below
+ bool have_rli_lock= FALSE; // true if rli->data_lock mutex has been
acquired
+ bool quit= FALSE; // true if the thread should terminate
+
+ while ( !quit )
{
- // do not scare the user if SQL thread was simply killed or stopped
- if (!sql_slave_killed(thd,rli))
+ thd->proc_info = "Reading event from the relay log";
+
+ /*
+ We acquire this mutex since we need it for all operations except
+ event execution. But we will release it in places where we will
+ wait for something for example inside of next_event().
+ */
+ pthread_mutex_lock(&rli->data_lock);
+ have_rli_lock= TRUE;
+
+ /*
+ This tests if the position of the end of the last previous executed event
+ hits the UNTIL barrier.
+ We would prefer to test if the position of the start (or possibly) end of
+ the to-be-read event hits the UNTIL barrier, this is different if there
+ was an event ignored by the I/O thread just before (BUG#13861 to be
+ fixed).
+ */
+ if (rli->until_condition!=RELAY_LOG_INFO::UNTIL_NONE &&
+ rli->is_until_satisfied())
{
+ sql_print_information("Slave SQL thread: Stopped because it reached its "
+ "UNTIL position %s", llstr(rli->until_pos(), buf1));
/*
- retrieve as much info as possible from the thd and, error codes and warnings
- and print this to the error log as to allow the user to locate the error
+ Setting abort_slave flag because we do not want additional message about
+ error in query execution to be printed.
*/
- if (thd->net.last_errno != 0)
+ rli->abort_slave= 1;
+ goto break_loop;
+ }
+
+ // Get next event from the relay log
+
+ ev= next_event(rli);
+
+ DBUG_ASSERT(rli->sql_thd==thd);
+
+ // Thread could be requested to terminate while waiting for next event.
+ if (sql_slave_killed(thd,rli))
+ {
+ // TODO: Give an informative message
+ goto break_loop;
+ }
+
+ if( !ev )
+ {
+ slave_print_msg(ERROR_LEVEL, rli, 0,
+ "Could not parse relay log event entry. The possible reasons are: the master's
"
+ "binary log is corrupted (you can check this by running 'mysqlbinlog' on the "
+ "binary log), the slave's relay log is corrupted (you can check this by running
"
+ "'mysqlbinlog' on the relay log), a network problem, or a bug in the master's "
+ "or slave's MySQL code. If you want to check the master's binary log or slave's
"
+ "relay log, you will be able to know their names by issuing 'SHOW SLAVE STATUS'
"
+ "on this slave.");
+ goto break_loop;
+ }
+
+ // We got next event, show it in the debug trace.
+
+ DBUG_PRINT("rpl event",("at relay: " RLI_POS_FORMAT
+ ", relay group: " RLI_GROUP_FORMAT
+ ", master group: " RLI_MASTER_POS_FORMAT
+ ", type=%d (%s), server_id=%d.",
+ RLI_POS(rli,llbuff), RLI_GROUP_POS(rli,llbuff1),
RLI_MASTER_POS(rli,llbuff2),
+ ev->get_type_code(), ev->get_type_str(), ev->server_id
));
+
+ // Event filtering
+
+ if( skip_event(ev, rli->slave_skip_counter > 0 ) )
+ {
+ int type_code= ev->get_type_code();
+
+ // Decrease slave_skip_counter if active
+
+ if ( rli->slave_skip_counter > 0
+ &&
+ /*
+ The events from ourselves which have something to do with the relay
+ log itself must be skipped, true, but they mustn't decrement
+ rli->slave_skip_counter, because the user is supposed to not see
+ these events (they are not in the master's binlog) and if we
+ decremented, START SLAVE would for example decrement when it sees
+ the Rotate, so the event which the user probably wanted to skip
+ would not be skipped.
+ */
+ !( ev->server_id == (uint32) ::server_id && internal_event(ev) )
+ &&
+ /*
+ Protect against common user error of setting the counter to 1
+ instead of 2 while recovering from an insert which used auto_increment,
+ rand or user var.
+ */
+ !( ( type_code == INTVAR_EVENT ||
+ type_code == RAND_EVENT ||
+ type_code == USER_VAR_EVENT) && rli->slave_skip_counter ==
1 )
+ )
+ --rli->slave_skip_counter;
+
+ // Advance relay log position (as we don't execute this event)
+ /*
+ We only skip the event here and do not increase the group log
+ position. In the event that we have to restart, this means
+ that we might have to skip the event again, but that is a
+ minor issue.
+
+ If we were to increase the group log position when skipping an
+ event, it might be that we are restarting at the wrong
+ position and have events before that we should have executed,
+ so not increasing the group log position is a sure bet in this
+ case.
+
+ In this way, we just step the group log position when we
+ *know* that we are at the end of a group.
+ */
+ rli->inc_event_relay_log_pos();
+
+ goto continue_loop;
+ }
+
+ pthread_mutex_unlock(&rli->data_lock);
+ have_rli_lock= FALSE;
+
+ // Now we execute the event
+ thd->proc_info = "Executing event from the relay log";
+
+ res= exec_relay_log_event(thd,ev,rli,&error);
+
+ DBUG_PRINT("info",("exec_relay_log_event() returns %d",res));
+
+ if( error )
+ {
+ struct tm ts;
+
+ localtime_r(&(ev->when), &ts);
+
+ sql_print_warning("Slave SQL thread: Error %d while executing '%s' event. "
+ "Event position is " RLI_POS_FORMAT
+ ", group pos " RLI_GROUP_FORMAT
+ ", master group " RLI_MASTER_FORMAT
+ ", type_code=%d, timestamp=%02d:%02d:%02d"
+ error,
+ ev->get_type_str(),
+ RLI_POS(rli,llbuff),
+ RLI_GROUP(rli,llbuff),
+ RLI_MASTER_GROUP(rli,llbuff),
+ ev->get_type_code(),
+ ts.tm_hour, ts.tm_min, ts.tm_sec );
+ }
+
+ switch( res )
+ {
+ case OK: // successfull execution
{
- if (rli->last_slave_errno == 0)
+ goto continue_loop;
+ }
+
+ case RETRY: // current transaction should be retried
+
+ if( slave_trans_retries > 0 ) // if slave_trans_retries not set, treat it as
error.
+ {
+ if (rli->trans_retries >= slave_trans_retries)
{
- slave_print_msg(ERROR_LEVEL, rli, thd->net.last_errno,
- thd->net.last_error ?
- thd->net.last_error : "<no message>");
+ sql_print_error("Slave SQL thread: Retried transaction %lu time(s) "
+ "in vain, giving up. Consider raising the value of "
+ "the slave_transaction_retries variable.",
+ slave_trans_retries);
+ goto break_loop;
}
- else if (rli->last_slave_errno != (int) thd->net.last_errno)
+
+ /*
+ We want to retry current transaction because of a temporary error.
+ We will rollback it and rewind the relay log to the position where
+ transaction started. We have also to do the following:
+
+ a) init_master_info(), to seek back to hot relay log's start for later
+ (for when we will come back to this hot log after re-processing the
+ possibly existing old logs where BEGIN is: check_binlog_magic() will
+ then need the cache to be at position 0 (see comments at beginning of
+ init_master_info()).
+
+ b) init_relay_log_pos(), because the BEGIN may be an older relay log.
+ */
+
+ end_trans(thd, ROLLBACK); // moved here from after relay log rewind
+
+ if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL))
+ {
+ sql_print_error("Slave SQL thread: transaction retry: "
+ "Failed to initialize the master info structure");
+ goto break_loop;
+ }
+
+ if (init_relay_log_pos(rli,
+ rli->group_relay_log_name,
+ rli->group_relay_log_pos,
+ 1, &errmsg, 1))
{
- sql_print_error("Slave (additional info): %s Error_code: %d",
- thd->net.last_error ?
- thd->net.last_error : "<no message>",
- thd->net.last_errno);
+ sql_print_error("Slave SQL thread: transaction retry: "
+ "Error initializing relay log position: %s", errmsg);
+ goto break_loop;
}
+
+ safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
+ (CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
+
+ pthread_mutex_lock(&rli->data_lock); // because of SHOW STATUS
+ // Note: trans_retries is reset to 0 inside rli->inc_group_relay_pos
+ rli->trans_retries++;
+ rli->retried_trans++;
+ pthread_mutex_unlock(&rli->data_lock);
+
+ DBUG_PRINT("info", ("Slave SQL thread: Retries transaction, "
+ "rli->trans_retries: %lu", rli->trans_retries));
+
+ goto continue_loop;
}
-
- /* Print any warnings issued */
- List_iterator_fast<MYSQL_ERROR> it(thd->warn_list);
- MYSQL_ERROR *err;
- while ((err= it++))
- sql_print_warning("Slave: %s Error_code: %d",err->msg, err->code);
-
- sql_print_error("\
-Error running query, slave SQL thread aborted. Fix the problem, and restart \
-the slave SQL thread with \"SLAVE START\". We stopped at log \
-'%s' position %s", RPL_LOG_NAME, llstr(rli->group_master_log_pos, llbuff));
+
+ case EXIT: // SQL thread should terminate
+ default:
+ {
+ /*
+ retrieve as much info as possible from the thd and, error codes and warnings
+ and print this to the error log as to allow the user to locate the error
+ */
+ if (thd->net.last_errno != 0)
+ {
+ if (rli->last_slave_errno == 0)
+ {
+ slave_print_msg(ERROR_LEVEL, rli, thd->net.last_errno,
+ thd->net.last_error ?
+ thd->net.last_error : "<no message>");
+ }
+ else if (rli->last_slave_errno != (int) thd->net.last_errno)
+ {
+ sql_print_error("Slave SQL thread: Network error: %s Error_code: %d",
+ thd->net.last_error ?
+ thd->net.last_error : "<no message>",
+ thd->net.last_errno);
+ }
+ }
+
+ /* Print any warnings issued */
+ List_iterator_fast<MYSQL_ERROR> it(thd->warn_list);
+ MYSQL_ERROR *err;
+ while ((err= it++))
+ sql_print_warning("Slave SQL thread: %s Error_code: %d",
+ err->msg, err->code);
+
+ sql_print_error("Slave SQL thread: Abort due to error during replication event
execution. "
+ "Fix the problem, and restart the slave SQL thread with \"SLAVE
START\".");
+ goto break_loop;
+ }
+
+ } // switch( execute_event(...) )
+
+ goto continue_loop;
+
+ break_loop:
+
+ quit= TRUE;
+
+ continue_loop:
+
+ // release acquired resources
+
+ if( have_rli_lock )
+ pthread_mutex_unlock(&rli->data_lock);
+
+ /*
+ Format_description_log_event should not be deleted because it will be
+ used to read info about the relay log's format; it will be deleted when
+ the SQL thread does not need it, i.e. when this thread terminates.
+ (Can't see where it happens /Rafal)
+ */
+ if (ev && ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ {
+ DBUG_PRINT("info", ("Deleting the event after it has been executed"));
+ delete ev;
}
- goto err;
- }
- }
+
+ ev= NULL;
+
+ } // while( !quit )
+
+ } // group enclosing event execution loop
+
+ thd->proc_info = "Exiting";
/* Thread stopped. Print the current replication position to the log */
- sql_print_information("Slave SQL thread exiting, replication stopped in log "
+ sql_print_information("Slave SQL thread: Exiting, replication stopped in log "
"'%s' at position %s",
RPL_LOG_NAME, llstr(rli->group_master_log_pos,llbuff));
--- 1.4/sql/rpl_rli.h 2006-12-08 21:28:36 +01:00
+++ 1.5/sql/rpl_rli.h 2006-12-08 21:28:36 +01:00
@@ -128,7 +128,7 @@
char group_relay_log_name[FN_REFLEN];
ulonglong group_relay_log_pos;
char event_relay_log_name[FN_REFLEN];
- ulonglong event_relay_log_pos;
+ ulonglong event_relay_log_pos; // so, is it before or after the current event?
ulonglong future_event_relay_log_pos;
/*
@@ -321,5 +321,17 @@
time_t unsafe_to_stop_at;
} RELAY_LOG_INFO;
+
+#define RLI_MASTER_POS(rli,buf) ((rli)->group_master_log_name[0] ? \
+ (rli)->group_master_log_name : "FIRST"), \
+ llstr((rli)->group_master_log_pos,(buf))
+
+#define RLI_GROUP_POS(rli,buf) (rli)->group_relay_log_name,
llstr((rli)->group_relay_log_pos,(buf))
+#define RLI_POS(rli,buf) (rli)->event_relay_log_name,
llstr((rli)->event_relay_log_pos,(buf))
+
+#define RLI_POS_FORMAT "%s:%s"
+#define RLI_GROUP_POS_FORMAT RLI_POS_FORMAT
+#define RLI_MASTER_POS_FORMAT RLI_POS_FORMAT
+
#endif /* RPL_RLI_H */
| Thread |
|---|
| • bk commit into 5.1 tree (rafal:1.2310) | rsomla | 8 Dec |