1037 lines
35 KiB
C++
1037 lines
35 KiB
C++
/* Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; version 2 of the License.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
|
|
|
|
#include "debug_sync.h"
|
|
#include "rpl_mts_submode.h"
|
|
|
|
#include "hash.h" // HASH
|
|
#include "log.h" // sql_print_information
|
|
#include "log_event.h" // Query_log_event
|
|
#include "rpl_rli.h" // Relay_log_info
|
|
#include "rpl_rli_pdb.h" // db_worker_hash_entry
|
|
#include "rpl_slave_commit_order_manager.h" // Commit_order_manager
|
|
#include "sql_class.h" // THD
|
|
|
|
|
|
/**
|
|
Does necessary arrangement before scheduling next event.
|
|
@param: Relay_log_info rli
|
|
@return: 1 if error
|
|
0 no error
|
|
*/
|
|
int
|
|
Mts_submode_database::schedule_next_event(Relay_log_info *rli, Log_event *ev)
|
|
{
|
|
/*nothing to do here*/
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
Logic to attach temporary tables.
|
|
@param: THD thd
|
|
Relay_log_info rli
|
|
Query_log_event ev
|
|
@return: void
|
|
*/
|
|
void
|
|
Mts_submode_database::attach_temp_tables(THD *thd, const Relay_log_info* rli,
|
|
Query_log_event* ev)
|
|
{
|
|
int i, parts;
|
|
DBUG_ENTER("Mts_submode_database::attach_temp_tables");
|
|
if (!is_mts_worker(thd) || (ev->ends_group() || ev->starts_group()))
|
|
DBUG_VOID_RETURN;
|
|
DBUG_ASSERT(!thd->temporary_tables);
|
|
// in over max-db:s case just one special partition is locked
|
|
parts= ((ev->mts_accessed_dbs == OVER_MAX_DBS_IN_EVENT_MTS) ? 1 :
|
|
ev->mts_accessed_dbs);
|
|
for (i= 0; i < parts; i++)
|
|
{
|
|
mts_move_temp_tables_to_thd(thd,
|
|
ev->mts_assigned_partitions[i]->temporary_tables);
|
|
ev->mts_assigned_partitions[i]->temporary_tables= NULL;
|
|
}
|
|
DBUG_VOID_RETURN;
|
|
}
|
|
|
|
/**
|
|
Function is called by Coordinator when it identified an event
|
|
requiring sequential execution.
|
|
Creating sequential context for the event includes waiting
|
|
for the assigned to Workers tasks to be completed and their
|
|
resources such as temporary tables be returned to Coordinator's
|
|
repository.
|
|
In case all workers are waited Coordinator changes its group status.
|
|
|
|
@param rli Relay_log_info instance of Coordinator
|
|
@param ignore Optional Worker instance pointer if the sequential context
|
|
is established due for the ignore Worker. Its resources
|
|
are to be retained.
|
|
|
|
@note Resources that are not occupied by Workers such as
|
|
a list of temporary tables held in unused (zero-usage) records
|
|
of APH are relocated to the Coordinator placeholder.
|
|
|
|
@return non-negative number of released by Workers partitions
|
|
(one partition by one Worker can count multiple times)
|
|
|
|
or -1 to indicate there has been a failure on a not-ignored Worker
|
|
as indicated by its running_status so synchronization can't succeed.
|
|
*/
|
|
|
|
int
|
|
Mts_submode_database::wait_for_workers_to_finish(Relay_log_info *rli,
|
|
Slave_worker *ignore)
|
|
{
|
|
uint ret= 0;
|
|
HASH *hash= &rli->mapping_db_to_worker;
|
|
THD *thd= rli->info_thd;
|
|
bool cant_sync= FALSE;
|
|
char llbuf[22];
|
|
|
|
DBUG_ENTER("Mts_submode_database::wait_for_workers_to_finish");
|
|
|
|
llstr(rli->get_event_relay_log_pos(), llbuf);
|
|
DBUG_PRINT("info", ("Coordinator and workers enter synchronization "
|
|
"procedure when scheduling event relay-log: %s "
|
|
"pos: %s", rli->get_event_relay_log_name(), llbuf));
|
|
|
|
for (uint i= 0, ret= 0; i < hash->records; i++)
|
|
{
|
|
db_worker_hash_entry *entry;
|
|
|
|
mysql_mutex_lock(&rli->slave_worker_hash_lock);
|
|
|
|
entry= (db_worker_hash_entry*) my_hash_element(hash, i);
|
|
|
|
DBUG_ASSERT(entry);
|
|
|
|
// the ignore Worker retains its active resources
|
|
if (ignore && entry->worker == ignore && entry->usage > 0)
|
|
{
|
|
mysql_mutex_unlock(&rli->slave_worker_hash_lock);
|
|
continue;
|
|
}
|
|
|
|
if (entry->usage > 0 && !thd->killed)
|
|
{
|
|
PSI_stage_info old_stage;
|
|
Slave_worker *w_entry= entry->worker;
|
|
|
|
entry->worker= NULL; // mark Worker to signal when usage drops to 0
|
|
thd->ENTER_COND(&rli->slave_worker_hash_cond,
|
|
&rli->slave_worker_hash_lock,
|
|
&stage_slave_waiting_worker_to_release_partition,
|
|
&old_stage);
|
|
do
|
|
{
|
|
mysql_cond_wait(&rli->slave_worker_hash_cond, &rli->slave_worker_hash_lock);
|
|
DBUG_PRINT("info",
|
|
("Either got awakened of notified: "
|
|
"entry %p, usage %lu, worker %lu",
|
|
entry, entry->usage, w_entry->id));
|
|
} while (entry->usage != 0 && !thd->killed);
|
|
entry->worker= w_entry; // restoring last association, needed only for assert
|
|
mysql_mutex_unlock(&rli->slave_worker_hash_lock);
|
|
thd->EXIT_COND(&old_stage);
|
|
ret++;
|
|
}
|
|
else
|
|
{
|
|
mysql_mutex_unlock(&rli->slave_worker_hash_lock);
|
|
}
|
|
// resources relocation
|
|
mts_move_temp_tables_to_thd(thd, entry->temporary_tables);
|
|
entry->temporary_tables= NULL;
|
|
if (entry->worker->running_status != Slave_worker::RUNNING)
|
|
cant_sync= TRUE;
|
|
}
|
|
|
|
if (!ignore)
|
|
{
|
|
DBUG_PRINT("info", ("Coordinator synchronized with workers, "
|
|
"waited entries: %d, cant_sync: %d",
|
|
ret, cant_sync));
|
|
|
|
rli->mts_group_status= Relay_log_info::MTS_NOT_IN_GROUP;
|
|
}
|
|
|
|
DBUG_RETURN(!cant_sync ? ret : -1);
|
|
}
|
|
|
|
/**
|
|
Logic to detach the temporary tables from the worker threads upon
|
|
event execution
|
|
@param: thd THD instance
|
|
rli Relay_log_info instance
|
|
ev Query_log_event that is being applied
|
|
@return: void
|
|
*/
|
|
void
|
|
Mts_submode_database::detach_temp_tables(THD *thd, const Relay_log_info* rli,
|
|
Query_log_event *ev)
|
|
{
|
|
int i, parts;
|
|
DBUG_ENTER("Mts_submode_database::detach_temp_tables");
|
|
if (!is_mts_worker(thd))
|
|
DBUG_VOID_RETURN;
|
|
parts= ((ev->mts_accessed_dbs == OVER_MAX_DBS_IN_EVENT_MTS) ?
|
|
1 : ev->mts_accessed_dbs);
|
|
/*
|
|
todo: optimize for a case of
|
|
|
|
a. one db
|
|
Only detaching temporary_tables from thd to entry would require
|
|
instead of the double-loop below.
|
|
|
|
b. unchanged thd->temporary_tables.
|
|
In such case the involved entries would continue to hold the
|
|
unmodified lists provided that the attach_ method does not
|
|
destroy references to them.
|
|
*/
|
|
for (i= 0; i < parts; i++)
|
|
{
|
|
ev->mts_assigned_partitions[i]->temporary_tables= NULL;
|
|
}
|
|
for (TABLE *table= thd->temporary_tables; table;)
|
|
{
|
|
int i;
|
|
char *db_name= NULL;
|
|
|
|
// find which entry to go
|
|
for (i= 0; i < parts; i++)
|
|
{
|
|
db_name= ev->mts_accessed_db_names[i];
|
|
if (!strlen(db_name))
|
|
break;
|
|
// Only default database is rewritten.
|
|
if (!rpl_filter->is_rewrite_empty() && !strcmp(ev->get_db(), db_name))
|
|
{
|
|
size_t dummy_len;
|
|
const char *db_filtered= rpl_filter->get_rewrite_db(db_name, &dummy_len);
|
|
// db_name != db_filtered means that db_name is rewritten.
|
|
if (strcmp(db_name, db_filtered))
|
|
db_name= (char*)db_filtered;
|
|
}
|
|
if (strcmp(table->s->db.str, db_name) < 0)
|
|
continue;
|
|
else
|
|
{
|
|
// When rewrite db rules are used we can not rely on
|
|
// mts_accessed_db_names elements order.
|
|
if (!rpl_filter->is_rewrite_empty() &&
|
|
strcmp(table->s->db.str, db_name))
|
|
continue;
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
DBUG_ASSERT(db_name && (
|
|
!strcmp(table->s->db.str, db_name) ||
|
|
!strlen(db_name))
|
|
);
|
|
DBUG_ASSERT(i < ev->mts_accessed_dbs);
|
|
// table pointer is shifted inside the function
|
|
table= mts_move_temp_table_to_entry(table, thd, ev->mts_assigned_partitions[i]);
|
|
}
|
|
|
|
DBUG_ASSERT(!thd->temporary_tables);
|
|
#ifndef DBUG_OFF
|
|
for (int i= 0; i < parts; i++)
|
|
{
|
|
DBUG_ASSERT(!ev->mts_assigned_partitions[i]->temporary_tables ||
|
|
!ev->mts_assigned_partitions[i]->temporary_tables->prev);
|
|
}
|
|
#endif
|
|
DBUG_VOID_RETURN;
|
|
}
|
|
|
|
/**
|
|
Logic to get least occupied worker when the sql mts_submode= database
|
|
@param
|
|
rli relay log info of coordinator
|
|
ws arrayy of worker threads
|
|
ev event for which we are searching for a worker.
|
|
@return slave worker thread
|
|
*/
|
|
Slave_worker *
|
|
Mts_submode_database::get_least_occupied_worker(Relay_log_info *rli,
|
|
Slave_worker_array *ws,
|
|
Log_event *ev)
|
|
{
|
|
long usage= LONG_MAX;
|
|
Slave_worker **ptr_current_worker= NULL, *worker= NULL;
|
|
|
|
DBUG_ENTER("Mts_submode_database::get_least_occupied_worker");
|
|
|
|
#ifndef DBUG_OFF
|
|
|
|
if (DBUG_EVALUATE_IF("mts_distribute_round_robin", 1, 0))
|
|
{
|
|
worker= ws->at(w_rr % ws->size());
|
|
sql_print_information("Chosing worker id %lu, the following is"
|
|
" going to be %lu", worker->id,
|
|
static_cast<ulong>(w_rr % ws->size()));
|
|
DBUG_ASSERT(worker != NULL);
|
|
DBUG_RETURN(worker);
|
|
}
|
|
#endif
|
|
|
|
for (Slave_worker **it= ws->begin(); it != ws->end(); ++it)
|
|
{
|
|
ptr_current_worker= it;
|
|
if ((*ptr_current_worker)->usage_partition <= usage)
|
|
{
|
|
worker= *ptr_current_worker;
|
|
usage= (*ptr_current_worker)->usage_partition;
|
|
}
|
|
}
|
|
DBUG_ASSERT(worker != NULL);
|
|
DBUG_RETURN(worker);
|
|
}
|
|
|
|
/* MTS submode master Default constructor */
|
|
Mts_submode_logical_clock::Mts_submode_logical_clock()
|
|
{
|
|
type= MTS_PARALLEL_TYPE_LOGICAL_CLOCK;
|
|
first_event= true;
|
|
force_new_group= false;
|
|
is_new_group= true;
|
|
delegated_jobs = 0;
|
|
jobs_done= 0;
|
|
last_lwm_timestamp= SEQ_UNINIT;
|
|
last_lwm_index= INDEX_UNDEF;
|
|
is_error= false;
|
|
min_waited_timestamp= SEQ_UNINIT;
|
|
last_committed= SEQ_UNINIT;
|
|
sequence_number= SEQ_UNINIT;
|
|
}
|
|
|
|
/**
|
|
The method finds the minimum logical timestamp (low-water-mark) of
|
|
committed transactions.
|
|
The successful search results in a pair of a logical timestamp value and a GAQ
|
|
index that contains it. last_lwm_timestamp may still be raised though
|
|
the search does not find any satisfying running index.
|
|
Search is implemented as headway scanning of GAQ from a point of a
|
|
previous search's stop position (last_lwm_index).
|
|
Whether the cached (memorized) index value is considered to be stale
|
|
when its timestamp gets less than the current "stable" LWM:
|
|
|
|
last_lwm_timestamp <= GAQ.lwm.sequence_number (*)
|
|
|
|
Staleness is caused by GAQ garbage collection that increments the rhs of (*),
|
|
see ::move_queue_head(). When that's diagnozed, the search in GAQ needs
|
|
restarting from the queue tail.
|
|
|
|
Formally, the undefined cached value of last_lwm_timestamp is also stale.
|
|
|
|
the last time index containg lwm
|
|
+------+
|
|
| LWM |
|
|
| | |
|
|
V V V
|
|
GAQ: xoooooxxxxxXXXXX...X
|
|
^ ^
|
|
| | LWM+1
|
|
|
|
|
+- tne new current_lwm
|
|
|
|
<---- logical (commit) time ----
|
|
|
|
here `x' stands for committed, `X' for committed and discarded from
|
|
the running range of the queue, `o' for not committed.
|
|
|
|
@param rli Relay_log_info pointer
|
|
@param need_look Either the caller or the function must hold a mutex
|
|
to avoid race with concurrent GAQ update.
|
|
|
|
@return possibly updated current_lwm
|
|
*/
|
|
longlong Mts_submode_logical_clock::get_lwm_timestamp(Relay_log_info *rli,
|
|
bool need_lock)
|
|
{
|
|
longlong lwm_estim;
|
|
Slave_job_group* ptr_g;
|
|
bool is_stale= false;
|
|
|
|
if (!need_lock)
|
|
mysql_mutex_lock(&rli->mts_gaq_LOCK);
|
|
|
|
/*
|
|
Make the "stable" LWM-based estimate which will be compared
|
|
against the cached "instant" value.
|
|
*/
|
|
lwm_estim= rli->gaq->lwm.sequence_number;
|
|
/*
|
|
timestamp continuity invariant: if the queue has any item
|
|
its timestamp is greater on one than the estimate.
|
|
*/
|
|
DBUG_ASSERT(lwm_estim == SEQ_UNINIT || rli->gaq->empty() ||
|
|
lwm_estim + 1 ==
|
|
rli->gaq->get_job_group(rli->gaq->entry)->sequence_number);
|
|
|
|
last_lwm_index=
|
|
rli->gaq->find_lwm(&ptr_g,
|
|
/*
|
|
The underfined "stable" forces the scan's restart
|
|
as the stale value does.
|
|
*/
|
|
lwm_estim == SEQ_UNINIT ||
|
|
(is_stale= clock_leq(last_lwm_timestamp, lwm_estim)) ?
|
|
rli->gaq->entry :
|
|
last_lwm_index);
|
|
/*
|
|
if the returned index is sane update the timestamp.
|
|
*/
|
|
if (last_lwm_index != rli->gaq->size)
|
|
{
|
|
// non-decreasing lwm invariant
|
|
DBUG_ASSERT(clock_leq(last_lwm_timestamp, ptr_g->sequence_number));
|
|
|
|
last_lwm_timestamp= ptr_g->sequence_number;
|
|
}
|
|
else if (is_stale)
|
|
{
|
|
my_atomic_store64(&last_lwm_timestamp, lwm_estim);
|
|
}
|
|
|
|
if (!need_lock)
|
|
mysql_mutex_unlock(&rli->mts_gaq_LOCK);
|
|
|
|
return last_lwm_timestamp;
|
|
};
|
|
|
|
/**
|
|
The method implements logical timestamp conflict detection
|
|
and resolution through waiting by the calling thread.
|
|
The conflict or waiting condition is like the following
|
|
|
|
lwm < last_committed,
|
|
|
|
where lwm is a minimum logical timestamp of committed transactions.
|
|
Since the lwm's exact value is not always available its pessimistic
|
|
estimate (an old version) is improved (get_lwm_timestamp()) as the
|
|
first step before to the actual waiting commitment.
|
|
|
|
Special cases include:
|
|
|
|
When @c last_committed_arg is uninitialized the calling thread must
|
|
proceed without waiting for anyone. Any possible dependency with unknown
|
|
commit parent transaction shall be sorted out by the parent;
|
|
|
|
When the gaq index is subsequent to the last lwm index
|
|
there's no dependency of the current transaction with any regardless of
|
|
lwm timestamp should it be SEQ_UNINIT.
|
|
Consequently when GAQ consists of just one item there's none to wait.
|
|
Such latter case is left to the caller to handle.
|
|
|
|
@note The caller must make sure the current transaction won't be waiting
|
|
for itself. That is the method should not be caller by a Worker
|
|
whose group assignment is in the GAQ front item.
|
|
|
|
@param last_committed_arg logical timestamp of a parent transaction
|
|
@param gaq_index Index of the current transaction in GAQ
|
|
@return false as success,
|
|
true when the error flag is raised or
|
|
the caller thread is found killed.
|
|
*/
|
|
bool Mts_submode_logical_clock::
|
|
wait_for_last_committed_trx(Relay_log_info* rli,
|
|
longlong last_committed_arg,
|
|
longlong lwm_estimate_arg)
|
|
{
|
|
THD* thd= rli->info_thd;
|
|
|
|
DBUG_ENTER("Mts_submode_logical_clock::wait_for_last_committed_trx");
|
|
|
|
if (last_committed_arg == SEQ_UNINIT)
|
|
DBUG_RETURN(false);
|
|
|
|
mysql_mutex_lock(&rli->mts_gaq_LOCK);
|
|
|
|
DBUG_ASSERT(min_waited_timestamp == SEQ_UNINIT);
|
|
|
|
my_atomic_store64(&min_waited_timestamp, last_committed_arg);
|
|
/*
|
|
This transaction is a candidate for insertion into the waiting list.
|
|
That fact is descibed by incrementing waited_timestamp_cnt.
|
|
When the candidate won't make it the counter is decremented at once
|
|
while the mutex is hold.
|
|
*/
|
|
if ((!rli->info_thd->killed && !is_error) &&
|
|
!clock_leq(last_committed_arg, get_lwm_timestamp(rli, true)))
|
|
{
|
|
PSI_stage_info old_stage;
|
|
struct timespec ts[2];
|
|
set_timespec_nsec(&ts[0], 0);
|
|
|
|
DBUG_ASSERT(rli->gaq->len >= 2); // there's someone to wait
|
|
|
|
thd->ENTER_COND(&rli->logical_clock_cond, &rli->mts_gaq_LOCK,
|
|
&stage_worker_waiting_for_commit_parent, &old_stage);
|
|
do
|
|
{
|
|
mysql_cond_wait(&rli->logical_clock_cond, &rli->mts_gaq_LOCK);
|
|
}
|
|
while ((!rli->info_thd->killed && !is_error) &&
|
|
!clock_leq(last_committed_arg, estimate_lwm_timestamp()));
|
|
my_atomic_store64(&min_waited_timestamp, SEQ_UNINIT); // reset waiting flag
|
|
mysql_mutex_unlock(&rli->mts_gaq_LOCK);
|
|
thd->EXIT_COND(&old_stage);
|
|
set_timespec_nsec(&ts[1], 0);
|
|
my_atomic_add64(&rli->mts_total_wait_overlap, diff_timespec(&ts[1], &ts[0]));
|
|
}
|
|
else
|
|
{
|
|
my_atomic_store64(&min_waited_timestamp, SEQ_UNINIT);
|
|
mysql_mutex_unlock(&rli->mts_gaq_LOCK);
|
|
}
|
|
|
|
DBUG_RETURN(rli->info_thd->killed || is_error);
|
|
}
|
|
|
|
/**
|
|
Does necessary arrangement before scheduling next event.
|
|
The method computes the meta-group status of the being scheduled
|
|
transaction represented by the event argument. When the status
|
|
is found OUT (of the current meta-group) as encoded as is_new_group == true
|
|
the global Scheduler (Coordinator thread) requests full synchronization
|
|
with all Workers.
|
|
The current being assigned group descriptor gets associated with
|
|
the group's logical timestamp aka sequence_number.
|
|
|
|
@param: Relay_log_info* rli
|
|
Log_event *ev
|
|
@return: ER_MTS_CANT_PARALLEL, ER_MTS_INCONSISTENT_DATA
|
|
0 if no error or slave has been killed gracefully
|
|
*/
|
|
int
|
|
Mts_submode_logical_clock::schedule_next_event(Relay_log_info* rli,
|
|
Log_event *ev)
|
|
{
|
|
longlong last_sequence_number= sequence_number;
|
|
bool gap_successor= false;
|
|
|
|
DBUG_ENTER("Mts_submode_logical_clock::schedule_next_event");
|
|
// We should check if the SQL thread was already killed before we schedule
|
|
// the next transaction
|
|
if (sql_slave_killed(rli->info_thd, rli))
|
|
DBUG_RETURN(0);
|
|
|
|
Slave_job_group *ptr_group=
|
|
rli->gaq->get_job_group(rli->gaq->assigned_group_index);
|
|
/*
|
|
A group id updater must satisfy the following:
|
|
- A query log event ("BEGIN" ) or a GTID EVENT
|
|
- A DDL or an implicit DML commit.
|
|
*/
|
|
switch (ev->get_type_code())
|
|
{
|
|
case binary_log::GTID_LOG_EVENT:
|
|
case binary_log::ANONYMOUS_GTID_LOG_EVENT:
|
|
// TODO: control continuity
|
|
ptr_group->sequence_number= sequence_number=
|
|
static_cast<Gtid_log_event*>(ev)->sequence_number;
|
|
ptr_group->last_committed= last_committed=
|
|
static_cast<Gtid_log_event*>(ev)->last_committed;
|
|
break;
|
|
|
|
default:
|
|
|
|
sequence_number= last_committed= SEQ_UNINIT;
|
|
|
|
break;
|
|
}
|
|
|
|
DBUG_PRINT("info", ("sequence_number %lld, last_committed %lld",
|
|
sequence_number, last_committed));
|
|
|
|
if (first_event)
|
|
{
|
|
first_event= false;
|
|
}
|
|
else
|
|
{
|
|
if (unlikely(clock_leq(sequence_number, last_committed) &&
|
|
last_committed != SEQ_UNINIT))
|
|
{
|
|
/* inconsistent (buggy) timestamps */
|
|
sql_print_error("Transaction is tagged with inconsistent logical "
|
|
"timestamps: "
|
|
"sequence_number (%lld) <= last_committed (%lld)",
|
|
sequence_number, last_committed);
|
|
DBUG_RETURN(ER_MTS_CANT_PARALLEL);
|
|
}
|
|
if (unlikely(clock_leq(sequence_number, last_sequence_number) &&
|
|
sequence_number != SEQ_UNINIT))
|
|
{
|
|
/* inconsistent (buggy) timestamps */
|
|
sql_print_error("Transaction's sequence number is inconsistent with that "
|
|
"of a preceding one: "
|
|
"sequence_number (%lld) <= previous sequence_number (%lld)",
|
|
sequence_number, last_sequence_number);
|
|
DBUG_RETURN(ER_MTS_CANT_PARALLEL);
|
|
}
|
|
/*
|
|
Being scheduled transaction sequence may have gaps, even in
|
|
relay log. In such case a transaction that succeeds a gap will
|
|
wait for all ealier that were scheduled to finish. It's marked
|
|
as gap successor now.
|
|
*/
|
|
compile_time_assert(SEQ_UNINIT == 0);
|
|
if (unlikely(sequence_number > last_sequence_number + 1))
|
|
{
|
|
DBUG_PRINT("info", ("sequence_number gap found, "
|
|
"last_sequence_number %lld, sequence_number %lld",
|
|
last_sequence_number, sequence_number));
|
|
DBUG_ASSERT(rli->replicate_same_server_id || true /* TODO: account autopositioning */);
|
|
gap_successor= true;
|
|
}
|
|
}
|
|
|
|
/*
|
|
The new group flag is practically the same as the force flag
|
|
when up to indicate syncronization with Workers.
|
|
*/
|
|
is_new_group=
|
|
(/* First event after a submode switch; */
|
|
first_event ||
|
|
/* Require a fresh group to be started; */
|
|
// todo: turn `force_new_group' into sequence_number == SEQ_UNINIT condition
|
|
force_new_group ||
|
|
/* Rewritten event without commit point timestamp (todo: find use case) */
|
|
sequence_number == SEQ_UNINIT ||
|
|
/*
|
|
undefined parent (e.g the very first trans from the master),
|
|
or old master.
|
|
*/
|
|
last_committed == SEQ_UNINIT ||
|
|
/*
|
|
When gap successor depends on a gap before it the scheduler has
|
|
to serialize this transaction execution with previously
|
|
scheduled ones. Below for simplicity it's assumed that such
|
|
gap-dependency is always the case.
|
|
*/
|
|
gap_successor ||
|
|
/*
|
|
previous group did not have sequence number assigned.
|
|
It's execution must be finished until the current group
|
|
can be assigned.
|
|
Dependency of the current group on the previous
|
|
can't be tracked. So let's wait till the former is over.
|
|
*/
|
|
last_sequence_number == SEQ_UNINIT);
|
|
/*
|
|
The coordinator waits till all transactions on which the current one
|
|
depends on are applied.
|
|
*/
|
|
if (!is_new_group)
|
|
{
|
|
longlong lwm_estimate= estimate_lwm_timestamp();
|
|
|
|
if (!clock_leq(last_committed, lwm_estimate) &&
|
|
rli->gaq->assigned_group_index != rli->gaq->entry)
|
|
{
|
|
/*
|
|
"Unlikely" branch.
|
|
|
|
The following block improves possibly stale lwm and when the
|
|
waiting condition stays, recompute min_waited_timestamp and go
|
|
waiting.
|
|
At awakening set min_waited_timestamp to commit_parent in the
|
|
subsequent GAQ index (could be NIL).
|
|
*/
|
|
if (wait_for_last_committed_trx(rli, last_committed, lwm_estimate))
|
|
{
|
|
/*
|
|
MTS was waiting for a dependent transaction to finish but either it
|
|
has failed or the applier was requested to stop. In any case, this
|
|
transaction wasn't started yet and should not warn about the
|
|
coordinator stopping in a middle of a transaction to avoid polluting
|
|
the server error log.
|
|
*/
|
|
rli->reported_unsafe_warning= true;
|
|
DBUG_RETURN(-1);
|
|
}
|
|
/*
|
|
Making the slave's max last committed (lwm) to satisfy this
|
|
transaction's scheduling condition.
|
|
*/
|
|
if (gap_successor)
|
|
last_lwm_timestamp= sequence_number - 1;
|
|
DBUG_ASSERT(!clock_leq(sequence_number, estimate_lwm_timestamp()));
|
|
}
|
|
|
|
delegated_jobs++;
|
|
|
|
DBUG_ASSERT(!force_new_group);
|
|
}
|
|
else
|
|
{
|
|
DBUG_ASSERT(delegated_jobs >= jobs_done);
|
|
DBUG_ASSERT(is_error || (rli->gaq->len + jobs_done == 1 + delegated_jobs));
|
|
DBUG_ASSERT(rli->mts_group_status == Relay_log_info::MTS_IN_GROUP);
|
|
|
|
/*
|
|
Under the new group fall the following use cases:
|
|
- events from an OLD (sequence_number unaware) master;
|
|
- malformed (missed BEGIN or GTID_NEXT) group incl. its
|
|
particular form of CREATE..SELECT..from..@user_var (or rand- and
|
|
int- var in place of @user- var).
|
|
The malformed group is handled exceptionally each event is executed
|
|
as a solitary group yet by the same (zero id) worker.
|
|
*/
|
|
if (-1 == wait_for_workers_to_finish(rli))
|
|
DBUG_RETURN (ER_MTS_INCONSISTENT_DATA);
|
|
|
|
rli->mts_group_status= Relay_log_info::MTS_IN_GROUP; //wait set it to NOT
|
|
DBUG_ASSERT(min_waited_timestamp == SEQ_UNINIT);
|
|
/*
|
|
the instant last lwm timestamp must reset when force flag is up.
|
|
*/
|
|
rli->gaq->lwm.sequence_number= last_lwm_timestamp= SEQ_UNINIT;
|
|
delegated_jobs= 1;
|
|
jobs_done= 0;
|
|
force_new_group= false;
|
|
/*
|
|
Not sequenced event can be followed with a logically relating
|
|
e.g User var to be followed by CREATE table.
|
|
It's supported to be executed in one-by-one fashion.
|
|
Todo: remove with the event group parser worklog.
|
|
*/
|
|
if (sequence_number == SEQ_UNINIT && last_committed == SEQ_UNINIT)
|
|
rli->last_assigned_worker= *rli->workers.begin();
|
|
}
|
|
|
|
#ifndef DBUG_OFF
|
|
mysql_mutex_lock(&rli->mts_gaq_LOCK);
|
|
DBUG_ASSERT(is_error || (rli->gaq->len + jobs_done == delegated_jobs));
|
|
mysql_mutex_unlock(&rli->mts_gaq_LOCK);
|
|
#endif
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/**
|
|
Logic to attach the temporary tables from the worker threads upon
|
|
event execution
|
|
@param: thd THD instance
|
|
rli Relay_log_info instance
|
|
ev Query_log_event that is being applied
|
|
@return: void
|
|
*/
|
|
void
|
|
Mts_submode_logical_clock::attach_temp_tables(THD *thd, const Relay_log_info* rli,
|
|
Query_log_event * ev)
|
|
{
|
|
bool shifted= false;
|
|
TABLE *table, *cur_table;
|
|
DBUG_ENTER("Mts_submode_logical_clock::attach_temp_tables");
|
|
if (!is_mts_worker(thd) || (ev->ends_group() || ev->starts_group()))
|
|
DBUG_VOID_RETURN;
|
|
/* fetch coordinator's rli */
|
|
Relay_log_info *c_rli= static_cast<const Slave_worker *>(rli)->c_rli;
|
|
DBUG_ASSERT(!thd->temporary_tables);
|
|
mysql_mutex_lock(&c_rli->mts_temp_table_LOCK);
|
|
if (!(table= c_rli->info_thd->temporary_tables))
|
|
{
|
|
mysql_mutex_unlock(&c_rli->mts_temp_table_LOCK);
|
|
DBUG_VOID_RETURN;
|
|
}
|
|
c_rli->info_thd->temporary_tables= 0;
|
|
do
|
|
{
|
|
/* store the current table */
|
|
cur_table= table;
|
|
/* move the table pointer to next in list, so that we can isolate the
|
|
current table */
|
|
table= table->next;
|
|
std::pair<uint, my_thread_id> st_id_pair= get_server_and_thread_id(cur_table);
|
|
if (thd->server_id == st_id_pair.first &&
|
|
thd->variables.pseudo_thread_id == st_id_pair.second)
|
|
{
|
|
/* short the list singling out the current table */
|
|
if (cur_table->prev) //not the first node
|
|
cur_table->prev->next= cur_table->next;
|
|
if (cur_table->next) //not the last node
|
|
cur_table->next->prev= cur_table->prev;
|
|
/* isolate the table */
|
|
cur_table->prev= NULL;
|
|
cur_table->next= NULL;
|
|
mts_move_temp_tables_to_thd(thd, cur_table);
|
|
}
|
|
else
|
|
/* We must shift the C->temp_table pointer to the fist table unused in
|
|
this iteration. If all the tables have ben used C->temp_tables will
|
|
point to NULL */
|
|
if (!shifted)
|
|
{
|
|
c_rli->info_thd->temporary_tables= cur_table;
|
|
shifted= true;
|
|
}
|
|
} while(table);
|
|
mysql_mutex_unlock(&c_rli->mts_temp_table_LOCK);
|
|
DBUG_VOID_RETURN;
|
|
}
|
|
|
|
/**
|
|
Logic to detach the temporary tables from the worker threads upon
|
|
event execution
|
|
@param: thd THD instance
|
|
rli Relay_log_info instance
|
|
ev Query_log_event that is being applied
|
|
@return: void
|
|
*/
|
|
void
|
|
Mts_submode_logical_clock::detach_temp_tables( THD *thd, const Relay_log_info* rli,
|
|
Query_log_event * ev)
|
|
{
|
|
DBUG_ENTER("Mts_submode_logical_clock::detach_temp_tables");
|
|
if (!is_mts_worker(thd))
|
|
DBUG_VOID_RETURN;
|
|
/*
|
|
Here in detach section we will move the tables from the worker to the
|
|
coordinaor thread. Since coordinator is shared we need to make sure that
|
|
there are no race conditions which may lead to assert failures and
|
|
non-deterministic results.
|
|
*/
|
|
Relay_log_info *c_rli= static_cast<const Slave_worker *>(rli)->c_rli;
|
|
mysql_mutex_lock(&c_rli->mts_temp_table_LOCK);
|
|
mts_move_temp_tables_to_thd(c_rli->info_thd, thd->temporary_tables);
|
|
mysql_mutex_unlock(&c_rli->mts_temp_table_LOCK);
|
|
thd->temporary_tables= 0;
|
|
DBUG_VOID_RETURN;
|
|
}
|
|
|
|
/**
|
|
Logic to get least occupied worker when the sql mts_submode= master_parallel
|
|
@param
|
|
rli relay log info of coordinator
|
|
ws arrayy of worker threads
|
|
ev event for which we are searching for a worker.
|
|
@return slave worker thread or NULL when coordinator is killed by any worker.
|
|
*/
|
|
|
|
Slave_worker *
|
|
Mts_submode_logical_clock::get_least_occupied_worker(Relay_log_info *rli,
|
|
Slave_worker_array *ws,
|
|
Log_event * ev)
|
|
{
|
|
Slave_worker *worker= NULL;
|
|
PSI_stage_info *old_stage= 0;
|
|
THD* thd= rli->info_thd;
|
|
DBUG_ENTER("Mts_submode_logical_clock::get_least_occupied_worker");
|
|
#ifndef DBUG_OFF
|
|
|
|
if (DBUG_EVALUATE_IF("mts_distribute_round_robin", 1, 0))
|
|
{
|
|
worker= ws->at(w_rr % ws->size());
|
|
sql_print_information("Chosing worker id %lu, the following is"
|
|
" going to be %lu", worker->id,
|
|
static_cast<ulong>(w_rr % ws->size()));
|
|
DBUG_ASSERT(worker != NULL);
|
|
DBUG_RETURN(worker);
|
|
}
|
|
Slave_committed_queue *gaq= rli->gaq;
|
|
Slave_job_group* ptr_group;
|
|
ptr_group= gaq->get_job_group(rli->gaq->assigned_group_index);
|
|
#endif
|
|
/*
|
|
The scheduling works as follows, in this sequence
|
|
-If this is an internal event of a transaction use the last assigned
|
|
worker
|
|
-If the i-th transaction is being scheduled in this group where "i" <=
|
|
number of available workers then schedule the events to the consecutive
|
|
workers
|
|
-If the i-th transaction is being scheduled in this group where "i" >
|
|
number of available workers then schedule this to the forst worker that
|
|
becomes free.
|
|
*/
|
|
if (rli->last_assigned_worker)
|
|
{
|
|
worker= rli->last_assigned_worker;
|
|
DBUG_ASSERT(ev->get_type_code() != binary_log::USER_VAR_EVENT || worker->id == 0 ||
|
|
rli->curr_group_seen_begin || rli->curr_group_seen_gtid);
|
|
}
|
|
else
|
|
{
|
|
worker= get_free_worker(rli);
|
|
|
|
DBUG_ASSERT(ev->get_type_code() != binary_log::USER_VAR_EVENT ||
|
|
rli->curr_group_seen_begin || rli->curr_group_seen_gtid);
|
|
|
|
if (worker == NULL)
|
|
{
|
|
struct timespec ts[2];
|
|
|
|
set_timespec_nsec(&ts[0], 0);
|
|
// Update thd info as waiting for workers to finish.
|
|
thd->enter_stage(&stage_slave_waiting_for_workers_to_process_queue,
|
|
old_stage,
|
|
__func__, __FILE__, __LINE__);
|
|
while (!worker && !thd->killed)
|
|
{
|
|
/*
|
|
Busy wait with yielding thread control before to next attempt
|
|
to find a free worker. As of current, a worker
|
|
can't have more than one assigned group of events in its
|
|
queue.
|
|
|
|
todo: replace this At-Most-One assignment policy with
|
|
First Available Worker as
|
|
this method clearly can't be considered as optimal.
|
|
*/
|
|
#if !defined(_WIN32)
|
|
sched_yield();
|
|
#else
|
|
my_sleep(rli->mts_coordinator_basic_nap);
|
|
#endif
|
|
worker= get_free_worker(rli);
|
|
}
|
|
THD_STAGE_INFO(thd, *old_stage);
|
|
set_timespec_nsec(&ts[1], 0);
|
|
rli->mts_total_wait_worker_avail += diff_timespec(&ts[1], &ts[0]);
|
|
rli->mts_wq_no_underrun_cnt++;
|
|
/*
|
|
Even OPTION_BEGIN is set, the 'BEGIN' event is not dispatched to
|
|
any worker thread. So The flag is removed and Coordinator thread
|
|
will not try to finish the group before abort.
|
|
*/
|
|
if (worker == NULL)
|
|
rli->info_thd->variables.option_bits&= ~OPTION_BEGIN;
|
|
}
|
|
if (rli->get_commit_order_manager() != NULL && worker != NULL)
|
|
rli->get_commit_order_manager()->register_trx(worker);
|
|
}
|
|
|
|
DBUG_ASSERT(ptr_group);
|
|
// assert that we have a worker thread for this event or the slave has
|
|
// stopped.
|
|
DBUG_ASSERT(worker != NULL || thd->killed);
|
|
/* The master my have send db partition info. make sure we never use them*/
|
|
if (ev->get_type_code() == binary_log::QUERY_EVENT)
|
|
static_cast<Query_log_event*>(ev)->mts_accessed_dbs= 0;
|
|
|
|
DBUG_RETURN(worker);
|
|
}
|
|
|
|
/**
|
|
Protected method to fetch a worker having no events assigned.
|
|
The method is supposed to be called by Coordinator, therefore
|
|
comparison like w_i->jobs.len == 0 must (eventually) succeed.
|
|
|
|
todo: consider to optimize scan that is getting more expensive with
|
|
more # of Workers.
|
|
|
|
@return a pointer to Worker or NULL if none is free.
|
|
*/
|
|
Slave_worker*
|
|
Mts_submode_logical_clock::get_free_worker(Relay_log_info *rli)
|
|
{
|
|
for (Slave_worker **it= rli->workers.begin(); it != rli->workers.end(); ++it)
|
|
{
|
|
Slave_worker *w_i= *it;
|
|
if (w_i->jobs.len == 0)
|
|
return w_i;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
Waits for slave workers to finish off the pending tasks before returning.
|
|
Used in this submode to make sure that all assigned jobs have been done.
|
|
|
|
@param Relay_log info *rli coordinator rli.
|
|
@param Slave worker to ignore.
|
|
@return -1 for error.
|
|
0 no error.
|
|
*/
|
|
int
|
|
Mts_submode_logical_clock::
|
|
wait_for_workers_to_finish(Relay_log_info *rli,
|
|
MY_ATTRIBUTE((unused)) Slave_worker * ignore)
|
|
{
|
|
PSI_stage_info *old_stage= 0;
|
|
THD *thd= rli->info_thd;
|
|
DBUG_ENTER("Mts_submode_logical_clock::wait_for_workers_to_finish");
|
|
DBUG_PRINT("info",("delegated %d, jobs_done %d", delegated_jobs,
|
|
jobs_done));
|
|
// Update thd info as waiting for workers to finish.
|
|
thd->enter_stage(&stage_slave_waiting_for_workers_to_process_queue,
|
|
old_stage,
|
|
__func__, __FILE__, __LINE__);
|
|
while (delegated_jobs > jobs_done && !thd->killed && !is_error)
|
|
{
|
|
// Todo: consider to replace with a. GAQ::get_lwm_timestamp() or
|
|
// b. (better) pthread wait+signal similarly to DB type.
|
|
if (mts_checkpoint_routine(rli, 0, true, true /*need_data_lock=true*/))
|
|
DBUG_RETURN(-1);
|
|
}
|
|
|
|
// Check if there is a failure on a not-ignored Worker
|
|
for (Slave_worker **it= rli->workers.begin(); it != rli->workers.end();
|
|
++it)
|
|
{
|
|
Slave_worker *w_i= *it;
|
|
if (w_i->running_status != Slave_worker::RUNNING)
|
|
DBUG_RETURN(-1);
|
|
|
|
}
|
|
|
|
DBUG_EXECUTE_IF("wait_for_workers_to_finish_after_wait",
|
|
{
|
|
const char act[]= "now WAIT_FOR coordinator_continue";
|
|
DBUG_ASSERT(!debug_sync_set_action(rli->info_thd,
|
|
STRING_WITH_LEN(act)));
|
|
});
|
|
|
|
// The current commit point sequence may end here (e.g Rotate to new log)
|
|
rli->gaq->lwm.sequence_number= SEQ_UNINIT;
|
|
// Restore previous info.
|
|
THD_STAGE_INFO(thd, *old_stage);
|
|
DBUG_PRINT("info",("delegated %d, jobs_done %d, Workers have finished their"
|
|
" jobs", delegated_jobs, jobs_done));
|
|
rli->mts_group_status= Relay_log_info::MTS_NOT_IN_GROUP;
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/**
|
|
Protected method to fetch the server_id and pseudo_thread_id from a
|
|
temporary table
|
|
@param : instance pointer of TABLE structure.
|
|
@return : std:pair<uint, my_thread_id>
|
|
@Note : It is the caller's responsibility to make sure we call this
|
|
function only for temp tables.
|
|
*/
|
|
std::pair<uint, my_thread_id>
|
|
Mts_submode_logical_clock::get_server_and_thread_id(TABLE* table)
|
|
{
|
|
DBUG_ENTER("get_server_and_thread_id");
|
|
char* extra_string= table->s->table_cache_key.str;
|
|
size_t extra_string_len= table->s->table_cache_key.length;
|
|
// assert will fail when called with non temporary tables.
|
|
DBUG_ASSERT(table->s->table_cache_key.length > 0);
|
|
std::pair<uint, my_thread_id>ret_pair= std::make_pair
|
|
(
|
|
/* last 8 bytes contains the server_id + pseudo_thread_id */
|
|
// fetch first 4 bytes to get the server id.
|
|
uint4korr(extra_string + extra_string_len - 8),
|
|
/* next 4 bytes contains the pseudo_thread_id */
|
|
uint4korr(extra_string + extra_string_len - 4)
|
|
);
|
|
DBUG_RETURN(ret_pair);
|
|
}
|
|
|