mysql5/mysql-5.7.27/storage/ndb/include/transporter/TransporterRegistry.hpp

868 lines
24 KiB
C++

/*
Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
//****************************************************************************
//
// NAME
// TransporterRegistry
//
// DESCRIPTION
// TransporterRegistry (singelton) is the interface to the
// transporter layer. It handles transporter states and
// holds the transporter arrays.
//
//***************************************************************************/
#ifndef TransporterRegistry_H
#define TransporterRegistry_H
#if defined(HAVE_EPOLL_CREATE)
#include <sys/epoll.h>
#endif
#include "TransporterDefinitions.hpp"
#include <SocketServer.hpp>
#include <SocketClient.hpp>
#include <NdbTCP.h>
#include <mgmapi/mgmapi.h>
#include <NodeBitmask.hpp>
// A transporter is always in an IOState.
// NoHalt is used initially and as long as it is no restrictions on
// sending or receiving.
enum IOState {
NoHalt = 0,
HaltInput = 1,
HaltOutput = 2,
HaltIO = 3
};
static const char *performStateString[] =
{ "is connected",
"is trying to connect",
"does nothing",
"is trying to disconnect" };
class Transporter;
class TCP_Transporter;
class SCI_Transporter;
class SHM_Transporter;
class TransporterRegistry;
class SocketAuthenticator;
class TransporterService : public SocketServer::Service {
SocketAuthenticator * m_auth;
TransporterRegistry * m_transporter_registry;
public:
TransporterService(SocketAuthenticator *auth= 0)
{
m_auth= auth;
m_transporter_registry= 0;
}
void setTransporterRegistry(TransporterRegistry *t)
{
m_transporter_registry= t;
}
SocketServer::Session * newSession(NDB_SOCKET_TYPE socket);
};
/**
* TransporterReceiveData
*
* State for pollReceive/performReceive
* Moved into own class to enable multi receive threads
*/
struct TransporterReceiveData
{
TransporterReceiveData();
~TransporterReceiveData();
bool init (unsigned maxTransporters);
/**
* Add a transporter to epoll_set
* does nothing if epoll not active
*/
bool epoll_add(TCP_Transporter*);
/**
* Bitmask of transporters currently handled by this instance
*/
NodeBitmask m_transporters;
/**
* Bitmask of transporters having data awaiting to be received
* from its transporter.
*/
NodeBitmask m_recv_transporters;
/**
* Bitmask of transporters that has already received data buffered
* inside its transporter. Possibly "carried over" from last
* performReceive
*/
NodeBitmask m_has_data_transporters;
/**
* Subset of m_has_data_transporters which we completed handling
* of in previous ::performReceive before we was interrupted due
* to lack of job buffers. Will skip these when we later retry
* ::performReceive in order to avoid starvation of non-handled
* transporters.
*/
NodeBitmask m_handled_transporters;
/**
* Bitmask of transporters having received corrupted or unsupported
* message. No more unpacking and delivery of messages allowed.
*/
NodeBitmask m_bad_data_transporters;
#if defined(HAVE_EPOLL_CREATE)
int m_epoll_fd;
struct epoll_event *m_epoll_events;
bool change_epoll(TCP_Transporter *t, bool add);
#endif
/**
* Used in polling if exists TCP_Transporter
*/
ndb_socket_poller m_socket_poller;
};
#include "TransporterCallback.hpp"
/**
* @class TransporterRegistry
* @brief ...
*/
class TransporterRegistry : private TransporterSendBufferHandle {
friend class SHM_Transporter;
friend class SHM_Writer;
friend class Transporter;
friend class TransporterService;
public:
/**
* Constructor
*/
TransporterRegistry(TransporterCallback *callback,
TransporterReceiveHandle * receiveHandle,
bool use_default_send_buffer = true,
unsigned maxTransporters = MAX_NTRANSPORTERS);
/**
* this handle will be used in the client connect thread
* to fetch information on dynamic ports. The old handle
* (if set) is destroyed, and this is destroyed by the destructor
*/
void set_mgm_handle(NdbMgmHandle h);
NdbMgmHandle get_mgm_handle(void) { return m_mgm_handle; };
bool init(NodeId localNodeId);
/**
* Iff using non-default TransporterReceiveHandle's
* they need to get initalized
*/
bool init(TransporterReceiveHandle&);
/**
Perform handshaking of a client connection to accept it
as transporter.
@note Connection should be closed by caller if function
returns false
@param sockfd the socket to handshake
@param mgs error message describing why handshake failed,
to be filled in when function return
@param close_with_reset allows the function to indicate to the caller
how the socket should be closed when function
returns false
@returns false on failure and true on success
*/
bool connect_server(NDB_SOCKET_TYPE sockfd,
BaseString& msg,
bool& close_with_reset) const;
bool connect_client(NdbMgmHandle *h);
/**
* Given a SocketClient, creates a NdbMgmHandle, turns it into a transporter
* and returns the socket.
*/
NDB_SOCKET_TYPE connect_ndb_mgmd(const char* server_name,
unsigned short server_port);
/**
* Given a connected NdbMgmHandle, turns it into a transporter
* and returns the socket.
*/
NDB_SOCKET_TYPE connect_ndb_mgmd(NdbMgmHandle *h);
private:
/**
* Report the dynamically allocated ports to ndb_mgmd so that clients
* which want to connect to ndbd can ask ndb_mgmd which port to use.
*/
bool report_dynamic_ports(NdbMgmHandle h) const;
/**
* Remove all transporters
*/
void removeAll();
/**
* Disconnect all transporters
*/
void disconnectAll();
public:
/**
* Stops the server, disconnects all the transporter
* and deletes them and remove it from the transporter arrays
*/
virtual ~TransporterRegistry();
bool start_service(SocketServer& server);
struct NdbThread* start_clients();
bool stop_clients();
void start_clients_thread();
/**
* Start/Stop receiving
*/
void startReceiving();
void stopReceiving();
/**
* Start/Stop sending
*/
void startSending();
void stopSending();
// A transporter is always in a PerformState.
// PerformIO is used initially and as long as any of the events
// PerformConnect, ...
enum PerformState {
CONNECTED = 0,
CONNECTING = 1,
DISCONNECTED = 2,
DISCONNECTING = 3
};
const char *getPerformStateString(NodeId nodeId) const
{ return performStateString[(unsigned)performStates[nodeId]]; };
PerformState getPerformState(NodeId nodeId) const { return performStates[nodeId]; }
/**
* Get and set methods for PerformState
*/
void do_connect(NodeId node_id);
void do_disconnect(NodeId node_id, int errnum = 0);
bool is_connected(NodeId node_id) const {
return performStates[node_id] == CONNECTED;
};
private:
void report_connect(TransporterReceiveHandle&, NodeId node_id);
void report_disconnect(TransporterReceiveHandle&, NodeId node_id, int errnum);
void report_error(NodeId nodeId, TransporterError errorCode,
const char *errorInfo = 0);
void dump_and_report_bad_message(const char file[], unsigned line,
TransporterReceiveHandle & recvHandle,
Uint32 * readPtr,
size_t sizeOfData,
NodeId remoteNodeId,
IOState state,
TransporterError errorCode);
public:
/**
* Get and set methods for IOState
*/
IOState ioState(NodeId nodeId) const;
void setIOState(NodeId nodeId, IOState state);
/**
* Methods to handle backoff of connection attempts when attempt fails
*/
public:
void indicate_node_up(NodeId nodeId);
void set_connect_backoff_max_time_in_ms(Uint32 max_time_in_ms);
private:
Uint32 get_connect_backoff_max_time_in_laps() const;
bool get_and_clear_node_up_indicator(NodeId nodeId);
void backoff_reset_connecting_time(NodeId nodeId);
bool backoff_update_and_check_time_for_connect(NodeId nodeId);
private:
bool createTCPTransporter(TransporterConfiguration * config);
bool createSCITransporter(TransporterConfiguration * config);
bool createSHMTransporter(TransporterConfiguration * config);
public:
/**
* configureTransporter
*
* Configure a transporter, ie. create new if it
* does not exist otherwise try to reconfigure it
*
*/
bool configureTransporter(TransporterConfiguration * config);
/**
* Allocate send buffer for default send buffer handling.
*
* Upper layer that implements their own TransporterSendBufferHandle do not
* use this, instead they manage their own send buffers.
*
* Argument is the value of config parameter TotalSendBufferMemory. If 0,
* a default will be used of sum(max send buffer) over all transporters.
* The second is the config parameter ExtraSendBufferMemory
*/
void allocate_send_buffers(Uint64 total_send_buffer,
Uint64 extra_send_buffer);
/**
* Get sum of max send buffer over all transporters, to be used as a default
* for allocate_send_buffers eg.
*
* Must be called after creating all transporters for returned value to be
* correct.
*/
Uint64 get_total_max_send_buffer() { return m_total_max_send_buffer; }
bool get_using_default_send_buffer() const{ return m_use_default_send_buffer;}
/**
* Get transporter's connect count
*/
Uint32 get_connect_count(Uint32 nodeId);
/**
* Set or clear overloaded bit.
* Query if any overloaded bit is set.
*/
void set_status_overloaded(Uint32 nodeId, bool val);
const NodeBitmask& get_status_overloaded() const;
/**
* Get transporter's overload count since connect
*/
Uint32 get_overload_count(Uint32 nodeId);
/**
* Set or clear slowdown bit.
* Query if any slowdown bit is set.
*/
void set_status_slowdown(Uint32 nodeId, bool val);
const NodeBitmask& get_status_slowdown() const;
/**
* Get transporter's slowdown count since connect
*/
Uint32 get_slowdown_count(Uint32 nodeId);
/**
* prepareSend
*
* When IOState is HaltOutput or HaltIO do not send or insert any
* signals in the SendBuffer, unless it is intended for the remote
* CMVMI block (blockno 252)
* Perform prepareSend on the transporter.
*
* NOTE signalHeader->xxxBlockRef should contain block numbers and
* not references
*/
SendStatus prepareSend(TransporterSendBufferHandle *sendHandle,
const SignalHeader * const signalHeader, Uint8 prio,
const Uint32 * const signalData,
NodeId nodeId,
const LinearSectionPtr ptr[3]);
SendStatus prepareSend(TransporterSendBufferHandle *sendHandle,
const SignalHeader * const signalHeader, Uint8 prio,
const Uint32 * const signalData,
NodeId nodeId,
class SectionSegmentPool & pool,
const SegmentedSectionPtr ptr[3]);
SendStatus prepareSend(TransporterSendBufferHandle *sendHandle,
const SignalHeader * const signalHeader, Uint8 prio,
const Uint32 * const signalData,
NodeId nodeId,
const GenericSectionPtr ptr[3]);
/**
* Backwards compatiple methods with default send buffer handling.
*/
SendStatus prepareSend(const SignalHeader * const signalHeader, Uint8 prio,
const Uint32 * const signalData,
NodeId nodeId,
const LinearSectionPtr ptr[3])
{
return prepareSend(this, signalHeader, prio, signalData, nodeId, ptr);
}
SendStatus prepareSend(const SignalHeader * const signalHeader, Uint8 prio,
const Uint32 * const signalData,
NodeId nodeId,
class SectionSegmentPool & pool,
const SegmentedSectionPtr ptr[3])
{
return prepareSend(this, signalHeader, prio, signalData, nodeId, pool, ptr);
}
SendStatus prepareSend(const SignalHeader * const signalHeader, Uint8 prio,
const Uint32 * const signalData,
NodeId nodeId,
const GenericSectionPtr ptr[3])
{
return prepareSend(this, signalHeader, prio, signalData, nodeId, ptr);
}
/**
* external_IO
*
* Equal to: poll(...); perform_IO()
*
*/
void external_IO(Uint32 timeOutMillis);
bool performSend(NodeId nodeId);
void performSend();
/**
* Force sending if more than or equal to sendLimit
* number have asked for send. Returns 0 if not sending
* and 1 if sending.
*/
int forceSendCheck(int sendLimit);
#ifdef DEBUG_TRANSPORTER
void printState();
#endif
class Transporter_interface {
public:
NodeId m_remote_nodeId;
int m_s_service_port; // signed port number
const char *m_interface;
};
Vector<Transporter_interface> m_transporter_interface;
void add_transporter_interface(NodeId remoteNodeId, const char *interf,
int s_port); // signed port. <0 is dynamic
Transporter* get_transporter(NodeId nodeId);
struct in_addr get_connect_address(NodeId node_id) const;
Uint64 get_bytes_sent(NodeId nodeId) const;
Uint64 get_bytes_received(NodeId nodeId) const;
protected:
private:
TransporterCallback *callbackObj;
TransporterReceiveHandle * receiveHandle;
NdbMgmHandle m_mgm_handle;
struct NdbThread *m_start_clients_thread;
bool m_run_start_clients_thread;
int sendCounter;
NodeId localNodeId;
unsigned maxTransporters;
int nTransporters;
int nTCPTransporters;
int nSCITransporters;
int nSHMTransporters;
#ifdef ERROR_INSERT
Bitmask<MAX_NTRANSPORTERS/32> m_blocked;
Bitmask<MAX_NTRANSPORTERS/32> m_blocked_disconnected;
int m_disconnect_errors[MAX_NTRANSPORTERS];
Uint32 m_mixology_level;
#endif
/**
* Arrays holding all transporters in the order they are created
*/
TCP_Transporter** theTCPTransporters;
SCI_Transporter** theSCITransporters;
SHM_Transporter** theSHMTransporters;
/**
* Array, indexed by nodeId, holding all transporters
*/
TransporterType* theTransporterTypes;
Transporter** theTransporters;
/**
* State arrays, index by host id
*/
PerformState* performStates;
int* m_disconnect_errnum;
IOState* ioStates;
struct ErrorState {
TransporterError m_code;
const char *m_info;
};
struct ErrorState *m_error_states;
/**
* peerUpIndicators[nodeId] is set by receiver thread
* to indicate that node is probable up.
* It is read and cleared by start clients thread.
*/
volatile bool* peerUpIndicators;
/**
* Count of how long time one have been attempting to
* connect to node nodeId, in units of 100ms.
*/
Uint32* connectingTime;
/**
* The current maximal time between connection attempts to a
* node in units of 100ms.
* Updated by receive thread, read by start clients thread
*/
volatile Uint32 connectBackoffMaxTime;
/**
* Overloaded bits, for fast check.
* Similarly slowdown bits for fast check.
*/
NodeBitmask m_status_overloaded;
NodeBitmask m_status_slowdown;
/**
* Unpack signal data.
*
* Defined in Packer.cpp.
*/
Uint32 unpack(TransporterReceiveHandle&,
Uint32 * readPtr,
Uint32 bufferSize,
NodeId remoteNodeId,
IOState state,
bool & stopReceiving);
Uint32 * unpack(TransporterReceiveHandle&,
Uint32 * readPtr,
Uint32 * eodPtr,
NodeId remoteNodeId,
IOState state,
bool & stopReceiving);
static Uint32 unpack_length_words(const Uint32 *readPtr, Uint32 maxWords);
/**
* Disconnect the transporter and remove it from
* theTransporters array. Do not allow any holes
* in theTransporters. Delete the transporter
* and remove it from theIndexedTransporters array
*/
void removeTransporter(NodeId nodeId);
Uint32 poll_TCP(Uint32 timeOutMillis, TransporterReceiveHandle&);
Uint32 poll_SCI(Uint32 timeOutMillis, TransporterReceiveHandle&);
Uint32 poll_SHM(Uint32 timeOutMillis, TransporterReceiveHandle&);
int m_shm_own_pid;
int m_transp_count;
public:
bool setup_wakeup_socket(TransporterReceiveHandle&);
void wakeup();
inline bool setup_wakeup_socket() {
assert(receiveHandle != 0);
return setup_wakeup_socket(* receiveHandle);
}
private:
bool m_has_extra_wakeup_socket;
NDB_SOCKET_TYPE m_extra_wakeup_sockets[2];
void consume_extra_sockets();
Uint32 *getWritePtr(TransporterSendBufferHandle *handle,
NodeId node, Uint32 lenBytes, Uint32 prio);
void updateWritePtr(TransporterSendBufferHandle *handle,
NodeId node, Uint32 lenBytes, Uint32 prio);
public:
/**
* TransporterSendBufferHandle implementation.
*
* Used for default send buffer handling, when the upper layer does not
* want to do special buffer handling itself.
*/
virtual Uint32 *getWritePtr(NodeId node, Uint32 lenBytes, Uint32 prio,
Uint32 max_use);
virtual Uint32 updateWritePtr(NodeId node, Uint32 lenBytes, Uint32 prio);
virtual void getSendBufferLevel(NodeId node, SB_LevelType &level);
virtual bool forceSend(NodeId node);
/* Various internal */
void inc_overload_count(Uint32 nodeId);
void inc_slowdown_count(Uint32 nodeId);
private:
/* Send buffer pages. */
struct SendBufferPage {
/* This is the number of words that will fit in one page of send buffer. */
static const Uint32 PGSIZE = 32768;
static Uint32 max_data_bytes()
{
return PGSIZE - offsetof(SendBufferPage, m_data);
}
/* Send buffer for one transporter is kept in a single-linked list. */
struct SendBufferPage *m_next;
/* Bytes of send data available in this page. */
Uint16 m_bytes;
/* Start of unsent data */
Uint16 m_start;
/* Data; real size is to the end of one page. */
char m_data[2];
};
/* Send buffer for one transporter. */
struct SendBuffer {
/* Total size of data in buffer, from m_offset_start_data to end. */
Uint32 m_used_bytes;
/* Linked list of active buffer pages with first and last pointer. */
SendBufferPage *m_first_page;
SendBufferPage *m_last_page;
};
SendBufferPage *alloc_page();
void release_page(SendBufferPage *page);
private:
/* True if we are using the default send buffer implementation. */
bool m_use_default_send_buffer;
/* Send buffers. */
SendBuffer *m_send_buffers;
/**
* Make sure m_send_buffers array (read-only) is not using same
* cacheline as the data below which is often updated.
*/
char unused[NDB_CL];
/* Linked list of free pages. */
SendBufferPage *m_page_freelist;
/* Original block of memory for pages (so we can free it at exit). */
unsigned char *m_send_buffer_memory;
Uint64 m_tot_send_buffer_memory;
Uint64 m_tot_used_buffer_memory;
/**
* Sum of max transporter memory for each transporter.
* Used to compute default send buffer size.
*/
Uint64 m_total_max_send_buffer;
public:
Uint32 get_bytes_to_send_iovec(NodeId node, struct iovec *dst, Uint32 max);
Uint32 bytes_sent(NodeId node, Uint32 bytes);
bool has_data_to_send(NodeId node);
void reset_send_buffer(NodeId node, bool should_be_empty);
void print_transporters(const char* where, NdbOut& out = ndbout);
/**
* Receiving
*/
Uint32 pollReceive(Uint32 timeOutMillis, TransporterReceiveHandle& mask);
Uint32 performReceive(TransporterReceiveHandle&);
void update_connections(TransporterReceiveHandle&);
inline Uint32 pollReceive(Uint32 timeOutMillis) {
assert(receiveHandle != 0);
return pollReceive(timeOutMillis, * receiveHandle);
}
inline Uint32 performReceive() {
assert(receiveHandle != 0);
return performReceive(* receiveHandle);
}
inline void update_connections() {
assert(receiveHandle != 0);
update_connections(* receiveHandle);
}
#ifdef ERROR_INSERT
/* Utils for testing latency issues */
bool isBlocked(NodeId nodeId);
void blockReceive(TransporterReceiveHandle&, NodeId nodeId);
void unblockReceive(TransporterReceiveHandle&, NodeId nodeId);
/* Testing interleaving of signal processing */
Uint32 getMixologyLevel() const;
void setMixologyLevel(Uint32 l);
#endif
};
inline void
TransporterRegistry::set_status_overloaded(Uint32 nodeId, bool val)
{
assert(nodeId < MAX_NODES);
if (val != m_status_overloaded.get(nodeId))
{
m_status_overloaded.set(nodeId, val);
if (val)
inc_overload_count(nodeId);
}
if (val)
set_status_slowdown(nodeId, val);
}
inline const NodeBitmask&
TransporterRegistry::get_status_overloaded() const
{
return m_status_overloaded;
}
inline void
TransporterRegistry::set_status_slowdown(Uint32 nodeId, bool val)
{
assert(nodeId < MAX_NODES);
if (val != m_status_slowdown.get(nodeId))
{
m_status_slowdown.set(nodeId, val);
if (val)
inc_slowdown_count(nodeId);
}
}
inline const NodeBitmask&
TransporterRegistry::get_status_slowdown() const
{
return m_status_slowdown;
}
inline void
TransporterRegistry::indicate_node_up(NodeId nodeId) // Called from receive thread
{
assert(nodeId < MAX_NODES);
if (!peerUpIndicators[nodeId])
{
peerUpIndicators[nodeId] = true;
}
}
inline bool
TransporterRegistry::get_and_clear_node_up_indicator(NodeId nodeId) // Called from start client thread
{
assert(nodeId < MAX_NODES);
bool indicator = peerUpIndicators[nodeId];
if (indicator)
{
peerUpIndicators[nodeId] = false;
}
return indicator;
}
inline Uint32
TransporterRegistry::get_connect_backoff_max_time_in_laps() const
{ /* one lap, 100 ms */
return connectBackoffMaxTime;
}
inline void
TransporterRegistry::set_connect_backoff_max_time_in_ms(Uint32 backoff_max_time_in_ms)
{
/**
* Round up backoff_max_time to nearest higher 100ms, since that is lap time
* in start_client_threads using this function.
*/
connectBackoffMaxTime = (backoff_max_time_in_ms + 99) / 100;
}
inline void
TransporterRegistry::backoff_reset_connecting_time(NodeId nodeId)
{
assert(nodeId < MAX_NODES);
connectingTime[nodeId] = 0;
}
inline bool
TransporterRegistry::backoff_update_and_check_time_for_connect(NodeId nodeId)
{
assert(nodeId < MAX_NODES);
Uint32 backoff_max_time = get_connect_backoff_max_time_in_laps();
if (backoff_max_time == 0)
{
// Backoff disabled
return true;
}
connectingTime[nodeId] ++;
if (connectingTime[nodeId] >= backoff_max_time)
{
return (connectingTime[nodeId] % backoff_max_time == 0);
}
/**
* Attempt moments from start of connecting.
* This function is called from start_clients_thread
* roughly every 100ms for each node it is connecting
* to.
*/
static const Uint16 attempt_moments[] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024};
static const int attempt_moments_count = sizeof(attempt_moments) / sizeof(attempt_moments[0]);
for(int i = 0; i < attempt_moments_count; i ++)
{
if (connectingTime[nodeId] == attempt_moments[i])
{
return true;
}
else if (connectingTime[nodeId] < attempt_moments[i])
{
return false;
}
}
return (connectingTime[nodeId] % attempt_moments[attempt_moments_count - 1] == 0);
}
/**
* A function used to calculate a send buffer level given the size of the node
* send buffer and the total send buffer size for all nodes and the total send
* buffer used for all nodes. There is also a thread parameter that specifies
* the number of threads used (this is 0 except for ndbmtd).
*/
void calculate_send_buffer_level(Uint64 node_send_buffer_size,
Uint64 total_send_buffer_size,
Uint64 total_used_send_buffer_size,
Uint32 num_threads,
SB_LevelType &level);
#endif // Define of TransporterRegistry_H