340 lines
11 KiB
C++
340 lines
11 KiB
C++
/*
|
|
* File: PartitionDetector.cc
|
|
* Author: Grit Schneider
|
|
*
|
|
* Created on August 2, 2012, 3:31 PM
|
|
*/
|
|
#include "PartitionDetector.h"
|
|
|
|
#include "Dispatcher.h"
|
|
#include "Moversight.h"
|
|
#include "ms/MembershipService.h"
|
|
#include "ms/MemberRegister.h"
|
|
#include "fd/partition/NeighborhoodDetector.h"
|
|
#include "fd/partition/timer/PartitionTimer.h"
|
|
#include "fd/partition/msg/NDMessage.h"
|
|
#include "fd/partition/msg/NDMessageConfirm.h"
|
|
|
|
#include "fd/partition/events/PartitionDetectedEvent.h"
|
|
#include "fd/partition/events/NeighborReachableAgainEvent.h"
|
|
|
|
namespace ubeeme {
|
|
namespace moversight {
|
|
|
|
#undef DEBUG
|
|
#define DEBUG(msg) if (module.isPrintDebugNFD()) MOV_DEBUG << "PD@" << getLocalID() << " "<<msg<<endl;
|
|
|
|
/**
|
|
* @brief Constructor
|
|
* @param d A reference to the dispatcher.
|
|
*/
|
|
PartitionDetector::PartitionDetector(Dispatcher & d) : MoversightService(d, "PartitionDetector"), partitionTimer(NULL), nd(NULL) {
|
|
}
|
|
|
|
/**
|
|
* @brief Destructor
|
|
*/
|
|
PartitionDetector::~PartitionDetector() {
|
|
}
|
|
|
|
/**
|
|
* @brief Initialise the PartitionDetector
|
|
*/
|
|
void
|
|
PartitionDetector::initialise() {
|
|
|
|
if(nd != NULL){
|
|
nd->finalise();
|
|
delete nd;
|
|
}//End if
|
|
nd = new NeighborhoodDetector(dispatcher);
|
|
nd->initialise();
|
|
|
|
if(partitionTimer != NULL){
|
|
stopAndDeletePartitionTimer();
|
|
}//End if
|
|
partitionTimer = new PartitionTimer(*this);
|
|
|
|
}
|
|
|
|
/**
|
|
* @brief Runs operations to finalise the PartitionDetector
|
|
*/
|
|
void
|
|
PartitionDetector::finalise() {
|
|
|
|
dispatcher.unsubscribeAll(this);
|
|
|
|
DEBUG("finalise - stop PartitionDetector");
|
|
stopAndDeletePartitionTimer();
|
|
|
|
//stop the nd
|
|
if (nd != NULL) {
|
|
nd->finalise();
|
|
delete nd;
|
|
nd = NULL;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief Starts the Partition Detection Service.Adds the peers to the lists,
|
|
* checks whether its a signaled connection-loss by the network-services,
|
|
* and if not start sending the ND-Messages to the slaves in the current
|
|
* cluster and to all other masters.
|
|
*/
|
|
void
|
|
PartitionDetector::detectPartition() {
|
|
|
|
if (checkIfConnectionLost()) {
|
|
|
|
partitionTimer->stop();
|
|
dispatcher.signal( new PartitionDetectedEvent( partitionTimer->getNonReachablePeerIDList()));
|
|
return;
|
|
}
|
|
|
|
if (!partitionTimer->isRunning()) {
|
|
|
|
partitionTimer->setNonReachablePeersFromMR();
|
|
partitionTimer->setIsMandatory(true); // first round asking for your neighbors is mandatory!!
|
|
|
|
partitionTimer->start();
|
|
|
|
nd->sendNDMessage(partitionTimer->getNonReachablePeerIDList());
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief Assignment operator
|
|
* @param other The instance to assign
|
|
* @return A reference to the local instance
|
|
*/
|
|
PartitionDetector &
|
|
PartitionDetector::operator =(PartitionDetector & other) {
|
|
|
|
if (this == &other) {
|
|
return *this;
|
|
}
|
|
this->partitionTimer = other.partitionTimer;
|
|
return *this;
|
|
|
|
}//End
|
|
|
|
/**
|
|
* @brief Method to get the MembershipService from the dispatcher.
|
|
* @return membershipservice
|
|
*/
|
|
MembershipService &
|
|
PartitionDetector::getMembershipService() {
|
|
return dispatcher.getMembershipService();
|
|
}
|
|
|
|
/**
|
|
* @brief Handle a partition timer event.
|
|
* @param timer The timer to handle.
|
|
*/
|
|
void
|
|
PartitionDetector::handlePartitionTimer(PartitionTimer* timer) {
|
|
if (timer->isMandatory()) {
|
|
determineAndSendToMissingMastersSlaves();
|
|
} else {
|
|
partitionTimer->stop();
|
|
dispatcher.signal( new PartitionDetectedEvent( partitionTimer->getNonReachablePeerIDList()));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief Handle each incoming nd message.
|
|
* @param ndm A received nd message.
|
|
*/
|
|
void
|
|
PartitionDetector::handleNDMessage(const NDMessage * ndm) {
|
|
|
|
DEBUG("handleNDMessage");
|
|
|
|
if (getLocalSubState() == REJOIN_IN_PROGRESS) {
|
|
//drop
|
|
return;
|
|
}//end if
|
|
|
|
PeerID senderID = ndm->getSourceID();
|
|
|
|
// equal whether its in the detection or in the rejoin part
|
|
nd->sendNDMessageConfirm(senderID);
|
|
|
|
// set the peer as reReachable to make sure we're contacting the right one afterwards
|
|
partitionTimer->setReReachablePeerID(senderID);
|
|
}
|
|
|
|
/**
|
|
* @brief Handle incoming NDC message.
|
|
* @param ndmc A received ndc message.
|
|
*/
|
|
void
|
|
PartitionDetector::handleNDCMessage(const NDMessageConfirm * ndmc) {
|
|
|
|
DEBUG("handleNDCMessage");
|
|
|
|
if (getLocalSubState() == REJOIN_IN_PROGRESS) {
|
|
//drop
|
|
return;
|
|
}//end if
|
|
|
|
PeerID senderID = ndmc->getSourceID();
|
|
if (getLocalSubState() != WAITING_FOR_REJOIN) {
|
|
// updating lists only during neighborhood detection not during the refinding of the group!
|
|
partitionTimer->markAsReachable(senderID);
|
|
} else if (getLocalSubState() == WAITING_FOR_REJOIN) {
|
|
|
|
// success: primary group could be reached again!
|
|
if (isPeerMemberOfOldGroup(senderID)) {
|
|
if ((partitionTimer->getReReachablePeerID() == senderID && getLocalID() < senderID) || partitionTimer->getReReachablePeerID() == 0) {
|
|
DEBUG("yippie - another group reachable again! ");
|
|
setLocalSubState(REJOIN_IN_PROGRESS);
|
|
dispatcher.signal( new NeighborReachableAgainEvent( senderID, partitionTimer->getDisconnectedPeers()));
|
|
}
|
|
} else {
|
|
throw PeerNotFoundException("The Peer sending us a ND wasn't member of the old group!");
|
|
}
|
|
} else if (partitionTimer->isNonReachableQueueEmpty() && !partitionTimer->isMandatory()) {
|
|
// partition case: success if all the peers answered with NDC
|
|
partitionTimer->stop();
|
|
resetAllStates();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief Method to clean up the whole PeerState to (JOINED,NO_SUBSTATE,NO_STATE_OPERATION).
|
|
*/
|
|
void
|
|
PartitionDetector::resetAllStates() {
|
|
MemberRegister & mr = dispatcher.getMembershipService().getCurrentMemberRegister();
|
|
PeerState state(JOINED);
|
|
mr.setPeerStateOfAllPeers(state);
|
|
|
|
}
|
|
|
|
/**
|
|
* @brief Method checks if signal came directly from the dispatcher, that the connection
|
|
* was lost for sure - no searching for the partition needed, we know which
|
|
* peer has lost its connection.
|
|
* @return true - if it was a signal from the dispatcher, and one of the peers
|
|
* has the state "DISCONNECTED", false otherwise
|
|
*/
|
|
bool
|
|
PartitionDetector::checkIfConnectionLost() {
|
|
PeerIDList allPeers = getMembershipService().getClusterAndMasterPeerIDList(getLocalPeer());
|
|
for (size_t i = 0; i < allPeers.size(); i++) {
|
|
PeerID pId = allPeers.get(i);
|
|
Peer p = getMembershipService().getPeer(pId);
|
|
|
|
if (p.getPeerState() == DISCONNECTED) {
|
|
|
|
partitionTimer->updateDueToConnectionLoss(pId);
|
|
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* @brief Method that determines whether a master was missing or not. And
|
|
* for all missing masters, their slaves are added to the list, to which a
|
|
* ND-Message is sent.
|
|
*/
|
|
void
|
|
PartitionDetector::determineAndSendToMissingMastersSlaves() {
|
|
partitionTimer->stop();
|
|
|
|
if (partitionTimer->getNonReachableMasters().size() != 0) {
|
|
addingMissingMastersSlaves();
|
|
}
|
|
|
|
addingPendingSlavesFromOtherClusters();
|
|
|
|
nd->sendNDMessage(partitionTimer->getNonReachablePeerIDList());
|
|
partitionTimer->start();
|
|
partitionTimer->setIsMandatory(false);
|
|
|
|
}
|
|
|
|
/**
|
|
* @brief Adding the slaves from a master we can't reach to the unreachable list.
|
|
*/
|
|
void
|
|
PartitionDetector::addingMissingMastersSlaves() {
|
|
|
|
PeerIDList nonReachableMasters = partitionTimer->getNonReachableMasters();
|
|
PeerIDList otherClustersSlaves;
|
|
|
|
for (size_t i = 0; i < nonReachableMasters.size(); i++) {
|
|
|
|
PeerID masterID = nonReachableMasters.get(i);
|
|
ClusterID cID = getMembershipService().findPeer(masterID);
|
|
|
|
if (getLocalMasterID() != masterID) {
|
|
otherClustersSlaves.add(getMembershipService().getClusterPeerIDListSlavesOnly(cID));
|
|
}
|
|
}
|
|
partitionTimer->addingPeerSetToNonReachableList(otherClustersSlaves);
|
|
}
|
|
|
|
/**
|
|
* @brief Also slaves from other clusters where the master is joined need to be checked whether they are pending.
|
|
* And if they are Pending we need to put them on the unreachable list of the partitionTimer!
|
|
*/
|
|
void
|
|
PartitionDetector::addingPendingSlavesFromOtherClusters() {
|
|
ClusterList & cList = dispatcher.getMembershipService().getCurrentMemberRegister().getClusters();
|
|
PeerIDList otherClustersSlaves = partitionTimer->getNonReachablePeerIDList();
|
|
PeerIDList missedPeers;
|
|
|
|
for (size_t i = 0; i < cList.size(); i++) {
|
|
const PeerList & pList = cList.get(i).getPeerList();
|
|
for (size_t j = 0; j < pList.size(); j++) {
|
|
|
|
if (pList.get(j).getState() == PENDING && !otherClustersSlaves.contains(pList.get(j).getPeerID())) {
|
|
missedPeers.add(pList.get(j).getPeerID());
|
|
}
|
|
}
|
|
}
|
|
|
|
if (missedPeers.size() != 0) {
|
|
partitionTimer->addingPeerSetToNonReachableList(missedPeers);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief Checks whether the peer was a member of the original group (the one before the partition).
|
|
* @param searchedPeer - the peer ID searched for
|
|
* @return true if it was a member, false if not
|
|
*/
|
|
bool
|
|
PartitionDetector::isPeerMemberOfOldGroup(PeerID searchedPeer) {
|
|
MemberRegister oldMR = dispatcher.getMembershipService().getLastMemberRegister();
|
|
if (oldMR.contains(searchedPeer)) {
|
|
return true;
|
|
}
|
|
return false;
|
|
|
|
}
|
|
|
|
/**
|
|
* @brief Method tries to stop and delete the current running partitionTImer.
|
|
*/
|
|
void
|
|
PartitionDetector::stopAndDeletePartitionTimer() {
|
|
|
|
DEBUG("stopTimer- try to stop the timer PartitionTimer");
|
|
|
|
if (partitionTimer != NULL) {
|
|
|
|
partitionTimer->stop();
|
|
delete partitionTimer;
|
|
partitionTimer = NULL;
|
|
}
|
|
}
|
|
}
|
|
}
|