Leader Election

Sentinel will use is-master-down-by-addr for getting votes from other sentinels, when the leader election is happening. Each sentinel will have chance to become leader to lead the failover process.

    if ((master->flags & SRI_S_DOWN) == 0) continue;  
    if (ri->link->disconnected) continue;  
    if (!(flags & SENTINEL_ASK_FORCED) &&  
        mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD)  
        continue;  
  
    /* Ask */  
    ll2string(port,sizeof(port),master->addr->port);  
    retval = redisAsyncCommand(ri->link->cc,  
                sentinelReceiveIsMasterDownReply, ri,  
                "%s is-master-down-by-addr %s %s %llu %s",  
                sentinelInstanceMapCommand(ri,"SENTINEL"),  
                master->addr->ip, port,  
                sentinel.current_epoch,  
                (master->failover_state > SENTINEL_FAILOVER_STATE_NONE) ?  
                sentinel.myid : "*");  
    if (retval == C_OK) ri->link->pending_commands++;  
}  

The call: is-master-down-by-addr: the runid will be sentinel id if it is seeking for votes.

If Sentinel receive other Sentinel requests, it will vote if the following condition is true,

Sentinel¡¯s epoch is smaller than or equal to the new vote¡¯s sentinel epoch
This is the first Sentinel request it received in current epoch

/* Vote for the master (or fetch the previous vote) if the request 
        * includes a runid, otherwise the sender is not seeking for a vote. */  
       if (ri && ri->flags & SRI_MASTER && strcasecmp(c->argv[5]->ptr,"*")) {  
           leader = sentinelVoteLeader(ri,(uint64_t)req_epoch,  
                                           c->argv[5]->ptr,  
                                           &leader_epoch);  
       }  

The sentinel receives other votes from other sentinel, it will parse and get the field of the is-master-down-by-addr and remember the vote.

              /* Ignore every error or unexpected reply. 
    * Note that if the command returns an error for any reason we'll 
    * end clearing the SRI_MASTER_DOWN flag for timeout anyway. */  
   if (r->type == REDIS_REPLY_ARRAY && r->elements == 3 &&  
       r->element[0]->type == REDIS_REPLY_INTEGER &&  
       r->element[1]->type == REDIS_REPLY_STRING &&  
       r->element[2]->type == REDIS_REPLY_INTEGER)  
   {  
       ri->last_master_down_reply_time = mstime();  
       if (r->element[0]->integer == 1) {  
           ri->flags |= SRI_MASTER_DOWN;  
       } else {  
           ri->flags &= ~SRI_MASTER_DOWN;  
       }  
       if (strcmp(r->element[1]->str,"*")) {  
           /* If the runid in the reply is not "*" the Sentinel actually 
            * replied with a vote. */  
           sdsfree(ri->leader);  
           if ((long long)ri->leader_epoch != r->element[2]->integer)  
               serverLog(LL_WARNING,  
                   "%s voted for %s %llu", ri->name,  
                   r->element[1]->str,  
                   (unsigned long long) r->element[2]->integer);  
           ri->leader = sdsnew(r->element[1]->str);  
           ri->leader_epoch = r->element[2]->integer;  
       }  
   } 

The sentinel will check the vote result and if the following is true, it will be elected as leader to do the failover.

Majority of sentinel votes.
The vote number reach quorum number. (if quorum number is smaller than 50%+1, it will use %50+1 as the limit) If this doesn’t happen, the vote will start again.

char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t epoch) {  
    dict *counters;   
    dictIterator *di;  
    dictEntry *de;  
    unsigned int voters = 0, voters_quorum;  
    char *myvote;  
    char *winner = NULL;  
    uint64_t leader_epoch;  
    uint64_t max_votes = 0;  
  
    serverAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS));  
    counters = dictCreate(&leaderVotesDictType,NULL);  
  
    voters = dictSize(master->sentinels)+1; /* All the other sentinels and me.*/  
  
    /* Count other sentinels votes */  
    di = dictGetIterator(master->sentinels);  
    while((de = dictNext(di)) != NULL) {  
        sentinelRedisInstance *ri = dictGetVal(de);  
        if (ri->leader != NULL && ri->leader_epoch == sentinel.current_epoch)  
            sentinelLeaderIncr(counters,ri->leader);  
    }  
    dictReleaseIterator(di);  
  
    /* Check what's the winner. For the winner to win, it needs two conditions: 
     * 1) Absolute majority between voters (50% + 1). 
     * 2) And anyway at least master->quorum votes. */  
    di = dictGetIterator(counters);  
    while((de = dictNext(di)) != NULL) {  
        uint64_t votes = dictGetUnsignedIntegerVal(de);  
  
        if (votes > max_votes) {  
            max_votes = votes;  
            winner = dictGetKey(de);  
        }  
    }  
    dictReleaseIterator(di);  
  
    /* Count this Sentinel vote: 
     * if this Sentinel did not voted yet, either vote for the most 
     * common voted sentinel, or for itself if no vote exists at all. */  
    if (winner)  
        myvote = sentinelVoteLeader(master,epoch,winner,&leader_epoch);  
    else  
        myvote = sentinelVoteLeader(master,epoch,sentinel.myid,&leader_epoch);  
  
    if (myvote && leader_epoch == epoch) {  
        uint64_t votes = sentinelLeaderIncr(counters,myvote);  
  
        if (votes > max_votes) {  
            max_votes = votes;  
            winner = myvote;  
        }  
    }  
  
    voters_quorum = voters/2+1;  
    if (winner && (max_votes < voters_quorum || max_votes < master->quorum))  
        winner = NULL;  
  
    winner = winner ? sdsnew(winner) : NULL;  
    sdsfree(myvote);  
    dictRelease(counters);  
    return winner;  
}

Failover Process

Master fail over process was defined in function sentinelFailoverStateMachine.

void sentinelFailoverStateMachine(sentinelRedisInstance *ri) {  
    serverAssert(ri->flags & SRI_MASTER);  
  
    if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS)) return;  
  
    switch(ri->failover_state) {  
        case SENTINEL_FAILOVER_STATE_WAIT_START:  
            sentinelFailoverWaitStart(ri);  
            break;  
        case SENTINEL_FAILOVER_STATE_SELECT_SLAVE:  
            sentinelFailoverSelectSlave(ri);  
            break;  
        case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE:  
            sentinelFailoverSendSlaveOfNoOne(ri);  
            break;  
        case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION:  
            sentinelFailoverWaitPromotion(ri);  
            break;  
        case SENTINEL_FAILOVER_STATE_RECONF_SLAVES:  
            sentinelFailoverReconfNextSlave(ri);  
            break;  
    }  
} 

The sequence of the failover process as describe follows:

In SENTINEL_FAILOVER_STATE_WAIT_START state, Sentinels will count for the votes for leader election, if it was selected as the leader, it will proceed and change the status into SENTINEL_FAILOVER_STATE_SELECT_SLAVE, otherwise, it will abort failover after the election timeout.

void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {  
    char *leader;  
    int isleader;  
  
    /* Check if we are the leader for the failover epoch. */  
    leader = sentinelGetLeader(ri, ri->failover_epoch);  
    isleader = leader && strcasecmp(leader,sentinel.myid) == 0;  
    sdsfree(leader);  
  
    /* If I'm not the leader, and it is not a forced failover via 
     * SENTINEL FAILOVER, then I can't continue with the failover. */  
    if (!isleader && !(ri->flags & SRI_FORCE_FAILOVER)) {  
        int election_timeout = SENTINEL_ELECTION_TIMEOUT;  
  
        /* The election timeout is the MIN between SENTINEL_ELECTION_TIMEOUT 
         * and the configured failover timeout. */  
        if (election_timeout > ri->failover_timeout)  
            election_timeout = ri->failover_timeout;  
        /* Abort the failover if I'm not the leader after some time. */  
        if (mstime() - ri->failover_start_time > election_timeout) {  
            sentinelEvent(LL_WARNING,"-failover-abort-not-elected",ri,"%@");  
            sentinelAbortFailover(ri);  
        }  
        return;  
    }  
    sentinelEvent(LL_WARNING,"+elected-leader",ri,"%@");  
    if (sentinel.simfailure_flags & SENTINEL_SIMFAILURE_CRASH_AFTER_ELECTION)  
        sentinelSimFailureCrash();  
    ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;  
    ri->failover_state_change_time = mstime();  
    sentinelEvent(LL_WARNING,"+failover-state-select-slave",ri,"%@");  
}  

In SENTINEL_FAILOVER_STATE_SELECT_SLAVE state, the leader sentinel will select the best slaves to promote, the detailed algorithm was explained in the following function sentinelSelectSlave.

sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {  
    sentinelRedisInstance **instance =  
        zmalloc(sizeof(instance[0])*dictSize(master->slaves));  
    sentinelRedisInstance *selected = NULL;  
    int instances = 0;  
    dictIterator *di;  
    dictEntry *de;  
    mstime_t max_master_down_time = 0;  
  
    if (master->flags & SRI_S_DOWN)  
        max_master_down_time += mstime() - master->s_down_since_time;  
    max_master_down_time += master->down_after_period * 10;  
  
    di = dictGetIterator(master->slaves);  
    while((de = dictNext(di)) != NULL) {  
        sentinelRedisInstance *slave = dictGetVal(de);  
        mstime_t info_validity_time;  
  
        if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN)) continue;  
        if (slave->link->disconnected) continue;  
        if (mstime() - slave->link->last_avail_time > SENTINEL_PING_PERIOD*5) continue;  
        if (slave->slave_priority == 0) continue;  
  
        /* If the master is in SDOWN state we get INFO for slaves every second. 
         * Otherwise we get it with the usual period so we need to account for 
         * a larger delay. */  
        if (master->flags & SRI_S_DOWN)  
            info_validity_time = SENTINEL_PING_PERIOD*5;  
        else  
            info_validity_time = SENTINEL_INFO_PERIOD*3;  
        if (mstime() - slave->info_refresh > info_validity_time) continue;  
        if (slave->master_link_down_time > max_master_down_time) continue;  
        instance[instances++] = slave;  
    }  
    dictReleaseIterator(di);  
    if (instances) {  
        qsort(instance,instances,sizeof(sentinelRedisInstance*),  
            compareSlavesForPromotion);  
        selected = instance[0];  
    }  
    zfree(instance);  
    return selected;  
} 

The sequence of selecting good slave is:

It filtered out all the slaves which is in S_DOWN, O_DOWN state.
It filtered out all slaves with disconnected link.
It filtered out all slaves which doesn¡¯t get back to Sentinel ping in 5 secs by default.
It filtered out all slaves with 0 priority.
It filtered out slaves with more than 5 sec info_refresh time when master is in S_DOWN state.
It filtered out slaves which has master link down time more than master down time+10*down_after_period.

Then all the slaves will be sorted for the following sequence:
Slave Priority,
Slave Replication offset.
Slave Id.

The first slave will be promoted to the master, and the failover will proceed into SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE status, if there is no slave available, the failover will be aborted.

In SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE state, the leader sentinel will send slaveof no one command to the promoted master, and the master status will change into SENTINEL_FAILOVER_STATE_WAIT_PROMOTION.

void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) {  
    int retval;  
  
    /* We can't send the command to the promoted slave if it is now 
     * disconnected. Retry again and again with this state until the timeout 
     * is reached, then abort the failover. */  
    if (ri->promoted_slave->link->disconnected) {  
        if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {  
            sentinelEvent(LL_WARNING,"-failover-abort-slave-timeout",ri,"%@");  
            sentinelAbortFailover(ri);  
        }  
        return;  
    }  
  
    /* Send SLAVEOF NO ONE command to turn the slave into a master. 
     * We actually register a generic callback for this command as we don't 
     * really care about the reply. We check if it worked indirectly observing 
     * if INFO returns a different role (master instead of slave). */  
    retval = sentinelSendSlaveOf(ri->promoted_slave,NULL,0);  
    if (retval != C_OK) return;  
    sentinelEvent(LL_NOTICE, "+failover-state-wait-promotion",  
        ri->promoted_slave,"%@");  
    ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION;  
    ri->failover_state_change_time = mstime();  
}  

In SENTINEL_FAILOVER_STATE_WAIT_PROMOTION state, the sentinel will continuously waiting for the promoted Slave report the role change through Info command reply, if this doesn’t happen after failover-timeout, the failover process will be aborted.

/* We actually wait for promotion indirectly checking with INFO when the 
 * slave turns into a master. */  
void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) {  
    /* Just handle the timeout. Switching to the next state is handled 
     * by the function parsing the INFO command of the promoted slave. */  
    if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {  
        sentinelEvent(LL_WARNING,"-failover-abort-slave-timeout",ri,"%@");  
        sentinelAbortFailover(ri);  
    }  
}  

However if during the wait time, it receives the Info reply about the slave role change into master, it will change its master failover status into SENTINEL_FAILOVER_STATE_RECONF_SLAVES.

/* Handle slave -> master role switch. */  
   if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) {  
       /* If this is a promoted slave we can change state to the 
        * failover state machine. */  
       if ((ri->flags & SRI_PROMOTED) &&  
           (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&  
           (ri->master->failover_state ==  
               SENTINEL_FAILOVER_STATE_WAIT_PROMOTION))  
       {  
           /* Now that we are sure the slave was reconfigured as a master 
            * set the master configuration epoch to the epoch we won the 
            * election to perform this failover. This will force the other 
            * Sentinels to update their config (assuming there is not 
            * a newer one already available). */  
           ri->master->config_epoch = ri->master->failover_epoch;  
           ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES;  
           ri->master->failover_state_change_time = mstime();  
           sentinelFlushConfig();  
           sentinelEvent(LL_WARNING,"+promoted-slave",ri,"%@");  
           if (sentinel.simfailure_flags &  
               SENTINEL_SIMFAILURE_CRASH_AFTER_PROMOTION)  
               sentinelSimFailureCrash();  
           sentinelEvent(LL_WARNING,"+failover-state-reconf-slaves",  
               ri->master,"%@");  
           sentinelCallClientReconfScript(ri->master,SENTINEL_LEADER,  
               "start",ri->master->addr,ri->addr);  
           sentinelForceHelloUpdateForMaster(ri->master);  
       } else {  

In SENTINEL_FAILOVER_STATE_RECONF_SLAVES state, leader sentinels will send Slaveof to other slaves:

/* Send SLAVE OF <new master address> to all the remaining slaves that 
 * still don't appear to have the configuration updated. */  
void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) {  
    dictIterator *di;  
    dictEntry *de;  
    int in_progress = 0;  
  
    di = dictGetIterator(master->slaves);  
    while((de = dictNext(di)) != NULL) {  
        sentinelRedisInstance *slave = dictGetVal(de);  
  
        if (slave->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG))  
            in_progress++;  
    }  
    dictReleaseIterator(di);  
  
    di = dictGetIterator(master->slaves);  
    while(in_progress < master->parallel_syncs &&  
          (de = dictNext(di)) != NULL)  
    {  
        sentinelRedisInstance *slave = dictGetVal(de);  
        int retval;  
  
        /* Skip the promoted slave, and already configured slaves. */  
        if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;  
  
        /* If too much time elapsed without the slave moving forward to 
         * the next state, consider it reconfigured even if it is not. 
         * Sentinels will detect the slave as misconfigured and fix its 
         * configuration later. */  
        if ((slave->flags & SRI_RECONF_SENT) &&  
            (mstime() - slave->slave_reconf_sent_time) >  
            SENTINEL_SLAVE_RECONF_TIMEOUT)  
        {  
            sentinelEvent(LL_NOTICE,"-slave-reconf-sent-timeout",slave,"%@");  
            slave->flags &= ~SRI_RECONF_SENT;  
            slave->flags |= SRI_RECONF_DONE;  
        }  
  
        /* Nothing to do for instances that are disconnected or already 
         * in RECONF_SENT state. */  
        if (slave->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG)) continue;  
        if (slave->link->disconnected) continue;  
  
        /* Send SLAVEOF <new master>. */  
        retval = sentinelSendSlaveOf(slave,  
                master->promoted_slave->addr->ip,  
                master->promoted_slave->addr->port);  
        if (retval == C_OK) {  
            slave->flags |= SRI_RECONF_SENT;  
            slave->slave_reconf_sent_time = mstime();  
            sentinelEvent(LL_NOTICE,"+slave-reconf-sent",slave,"%@");  
            in_progress++;  
        }  
    }  
    dictReleaseIterator(di);  
  
    /* Check if all the slaves are reconfigured and handle timeout. */  
    sentinelFailoverDetectEnd(master);  
} 

The failover end only if all the other healthy slave successfully received and configured the new promoted slave as replicated master, then the failover master will be in SENTINEL_FAILOVER_STATE_UPDATE_CONFIG state.

void sentinelFailoverDetectEnd(sentinelRedisInstance *master) {  
    int not_reconfigured = 0, timeout = 0;  
    dictIterator *di;  
    dictEntry *de;  
    mstime_t elapsed = mstime() - master->failover_state_change_time;  
  
    /* We can't consider failover finished if the promoted slave is 
     * not reachable. */  
    if (master->promoted_slave == NULL ||  
        master->promoted_slave->flags & SRI_S_DOWN) return;  
  
    /* The failover terminates once all the reachable slaves are properly 
     * configured. */  
    di = dictGetIterator(master->slaves);  
    while((de = dictNext(di)) != NULL) {  
        sentinelRedisInstance *slave = dictGetVal(de);  
  
        if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;  
        if (slave->flags & SRI_S_DOWN) continue;  
        not_reconfigured++;  
    }  
    dictReleaseIterator(di);  
  
    /* Force end of failover on timeout. */  
    if (elapsed > master->failover_timeout) {  
        not_reconfigured = 0;  
        timeout = 1;  
        sentinelEvent(LL_WARNING,"+failover-end-for-timeout",master,"%@");  
    }  
  
    if (not_reconfigured == 0) {  
        sentinelEvent(LL_WARNING,"+failover-end",master,"%@");  
        master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG;  
        master->failover_state_change_time = mstime();  
    }  
  
    /* If I'm the leader it is a good idea to send a best effort SLAVEOF 
     * command to all the slaves still not reconfigured to replicate with 
     * the new master. */  
    if (timeout) {  
        dictIterator *di;  
        dictEntry *de;  
  
        di = dictGetIterator(master->slaves);  
        while((de = dictNext(di)) != NULL) {  
            sentinelRedisInstance *slave = dictGetVal(de);  
            int retval;  
  
            if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE|SRI_RECONF_SENT)) continue;  
            if (slave->link->disconnected) continue;  
  
            retval = sentinelSendSlaveOf(slave,  
                    master->promoted_slave->addr->ip,  
                    master->promoted_slave->addr->port);  
            if (retval == C_OK) {  
                sentinelEvent(LL_NOTICE,"+slave-reconf-sent-be",slave,"%@");  
                slave->flags |= SRI_RECONF_SENT;  
            }  
        }  
        dictReleaseIterator(di);  
    }  
} 

At last, when monitored master is in SENTINEL_FAILOVER_STATE_UPDATE_CONFIG state, Sentinel will reconfig master with promoted slave and make failed master as a slave of the new master. Then the failover process ends.

   if (ri->flags & SRI_MASTER) {  
        sentinelHandleDictOfRedisInstances(ri->slaves);  
        sentinelHandleDictOfRedisInstances(ri->sentinels);  
        if (ri->failover_state == SENTINEL_FAILOVER_STATE_UPDATE_CONFIG) {  
            switch_to_promoted = ri;  
        }  
    }  
}  
if (switch_to_promoted)  
    sentinelFailoverSwitchToPromotedSlave(switch_to_promoted);  
dictReleaseIterator(di);  

The following graph describes the whole failover process.