简述 Redis 集群配置以及基础原理

# 集群

集群是redis的分布式数据库方案。通过分片实现数据共享。提供复制和故障转移功能。

// 保存连接节点所需的有关信息
typedef struct clusterLink {

    // 连接的创建时间
    mstime_t ctime;             /* Link creation time */

    // TCP 套接字描述符
    int fd;                     /* TCP socket file descriptor */

    // 输出缓冲区，保存着等待发送给其他节点的消息（message）。
    sds sndbuf;                 /* Packet send buffer */

    // 输入缓冲区，保存着从其他节点接收到的消息。
    sds rcvbuf;                 /* Packet reception buffer */

    // 与这个连接相关联的节点，如果没有的话就为 NULL
    struct clusterNode *node;   /* Node related to this link if any, or NULL */

} clusterLink;

// 节点状态
struct clusterNode {

    // 创建节点的时间
    mstime_t ctime; /* Node object creation time. */

    // 节点的名字，由 40 个十六进制字符组成
    // 例如 68eef66df23420a5862208ef5b1a7005b806f2ff
    char name[REDIS_CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */

    // 节点标识
    // 使用各种不同的标识值记录节点的角色（比如主节点或者从节点），
    // 以及节点目前所处的状态（比如在线或者下线）。
    int flags;      /* REDIS_NODE_... */

    // 节点当前的配置纪元，用于实现故障转移
    uint64_t configEpoch; /* Last configEpoch observed for this node */

    // 由这个节点负责处理的槽
    // 一共有 REDIS_CLUSTER_SLOTS / 8 个字节长
    // 每个字节的每个位记录了一个槽的保存状态
    // 位的值为 1 表示槽正由本节点处理，值为 0 则表示槽并非本节点处理
    // 比如 slots[0] 的第一个位保存了槽 0 的保存情况
    // slots[0] 的第二个位保存了槽 1 的保存情况，以此类推
    unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* slots handled by this node */

    // 该节点负责处理的槽数量
    int numslots;   /* Number of slots handled by this node */

    // 如果本节点是主节点，那么用这个属性记录从节点的数量
    int numslaves;  /* Number of slave nodes, if this is a master */

    // 指针数组，指向各个从节点
    struct clusterNode **slaves; /* pointers to slave nodes */

    // 如果这是一个从节点，那么指向主节点
    struct clusterNode *slaveof; /* pointer to the master node */

    // 最后一次发送 PING 命令的时间
    mstime_t ping_sent;      /* Unix time we sent latest ping */

    // 最后一次接收 PONG 回复的时间戳
    mstime_t pong_received;  /* Unix time we received the pong */

    // 最后一次被设置为 FAIL 状态的时间
    mstime_t fail_time;      /* Unix time when FAIL flag was set */

    // 最后一次给某个从节点投票的时间
    mstime_t voted_time;     /* Last time we voted for a slave of this master */

    // 最后一次从这个节点接收到复制偏移量的时间
    mstime_t repl_offset_time;  /* Unix time we received offset for this node */

    // 这个节点的复制偏移量
    long long repl_offset;      /* Last known repl offset for this node. */

    // 节点的 IP 地址
    char ip[REDIS_IP_STR_LEN];  /* Latest known IP address of this node */

    // 节点的端口号
    int port;                   /* Latest known port of this node */

    // 保存连接节点所需的有关信息
    clusterLink *link;          /* TCP/IP link with this node */

    // 一个链表，记录了所有其他节点对该节点的下线报告
    list *fail_reports;         /* List of nodes signaling this as failing */

};
typedef struct clusterNode clusterNode;


// 集群状态，每个节点都保存着一个这样的状态，记录了它们眼中的集群的样子。
// 另外，虽然这个结构主要用于记录集群的属性，但是为了节约资源，
// 有些与节点有关的属性，比如 slots_to_keys 、 failover_auth_count 
// 也被放到了这个结构里面。
typedef struct clusterState {

    // 指向当前节点的指针
    clusterNode *myself;  /* This node */

    // 集群当前的配置纪元，用于实现故障转移
    uint64_t currentEpoch;

    // 集群当前的状态：是在线还是下线
    int state;            /* REDIS_CLUSTER_OK, REDIS_CLUSTER_FAIL, ... */

    // 集群中至少处理着一个槽的节点的数量。
    int size;             /* Num of master nodes with at least one slot */

    // 集群节点名单（包括 myself 节点）
    // 字典的键为节点的名字，字典的值为 clusterNode 结构
    dict *nodes;          /* Hash table of name -> clusterNode structures */

    // 节点黑名单，用于 CLUSTER FORGET 命令
    // 防止被 FORGET 的命令重新被添加到集群里面
    // （不过现在似乎没有在使用的样子，已废弃？还是尚未实现？）
    dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */

    // 记录要从当前节点迁移到目标节点的槽，以及迁移的目标节点
    // migrating_slots_to[i] = NULL 表示槽 i 未被迁移
    // migrating_slots_to[i] = clusterNode_A 表示槽 i 要从本节点迁移至节点 A
    clusterNode *migrating_slots_to[REDIS_CLUSTER_SLOTS];

    // 记录要从源节点迁移到本节点的槽，以及进行迁移的源节点
    // importing_slots_from[i] = NULL 表示槽 i 未进行导入
    // importing_slots_from[i] = clusterNode_A 表示正从节点 A 中导入槽 i
    clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS];

    // 负责处理各个槽的节点
    // 例如 slots[i] = clusterNode_A 表示槽 i 由节点 A 处理
    clusterNode *slots[REDIS_CLUSTER_SLOTS];

    // 跳跃表，表中以槽作为分值，键作为成员，对槽进行有序排序
    // 当需要对某些槽进行区间（range）操作时，这个跳跃表可以提供方便
    // 具体操作定义在 db.c 里面
    zskiplist *slots_to_keys;

    /* The following fields are used to take the slave state on elections. */
    // 以下这些域被用于进行故障转移选举

    // 上次执行选举或者下次执行选举的时间
    mstime_t failover_auth_time; /* Time of previous or next election. */

    // 节点获得的投票数量
    int failover_auth_count;    /* Number of votes received so far. */

    // 如果值为 1 ，表示本节点已经向其他节点发送了投票请求
    int failover_auth_sent;     /* True if we already asked for votes. */

    int failover_auth_rank;     /* This slave rank for current auth request. */

    uint64_t failover_auth_epoch; /* Epoch of the current election. */

    /* Manual failover state in common. */
    /* 共用的手动故障转移状态 */

    // 手动故障转移执行的时间限制
    mstime_t mf_end;            /* Manual failover time limit (ms unixtime).
                                   It is zero if there is no MF in progress. */
    /* Manual failover state of master. */
    /* 主服务器的手动故障转移状态 */
    clusterNode *mf_slave;      /* Slave performing the manual failover. */
    /* Manual failover state of slave. */
    /* 从服务器的手动故障转移状态 */
    long long mf_master_offset; /* Master offset the slave needs to start MF
                                   or zero if stil not received. */
    // 指示手动故障转移是否可以开始的标志值
    // 值为非 0 时表示各个主服务器可以开始投票
    int mf_can_start;           /* If non-zero signal that the manual failover
                                   can start requesting masters vote. */

    /* The followign fields are uesd by masters to take state on elections. */
    /* 以下这些域由主服务器使用，用于记录选举时的状态 */

    // 集群最后一次进行投票的纪元
    uint64_t lastVoteEpoch;     /* Epoch of the last vote granted. */

    // 在进入下个事件循环之前要做的事情，以各个 flag 来记录
    int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */

    // 通过 cluster 连接发送的消息数量
    long long stats_bus_messages_sent;  /* Num of msg sent via cluster bus. */

    // 通过 cluster 接收到的消息数量
    long long stats_bus_messages_received; /* Num of msg rcvd via cluster bus.*/

} clusterState;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

[外链图片转存中...(img-pyfo5Uy7-1653145947299)]

cluster meet命令

客户端向节点A发送meet命令，指定节点B的ip和端口。将B加入到A的集群。

向A发命令cluster meet B.
A为B创建一个clusterNode结构并保存在dict中
A向B发一个meet消息
B为A创建一个clusterNode结构并保存在dict中
B返回A一个PONG消息
A收到PONG返回一个PING
B收到PING，握手结束 [外链图片转存中...(img-MJB1V7if-1653145947300)]

槽指派

redis通过分片的方式保存键值对，集群的整个数据库被分成16384个槽。每个节点处理一定数量的槽，每个key一定属于其中一个槽。

向节点指派槽的命令：cluster addslots 1 2 3 4
clusterNode中Slots数组保存当前节点所有槽信息。Slots是二进制数组。
传播节点的槽指派信息：集群节点会相互发送自己被指派的槽，节点收到其他几点的槽信息会更新相应的dict中的clusterNode结构。
clusterState中Slots数组保存所有槽信息。 [外链图片转存中...(img-p5szUZKB-1653145947300)]
cluster addslots 1 2 3 4的实现：首先遍历clusterState的Slots判断命令中的槽还没有被指派。然后遍历命令中的槽，设置clusterState的Slots和clusterNode的slots。

集群中执行命令

接收命令的节点计算命令要处理的数据库属于哪个槽，如果是指派给自己的就直接执行；如果不是就向客户端返回一个moved错误指引客户端至正确节点执行。
计算给定key的槽：任何一个key通过这个算法就可以得出一个介于0-16383之间的槽。crc16(key)&16383
集群中的节点只能使用0号数据库。

重新分片集群管理软件redis-trib负责执行。

redis-trib向目标节点发送导入命令，并将slot指向目标节点
redis-trib向源节点发送导出命令
redis-trib从源节点获取导出slot的键，导入到目标节点
键迁移结束。向集群中任意节点发送消息，slot已指向目标节点。 [外链图片转存中...(img-dezJf9DD-1653145947300)]

ask错误

ask错误：正在被转移的key被查询时，会发生查询转移
Cluster setslot importing命令：clusterState的Importing_lots_from记录着导入的槽
Cluster setslot migrating命令:clusterState的migrating_slots_to记录着正在导出的槽。ask错误的实现

publish的实现

不是直接向所有节点广播，而是通过一个节点向所有节点广播。因为redis中有一个潜规则：各节点通过消息通信。

typedef struct {

    // 频道名长度
    uint32_t channel_len;

    // 消息长度
    uint32_t message_len;

    // 消息内容，格式为 频道名+消息
    // bulk_data[0:channel_len-1] 为频道名
    // bulk_data[channel_len:channel_len+message_len-1] 为消息
    unsigned char bulk_data[8]; /* defined as 8 just for alignment concerns. */

} clusterMsgDataPublish;

1
2
3
4
5
6
7
8
9
10
11
12
13
14

# 复制

复制的步骤

redis的复制分为两个步骤，同步（sync）和命令传播(commond propogate)。
同步：从服务器向主服务器发送sync命令，主服务器生成rdb文件，并将生成rdb期间的数据库修改命令保存在缓冲区，然后都传给从服务器。同步浪费资源体现在：a.生成rdb耗费cpu，内存，i/o;b.传输rdb耗费网络资源；c.载入rdb会阻塞服务器。
命令传播：主服务器收到的修改命令也发给从服务器执行。
redis2.8以前的复制缺陷：缺陷场景：主从断线之后的重新复制。实现方式：断线之后，主从重新走一遍同步和命令传播。缺陷原因：同步是很浪费资源的操作，应该尽量避免。
redis2.8以后用psync代替sync;psync支持完全重同步和部分重同步。
部分重同步的实现：偏移量（同步的字节量），复制积压缓冲区(固定长度的队列)，服务器运行id（判断是不是向同一个服务器发送的同步命令）。 [外链图片转存中...(img-PHS6g12W-1653145947297)]

复制的实现

设置主服务器地址和端口。Slaveof 127.0.0.1 6379。从服务器保存主服务器地址和端口。
建立套接字。从服务器根据保存的主服务器地址和端口号，建立套接字。
从服务器发送ping命令。超时或者错误则重新建立套接字重试；返回“PONG”说明网络连接正常。
身份验证。主从都没有设置身份验证或者设置一致才可以通过身份验证。
从服务器发送监听端口，主服务器将其保存在redisClient的slave_listening_port;
同步。从服务器向主服务发送psync命令。 [外链图片转存中...(img-JbQvVK4o-1653145947297)]

心跳检测

从服务器每秒发送命令给主服务器，replconf ack offset,有三个作用

检测链接状况
辅助实现min-slaves选项。min-slaves-to-write 3，min-slaves-max-lag 10。表示从服务器少于3个，或者从服务延迟都大于等于10，主服务器将终止写命令。
检测命令丢失

编辑

上次更新: 2022/05/22, 00:01:01

← Redis 如何实现分布式锁？简述 Redis 中跳表的应用以及优缺点→