1.問題表現 version 3.5.9node
Dec 29 2016 07:46:58 GMT: INFO (paxos): (paxos.c::2367) Cluster Integrity Check: Detected succession list discrepancy between node bb900007f14eb4b and self bb9ffe723270008 Dec 29 2016 07:46:58 GMT: INFO (paxos): (paxos.c::2412) CLUSTER INTEGRITY FAULT. [Phase 1 of 2] To fix, issue this command across all nodes: dun:nodes=bb900007f14eb4b Dec 29 2016 07:46:58 GMT: INFO (paxos): (paxos.c::2516) as_paxos_retransmit_check: principal bb9ffe723270008 retransmitting sync messages to nodes that have not responded yet ... Dec 29 2016 07:46:58 GMT: INFO (paxos): (paxos.c::1439) sending sync message to bb900007f14eb4b Dec 29 2016 07:46:58 GMT: INFO (paxos): (paxos.c::1448) SUCCESSION [9.0]@bb9ffe723270008: bb9ffe723270008 bb900007f14eb4b Dec 29 2016 07:47:03 GMT: INFO (paxos): (paxos.c::2367) Cluster Integrity Check: Detected succession list discrepancy between node bb900007f14eb4b and self bb9ffe723270008 Dec 29 2016 07:47:03 GMT: INFO (paxos): (paxos.c::2412) CLUSTER INTEGRITY FAULT. [Phase 1 of 2] To fix, issue this command across all nodes: dun:nodes=bb900007f14eb4b Dec 29 2016 07:47:03 GMT: INFO (paxos): (paxos.c::2516) as_paxos_retransmit_check: principal bb9ffe723270008 retransmitting sync messages to nodes that have not responded yet ... Dec 29 2016 07:47:03 GMT: INFO (paxos): (paxos.c::1439) sending sync message to bb900007f14eb4b Dec 29 2016 07:47:03 GMT: INFO (paxos): (paxos.c::1448) SUCCESSION [9.0]@bb9ffe723270008: bb9ffe723270008 bb900007f14eb4b
2.Cluster Integrity Check網絡
// for each node in the succession list // compare the node's succession list with this server's succession list bool cluster_integrity_fault = false; bool are_nodes_not_dunned = false; for (int i = 0; i < g_config.paxos_max_cluster_size; i++) { cf_debug(AS_PAXOS, "Cluster Integrity Check: %d, %"PRIx64"", i, succ_list_index[i]); if (succ_list_index[i] == (cf_node) 0) { break; // we are done }
3.CLUSTER INTEGRITY FAULTsocket
switch (g_config.paxos_recovery_policy) { case AS_PAXOS_RECOVERY_POLICY_MANUAL: { if (are_nodes_not_dunned) { snprintf(sbuf, 97, "CLUSTER INTEGRITY FAULT. [Phase 1 of 2] To fix, issue this command across all nodes: dun:nodes="); } else { snprintf(sbuf, 99, "CLUSTER INTEGRITY FAULT. [Phase 2 of 2] To fix, issue this command across all nodes: undun:nodes="); } bool nodes_missing = false; for (int i = 0; i < g_config.paxos_max_cluster_size; i++) { if ((cf_node)0 == missing_nodes[i]) { break; } snprintf(sbuf + strlen(sbuf), 18, "%"PRIx64",", missing_nodes[i]); nodes_missing = true; }
4.緣由分析tcp
只要出現兩個節點間不能互相經過3002端口同步狀態,就會出現上述問題ide
致使該問題的緣由有不少種this
①防火牆debug
#驗證方法 telnet ip:port
②進程fd耗盡,致使沒法建立socketcode
#as默認fd數量aerospike.conf proto-fd-max 15000 #驗證方法 ll /proc/pid/fd|grep socket |wc-l lsof -p asd-pid|grep can't identify protocol|wc -l
100 BB9FFE723270008 192.168.56.100 101 BB900007F80090B 192.168.56.101 101 能鏈接100 100沒法鏈接101就出現上面 as集羣各節點狀態確認問題 101節點能夠鏈接100的3002端口 但100節點沒法鏈接101的3002端口
101鏈接100的3002server
[root@c101 ~]# telnet 192.168.56.100 3002 Trying 192.168.56.100... Connected to 192.168.56.100. Escape character is '^]'. Mhc
100鏈接101的3002進程
[root@c100 ~]# telnet 192.168.56.101 3002 Trying 192.168.56.101... telnet: connect to address 192.168.56.101: No route to host [root@c100 ~]#
100節點網絡狀態
[root@c100 ~]# netstat -nat Active Internet connections (servers and established) Proto Recv-Q Send-Q Local Address Foreign Address State tcp 0 0 0.0.0.0:3001 0.0.0.0:* LISTEN tcp 0 0 192.168.56.100:3002 0.0.0.0:* LISTEN tcp 0 0 0.0.0.0:3003 0.0.0.0:* LISTEN tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN tcp 0 0 0.0.0.0:3000 0.0.0.0:* LISTEN tcp 0 0 192.168.56.100:3002 192.168.56.101:58930 ESTABLISHED tcp 0 52 192.168.56.100:22 192.168.56.1:52622 ESTABLISHED tcp 0 0 192.168.56.100:22 192.168.56.1:52188 ESTABLISHED tcp 0 0 192.168.56.100:22 192.168.56.1:52031 ESTABLISHED tcp6 0 0 :::3306 :::* LISTEN tcp6 0 0 :::22 :::* LISTEN [root@c100 ~]#
101節點網絡狀態
[root@c101 ~]# netstat -nat Active Internet connections (servers and established) Proto Recv-Q Send-Q Local Address Foreign Address State tcp 0 0 0.0.0.0:3001 0.0.0.0:* LISTEN tcp 0 0 127.0.0.1:25 0.0.0.0:* LISTEN tcp 0 0 192.168.56.101:3002 0.0.0.0:* LISTEN tcp 0 0 0.0.0.0:3003 0.0.0.0:* LISTEN tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN tcp 0 0 0.0.0.0:3000 0.0.0.0:* LISTEN tcp 0 0 192.168.56.101:22 192.168.56.1:55739 ESTABLISHED tcp 0 0 192.168.56.101:58930 192.168.56.100:3002 ESTABLISHED tcp 0 52 192.168.56.101:22 192.168.56.1:55723 ESTABLISHED tcp 0 0 192.168.56.101:22 192.168.56.1:55135 ESTABLISHED tcp6 0 0 ::1:25 :::* LISTEN tcp6 0 0 :::22 :::* LISTEN [root@c101 ~]#
5.問題還原重現方法
節點A能夠鏈接節點B,節點B沒法鏈接節點A 將節點A的防火牆打開便可