[pve-devel] [PATCH corosync-pve 1/2] cherry-pick fixes

Tue Nov 9 13:07:39 CET 2021

patch #3 should improve network load in recovery situations
patch #4 fixes a cpg corruption issue discovered while investigating the
knet sequence number bug

Signed-off-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
---
 ...cel_hold_on_retransmit-config-option.patch | 132 ++++++++++++++++++
 ...ch-totempg-buffers-at-the-right-time.patch | 113 +++++++++++++++
 debian/patches/series                         |   2 +
 3 files changed, 247 insertions(+)
 create mode 100644 debian/patches/0003-totem-Add-cancel_hold_on_retransmit-config-option.patch
 create mode 100644 debian/patches/0004-totemsrp-Switch-totempg-buffers-at-the-right-time.patch

diff --git a/debian/patches/0003-totem-Add-cancel_hold_on_retransmit-config-option.patch b/debian/patches/0003-totem-Add-cancel_hold_on_retransmit-config-option.patch
new file mode 100644
index 0000000..7fd66cf
--- /dev/null
+++ b/debian/patches/0003-totem-Add-cancel_hold_on_retransmit-config-option.patch
@@ -0,0 +1,132 @@
+From cdf72925db5a81e546ca8e8d7d8291ee1fc77be4 Mon Sep 17 00:00:00 2001
+From: Jan Friesse <jfriesse at redhat.com>
+Date: Wed, 11 Aug 2021 17:34:05 +0200
+Subject: [PATCH 3/4] totem: Add cancel_hold_on_retransmit config option
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Previously, existence of retransmit messages canceled holding
+of token (and never allowed representative to enter token hold
+state).
+
+This makes token rotating maximum speed and keeps processor
+resending messages over and over again - overloading network
+and reducing chance to successfully deliver the messages.
+
+Also there were reports of various Antivirus / IPS / IDS which slows
+down delivery of packets with certain sizes (packets bigger than token)
+what make Corosync retransmit messages over and over again.
+
+Proposed solution is to allow representative to enter token hold
+state when there are only retransmit messages. This allows network to
+handle overload and/or gives Antivirus/IPS/IDS enough time scan and
+deliver packets without corosync entering "FAILED TO RECEIVE" state and
+adding more load to network.
+
+Signed-off-by: Jan Friesse <jfriesse at redhat.com>
+Reviewed-by: Christine Caulfield <ccaulfie at redhat.com>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
+---
+ include/corosync/totem/totem.h |  2 ++
+ exec/totemconfig.c             |  6 ++++++
+ exec/totemsrp.c                |  5 +++--
+ man/corosync.conf.5            | 15 ++++++++++++++-
+ 4 files changed, 25 insertions(+), 3 deletions(-)
+
+diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h
+index 8b166566..bdb6a15f 100644
+--- a/include/corosync/totem/totem.h
++++ b/include/corosync/totem/totem.h
+@@ -244,6 +244,8 @@ struct totem_config {
+ 
+ 	unsigned int block_unlisted_ips;
+ 
++	unsigned int cancel_token_hold_on_retransmit;
++
+ 	void (*totem_memb_ring_id_create_or_load) (
+ 	    struct memb_ring_id *memb_ring_id,
+ 	    unsigned int nodeid);
+diff --git a/exec/totemconfig.c b/exec/totemconfig.c
+index 57a1587a..46e09952 100644
+--- a/exec/totemconfig.c
++++ b/exec/totemconfig.c
+@@ -81,6 +81,7 @@
+ #define MAX_MESSAGES				17
+ #define MISS_COUNT_CONST			5
+ #define BLOCK_UNLISTED_IPS			1
++#define CANCEL_TOKEN_HOLD_ON_RETRANSMIT		0
+ /* This constant is not used for knet */
+ #define UDP_NETMTU                              1500
+ 
+@@ -144,6 +145,8 @@ static void *totem_get_param_by_name(struct totem_config *totem_config, const ch
+ 		return totem_config->knet_compression_model;
+ 	if (strcmp(param_name, "totem.block_unlisted_ips") == 0)
+ 		return &totem_config->block_unlisted_ips;
++	if (strcmp(param_name, "totem.cancel_token_hold_on_retransmit") == 0)
++		return &totem_config->cancel_token_hold_on_retransmit;
+ 
+ 	return NULL;
+ }
+@@ -365,6 +368,9 @@ void totem_volatile_config_read (struct totem_config *totem_config, icmap_map_t
+ 
+ 	totem_volatile_config_set_boolean_value(totem_config, temp_map, "totem.block_unlisted_ips", deleted_key,
+ 	    BLOCK_UNLISTED_IPS);
++
++	totem_volatile_config_set_boolean_value(totem_config, temp_map, "totem.cancel_token_hold_on_retransmit",
++	    deleted_key, CANCEL_TOKEN_HOLD_ON_RETRANSMIT);
+ }
+ 
+ int totem_volatile_config_validate (
+diff --git a/exec/totemsrp.c b/exec/totemsrp.c
+index 949d367b..d24b11fa 100644
+--- a/exec/totemsrp.c
++++ b/exec/totemsrp.c
+@@ -3981,8 +3981,9 @@ static int message_handler_orf_token (
+ 		transmits_allowed = fcc_calculate (instance, token);
+ 		mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed);
+ 
+-		if (instance->my_token_held == 1 &&
+-			(token->rtr_list_entries > 0 || mcasted_retransmit > 0)) {
++		if (instance->totem_config->cancel_token_hold_on_retransmit &&
++		    instance->my_token_held == 1 &&
++		    (token->rtr_list_entries > 0 || mcasted_retransmit > 0)) {
+ 			instance->my_token_held = 0;
+ 			forward_token = 1;
+ 		}
+diff --git a/man/corosync.conf.5 b/man/corosync.conf.5
+index 0588ad1e..a3771ea7 100644
+--- a/man/corosync.conf.5
++++ b/man/corosync.conf.5
+@@ -32,7 +32,7 @@
+ .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ .\" * THE POSSIBILITY OF SUCH DAMAGE.
+ .\" */
+-.TH COROSYNC_CONF 5 2021-07-23 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
++.TH COROSYNC_CONF 5 2021-08-11 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
+ .SH NAME
+ corosync.conf - corosync executive configuration file
+ 
+@@ -584,6 +584,19 @@ with an old configuration.
+ 
+ The default value is yes.
+ 
++.TP
++cancel_token_hold_on_retransmit
++Allows Corosync to hold token by representative when there is too much
++retransmit messages. This allows network to process increased load without
++overloading it. Used mechanism is same as described for
++.B hold
++directive.
++
++Some deployments may prefer to never hold token when there is
++retransmit messages. If so, option should be set to yes.
++
++The default value is no.
++
+ .PP
+ Within the
+ .B logging
+-- 
+2.30.2
+
diff --git a/debian/patches/0004-totemsrp-Switch-totempg-buffers-at-the-right-time.patch b/debian/patches/0004-totemsrp-Switch-totempg-buffers-at-the-right-time.patch
new file mode 100644
index 0000000..2ef9215
--- /dev/null
+++ b/debian/patches/0004-totemsrp-Switch-totempg-buffers-at-the-right-time.patch
@@ -0,0 +1,113 @@
+From 7dce8bc0066c7c76eeb26cc8f6fe4de3221d6798 Mon Sep 17 00:00:00 2001
+From: Jan Friesse <jfriesse at redhat.com>
+Date: Tue, 26 Oct 2021 18:17:59 +0200
+Subject: [PATCH 4/4] totemsrp: Switch totempg buffers at the right time
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Commit 92e0f9c7bb9b4b6a0da8d64bdf3b2e47ae55b1cc added switching of
+totempg buffers in sync phase. But because buffers got switch too early
+there was a problem when delivering recovered messages (messages got
+corrupted and/or lost). Solution is to switch buffers after recovered
+messages got delivered.
+
+I think it is worth to describe complete history with reproducers so it
+doesn't get lost.
+
+It all started with 402638929e5045ef520a7339696c687fbed0b31b (more info
+about original problem is described in
+https://bugzilla.redhat.com/show_bug.cgi?id=820821). This patch
+solves problem which is way to be reproduced with following reproducer:
+- 2 nodes
+- Both nodes running corosync and testcpg
+- Pause node 1 (SIGSTOP of corosync)
+- On node 1, send some messages by testcpg
+  (it's not answering but this doesn't matter). Simply hit ENTER key
+  few times is enough)
+- Wait till node 2 detects that node 1 left
+- Unpause node 1 (SIGCONT of corosync)
+
+and on node 1 newly mcasted cpg messages got sent before sync barrier,
+so node 2 logs "Unknown node -> we will not deliver message".
+
+Solution was to add switch of totemsrp new messages buffer.
+
+This patch was not enough so new one
+(92e0f9c7bb9b4b6a0da8d64bdf3b2e47ae55b1cc) was created. Reproducer of
+problem was similar, just cpgverify was used instead of testcpg.
+Occasionally when node 1 was unpaused it hang in sync phase because
+there was a partial message in totempg buffers. New sync message had
+different frag cont so it was thrown away and never delivered.
+
+After many years problem was found which is solved by this patch
+(original issue describe in
+https://github.com/corosync/corosync/issues/660).
+Reproducer is more complex:
+- 2 nodes
+- Node 1 is rate-limited (used script on the hypervisor side):
+  ```
+  iface=tapXXXX
+  # ~0.1MB/s in bit/s
+  rate=838856
+  # 1mb/s
+  burst=1048576
+  tc qdisc add dev $iface root handle 1: htb default 1
+  tc class add dev $iface parent 1: classid 1:1 htb rate ${rate}bps \
+    burst ${burst}b
+  tc qdisc add dev $iface handle ffff: ingress
+  tc filter add dev $iface parent ffff: prio 50 basic police rate \
+    ${rate}bps burst ${burst}b mtu 64kb "drop"
+  ```
+- Node 2 is running corosync and cpgverify
+- Node 1 keeps restarting of corosync and running cpgverify in cycle
+  - Console 1: while true; do corosync; sleep 20; \
+      kill $(pidof corosync); sleep 20; done
+  - Console 2: while true; do ./cpgverify;done
+
+And from time to time (reproduced usually in less than 5 minutes)
+cpgverify reports corrupted message.
+
+Signed-off-by: Jan Friesse <jfriesse at redhat.com>
+Reviewed-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
+---
+ exec/totemsrp.c | 16 +++++++++++++++-
+ 1 file changed, 15 insertions(+), 1 deletion(-)
+
+diff --git a/exec/totemsrp.c b/exec/totemsrp.c
+index d24b11fa..fd71771b 100644
+--- a/exec/totemsrp.c
++++ b/exec/totemsrp.c
+@@ -1989,13 +1989,27 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
+ 		trans_memb_list_totemip, instance->my_trans_memb_entries,
+ 		left_list, instance->my_left_memb_entries,
+ 		0, 0, &instance->my_ring_id);
++	/*
++	 * Switch new totemsrp messages queue. Messages sent from now on are stored
++	 * in different queue so synchronization messages are delivered first. Totempg
++	 * buffers will be switched later.
++	 */
+ 	instance->waiting_trans_ack = 1;
+-	instance->totemsrp_waiting_trans_ack_cb_fn (1);
+ 
+ // TODO we need to filter to ensure we only deliver those
+ // messages which are part of instance->my_deliver_memb
+ 	messages_deliver_to_app (instance, 1, instance->old_ring_state_high_seq_received);
+ 
++	/*
++	 * Switch totempg buffers. This used to be right after
++	 *   instance->waiting_trans_ack = 1;
++	 * line. This was causing problem, because there may be not yet
++	 * processed parts of messages in totempg buffers.
++	 * So when buffers were switched and recovered messages
++	 * got delivered it was not possible to assemble them.
++	 */
++	instance->totemsrp_waiting_trans_ack_cb_fn (1);
++
+ 	instance->my_aru = aru_save;
+ 
+ 	/*
+-- 
+2.30.2
+
diff --git a/debian/patches/series b/debian/patches/series
index fd3a2f0..74c8c39 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,2 +1,4 @@
 0001-Enable-PrivateTmp-in-the-systemd-service-files.patch
 0002-only-start-corosync.service-if-conf-exists.patch
+0003-totem-Add-cancel_hold_on_retransmit-config-option.patch
+0004-totemsrp-Switch-totempg-buffers-at-the-right-time.patch
-- 
2.30.2