[pve-devel] applied: [PATCH kronosnet] cherry-pick netload issue fixes

Thomas Lamprecht t.lamprecht at proxmox.com
Tue Oct 15 15:04:09 CEST 2019


cherry-pick upstream kronosnet pull request:
https://github.com/kronosnet/kronosnet/pull/264

Includes fixes for MTU detection and defragmentation, normally
happening on high network load

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 ...e-SCTP-test-if-SCTP-is-not-supporte.patch} |  12 +-
 ...ables-to-make-it-easier-to-read-the-.patch |  67 +++++++++++
 ...ost-fix-defrag-buffers-reclaim-logic.patch |  96 ++++++++++++++++
 ...o-the-defrag-buffer-only-if-we-know-.patch |  30 +++++
 ...-to-knet_bench-to-specify-a-fixed-pa.patch | 104 ++++++++++++++++++
 ...-MTU-for-a-link-if-the-value-is-lowe.patch |  38 +++++++
 debian/patches/series                         |   7 +-
 7 files changed, 349 insertions(+), 5 deletions(-)
 rename debian/patches/{send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch => 0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch} (68%)
 create mode 100644 debian/patches/0002-host-rename-variables-to-make-it-easier-to-read-the-.patch
 create mode 100644 debian/patches/0003-host-fix-defrag-buffers-reclaim-logic.patch
 create mode 100644 debian/patches/0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch
 create mode 100644 debian/patches/0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch
 create mode 100644 debian/patches/0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch

diff --git a/debian/patches/send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch b/debian/patches/0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch
similarity index 68%
rename from debian/patches/send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch
rename to debian/patches/0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch
index fb13ba3..003550f 100644
--- a/debian/patches/send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch
+++ b/debian/patches/0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch
@@ -1,17 +1,21 @@
-From: =?utf-8?q?Ferenc_W=C3=A1gner?= <wferi at debian.org>
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ferenc=20W=C3=A1gner?= <wferi at debian.org>
 Date: Wed, 3 Apr 2019 10:26:11 +0200
-Subject: send test: skip the SCTP test if SCTP is not supported by the kernel
+Subject: [PATCH] send test: skip the SCTP test if SCTP is not supported by the
+ kernel
 
 For example, module loading is disabled on Debian build daemons.
+
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
 ---
  libknet/tests/api_knet_send.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)
 
 diff --git a/libknet/tests/api_knet_send.c b/libknet/tests/api_knet_send.c
-index f241201..1c55db1 100644
+index 50469ee..c2166ef 100644
 --- a/libknet/tests/api_knet_send.c
 +++ b/libknet/tests/api_knet_send.c
-@@ -173,12 +173,13 @@ static void test(uint8_t transport)
+@@ -181,12 +181,13 @@ static void test(uint8_t transport)
  	}
  
  	if (knet_link_set_config(knet_h, 1, 0, transport, &lo, &lo, 0) < 0) {
diff --git a/debian/patches/0002-host-rename-variables-to-make-it-easier-to-read-the-.patch b/debian/patches/0002-host-rename-variables-to-make-it-easier-to-read-the-.patch
new file mode 100644
index 0000000..8e7060b
--- /dev/null
+++ b/debian/patches/0002-host-rename-variables-to-make-it-easier-to-read-the-.patch
@@ -0,0 +1,67 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 06:46:36 +0200
+Subject: [PATCH] [host] rename variables to make it easier to read the code
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit a081ff7fab6bb98f9ff2a88b1593776b0c27b6f1)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/host.c | 24 ++++++++++++------------
+ 1 file changed, 12 insertions(+), 12 deletions(-)
+
+diff --git a/libknet/host.c b/libknet/host.c
+index abb1f89..ac26b89 100644
+--- a/libknet/host.c
++++ b/libknet/host.c
+@@ -569,7 +569,7 @@ static void _clear_cbuffers(struct knet_host *host, seq_num_t rx_seq_num)
+ 
+ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, int clear_buf)
+ {
+-	size_t i, j; /* circular buffer indexes */
++	size_t head, tail; /* circular buffer indexes */
+ 	seq_num_t seq_dist;
+ 	char *dst_cbuf = host->circular_buffer;
+ 	char *dst_cbuf_defrag = host->circular_buffer_defrag;
+@@ -585,13 +585,13 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
+ 		seq_dist = *dst_seq_num - seq_num;
+ 	}
+ 
+-	j = seq_num % KNET_CBUFFER_SIZE;
++	head = seq_num % KNET_CBUFFER_SIZE;
+ 
+ 	if (seq_dist < KNET_CBUFFER_SIZE) { /* seq num is in ring buffer */
+ 		if (!defrag_buf) {
+-			return (dst_cbuf[j] == 0) ? 1 : 0;
++			return (dst_cbuf[head] == 0) ? 1 : 0;
+ 		} else {
+-			return (dst_cbuf_defrag[j] == 0) ? 1 : 0;
++			return (dst_cbuf_defrag[head] == 0) ? 1 : 0;
+ 		}
+ 	} else if (seq_dist <= SEQ_MAX - KNET_CBUFFER_SIZE) {
+ 		memset(dst_cbuf, 0, KNET_CBUFFER_SIZE);
+@@ -600,16 +600,16 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
+ 	}
+ 
+ 	/* cleaning up circular buffer */
+-	i = (*dst_seq_num + 1) % KNET_CBUFFER_SIZE;
++	tail = (*dst_seq_num + 1) % KNET_CBUFFER_SIZE;
+ 
+-	if (i > j) {
+-		memset(dst_cbuf + i, 0, KNET_CBUFFER_SIZE - i);
+-		memset(dst_cbuf, 0, j + 1);
+-		memset(dst_cbuf_defrag + i, 0, KNET_CBUFFER_SIZE - i);
+-		memset(dst_cbuf_defrag, 0, j + 1);
++	if (tail > head) {
++		memset(dst_cbuf + tail, 0, KNET_CBUFFER_SIZE - tail);
++		memset(dst_cbuf, 0, head + 1);
++		memset(dst_cbuf_defrag + tail, 0, KNET_CBUFFER_SIZE - tail);
++		memset(dst_cbuf_defrag, 0, head + 1);
+ 	} else {
+-		memset(dst_cbuf + i, 0, j - i + 1);
+-		memset(dst_cbuf_defrag + i, 0, j - i + 1);
++		memset(dst_cbuf + tail, 0, head - tail + 1);
++		memset(dst_cbuf_defrag + tail, 0, head - tail + 1);
+ 	}
+ 
+ 	*dst_seq_num = seq_num;
diff --git a/debian/patches/0003-host-fix-defrag-buffers-reclaim-logic.patch b/debian/patches/0003-host-fix-defrag-buffers-reclaim-logic.patch
new file mode 100644
index 0000000..ca82c48
--- /dev/null
+++ b/debian/patches/0003-host-fix-defrag-buffers-reclaim-logic.patch
@@ -0,0 +1,96 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 06:53:24 +0200
+Subject: [PATCH] [host] fix defrag buffers reclaim logic
+
+The problem:
+
+- let's assume a 2 nodes (A and B) cluster setup
+- node A sends fragmented packets to node B and there is
+  packet loss on the network.
+- node B receives all those fragments and attempts to
+  reassemble them.
+- node A sends packet seq_num X in Y fragments.
+- node B receives only part of the fragments and stores
+  them in a defrag buf.
+- packet loss stops.
+- node A continues to send packets and a seq_num
+  roll-over takes place.
+- node A sends a new packet seq_num X in Y fragments.
+- node B gets confused here because the parts of the old
+  packet seq_num X are still stored and the buffer
+  has not been reclaimed.
+- node B continues to rebuild packet seq_num X with
+  old stale data and new data from after the roll-over.
+- node B completes reassembling the packet and delivers
+  junk to the application.
+
+The solution:
+
+Add a much stronger buffer reclaim logic that will apply
+on each received packet and not only when defrag buffers
+are needed, as there might be a mix of fragmented and not
+fragmented packets in-flight.
+
+The new logic creates a window of N packets that can be
+handled at the same time (based on the number of buffers)
+and clear everything else.
+
+Fixes https://github.com/kronosnet/kronosnet/issues/261
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit 3fb0166ebd14e37a2fb9fe7aeae53d09e0e66b74)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/host.c | 31 +++++++++++++++++++++++++++++++
+ 1 file changed, 31 insertions(+)
+
+diff --git a/libknet/host.c b/libknet/host.c
+index ac26b89..85d4626 100644
+--- a/libknet/host.c
++++ b/libknet/host.c
+@@ -562,6 +562,35 @@ static void _clear_cbuffers(struct knet_host *host, seq_num_t rx_seq_num)
+ 	}
+ }
+ 
++static void _reclaim_old_defrag_bufs(struct knet_host *host, seq_num_t seq_num)
++{
++	seq_num_t head, tail; /* seq_num boundaries */
++	int i;
++
++	head = seq_num + 1;
++	tail = seq_num - (KNET_MAX_LINK + 1);
++
++	/*
++	 * expire old defrag buffers
++	 */
++	for (i = 0; i < KNET_MAX_LINK; i++) {
++		if (host->defrag_buf[i].in_use) {
++			/*
++			 * head has done a rollover to 0+
++			 */
++			if (tail > head) {
++				if ((host->defrag_buf[i].pckt_seq >= head) && (host->defrag_buf[i].pckt_seq <= tail)) {
++					host->defrag_buf[i].in_use = 0;
++				}
++			} else {
++				if ((host->defrag_buf[i].pckt_seq >= head) || (host->defrag_buf[i].pckt_seq <= tail)){
++					host->defrag_buf[i].in_use = 0;
++				}
++			}
++		}
++	}
++}
++
+ /*
+  * check if a given packet seq num is in the circular buffers
+  * defrag_buf = 0 -> use normal cbuf 1 -> use the defrag buffer lookup
+@@ -579,6 +608,8 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
+ 		_clear_cbuffers(host, seq_num);
+ 	}
+ 
++	_reclaim_old_defrag_bufs(host, seq_num);
++
+ 	if (seq_num < *dst_seq_num) {
+ 		seq_dist =  (SEQ_MAX - seq_num) + *dst_seq_num;
+ 	} else {
diff --git a/debian/patches/0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch b/debian/patches/0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch
new file mode 100644
index 0000000..46d06a8
--- /dev/null
+++ b/debian/patches/0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch
@@ -0,0 +1,30 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 07:02:05 +0200
+Subject: [PATCH] [rx] copy data into the defrag buffer only if we know the
+ size of the frame
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit 8b2863b392d275ea50fa19e8e8bebe40a6134707)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/threads_rx.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/libknet/threads_rx.c b/libknet/threads_rx.c
+index e8fe264..ef0c2cc 100644
+--- a/libknet/threads_rx.c
++++ b/libknet/threads_rx.c
+@@ -186,8 +186,10 @@ static int pckt_defrag(knet_handle_t knet_h, struct knet_header *inbuf, ssize_t
+ 		defrag_buf->frag_size = *len;
+ 	}
+ 
+-	memmove(defrag_buf->buf + ((inbuf->khp_data_frag_seq - 1) * defrag_buf->frag_size),
+-	       inbuf->khp_data_userdata, *len);
++	if (defrag_buf->frag_size) {
++		memmove(defrag_buf->buf + ((inbuf->khp_data_frag_seq - 1) * defrag_buf->frag_size),
++		       inbuf->khp_data_userdata, *len);
++	}
+ 
+ 	defrag_buf->frag_recv++;
+ 	defrag_buf->frag_map[inbuf->khp_data_frag_seq] = 1;
diff --git a/debian/patches/0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch b/debian/patches/0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch
new file mode 100644
index 0000000..ed11a81
--- /dev/null
+++ b/debian/patches/0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch
@@ -0,0 +1,104 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 07:16:22 +0200
+Subject: [PATCH] [test] add ability to knet_bench to specify a fixed packet
+ size for perf test
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit d39c189900ef1a5647c7264e799f793ab9fd93e2)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/tests/knet_bench.c | 24 ++++++++++++++++++++----
+ 1 file changed, 20 insertions(+), 4 deletions(-)
+
+diff --git a/libknet/tests/knet_bench.c b/libknet/tests/knet_bench.c
+index dc04239..54b5303 100644
+--- a/libknet/tests/knet_bench.c
++++ b/libknet/tests/knet_bench.c
+@@ -67,6 +67,8 @@ static int test_type = TEST_PING;
+ static uint64_t perf_by_size_size = 1 * ONE_GIGABYTE;
+ static uint64_t perf_by_time_secs = 10;
+ 
++static uint32_t force_packet_size = 0;
++
+ struct node {
+ 	int nodeid;
+ 	int links;
+@@ -109,6 +111,7 @@ static void print_help(void)
+ 	printf(" -s                                        nodeid that will generate traffic for benchmarks\n");
+ 	printf(" -S [size|seconds]                         when used in combination with -T perf-by-size it indicates how many GB of traffic to generate for the test. (default: 1GB)\n");
+ 	printf("                                           when used in combination with -T perf-by-time it indicates how many Seconds of traffic to generate for the test. (default: 10 seconds)\n");
++	printf(" -x                                        force packet size for perf-by-time or perf-by-size\n");
+ 	printf(" -C                                        repeat the test continously (default: off)\n");
+ 	printf(" -X[XX]                                    show stats at the end of the run (default: 1)\n");
+ 	printf("                                           1: show handle stats, 2: show summary link stats\n");
+@@ -250,7 +253,7 @@ static void setup_knet(int argc, char *argv[])
+ 
+ 	memset(nodes, 0, sizeof(nodes));
+ 
+-	while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:X::P:z:h")) != EOF) {
++	while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:x:X::P:z:h")) != EOF) {
+ 		switch(rv) {
+ 			case 'h':
+ 				print_help();
+@@ -406,6 +409,13 @@ static void setup_knet(int argc, char *argv[])
+ 				perf_by_size_size = (uint64_t)atoi(optarg) * ONE_GIGABYTE;
+ 				perf_by_time_secs = (uint64_t)atoi(optarg);
+ 				break;
++			case 'x':
++				force_packet_size = (uint32_t)atoi(optarg);
++				if ((force_packet_size < 1) || (force_packet_size > 65536)) {
++					printf("Unsupported packet size %u (accepted 1 - 65536)\n", force_packet_size);
++					exit(FAIL);
++				}
++				break;
+ 			case 'C':
+ 				continous = 1;
+ 				break;
+@@ -874,7 +884,7 @@ static int setup_send_buffers_common(struct knet_mmsghdr *msg, struct iovec *iov
+ 			printf("TXT: Unable to malloc!\n");
+ 			return -1;
+ 		}
+-		memset(tx_buf[i], 0, KNET_MAX_PACKET_SIZE);
++		memset(tx_buf[i], i, KNET_MAX_PACKET_SIZE);
+ 		iov_out[i].iov_base = (void *)tx_buf[i];
+ 		memset(&msg[i].msg_hdr, 0, sizeof(struct msghdr));
+ 		msg[i].msg_hdr.msg_iov = &iov_out[i];
+@@ -898,6 +908,9 @@ static void send_perf_data_by_size(void)
+ 	setup_send_buffers_common(msg, iov_out, tx_buf);
+ 
+ 	while (packetsize <= KNET_MAX_PACKET_SIZE) {
++		if (force_packet_size) {
++			packetsize = force_packet_size;
++		}
+ 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
+ 			iov_out[i].iov_len = packetsize;
+ 		}
+@@ -926,7 +939,7 @@ static void send_perf_data_by_size(void)
+ 
+ 		knet_send(knet_h, ctrl_message, TEST_STOP, channel);
+ 
+-		if (packetsize == KNET_MAX_PACKET_SIZE) {
++		if ((packetsize == KNET_MAX_PACKET_SIZE) || (force_packet_size)) {
+ 			break;
+ 		}
+ 
+@@ -1175,6 +1188,9 @@ static void send_perf_data_by_time(void)
+ 	memset(&clock_end, 0, sizeof(clock_start));
+ 
+ 	while (packetsize <= KNET_MAX_PACKET_SIZE) {
++		if (force_packet_size) {
++			packetsize = force_packet_size;
++		}
+ 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
+ 			iov_out[i].iov_len = packetsize;
+ 		}
+@@ -1205,7 +1221,7 @@ static void send_perf_data_by_time(void)
+ 
+ 		knet_send(knet_h, ctrl_message, TEST_STOP, channel);
+ 
+-		if (packetsize == KNET_MAX_PACKET_SIZE) {
++		if ((packetsize == KNET_MAX_PACKET_SIZE) || (force_packet_size)) {
+ 			break;
+ 		}
+ 
diff --git a/debian/patches/0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch b/debian/patches/0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch
new file mode 100644
index 0000000..20fda14
--- /dev/null
+++ b/debian/patches/0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch
@@ -0,0 +1,38 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 11:53:56 +0200
+Subject: [PATCH] [PMTUd] invalidate MTU for a link if the value is lower than
+ minimum
+
+Under heavy network load and packet loss, calculated MTU can be
+too small. In that case we need to invalidate the link mtu,
+that would remove the link from the rotation (and traffic) and
+would give PMTUd time to get the right MTU in the next round.
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit 34c08ae7e7903d79f06569b4f506a00c15af0238)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/threads_pmtud.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/libknet/threads_pmtud.c b/libknet/threads_pmtud.c
+index 75d5196..f4d15ae 100644
+--- a/libknet/threads_pmtud.c
++++ b/libknet/threads_pmtud.c
+@@ -482,7 +482,14 @@ static int _handle_check_pmtud(knet_handle_t knet_h, struct knet_host *dst_host,
+ 		}
+ 		dst_link->has_valid_mtu = 0;
+ 	} else {
+-		dst_link->has_valid_mtu = 1;
++		if (dst_link->status.mtu < calc_min_mtu(knet_h)) {
++			log_info(knet_h, KNET_SUB_PMTUD,
++				 "Invalid MTU detected for host: %u link: %u mtu: %u",
++				 dst_host->host_id, dst_link->link_id, dst_link->status.mtu);
++			dst_link->has_valid_mtu = 0;
++		} else {
++			dst_link->has_valid_mtu = 1;
++		}
+ 		if (dst_link->has_valid_mtu) {
+ 			if ((saved_pmtud) && (saved_pmtud != dst_link->status.mtu)) {
+ 				log_info(knet_h, KNET_SUB_PMTUD, "PMTUD link change for host: %u link: %u from %u to %u",
diff --git a/debian/patches/series b/debian/patches/series
index 7fbd139..d593aca 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1 +1,6 @@
-send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch
+0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch
+0002-host-rename-variables-to-make-it-easier-to-read-the-.patch
+0003-host-fix-defrag-buffers-reclaim-logic.patch
+0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch
+0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch
+0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch
-- 
2.20.1





More information about the pve-devel mailing list