[pve-devel] applied: [PATCH kronosnet] cherry-pick netload issue fixes
Thomas Lamprecht
t.lamprecht at proxmox.com
Tue Oct 15 15:04:09 CEST 2019
cherry-pick upstream kronosnet pull request:
https://github.com/kronosnet/kronosnet/pull/264
Includes fixes for MTU detection and defragmentation, normally
happening on high network load
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
...e-SCTP-test-if-SCTP-is-not-supporte.patch} | 12 +-
...ables-to-make-it-easier-to-read-the-.patch | 67 +++++++++++
...ost-fix-defrag-buffers-reclaim-logic.patch | 96 ++++++++++++++++
...o-the-defrag-buffer-only-if-we-know-.patch | 30 +++++
...-to-knet_bench-to-specify-a-fixed-pa.patch | 104 ++++++++++++++++++
...-MTU-for-a-link-if-the-value-is-lowe.patch | 38 +++++++
debian/patches/series | 7 +-
7 files changed, 349 insertions(+), 5 deletions(-)
rename debian/patches/{send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch => 0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch} (68%)
create mode 100644 debian/patches/0002-host-rename-variables-to-make-it-easier-to-read-the-.patch
create mode 100644 debian/patches/0003-host-fix-defrag-buffers-reclaim-logic.patch
create mode 100644 debian/patches/0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch
create mode 100644 debian/patches/0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch
create mode 100644 debian/patches/0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch
diff --git a/debian/patches/send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch b/debian/patches/0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch
similarity index 68%
rename from debian/patches/send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch
rename to debian/patches/0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch
index fb13ba3..003550f 100644
--- a/debian/patches/send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch
+++ b/debian/patches/0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch
@@ -1,17 +1,21 @@
-From: =?utf-8?q?Ferenc_W=C3=A1gner?= <wferi at debian.org>
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ferenc=20W=C3=A1gner?= <wferi at debian.org>
Date: Wed, 3 Apr 2019 10:26:11 +0200
-Subject: send test: skip the SCTP test if SCTP is not supported by the kernel
+Subject: [PATCH] send test: skip the SCTP test if SCTP is not supported by the
+ kernel
For example, module loading is disabled on Debian build daemons.
+
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
libknet/tests/api_knet_send.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/libknet/tests/api_knet_send.c b/libknet/tests/api_knet_send.c
-index f241201..1c55db1 100644
+index 50469ee..c2166ef 100644
--- a/libknet/tests/api_knet_send.c
+++ b/libknet/tests/api_knet_send.c
-@@ -173,12 +173,13 @@ static void test(uint8_t transport)
+@@ -181,12 +181,13 @@ static void test(uint8_t transport)
}
if (knet_link_set_config(knet_h, 1, 0, transport, &lo, &lo, 0) < 0) {
diff --git a/debian/patches/0002-host-rename-variables-to-make-it-easier-to-read-the-.patch b/debian/patches/0002-host-rename-variables-to-make-it-easier-to-read-the-.patch
new file mode 100644
index 0000000..8e7060b
--- /dev/null
+++ b/debian/patches/0002-host-rename-variables-to-make-it-easier-to-read-the-.patch
@@ -0,0 +1,67 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 06:46:36 +0200
+Subject: [PATCH] [host] rename variables to make it easier to read the code
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit a081ff7fab6bb98f9ff2a88b1593776b0c27b6f1)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/host.c | 24 ++++++++++++------------
+ 1 file changed, 12 insertions(+), 12 deletions(-)
+
+diff --git a/libknet/host.c b/libknet/host.c
+index abb1f89..ac26b89 100644
+--- a/libknet/host.c
++++ b/libknet/host.c
+@@ -569,7 +569,7 @@ static void _clear_cbuffers(struct knet_host *host, seq_num_t rx_seq_num)
+
+ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, int clear_buf)
+ {
+- size_t i, j; /* circular buffer indexes */
++ size_t head, tail; /* circular buffer indexes */
+ seq_num_t seq_dist;
+ char *dst_cbuf = host->circular_buffer;
+ char *dst_cbuf_defrag = host->circular_buffer_defrag;
+@@ -585,13 +585,13 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
+ seq_dist = *dst_seq_num - seq_num;
+ }
+
+- j = seq_num % KNET_CBUFFER_SIZE;
++ head = seq_num % KNET_CBUFFER_SIZE;
+
+ if (seq_dist < KNET_CBUFFER_SIZE) { /* seq num is in ring buffer */
+ if (!defrag_buf) {
+- return (dst_cbuf[j] == 0) ? 1 : 0;
++ return (dst_cbuf[head] == 0) ? 1 : 0;
+ } else {
+- return (dst_cbuf_defrag[j] == 0) ? 1 : 0;
++ return (dst_cbuf_defrag[head] == 0) ? 1 : 0;
+ }
+ } else if (seq_dist <= SEQ_MAX - KNET_CBUFFER_SIZE) {
+ memset(dst_cbuf, 0, KNET_CBUFFER_SIZE);
+@@ -600,16 +600,16 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
+ }
+
+ /* cleaning up circular buffer */
+- i = (*dst_seq_num + 1) % KNET_CBUFFER_SIZE;
++ tail = (*dst_seq_num + 1) % KNET_CBUFFER_SIZE;
+
+- if (i > j) {
+- memset(dst_cbuf + i, 0, KNET_CBUFFER_SIZE - i);
+- memset(dst_cbuf, 0, j + 1);
+- memset(dst_cbuf_defrag + i, 0, KNET_CBUFFER_SIZE - i);
+- memset(dst_cbuf_defrag, 0, j + 1);
++ if (tail > head) {
++ memset(dst_cbuf + tail, 0, KNET_CBUFFER_SIZE - tail);
++ memset(dst_cbuf, 0, head + 1);
++ memset(dst_cbuf_defrag + tail, 0, KNET_CBUFFER_SIZE - tail);
++ memset(dst_cbuf_defrag, 0, head + 1);
+ } else {
+- memset(dst_cbuf + i, 0, j - i + 1);
+- memset(dst_cbuf_defrag + i, 0, j - i + 1);
++ memset(dst_cbuf + tail, 0, head - tail + 1);
++ memset(dst_cbuf_defrag + tail, 0, head - tail + 1);
+ }
+
+ *dst_seq_num = seq_num;
diff --git a/debian/patches/0003-host-fix-defrag-buffers-reclaim-logic.patch b/debian/patches/0003-host-fix-defrag-buffers-reclaim-logic.patch
new file mode 100644
index 0000000..ca82c48
--- /dev/null
+++ b/debian/patches/0003-host-fix-defrag-buffers-reclaim-logic.patch
@@ -0,0 +1,96 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 06:53:24 +0200
+Subject: [PATCH] [host] fix defrag buffers reclaim logic
+
+The problem:
+
+- let's assume a 2 nodes (A and B) cluster setup
+- node A sends fragmented packets to node B and there is
+ packet loss on the network.
+- node B receives all those fragments and attempts to
+ reassemble them.
+- node A sends packet seq_num X in Y fragments.
+- node B receives only part of the fragments and stores
+ them in a defrag buf.
+- packet loss stops.
+- node A continues to send packets and a seq_num
+ roll-over takes place.
+- node A sends a new packet seq_num X in Y fragments.
+- node B gets confused here because the parts of the old
+ packet seq_num X are still stored and the buffer
+ has not been reclaimed.
+- node B continues to rebuild packet seq_num X with
+ old stale data and new data from after the roll-over.
+- node B completes reassembling the packet and delivers
+ junk to the application.
+
+The solution:
+
+Add a much stronger buffer reclaim logic that will apply
+on each received packet and not only when defrag buffers
+are needed, as there might be a mix of fragmented and not
+fragmented packets in-flight.
+
+The new logic creates a window of N packets that can be
+handled at the same time (based on the number of buffers)
+and clear everything else.
+
+Fixes https://github.com/kronosnet/kronosnet/issues/261
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit 3fb0166ebd14e37a2fb9fe7aeae53d09e0e66b74)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/host.c | 31 +++++++++++++++++++++++++++++++
+ 1 file changed, 31 insertions(+)
+
+diff --git a/libknet/host.c b/libknet/host.c
+index ac26b89..85d4626 100644
+--- a/libknet/host.c
++++ b/libknet/host.c
+@@ -562,6 +562,35 @@ static void _clear_cbuffers(struct knet_host *host, seq_num_t rx_seq_num)
+ }
+ }
+
++static void _reclaim_old_defrag_bufs(struct knet_host *host, seq_num_t seq_num)
++{
++ seq_num_t head, tail; /* seq_num boundaries */
++ int i;
++
++ head = seq_num + 1;
++ tail = seq_num - (KNET_MAX_LINK + 1);
++
++ /*
++ * expire old defrag buffers
++ */
++ for (i = 0; i < KNET_MAX_LINK; i++) {
++ if (host->defrag_buf[i].in_use) {
++ /*
++ * head has done a rollover to 0+
++ */
++ if (tail > head) {
++ if ((host->defrag_buf[i].pckt_seq >= head) && (host->defrag_buf[i].pckt_seq <= tail)) {
++ host->defrag_buf[i].in_use = 0;
++ }
++ } else {
++ if ((host->defrag_buf[i].pckt_seq >= head) || (host->defrag_buf[i].pckt_seq <= tail)){
++ host->defrag_buf[i].in_use = 0;
++ }
++ }
++ }
++ }
++}
++
+ /*
+ * check if a given packet seq num is in the circular buffers
+ * defrag_buf = 0 -> use normal cbuf 1 -> use the defrag buffer lookup
+@@ -579,6 +608,8 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
+ _clear_cbuffers(host, seq_num);
+ }
+
++ _reclaim_old_defrag_bufs(host, seq_num);
++
+ if (seq_num < *dst_seq_num) {
+ seq_dist = (SEQ_MAX - seq_num) + *dst_seq_num;
+ } else {
diff --git a/debian/patches/0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch b/debian/patches/0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch
new file mode 100644
index 0000000..46d06a8
--- /dev/null
+++ b/debian/patches/0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch
@@ -0,0 +1,30 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 07:02:05 +0200
+Subject: [PATCH] [rx] copy data into the defrag buffer only if we know the
+ size of the frame
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit 8b2863b392d275ea50fa19e8e8bebe40a6134707)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/threads_rx.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/libknet/threads_rx.c b/libknet/threads_rx.c
+index e8fe264..ef0c2cc 100644
+--- a/libknet/threads_rx.c
++++ b/libknet/threads_rx.c
+@@ -186,8 +186,10 @@ static int pckt_defrag(knet_handle_t knet_h, struct knet_header *inbuf, ssize_t
+ defrag_buf->frag_size = *len;
+ }
+
+- memmove(defrag_buf->buf + ((inbuf->khp_data_frag_seq - 1) * defrag_buf->frag_size),
+- inbuf->khp_data_userdata, *len);
++ if (defrag_buf->frag_size) {
++ memmove(defrag_buf->buf + ((inbuf->khp_data_frag_seq - 1) * defrag_buf->frag_size),
++ inbuf->khp_data_userdata, *len);
++ }
+
+ defrag_buf->frag_recv++;
+ defrag_buf->frag_map[inbuf->khp_data_frag_seq] = 1;
diff --git a/debian/patches/0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch b/debian/patches/0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch
new file mode 100644
index 0000000..ed11a81
--- /dev/null
+++ b/debian/patches/0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch
@@ -0,0 +1,104 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 07:16:22 +0200
+Subject: [PATCH] [test] add ability to knet_bench to specify a fixed packet
+ size for perf test
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit d39c189900ef1a5647c7264e799f793ab9fd93e2)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/tests/knet_bench.c | 24 ++++++++++++++++++++----
+ 1 file changed, 20 insertions(+), 4 deletions(-)
+
+diff --git a/libknet/tests/knet_bench.c b/libknet/tests/knet_bench.c
+index dc04239..54b5303 100644
+--- a/libknet/tests/knet_bench.c
++++ b/libknet/tests/knet_bench.c
+@@ -67,6 +67,8 @@ static int test_type = TEST_PING;
+ static uint64_t perf_by_size_size = 1 * ONE_GIGABYTE;
+ static uint64_t perf_by_time_secs = 10;
+
++static uint32_t force_packet_size = 0;
++
+ struct node {
+ int nodeid;
+ int links;
+@@ -109,6 +111,7 @@ static void print_help(void)
+ printf(" -s nodeid that will generate traffic for benchmarks\n");
+ printf(" -S [size|seconds] when used in combination with -T perf-by-size it indicates how many GB of traffic to generate for the test. (default: 1GB)\n");
+ printf(" when used in combination with -T perf-by-time it indicates how many Seconds of traffic to generate for the test. (default: 10 seconds)\n");
++ printf(" -x force packet size for perf-by-time or perf-by-size\n");
+ printf(" -C repeat the test continously (default: off)\n");
+ printf(" -X[XX] show stats at the end of the run (default: 1)\n");
+ printf(" 1: show handle stats, 2: show summary link stats\n");
+@@ -250,7 +253,7 @@ static void setup_knet(int argc, char *argv[])
+
+ memset(nodes, 0, sizeof(nodes));
+
+- while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:X::P:z:h")) != EOF) {
++ while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:x:X::P:z:h")) != EOF) {
+ switch(rv) {
+ case 'h':
+ print_help();
+@@ -406,6 +409,13 @@ static void setup_knet(int argc, char *argv[])
+ perf_by_size_size = (uint64_t)atoi(optarg) * ONE_GIGABYTE;
+ perf_by_time_secs = (uint64_t)atoi(optarg);
+ break;
++ case 'x':
++ force_packet_size = (uint32_t)atoi(optarg);
++ if ((force_packet_size < 1) || (force_packet_size > 65536)) {
++ printf("Unsupported packet size %u (accepted 1 - 65536)\n", force_packet_size);
++ exit(FAIL);
++ }
++ break;
+ case 'C':
+ continous = 1;
+ break;
+@@ -874,7 +884,7 @@ static int setup_send_buffers_common(struct knet_mmsghdr *msg, struct iovec *iov
+ printf("TXT: Unable to malloc!\n");
+ return -1;
+ }
+- memset(tx_buf[i], 0, KNET_MAX_PACKET_SIZE);
++ memset(tx_buf[i], i, KNET_MAX_PACKET_SIZE);
+ iov_out[i].iov_base = (void *)tx_buf[i];
+ memset(&msg[i].msg_hdr, 0, sizeof(struct msghdr));
+ msg[i].msg_hdr.msg_iov = &iov_out[i];
+@@ -898,6 +908,9 @@ static void send_perf_data_by_size(void)
+ setup_send_buffers_common(msg, iov_out, tx_buf);
+
+ while (packetsize <= KNET_MAX_PACKET_SIZE) {
++ if (force_packet_size) {
++ packetsize = force_packet_size;
++ }
+ for (i = 0; i < PCKT_FRAG_MAX; i++) {
+ iov_out[i].iov_len = packetsize;
+ }
+@@ -926,7 +939,7 @@ static void send_perf_data_by_size(void)
+
+ knet_send(knet_h, ctrl_message, TEST_STOP, channel);
+
+- if (packetsize == KNET_MAX_PACKET_SIZE) {
++ if ((packetsize == KNET_MAX_PACKET_SIZE) || (force_packet_size)) {
+ break;
+ }
+
+@@ -1175,6 +1188,9 @@ static void send_perf_data_by_time(void)
+ memset(&clock_end, 0, sizeof(clock_start));
+
+ while (packetsize <= KNET_MAX_PACKET_SIZE) {
++ if (force_packet_size) {
++ packetsize = force_packet_size;
++ }
+ for (i = 0; i < PCKT_FRAG_MAX; i++) {
+ iov_out[i].iov_len = packetsize;
+ }
+@@ -1205,7 +1221,7 @@ static void send_perf_data_by_time(void)
+
+ knet_send(knet_h, ctrl_message, TEST_STOP, channel);
+
+- if (packetsize == KNET_MAX_PACKET_SIZE) {
++ if ((packetsize == KNET_MAX_PACKET_SIZE) || (force_packet_size)) {
+ break;
+ }
+
diff --git a/debian/patches/0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch b/debian/patches/0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch
new file mode 100644
index 0000000..20fda14
--- /dev/null
+++ b/debian/patches/0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch
@@ -0,0 +1,38 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Fabio M. Di Nitto" <fdinitto at redhat.com>
+Date: Tue, 15 Oct 2019 11:53:56 +0200
+Subject: [PATCH] [PMTUd] invalidate MTU for a link if the value is lower than
+ minimum
+
+Under heavy network load and packet loss, calculated MTU can be
+too small. In that case we need to invalidate the link mtu,
+that would remove the link from the rotation (and traffic) and
+would give PMTUd time to get the right MTU in the next round.
+
+Signed-off-by: Fabio M. Di Nitto <fdinitto at redhat.com>
+(cherry picked from commit 34c08ae7e7903d79f06569b4f506a00c15af0238)
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ libknet/threads_pmtud.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/libknet/threads_pmtud.c b/libknet/threads_pmtud.c
+index 75d5196..f4d15ae 100644
+--- a/libknet/threads_pmtud.c
++++ b/libknet/threads_pmtud.c
+@@ -482,7 +482,14 @@ static int _handle_check_pmtud(knet_handle_t knet_h, struct knet_host *dst_host,
+ }
+ dst_link->has_valid_mtu = 0;
+ } else {
+- dst_link->has_valid_mtu = 1;
++ if (dst_link->status.mtu < calc_min_mtu(knet_h)) {
++ log_info(knet_h, KNET_SUB_PMTUD,
++ "Invalid MTU detected for host: %u link: %u mtu: %u",
++ dst_host->host_id, dst_link->link_id, dst_link->status.mtu);
++ dst_link->has_valid_mtu = 0;
++ } else {
++ dst_link->has_valid_mtu = 1;
++ }
+ if (dst_link->has_valid_mtu) {
+ if ((saved_pmtud) && (saved_pmtud != dst_link->status.mtu)) {
+ log_info(knet_h, KNET_SUB_PMTUD, "PMTUD link change for host: %u link: %u from %u to %u",
diff --git a/debian/patches/series b/debian/patches/series
index 7fbd139..d593aca 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1 +1,6 @@
-send-test-skip-the-SCTP-test-if-SCTP-is-not-supported-by-.patch
+0001-send-test-skip-the-SCTP-test-if-SCTP-is-not-supporte.patch
+0002-host-rename-variables-to-make-it-easier-to-read-the-.patch
+0003-host-fix-defrag-buffers-reclaim-logic.patch
+0004-rx-copy-data-into-the-defrag-buffer-only-if-we-know-.patch
+0005-test-add-ability-to-knet_bench-to-specify-a-fixed-pa.patch
+0006-PMTUd-invalidate-MTU-for-a-link-if-the-value-is-lowe.patch
--
2.20.1
More information about the pve-devel
mailing list