[pve-devel] [PATCH kernel] add scheduler fix for ceph on numa hosts

Thu Jul 21 11:56:27 CEST 2016

applied

On Thu, Jul 14, 2016 at 04:00:19PM +0200, Fabian Grünbichler wrote:
> see https://forum.proxmox.com/threads/ceph-kernel-4-4-8-bug.28196/
> and https://lkml.org/lkml/2016/3/17/570 for background
> ---
>  Makefile                 |   1 +
>  ceph-scheduler-fix.patch | 137 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 138 insertions(+)
>  create mode 100644 ceph-scheduler-fix.patch
> 
> diff --git a/Makefile b/Makefile
> index c2f9ae8..a29e062 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -258,6 +258,7 @@ ${KERNEL_SRC}/README ${KERNEL_CFG_ORG}: ${KERNELSRCTAR}
>  	cd ${KERNEL_SRC}; patch -p1 < ../CVE-2016-4794-1-percpu-fix-synchronization-between-chunk-map_extend_.patch
>  	cd ${KERNEL_SRC}; patch -p1 < ../CVE-2016-4794-2-percpu-fix-synchronization-between-synchronous-map-e.patch
>  	cd ${KERNEL_SRC}; patch -p1 < ../CVE-2016-4470-KEYS-potential-uninitialized-variable.patch
> +	cd ${KERNEL_SRC}; patch -p1 < ../ceph-scheduler-fix.patch
>  	sed -i ${KERNEL_SRC}/Makefile -e 's/^EXTRAVERSION.*$$/EXTRAVERSION=${EXTRAVERSION}/'
>  	touch $@
>  
> diff --git a/ceph-scheduler-fix.patch b/ceph-scheduler-fix.patch
> new file mode 100644
> index 0000000..2466f82
> --- /dev/null
> +++ b/ceph-scheduler-fix.patch
> @@ -0,0 +1,137 @@
> +commit 8974189222159154c55f24ddad33e3613960521a
> +Author: Peter Zijlstra <peterz at infradead.org>
> +Date:   Thu Jun 16 10:50:40 2016 +0200
> +
> +    sched/fair: Fix cfs_rq avg tracking underflow
> +    
> +    As per commit:
> +    
> +      b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() serialization")
> +    
> +    > the code generated from update_cfs_rq_load_avg():
> +    >
> +    > 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> +    > 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> +    > 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
> +    > 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> +    > 		removed_load = 1;
> +    > 	}
> +    >
> +    > turns into:
> +    >
> +    > ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
> +    > ffffffff8108706b:       48 85 c0                test   %rax,%rax
> +    > ffffffff8108706e:       74 40                   je     ffffffff810870b0 <update_blocked_averages+0xc0>
> +    > ffffffff81087070:       4c 89 f8                mov    %r15,%rax
> +    > ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
> +    > ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
> +    > ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
> +    > ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
> +    > ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
> +    > ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx
> +    >
> +    > Which you'll note ends up with sa->load_avg -= r in memory at
> +    > ffffffff8108707a.
> +    
> +    So I _should_ have looked at other unserialized users of ->load_avg,
> +    but alas. Luckily nikbor reported a similar /0 from task_h_load() which
> +    instantly triggered recollection of this here problem.
> +    
> +    Aside from the intermediate value hitting memory and causing problems,
> +    there's another problem: the underflow detection relies on the signed
> +    bit. This reduces the effective width of the variables, IOW its
> +    effectively the same as having these variables be of signed type.
> +    
> +    This patch changes to a different means of unsigned underflow
> +    detection to not rely on the signed bit. This allows the variables to
> +    use the 'full' unsigned range. And it does so with explicit LOAD -
> +    STORE to ensure any intermediate value will never be visible in
> +    memory, allowing these unserialized loads.
> +    
> +    Note: GCC generates crap code for this, might warrant a look later.
> +    
> +    Note2: I say 'full' above, if we end up at U*_MAX we'll still explode;
> +           maybe we should do clamping on add too.
> +    
> +    Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org>
> +    Cc: Andrey Ryabinin <aryabinin at virtuozzo.com>
> +    Cc: Chris Wilson <chris at chris-wilson.co.uk>
> +    Cc: Linus Torvalds <torvalds at linux-foundation.org>
> +    Cc: Mike Galbraith <efault at gmx.de>
> +    Cc: Peter Zijlstra <peterz at infradead.org>
> +    Cc: Thomas Gleixner <tglx at linutronix.de>
> +    Cc: Yuyang Du <yuyang.du at intel.com>
> +    Cc: bsegall at google.com
> +    Cc: kernel at kyup.com
> +    Cc: morten.rasmussen at arm.com
> +    Cc: pjt at google.com
> +    Cc: steve.muckle at linaro.org
> +    Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
> +    Link: http://lkml.kernel.org/r/20160617091948.GJ30927@twins.programming.kicks-ass.net
> +    Signed-off-by: Ingo Molnar <mingo at kernel.org>
> +
> +---
> + kernel/sched/fair.c |   33 +++++++++++++++++++++++++--------
> + 1 file changed, 25 insertions(+), 8 deletions(-)
> +
> +--- a/kernel/sched/fair.c
> ++++ b/kernel/sched/fair.c
> +@@ -2682,6 +2682,23 @@ static inline void update_tg_load_avg(st
> + 
> + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
> + 
> ++/*
> ++ * Unsigned subtract and clamp on underflow.
> ++ *
> ++ * Explicitly do a load-store to ensure the intermediate value never hits
> ++ * memory. This allows lockless observations without ever seeing the negative
> ++ * values.
> ++ */
> ++#define sub_positive(_ptr, _val) do {				\
> ++	typeof(_ptr) ptr = (_ptr);				\
> ++	typeof(*ptr) val = (_val);				\
> ++	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
> ++	res = var - val;					\
> ++	if (res > var)						\
> ++		res = 0;					\
> ++	WRITE_ONCE(*ptr, res);					\
> ++} while (0)
> ++
> + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
> + static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> + {
> +@@ -2690,15 +2707,15 @@ static inline int update_cfs_rq_load_avg
> + 
> + 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> + 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> +-		sa->load_avg = max_t(long, sa->load_avg - r, 0);
> +-		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> ++		sub_positive(&sa->load_avg, r);
> ++		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
> + 		removed = 1;
> + 	}
> + 
> + 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
> + 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> +-		sa->util_avg = max_t(long, sa->util_avg - r, 0);
> +-		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
> ++		sub_positive(&sa->util_avg, r);
> ++		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
> + 	}
> + 
> + 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
> +@@ -2764,10 +2781,10 @@ static void detach_entity_load_avg(struc
> + 			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
> + 			  cfs_rq->curr == se, NULL);
> + 
> +-	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
> +-	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
> +-	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
> +-	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
> ++	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
> ++	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
> ++	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
> ++	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
> + }
> + 
> + /* Add the load generated by se into cfs_rq's load average */
> -- 
> 2.1.4
> 
> 
> _______________________________________________
> pve-devel mailing list
> pve-devel at pve.proxmox.com
> http://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
>