[pve-devel] applied: [PATCH] backport: block: fix silent corruption in Linux kernel 4.15

Thomas Lamprecht t.lamprecht at proxmox.com
Wed Oct 10 14:46:24 CEST 2018


reproducer: https://www.spinics.net/lists/linux-block/msg28507.html
ubuntu bugreport: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1796542

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 ...-add-a-lower-level-bio_add_page-interface.patch | 178 +++++++++++++++++++++
 ...iov_iter_get_pages-fix-size-of-last-iovec.patch |  77 +++++++++
 ...kdev_direct_IO_simple-fix-leak-in-error-c.patch |  50 ++++++
 ...ov_iter_get_pages-pin-more-pages-for-mult.patch |  98 ++++++++++++
 4 files changed, 403 insertions(+)
 create mode 100644 patches/kernel/0009-block-add-a-lower-level-bio_add_page-interface.patch
 create mode 100644 patches/kernel/0010-block-bio_iov_iter_get_pages-fix-size-of-last-iovec.patch
 create mode 100644 patches/kernel/0011-blkdev-__blkdev_direct_IO_simple-fix-leak-in-error-c.patch
 create mode 100644 patches/kernel/0012-block-bio_iov_iter_get_pages-pin-more-pages-for-mult.patch

diff --git a/patches/kernel/0009-block-add-a-lower-level-bio_add_page-interface.patch b/patches/kernel/0009-block-add-a-lower-level-bio_add_page-interface.patch
new file mode 100644
index 0000000..0183a57
--- /dev/null
+++ b/patches/kernel/0009-block-add-a-lower-level-bio_add_page-interface.patch
@@ -0,0 +1,178 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch at lst.de>
+Date: Tue, 9 Oct 2018 17:04:39 +0100
+Subject: [PATCH] block: add a lower-level bio_add_page interface
+
+Buglink: https://bugs.launchpad.net/bugs/1796542
+
+For the upcoming removal of buffer heads in XFS we need to keep track of
+the number of outstanding writeback requests per page.  For this we need
+to know if bio_add_page merged a region with the previous bvec or not.
+Instead of adding additional arguments this refactors bio_add_page to
+be implemented using three lower level helpers which users like XFS can
+use directly if they care about the merge decisions.
+
+Signed-off-by: Christoph Hellwig <hch at lst.de>
+Reviewed-by: Jens Axboe <axboe at kernel.dk>
+Reviewed-by: Ming Lei <ming.lei at redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong at oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong at oracle.com>
+(cherry picked from commit 0aa69fd32a5f766e997ca8ab4723c5a1146efa8b)
+Signed-off-by: Colin Ian King <colin.king at canonical.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ block/bio.c         | 98 ++++++++++++++++++++++++++++++++++-------------------
+ include/linux/bio.h |  9 +++++
+ 2 files changed, 73 insertions(+), 34 deletions(-)
+
+diff --git a/block/bio.c b/block/bio.c
+index 4b48f8eefc4c..2636d15af979 100644
+--- a/block/bio.c
++++ b/block/bio.c
+@@ -773,7 +773,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
+ 			return 0;
+ 	}
+ 
+-	if (bio->bi_vcnt >= bio->bi_max_vecs)
++	if (bio_full(bio))
+ 		return 0;
+ 
+ 	/*
+@@ -821,6 +821,65 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
+ EXPORT_SYMBOL(bio_add_pc_page);
+ 
+ /**
++ * __bio_try_merge_page - try appending data to an existing bvec.
++ * @bio: destination bio
++ * @page: page to add
++ * @len: length of the data to add
++ * @off: offset of the data in @page
++ *
++ * Try to add the data at @page + @off to the last bvec of @bio.  This is a
++ * a useful optimisation for file systems with a block size smaller than the
++ * page size.
++ *
++ * Return %true on success or %false on failure.
++ */
++bool __bio_try_merge_page(struct bio *bio, struct page *page,
++		unsigned int len, unsigned int off)
++{
++	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
++		return false;
++
++	if (bio->bi_vcnt > 0) {
++		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
++
++		if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
++			bv->bv_len += len;
++			bio->bi_iter.bi_size += len;
++			return true;
++		}
++	}
++	return false;
++}
++EXPORT_SYMBOL_GPL(__bio_try_merge_page);
++
++/**
++ * __bio_add_page - add page to a bio in a new segment
++ * @bio: destination bio
++ * @page: page to add
++ * @len: length of the data to add
++ * @off: offset of the data in @page
++ *
++ * Add the data at @page + @off to @bio as a new bvec.  The caller must ensure
++ * that @bio has space for another bvec.
++ */
++void __bio_add_page(struct bio *bio, struct page *page,
++		unsigned int len, unsigned int off)
++{
++	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
++
++	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
++	WARN_ON_ONCE(bio_full(bio));
++
++	bv->bv_page = page;
++	bv->bv_offset = off;
++	bv->bv_len = len;
++
++	bio->bi_iter.bi_size += len;
++	bio->bi_vcnt++;
++}
++EXPORT_SYMBOL_GPL(__bio_add_page);
++
++/**
+  *	bio_add_page	-	attempt to add page to bio
+  *	@bio: destination bio
+  *	@page: page to add
+@@ -833,40 +892,11 @@ EXPORT_SYMBOL(bio_add_pc_page);
+ int bio_add_page(struct bio *bio, struct page *page,
+ 		 unsigned int len, unsigned int offset)
+ {
+-	struct bio_vec *bv;
+-
+-	/*
+-	 * cloned bio must not modify vec list
+-	 */
+-	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+-		return 0;
+-
+-	/*
+-	 * For filesystems with a blocksize smaller than the pagesize
+-	 * we will often be called with the same page as last time and
+-	 * a consecutive offset.  Optimize this special case.
+-	 */
+-	if (bio->bi_vcnt > 0) {
+-		bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+-
+-		if (page == bv->bv_page &&
+-		    offset == bv->bv_offset + bv->bv_len) {
+-			bv->bv_len += len;
+-			goto done;
+-		}
++	if (!__bio_try_merge_page(bio, page, len, offset)) {
++		if (bio_full(bio))
++			return 0;
++		__bio_add_page(bio, page, len, offset);
+ 	}
+-
+-	if (bio->bi_vcnt >= bio->bi_max_vecs)
+-		return 0;
+-
+-	bv		= &bio->bi_io_vec[bio->bi_vcnt];
+-	bv->bv_page	= page;
+-	bv->bv_len	= len;
+-	bv->bv_offset	= offset;
+-
+-	bio->bi_vcnt++;
+-done:
+-	bio->bi_iter.bi_size += len;
+ 	return len;
+ }
+ EXPORT_SYMBOL(bio_add_page);
+diff --git a/include/linux/bio.h b/include/linux/bio.h
+index a98c6ac575cf..3440870712d4 100644
+--- a/include/linux/bio.h
++++ b/include/linux/bio.h
+@@ -123,6 +123,11 @@ static inline void *bio_data(struct bio *bio)
+ 	return NULL;
+ }
+ 
++static inline bool bio_full(struct bio *bio)
++{
++	return bio->bi_vcnt >= bio->bi_max_vecs;
++}
++
+ /*
+  * will die
+  */
+@@ -447,6 +452,10 @@ void bio_chain(struct bio *, struct bio *);
+ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
+ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
+ 			   unsigned int, unsigned int);
++bool __bio_try_merge_page(struct bio *bio, struct page *page,
++		unsigned int len, unsigned int off);
++void __bio_add_page(struct bio *bio, struct page *page,
++		unsigned int len, unsigned int off);
+ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
+ struct rq_map_data;
+ extern struct bio *bio_map_user_iov(struct request_queue *,
diff --git a/patches/kernel/0010-block-bio_iov_iter_get_pages-fix-size-of-last-iovec.patch b/patches/kernel/0010-block-bio_iov_iter_get_pages-fix-size-of-last-iovec.patch
new file mode 100644
index 0000000..b17e7a8
--- /dev/null
+++ b/patches/kernel/0010-block-bio_iov_iter_get_pages-fix-size-of-last-iovec.patch
@@ -0,0 +1,77 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Martin Wilck <mwilck at suse.com>
+Date: Tue, 9 Oct 2018 17:04:40 +0100
+Subject: [PATCH] block: bio_iov_iter_get_pages: fix size of last iovec
+
+Buglink: https://bugs.launchpad.net/bugs/1796542
+
+If the last page of the bio is not "full", the length of the last
+vector slot needs to be corrected. This slot has the index
+(bio->bi_vcnt - 1), but only in bio->bi_io_vec. In the "bv" helper
+array, which is shifted by the value of bio->bi_vcnt at function
+invocation, the correct index is (nr_pages - 1).
+
+v2: improved readability following suggestions from Ming Lei.
+v3: followed a formatting suggestion from Christoph Hellwig.
+
+Fixes: 2cefe4dbaadf ("block: add bio_iov_iter_get_pages()")
+Reviewed-by: Hannes Reinecke <hare at suse.com>
+Reviewed-by: Ming Lei <ming.lei at redhat.com>
+Reviewed-by: Jan Kara <jack at suse.cz>
+Reviewed-by: Christoph Hellwig <hch at lst.de>
+Signed-off-by: Martin Wilck <mwilck at suse.com>
+Signed-off-by: Jens Axboe <axboe at kernel.dk>
+(cherry picked from commit b403ea2404889e1227812fa9657667a1deb9c694)
+Signed-off-by: Colin Ian King <colin.king at canonical.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ block/bio.c | 18 ++++++++----------
+ 1 file changed, 8 insertions(+), 10 deletions(-)
+
+diff --git a/block/bio.c b/block/bio.c
+index 2636d15af979..d76372a6a5fe 100644
+--- a/block/bio.c
++++ b/block/bio.c
+@@ -911,16 +911,16 @@ EXPORT_SYMBOL(bio_add_page);
+  */
+ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+ {
+-	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
++	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx;
+ 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
+ 	struct page **pages = (struct page **)bv;
+-	size_t offset, diff;
++	size_t offset;
+ 	ssize_t size;
+ 
+ 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+ 	if (unlikely(size <= 0))
+ 		return size ? size : -EFAULT;
+-	nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
++	idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
+ 
+ 	/*
+ 	 * Deep magic below:  We need to walk the pinned pages backwards
+@@ -933,17 +933,15 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+ 	bio->bi_iter.bi_size += size;
+ 	bio->bi_vcnt += nr_pages;
+ 
+-	diff = (nr_pages * PAGE_SIZE - offset) - size;
+-	while (nr_pages--) {
+-		bv[nr_pages].bv_page = pages[nr_pages];
+-		bv[nr_pages].bv_len = PAGE_SIZE;
+-		bv[nr_pages].bv_offset = 0;
++	while (idx--) {
++		bv[idx].bv_page = pages[idx];
++		bv[idx].bv_len = PAGE_SIZE;
++		bv[idx].bv_offset = 0;
+ 	}
+ 
+ 	bv[0].bv_offset += offset;
+ 	bv[0].bv_len -= offset;
+-	if (diff)
+-		bv[bio->bi_vcnt - 1].bv_len -= diff;
++	bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size;
+ 
+ 	iov_iter_advance(iter, size);
+ 	return 0;
diff --git a/patches/kernel/0011-blkdev-__blkdev_direct_IO_simple-fix-leak-in-error-c.patch b/patches/kernel/0011-blkdev-__blkdev_direct_IO_simple-fix-leak-in-error-c.patch
new file mode 100644
index 0000000..797b6ca
--- /dev/null
+++ b/patches/kernel/0011-blkdev-__blkdev_direct_IO_simple-fix-leak-in-error-c.patch
@@ -0,0 +1,50 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Martin Wilck <mwilck at suse.com>
+Date: Tue, 9 Oct 2018 17:04:41 +0100
+Subject: [PATCH] blkdev: __blkdev_direct_IO_simple: fix leak in error case
+
+Buglink: https://bugs.launchpad.net/bugs/1796542
+
+Fixes: 72ecad22d9f1 ("block: support a full bio worth of IO for simplified bdev direct-io")
+Reviewed-by: Ming Lei <ming.lei at redhat.com>
+Reviewed-by: Hannes Reinecke <hare at suse.com>
+Reviewed-by: Christoph Hellwig <hch at lst.de>
+Signed-off-by: Martin Wilck <mwilck at suse.com>
+Signed-off-by: Jens Axboe <axboe at kernel.dk>
+(cherry picked from commit 9362dd1109f87a9d0a798fbc890cb339c171ed35)
+Signed-off-by: Colin Ian King <colin.king at canonical.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ fs/block_dev.c | 9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+diff --git a/fs/block_dev.c b/fs/block_dev.c
+index 82c823ef06a6..74b4ae9b7ba0 100644
+--- a/fs/block_dev.c
++++ b/fs/block_dev.c
+@@ -219,7 +219,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
+ 
+ 	ret = bio_iov_iter_get_pages(&bio, iter);
+ 	if (unlikely(ret))
+-		return ret;
++		goto out;
+ 	ret = bio.bi_iter.bi_size;
+ 
+ 	if (iov_iter_rw(iter) == READ) {
+@@ -248,12 +248,13 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
+ 		put_page(bvec->bv_page);
+ 	}
+ 
+-	if (vecs != inline_vecs)
+-		kfree(vecs);
+-
+ 	if (unlikely(bio.bi_status))
+ 		ret = blk_status_to_errno(bio.bi_status);
+ 
++out:
++	if (vecs != inline_vecs)
++		kfree(vecs);
++
+ 	bio_uninit(&bio);
+ 
+ 	return ret;
diff --git a/patches/kernel/0012-block-bio_iov_iter_get_pages-pin-more-pages-for-mult.patch b/patches/kernel/0012-block-bio_iov_iter_get_pages-pin-more-pages-for-mult.patch
new file mode 100644
index 0000000..644b564
--- /dev/null
+++ b/patches/kernel/0012-block-bio_iov_iter_get_pages-pin-more-pages-for-mult.patch
@@ -0,0 +1,98 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Martin Wilck <mwilck at suse.com>
+Date: Tue, 9 Oct 2018 17:04:42 +0100
+Subject: [PATCH] block: bio_iov_iter_get_pages: pin more pages for
+ multi-segment IOs
+
+Buglink: https://bugs.launchpad.net/bugs/1796542
+
+bio_iov_iter_get_pages() currently only adds pages for the next non-zero
+segment from the iov_iter to the bio. That's suboptimal for callers,
+which typically try to pin as many pages as fit into the bio. This patch
+converts the current bio_iov_iter_get_pages() into a static helper, and
+introduces a new helper that allocates as many pages as
+
+ 1) fit into the bio,
+ 2) are present in the iov_iter,
+ 3) and can be pinned by MM.
+
+Error is returned only if zero pages could be pinned. Because of 3), a
+zero return value doesn't necessarily mean all pages have been pinned.
+Callers that have to pin every page in the iov_iter must still call this
+function in a loop (this is currently the case).
+
+This change matters most for __blkdev_direct_IO_simple(), which calls
+bio_iov_iter_get_pages() only once. If it obtains less pages than
+requested, it returns a "short write" or "short read", and
+__generic_file_write_iter() falls back to buffered writes, which may
+lead to data corruption.
+
+Fixes: 72ecad22d9f1 ("block: support a full bio worth of IO for simplified bdev direct-io")
+Reviewed-by: Christoph Hellwig <hch at lst.de>
+Signed-off-by: Martin Wilck <mwilck at suse.com>
+Signed-off-by: Jens Axboe <axboe at kernel.dk>
+(cherry picked from commit 17d51b10d7773e4618bcac64648f30f12d4078fb)
+Signed-off-by: Colin Ian King <colin.king at canonical.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
+---
+ block/bio.c | 35 ++++++++++++++++++++++++++++++++---
+ 1 file changed, 32 insertions(+), 3 deletions(-)
+
+diff --git a/block/bio.c b/block/bio.c
+index d76372a6a5fe..415c65b9c590 100644
+--- a/block/bio.c
++++ b/block/bio.c
+@@ -902,14 +902,16 @@ int bio_add_page(struct bio *bio, struct page *page,
+ EXPORT_SYMBOL(bio_add_page);
+ 
+ /**
+- * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
++ * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+  * @bio: bio to add pages to
+  * @iter: iov iterator describing the region to be mapped
+  *
+- * Pins as many pages from *iter and appends them to @bio's bvec array. The
++ * Pins pages from *iter and appends them to @bio's bvec array. The
+  * pages will have to be released using put_page() when done.
++ * For multi-segment *iter, this function only adds pages from the
++ * the next non-empty segment of the iov iterator.
+  */
+-int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
++static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+ {
+ 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx;
+ 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
+@@ -946,6 +948,33 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+ 	iov_iter_advance(iter, size);
+ 	return 0;
+ }
++
++/**
++ * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
++ * @bio: bio to add pages to
++ * @iter: iov iterator describing the region to be mapped
++ *
++ * Pins pages from *iter and appends them to @bio's bvec array. The
++ * pages will have to be released using put_page() when done.
++ * The function tries, but does not guarantee, to pin as many pages as
++ * fit into the bio, or are requested in *iter, whatever is smaller.
++ * If MM encounters an error pinning the requested pages, it stops.
++ * Error is returned only if 0 pages could be pinned.
++ */
++int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
++{
++	unsigned short orig_vcnt = bio->bi_vcnt;
++
++	do {
++		int ret = __bio_iov_iter_get_pages(bio, iter);
++
++		if (unlikely(ret))
++			return bio->bi_vcnt > orig_vcnt ? 0 : ret;
++
++	} while (iov_iter_count(iter) && !bio_full(bio));
++
++	return 0;
++}
+ EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
+ 
+ static void submit_bio_wait_endio(struct bio *bio)
-- 
2.11.0





More information about the pve-devel mailing list