bcachefs: BCH_WRITE_SYNC

This adds a new flag for the write path, BCH_WRITE_SYNC, and switches the O_DIRECT write path to use it when we're not running asynchronously. It runs the btree update after the write in the original thread's context instead of a kworker, cutting context switches in half. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
author: Kent Overstreet <kent.overstreet@linux.dev> 2022-10-29 15:54:17 -0400
committer: Kent Overstreet <kent.overstreet@linux.dev> 2022-10-31 21:36:07 -0400
commit: 77c27f28aa58e9d9037eb68c87d3283f68c371f7 (patch)
tree: 692daf92aebe70364f6711a6f3fc7971651d7a57
parent: 1913e923fbf5fecb16dca4bd2ebe960b88f34a9c (diff)
3 files changed, 59 insertions, 81 deletions
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index bc91b584e891..0925f11752d9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -2118,6 +2118,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		dio->op.subvol		= inode->ei_subvol;
 		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
 
+		if (sync)
+			dio->op.flags |= BCH_WRITE_SYNC;
 		if ((req->ki_flags & IOCB_DSYNC) &&
 		    !c->opts.journal_flush_disabled)
 			dio->op.flags |= BCH_WRITE_FLUSH;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f74f6e4de233..74bad67f6000 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -541,7 +541,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 static void __bch2_write(struct closure *);
 
-static void bch2_write_done(struct closure *cl)
+static void __bch2_write_done(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
@@ -557,7 +557,23 @@ static void bch2_write_done(struct closure *cl)
 
 	EBUG_ON(cl->parent);
 	closure_debug_destroy(cl);
-	op->end_io(op);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static __always_inline void bch2_write_done(struct bch_write_op *op)
+{
+	if (likely(!(op->flags & BCH_WRITE_FLUSH) || op->error)) {
+		__bch2_write_done(&op->cl);
+	} else if (!(op->flags & BCH_WRITE_SYNC)) {
+		bch2_journal_flush_seq_async(&op->c->journal,
+					     op->journal_seq,
+					     &op->cl);
+		continue_at(&op->cl, __bch2_write_done, index_update_wq(op));
+	} else {
+		bch2_journal_flush_seq(&op->c->journal, op->journal_seq);
+		__bch2_write_done(&op->cl);
+	}
 }
 
 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
@@ -644,26 +660,20 @@ out:
 err:
 	keys->top = keys->keys;
 	op->error = ret;
+	op->flags |= BCH_WRITE_DONE;
 	goto out;
 }
 
 static void bch2_write_index(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bch_fs *c = op->c;
 
 	__bch2_write_index(op);
 
-	if (!(op->flags & BCH_WRITE_DONE)) {
+	if (!(op->flags & BCH_WRITE_DONE))
 		continue_at(cl, __bch2_write, index_update_wq(op));
-	} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
-		bch2_journal_flush_seq_async(&c->journal,
-					     op->journal_seq,
-					     cl);
-		continue_at(cl, bch2_write_done, index_update_wq(op));
-	} else {
-		continue_at_nobarrier(cl, bch2_write_done, NULL);
-	}
+	else
+		bch2_write_done(op);
 }
 
 static void bch2_write_endio(struct bio *bio)
@@ -695,12 +705,12 @@ static void bch2_write_endio(struct bio *bio)
 	if (wbio->put_bio)
 		bio_put(bio);
 
-	if (parent)
+	if (parent) {
 		bio_endio(&parent->bio);
-	else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
-		closure_put(cl);
-	else
-		continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
+		return;
+	}
+
+	closure_put(cl);
 }
 
 static void init_append_extent(struct bch_write_op *op,
@@ -1113,7 +1123,6 @@ static void __bch2_write(struct closure *cl)
 	struct bch_fs *c = op->c;
 	struct write_point *wp;
 	struct bio *bio = NULL;
-	bool skip_put = true;
 	unsigned nofs_flags;
 	int ret;
 
@@ -1129,13 +1138,13 @@ again:
 		/* +1 for possible cache device: */
 		if (op->open_buckets.nr + op->nr_replicas + 1 >
 		    ARRAY_SIZE(op->open_buckets.v))
-			goto flush_io;
+			break;
 
 		if (bch2_keylist_realloc(&op->insert_keys,
 					op->inline_keys,
 					ARRAY_SIZE(op->inline_keys),
 					BKEY_EXTENT_U64s_MAX))
-			goto flush_io;
+			break;
 
 		/*
 		 * The copygc thread is now global, which means it's no longer
@@ -1157,48 +1166,31 @@ again:
 
 		if (IS_ERR(wp)) {
 			if (unlikely(wp != ERR_PTR(-EAGAIN))) {
-				ret = PTR_ERR(wp);
-				goto err;
+				op->error = PTR_ERR(wp);
+				op->flags |= BCH_WRITE_DONE;
 			}
 
-			goto flush_io;
+			break;
 		}
 
-		/*
-		 * It's possible for the allocator to fail, put us on the
-		 * freelist waitlist, and then succeed in one of various retry
-		 * paths: if that happens, we need to disable the skip_put
-		 * optimization because otherwise there won't necessarily be a
-		 * barrier before we free the bch_write_op:
-		 */
-		if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
-			skip_put = false;
-
 		bch2_open_bucket_get(c, wp, &op->open_buckets);
 		ret = bch2_write_extent(op, wp, &bio);
 		bch2_alloc_sectors_done(c, wp);
 
-		if (ret < 0)
-			goto err;
-
-		if (ret) {
-			skip_put = false;
-		} else {
-			/*
-			 * for the skip_put optimization this has to be set
-			 * before we submit the bio:
-			 */
+		if (ret < 0) {
+			op->error = ret;
 			op->flags |= BCH_WRITE_DONE;
+			break;
 		}
 
+		if (!ret)
+			op->flags |= BCH_WRITE_DONE;
+
 		bio->bi_end_io	= bch2_write_endio;
 		bio->bi_private	= &op->cl;
 		bio->bi_opf |= REQ_OP_WRITE;
 
-		if (!skip_put)
-			closure_get(bio->bi_private);
-		else
-			op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+		closure_get(bio->bi_private);
 
 		key_to_write = (void *) (op->insert_keys.keys_p +
 					 key_to_write_offset);
@@ -1207,48 +1199,33 @@ again:
 					  key_to_write);
 	} while (ret);
 
-	if (!skip_put)
-		continue_at(cl, bch2_write_index, index_update_wq(op));
-out:
-	memalloc_nofs_restore(nofs_flags);
-	return;
-err:
-	op->error = ret;
-	op->flags |= BCH_WRITE_DONE;
-
-	continue_at(cl, bch2_write_index, index_update_wq(op));
-	goto out;
-flush_io:
 	/*
-	 * If the write can't all be submitted at once, we generally want to
-	 * block synchronously as that signals backpressure to the caller.
+	 * Sync or no?
+	 *
+	 * If we're running asynchronously, wne may still want to block
+	 * synchronously here if we weren't able to submit all of the IO at
+	 * once, as that signals backpressure to the caller.
 	 *
 	 * However, if we're running out of a workqueue, we can't block here
 	 * because we'll be blocking other work items from completing:
 	 */
-	if (current->flags & PF_WQ_WORKER) {
-		continue_at(cl, bch2_write_index, index_update_wq(op));
-		goto out;
-	}
-
-	closure_sync(cl);
-
-	if (!bch2_keylist_empty(&op->insert_keys)) {
+	if ((op->flags & BCH_WRITE_SYNC) ||
+	    (!(op->flags & BCH_WRITE_DONE) && !(current->flags & PF_WQ_WORKER))) {
+		closure_sync(cl);
 		__bch2_write_index(op);
 
-		if (op->error) {
-			op->flags |= BCH_WRITE_DONE;
-			continue_at_nobarrier(cl, bch2_write_done, NULL);
-			goto out;
-		}
+		if (!(op->flags & BCH_WRITE_DONE))
+			goto again;
+		bch2_write_done(op);
+	} else {
+		continue_at(cl, bch2_write_index, index_update_wq(op));
 	}
 
-	goto again;
+	memalloc_nofs_restore(nofs_flags);
 }
 
 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 {
-	struct closure *cl = &op->cl;
 	struct bio *bio = &op->wbio.bio;
 	struct bvec_iter iter;
 	struct bkey_i_inline_data *id;
@@ -1285,10 +1262,9 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
 	op->flags |= BCH_WRITE_DONE;
 
-	continue_at_nobarrier(cl, bch2_write_index, NULL);
-	return;
+	__bch2_write_index(op);
 err:
-	bch2_write_done(&op->cl);
+	bch2_write_done(op);
 }
 
 /**
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 9879a76e0168..cf69a5066e3a 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -35,10 +35,10 @@ enum bch_write_flags {
 	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 7),
 	BCH_WRITE_FROM_INTERNAL		= (1 << 8),
 	BCH_WRITE_CHECK_ENOSPC		= (1 << 9),
-	BCH_WRITE_MOVE			= (1 << 10),
+	BCH_WRITE_SYNC			= (1 << 10),
+	BCH_WRITE_MOVE			= (1 << 11),
 
 	/* Internal: */
-	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 11),
 	BCH_WRITE_DONE			= (1 << 12),
 	BCH_WRITE_IO_ERROR		= (1 << 13),
 };
author	Kent Overstreet <kent.overstreet@linux.dev>	2022-10-29 15:54:17 -0400
committer	Kent Overstreet <kent.overstreet@linux.dev>	2022-10-31 21:36:07 -0400
commit	77c27f28aa58e9d9037eb68c87d3283f68c371f7 (patch)
tree	692daf92aebe70364f6711a6f3fc7971651d7a57
parent	1913e923fbf5fecb16dca4bd2ebe960b88f34a9c (diff)