bcachefs: Device removal work

author: Kent Overstreet <kent.overstreet@gmail.com> 2017-05-20 20:51:40 -0800
committer: Kent Overstreet <kent.overstreet@gmail.com> 2018-05-22 00:44:18 -0400
commit: e4b4227e9969849d181881463da29b9f3cc373fd (patch)
tree: b9892d4f9d51cb95d5347bed4a7e400de0570f4b
parent: 6e4c78da70c84cceb94532cc9886577507ae565f (diff)
21 files changed, 485 insertions, 237 deletions
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ff6273737916..2294cc3adeca 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -159,7 +159,8 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 		    (!c->opts.nofsck &&
 		     fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
-				 "superblock not marked as containing replicas"))) {
+				 "superblock not marked as containing replicas (type %u)",
+				 data_type))) {
 			ret = bch2_check_mark_super(c, e, data_type);
 			if (ret)
 				return ret;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 34cf17680d6a..96484ea206ce 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1437,35 +1437,57 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 	struct closure *cl	= wbio->cl;
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 	struct bkey_i_extent *new_key;
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct btree_iter iter;
+	int ret;
 
-	six_lock_read(&b->lock);
-	bkey_copy(&tmp.k, &b->key);
-	six_unlock_read(&b->lock);
+	__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+			       BTREE_MAX_DEPTH,
+			       b->level, 0);
+retry:
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto err;
 
-	if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
-		/* Node has been freed: */
+	/* has node been freed? */
+	if (iter.nodes[b->level] != b) {
+		/* node has been freed: */
+		if (!btree_node_dying(b))
+			panic("foo4\n");
 		goto out;
 	}
 
-	new_key = bkey_i_to_extent(&tmp.k);
+	if (!btree_node_hashed(b))
+		panic("foo5\n");
 
-	while (wbio->replicas_failed) {
-		unsigned idx = __fls(wbio->replicas_failed);
+	bkey_copy(&tmp.k, &b->key);
 
-		bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
-		wbio->replicas_failed ^= 1 << idx;
-	}
+	new_key = bkey_i_to_extent(&tmp.k);
+	e = extent_i_to_s(new_key);
+	extent_for_each_ptr_backwards(e, ptr)
+		if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+			bch2_extent_drop_ptr(e, ptr);
 
-	if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
-	    bch2_btree_node_update_key(c, b, new_key)) {
-		set_btree_node_noevict(b);
-		bch2_fs_fatal_error(c, "fatal error writing btree node");
-	}
+	if (!bch2_extent_nr_ptrs(e.c))
+		goto err;
+
+	ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+	if (ret == -EINTR)
+		goto retry;
+	if (ret)
+		goto err;
 out:
+	bch2_btree_iter_unlock(&iter);
 	bio_put(&wbio->bio);
 	btree_node_write_done(c, b);
 	if (cl)
 		closure_put(cl);
+	return;
+err:
+	set_btree_node_noevict(b);
+	bch2_fs_fatal_error(c, "fatal error writing btree node");
+	goto out;
 }
 
 void bch2_btree_write_error_work(struct work_struct *work)
@@ -1495,12 +1517,17 @@ static void btree_node_write_endio(struct bio *bio)
 	struct closure *cl		= !wbio->split ? wbio->cl : NULL;
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= wbio->ca;
+	unsigned long flags;
 
 	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
-	    bch2_meta_write_fault("btree"))
-		set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
+	if (bio->bi_status == BLK_STS_REMOVED ||
+	    bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+	    bch2_meta_write_fault("btree")) {
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+	}
 
 	if (wbio->have_io_ref)
 		percpu_ref_put(&ca->io_ref);
@@ -1516,12 +1543,11 @@ static void btree_node_write_endio(struct bio *bio)
 		wbio->used_mempool,
 		wbio->data);
 
-	if (wbio->replicas_failed) {
-		unsigned long flags;
-
+	if (wbio->failed.nr) {
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
 		bio_list_add(&c->btree_write_error_list, &wbio->bio);
 		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
 		queue_work(c->wq, &c->btree_write_error_work);
 		return;
 	}
@@ -1732,6 +1758,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
 	wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
 	wbio->cl		= parent;
+	wbio->failed.nr		= 0;
 	wbio->order		= order;
 	wbio->used_mempool	= used_mempool;
 	wbio->data		= data;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 819b8efc5fd8..0b505a738e86 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -108,6 +108,17 @@ success:
 	return true;
 }
 
+bool bch2_btree_iter_relock(struct btree_iter *iter)
+{
+	unsigned l;
+
+	for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+		if (!bch2_btree_node_relock(iter, l))
+			return false;
+
+	return true;
+}
+
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 			   unsigned level,
@@ -214,7 +225,6 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
 				     unsigned new_locks_want)
 {
 	struct btree_iter *linked;
-	unsigned l;
 
 	/* Drop locks we don't want anymore: */
 	if (new_locks_want < iter->locks_want)
@@ -228,12 +238,9 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
 	iter->locks_want = new_locks_want;
 	btree_iter_drop_extra_locks(iter);
 
-	for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
-		if (!bch2_btree_node_relock(iter, l))
-			goto fail;
+	if (bch2_btree_iter_relock(iter))
+		return true;
 
-	return true;
-fail:
 	/*
 	 * Just an optimization: ancestor nodes must be locked before child
 	 * nodes, so set locks_want on iterators that might lock ancestors
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index a000306228fa..acfe5b59df56 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -111,6 +111,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 }
 
 bool bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool bch2_btree_iter_relock(struct btree_iter *);
 
 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
 void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index f1e06a378c9a..f0e6896a8a5e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -196,6 +196,7 @@ enum btree_flags {
 	BTREE_NODE_accessed,
 	BTREE_NODE_write_in_flight,
 	BTREE_NODE_just_written,
+	BTREE_NODE_dying,
 };
 
 BTREE_FLAG(read_in_flight);
@@ -207,6 +208,7 @@ BTREE_FLAG(write_idx);
 BTREE_FLAG(accessed);
 BTREE_FLAG(write_in_flight);
 BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index e11fcec963ba..c7c2930650d3 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -130,7 +130,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 			    __le64, unsigned);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
-			       struct bkey_i_extent *);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+			       struct btree *, struct bkey_i_extent *);
 
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6351e9c2490f..04854532b8b4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -914,6 +914,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	struct btree_write *w;
 	struct bset_tree *t;
 
+	set_btree_node_dying(b);
 	btree_interior_update_add_node_reference(as, b);
 
 	/*
@@ -1028,6 +1029,10 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	mutex_unlock(&c->btree_cache.lock);
 
 	mutex_lock(&c->btree_root_lock);
+	BUG_ON(btree_node_root(c, b) &&
+	       (b->level < btree_node_root(c, b)->level ||
+		!btree_node_dying(btree_node_root(c, b))));
+
 	btree_node_root(c, b) = b;
 	mutex_unlock(&c->btree_root_lock);
 
@@ -1790,64 +1795,16 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	return ret;
 }
 
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
-			       struct bkey_i_extent *new_key)
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+					 struct btree_update *as,
+					 struct btree_iter *iter,
+					 struct btree *b, struct btree *new_hash,
+					 struct bkey_i_extent *new_key)
 {
-	struct btree_update *as = NULL;
-	struct btree *parent, *new_hash = NULL;
-	struct btree_iter iter;
-	struct closure cl;
+	struct btree *parent;
 	bool must_rewrite_parent = false;
 	int ret;
 
-	__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
-			       BTREE_MAX_DEPTH,
-			       b->level, 0);
-	closure_init_stack(&cl);
-
-	ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
-	if (ret)
-		return ret;
-
-retry:
-	down_read(&c->gc_lock);
-	ret = bch2_btree_iter_traverse(&iter);
-	if (ret)
-		goto err;
-
-	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
-	if (!new_hash &&
-	    PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
-		/* bch2_btree_reserve_get will unlock */
-		do {
-			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
-			closure_sync(&cl);
-		} while (ret == -EAGAIN);
-
-		BUG_ON(ret);
-
-		new_hash = bch2_btree_node_mem_alloc(c);
-	}
-
-	as = bch2_btree_update_start(c, iter.btree_id,
-				     btree_update_reserve_required(c, b),
-				     BTREE_INSERT_NOFAIL|
-				     BTREE_INSERT_USE_RESERVE|
-				     BTREE_INSERT_USE_ALLOC_RESERVE,
-				     &cl);
-	if (IS_ERR(as)) {
-		ret = PTR_ERR(as);
-		if (ret == -EAGAIN || ret == -EINTR) {
-			bch2_btree_iter_unlock(&iter);
-			up_read(&c->gc_lock);
-			closure_sync(&cl);
-			goto retry;
-		}
-		goto err;
-	}
-
-	mutex_lock(&c->btree_interior_update_lock);
-
 	/*
 	 * Two corner cases that need to be thought about here:
 	 *
@@ -1872,22 +1829,12 @@ retry:
 	if (b->will_make_reachable)
 		must_rewrite_parent = true;
 
-	/* other case: btree node being freed */
-	if (iter.nodes[b->level] != b) {
-		/* node has been freed: */
-		BUG_ON(btree_node_hashed(b));
-		mutex_unlock(&c->btree_interior_update_lock);
-		goto err;
-	}
-
-	mutex_unlock(&c->btree_interior_update_lock);
-
 	if (must_rewrite_parent)
 		as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
 
 	btree_interior_update_add_node_reference(as, b);
 
-	parent = iter.nodes[b->level + 1];
+	parent = iter->nodes[b->level + 1];
 	if (parent) {
 		if (new_hash) {
 			bkey_copy(&new_hash->key, &new_key->k_i);
@@ -1896,8 +1843,8 @@ retry:
 			BUG_ON(ret);
 		}
 
-		bch2_btree_insert_node(as, parent, &iter,
-				       &keylist_single(&new_key->k_i));
+		bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+		bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
 
 		if (new_hash) {
 			mutex_lock(&c->btree_cache.lock);
@@ -1917,7 +1864,7 @@ retry:
 
 		BUG_ON(btree_node_root(c, b) != b);
 
-		bch2_btree_node_lock_write(b, &iter);
+		bch2_btree_node_lock_write(b, iter);
 
 		bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
 			      c->opts.btree_node_size, true,
@@ -1928,14 +1875,94 @@ retry:
 					   &stats);
 		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
 				    gc_pos_btree_root(b->btree_id));
-		bkey_copy(&b->key, &new_key->k_i);
+
+		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+			mutex_lock(&c->btree_cache.lock);
+			bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+			bkey_copy(&b->key, &new_key->k_i);
+			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+			BUG_ON(ret);
+			mutex_unlock(&c->btree_cache.lock);
+		} else {
+			bkey_copy(&b->key, &new_key->k_i);
+		}
 
 		btree_update_updated_root(as);
-		bch2_btree_node_unlock_write(b, &iter);
+		bch2_btree_node_unlock_write(b, iter);
 	}
 
 	bch2_btree_update_done(as);
-out:
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+			       struct btree *b, struct bkey_i_extent *new_key)
+{
+	struct btree_update *as = NULL;
+	struct btree *new_hash = NULL;
+	struct closure cl;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	if (!down_read_trylock(&c->gc_lock)) {
+		bch2_btree_iter_unlock(iter);
+		down_read(&c->gc_lock);
+
+		if (!bch2_btree_iter_relock(iter)) {
+			ret = -EINTR;
+			goto err;
+		}
+	}
+
+	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+	if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+		/* bch2_btree_reserve_get will unlock */
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		if (ret) {
+			ret = -EINTR;
+
+			bch2_btree_iter_unlock(iter);
+			up_read(&c->gc_lock);
+			closure_sync(&cl);
+			down_read(&c->gc_lock);
+
+			if (!bch2_btree_iter_relock(iter))
+				goto err;
+		}
+
+		new_hash = bch2_btree_node_mem_alloc(c);
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+				     btree_update_reserve_required(c, b),
+				     BTREE_INSERT_NOFAIL|
+				     BTREE_INSERT_USE_RESERVE|
+				     BTREE_INSERT_USE_ALLOC_RESERVE,
+				     &cl);
+	if (IS_ERR(as)) {
+		ret = PTR_ERR(as);
+		if (ret == -EAGAIN)
+			ret = -EINTR;
+
+		if (ret != -EINTR)
+			goto err;
+
+		bch2_btree_iter_unlock(iter);
+		up_read(&c->gc_lock);
+		closure_sync(&cl);
+		down_read(&c->gc_lock);
+
+		if (!bch2_btree_iter_relock(iter))
+			goto err;
+	}
+
+	ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+	if (ret)
+		goto err_free_update;
+
+	__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+err:
 	if (new_hash) {
 		mutex_lock(&c->btree_cache.lock);
 		list_move(&new_hash->list, &c->btree_cache.freeable);
@@ -1944,14 +1971,12 @@ out:
 		six_unlock_write(&new_hash->lock);
 		six_unlock_intent(&new_hash->lock);
 	}
-	bch2_btree_iter_unlock(&iter);
 	up_read(&c->gc_lock);
 	closure_sync(&cl);
 	return ret;
-err:
-	if (as)
-		bch2_btree_update_free(as);
-	goto out;
+err_free_update:
+	bch2_btree_update_free(as);
+	goto err;
 }
 
 /* Init code: */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 985f980c95d0..176978ca2231 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -157,6 +157,19 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
 	return nr_ptrs;
 }
 
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr_ptrs = 0;
+
+	extent_for_each_ptr(e, ptr)
+		nr_ptrs += (!ptr->cached &&
+			    bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
+			    BCH_MEMBER_STATE_FAILED);
+
+	return nr_ptrs;
+}
+
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
 	struct bkey_s_c_extent e;
@@ -435,7 +448,8 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
 	const struct bch_extent_ptr *ptr2;
 	struct bch_dev *ca;
 
-	if (ptr->dev >= c->sb.nr_devices)
+	if (ptr->dev >= c->sb.nr_devices ||
+	    !c->devs[ptr->dev])
 		return "pointer to invalid device";
 
 	ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -490,7 +504,9 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 			break;
 		case BCH_EXTENT_ENTRY_ptr:
 			ptr = entry_to_ptr(entry);
-			ca = bch_dev_bkey_exists(c, ptr->dev);
+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
 
 			p("ptr: %u:%llu gen %u%s", ptr->dev,
 			  (u64) ptr->offset, ptr->gen,
@@ -1974,15 +1990,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 				      struct bkey_s_extent e)
 {
 	struct bch_extent_ptr *ptr;
-	unsigned tier = 0, nr_cached = 0, nr_good = 0;
+	unsigned tier = 0, nr_cached = 0;
+	unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
 	bool have_higher_tier;
 
-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached &&
-		    bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
-		    BCH_MEMBER_STATE_FAILED)
-			nr_good++;
-
 	if (nr_good <= c->opts.data_replicas)
 		return;
 
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ff4ce2af16e0..ab7993abbddf 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -45,6 +45,7 @@ bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
 
 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
 unsigned bch2_extent_is_compressed(struct bkey_s_c);
 
 bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index e465533563f9..744bff0f42bb 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -140,7 +140,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 	const struct bch_extent_ptr *ptr;
 	struct bch_write_bio *n;
 	struct bch_dev *ca;
-	unsigned ptr_idx = 0;
 
 	BUG_ON(c->opts.nochanges);
 
@@ -169,7 +168,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 		n->c			= c;
 		n->ca			= ca;
-		n->ptr_idx		= ptr_idx++;
 		n->submit_time_us	= local_clock_us();
 		n->bio.bi_iter.bi_sector = ptr->offset;
 
@@ -185,7 +183,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			submit_bio(&n->bio);
 		} else {
 			n->have_io_ref		= false;
-			bcache_io_error(c, &n->bio, "device has been removed");
+			n->bio.bi_status	= BLK_STS_REMOVED;
 			bio_endio(&n->bio);
 		}
 	}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index b3a23e821097..0c145eb67317 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -21,6 +21,8 @@ void bch2_latency_acct(struct bch_dev *, unsigned, int);
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 			       enum bch_data_type, const struct bkey_i *);
 
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
 enum bch_write_flags {
 	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
 	BCH_WRITE_CACHED		= (1 << 1),
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 5b0d7aae97c5..ff18fdc90eb7 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -72,8 +72,7 @@ struct bch_write_bio {
 	struct closure		*cl;
 	};
 
-	u8			ptr_idx;
-	u8			replicas_failed;
+	struct bch_devs_list	failed;
 	u8			order;
 
 	unsigned		split:1,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 55e5d21bd3ce..30e80409962f 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -338,8 +338,8 @@ struct journal_list {
  * Given a journal entry we just read, add it to the list of journal entries to
  * be replayed:
  */
-static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
-		    struct jset *j)
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct journal_list *jlist, struct jset *j)
 {
 	struct journal_replay *i, *pos;
 	struct list_head *where;
@@ -347,8 +347,6 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
 	__le64 last_seq;
 	int ret;
 
-	mutex_lock(&jlist->lock);
-
 	last_seq = !list_empty(jlist->head)
 		? list_last_entry(jlist->head, struct journal_replay,
 				  list)->j.last_seq
@@ -376,9 +374,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
 				    memcmp(j, &i->j, bytes), c,
 				    "found duplicate but non identical journal entries (seq %llu)",
 				    le64_to_cpu(j->seq));
-
-			ret = JOURNAL_ENTRY_ADD_OK;
-			goto out;
+			goto found;
 		}
 
 		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
@@ -395,12 +391,16 @@ add:
 		goto out;
 	}
 
-	memcpy(&i->j, j, bytes);
 	list_add(&i->list, where);
+	i->devs.nr = 0;
+	memcpy(&i->j, j, bytes);
+found:
+	if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx),
+			 c, "duplicate journal entries on same device"))
+		bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
 	ret = JOURNAL_ENTRY_ADD_OK;
 out:
 fsck_err:
-	mutex_unlock(&jlist->lock);
 	return ret;
 }
 
@@ -722,7 +722,10 @@ reread:			sectors_read = min_t(unsigned,
 
 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
-		ret = journal_entry_add(c, jlist, j);
+		mutex_lock(&jlist->lock);
+		ret = journal_entry_add(c, ca, jlist, j);
+		mutex_unlock(&jlist->lock);
+
 		switch (ret) {
 		case JOURNAL_ENTRY_ADD_OK:
 			*entries_found = true;
@@ -1011,6 +1014,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		INIT_LIST_HEAD(&p->list);
 		INIT_LIST_HEAD(&p->flushed);
 		atomic_set(&p->count, 0);
+		p->devs.nr = 0;
 	}
 
 	mutex_lock(&j->blacklist_lock);
@@ -1019,6 +1023,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
 
 		atomic_set(&p->count, 1);
+		p->devs = i->devs;
 
 		if (journal_seq_blacklist_read(j, i, p)) {
 			mutex_unlock(&j->blacklist_lock);
@@ -1131,6 +1136,7 @@ static void __journal_entry_new(struct journal *j, int count)
 	INIT_LIST_HEAD(&p->list);
 	INIT_LIST_HEAD(&p->flushed);
 	atomic_set(&p->count, count);
+	p->devs.nr = 0;
 }
 
 static void __bch2_journal_next_entry(struct journal *j)
@@ -2303,6 +2309,9 @@ static void journal_write(struct closure *cl)
 				  BCH_DATA_JOURNAL))
 		goto err;
 
+	journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
+			bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
+
 	/*
 	 * XXX: we really should just disable the entire journal in nochanges
 	 * mode
@@ -2720,6 +2729,46 @@ int bch2_journal_flush(struct journal *j)
 	return bch2_journal_flush_seq(j, seq);
 }
 
+int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct bch_devs_list devs;
+	u64 seq = 0;
+	unsigned iter;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (bch2_dev_list_has_dev(p->devs, dev_idx))
+			seq = journal_pin_seq(j, p);
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_pins(j, seq);
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+	seq = 0;
+
+	spin_lock(&j->lock);
+	while (!ret && seq < atomic64_read(&j->seq)) {
+		seq = max(seq, last_seq(j));
+		devs = journal_seq_pin(j, seq)->devs;
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
+
 ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index e6532f2f6100..5f3ece089937 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -118,6 +118,8 @@
  */
 struct journal_replay {
 	struct list_head	list;
+	struct bch_devs_list	devs;
+	/* must be last: */
 	struct jset		j;
 };
 
@@ -357,6 +359,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 int bch2_journal_meta(struct journal *);
+int bch2_journal_flush_device(struct journal *, unsigned);
 
 void bch2_journal_halt(struct journal *);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 55b41c56a3f2..87f378a6ac4f 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -34,6 +34,7 @@ struct journal_entry_pin_list {
 	struct list_head		list;
 	struct list_head		flushed;
 	atomic_t			count;
+	struct bch_devs_list		devs;
 };
 
 struct journal;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 8d1c0ee07c24..e11ee9532483 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -27,23 +27,9 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
 
 #define MAX_DATA_OFF_ITER	10
 
-/*
- * This moves only the data off, leaving the meta-data (if any) in place.
- * It walks the key space, and for any key with a valid pointer to the
- * relevant device, it copies it elsewhere, updating the key to point to
- * the copy.
- * The meta-data is moved off by bch_move_meta_data_off_device.
- *
- * Note: If the number of data replicas desired is > 1, ideally, any
- * new copies would not be made in the same device that already have a
- * copy (if there are enough devices).
- * This is _not_ currently implemented.  The multiple replicas can
- * land in the same device even if there are others available.
- */
-
-int bch2_move_data_off_device(struct bch_dev *ca)
+static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
+				    int flags)
 {
-	struct bch_fs *c = ca->fs;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	u64 keys_moved, sectors_moved;
@@ -113,10 +99,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
 	return ret;
 }
 
-/*
- * This walks the btree, and for any node on the relevant device it moves the
- * node elsewhere.
- */
 static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
 			       enum btree_id id)
 {
@@ -200,9 +182,9 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
  *   is written.
  */
 
-int bch2_move_metadata_off_device(struct bch_dev *ca)
+static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
+				     int flags)
 {
-	struct bch_fs *c = ca->fs;
 	unsigned i;
 	int ret = 0;
 
@@ -240,37 +222,31 @@ err:
 	return ret;
 }
 
-/*
- * Flagging data bad when forcibly removing a device after failing to
- * migrate the data off the device.
- */
+int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	return bch2_dev_usrdata_migrate(c, ca, flags) ?:
+		bch2_dev_metadata_migrate(c, ca, flags);
+}
 
-static int bch2_flag_key_bad(struct btree_iter *iter,
-			    struct bch_dev *ca,
-			    struct bkey_s_c_extent orig)
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+			 unsigned dev_idx, int flags, bool metadata)
 {
-	BKEY_PADDED(key) tmp;
-	struct bkey_s_extent e;
 	struct bch_extent_ptr *ptr;
-	struct bch_fs *c = ca->fs;
-
-	bkey_reassemble(&tmp.key, orig.s_c);
-	e = bkey_i_to_s_extent(&tmp.key);
+	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+	unsigned nr_good;
 
 	extent_for_each_ptr_backwards(e, ptr)
-		if (ptr->dev == ca->dev_idx)
+		if (ptr->dev == dev_idx)
 			bch2_extent_drop_ptr(e, ptr);
 
-	/*
-	 * If the new extent no longer has any pointers, bch2_extent_normalize()
-	 * will do the appropriate thing with it (turning it into a
-	 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
-	 */
-	bch2_extent_normalize(c, e.s);
+	nr_good = bch2_extent_nr_good_ptrs(c, e.c);
+	if ((!nr_good && !(flags & lost)) ||
+	    (nr_good < replicas && !(flags & degraded)))
+		return -EINVAL;
 
-	return bch2_btree_insert_at(c, NULL, NULL, NULL,
-				   BTREE_INSERT_ATOMIC,
-				   BTREE_INSERT_ENTRY(iter, &tmp.key));
+	return 0;
 }
 
 /*
@@ -284,11 +260,11 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
  * that we've already tried to move the data MAX_DATA_OFF_ITER times and
  * are not likely to succeed if we try again.
  */
-int bch2_flag_data_bad(struct bch_dev *ca)
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-	struct bch_fs *c = ca->fs;
 	struct bkey_s_c k;
-	struct bkey_s_c_extent e;
+	struct bkey_s_extent e;
+	BKEY_PADDED(key) tmp;
 	struct btree_iter iter;
 	int ret = 0;
 
@@ -303,11 +279,33 @@ int bch2_flag_data_bad(struct bch_dev *ca)
 		if (!bkey_extent_is_data(k.k))
 			goto advance;
 
-		e = bkey_s_c_to_extent(k);
-		if (!bch2_extent_has_device(e, ca->dev_idx))
+		if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
 			goto advance;
 
-		ret = bch2_flag_key_bad(&iter, ca, e);
+		bkey_reassemble(&tmp.key, k);
+		e = bkey_i_to_s_extent(&tmp.key);
+
+		ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+		if (ret)
+			break;
+
+		/*
+		 * If the new extent no longer has any pointers, bch2_extent_normalize()
+		 * will do the appropriate thing with it (turning it into a
+		 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+		 */
+		bch2_extent_normalize(c, e.s);
+
+		if (bkey_extent_is_data(e.k) &&
+		    (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
+			break;
+
+		iter.pos = bkey_start_pos(&tmp.key.k);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+					   BTREE_INSERT_ATOMIC|
+					   BTREE_INSERT_NOFAIL,
+					   BTREE_INSERT_ENTRY(&iter, &tmp.key));
 
 		/*
 		 * don't want to leave ret == -EINTR, since if we raced and
@@ -319,26 +317,6 @@ int bch2_flag_data_bad(struct bch_dev *ca)
 		if (ret)
 			break;
 
-		/*
-		 * If the replica we're dropping was dirty and there is an
-		 * additional cached replica, the cached replica will now be
-		 * considered dirty - upon inserting the new version of the key,
-		 * the bucket accounting will be updated to reflect the fact
-		 * that the cached data is now dirty and everything works out as
-		 * if by magic without us having to do anything.
-		 *
-		 * The one thing we need to be concerned with here is there's a
-		 * race between when we drop any stale pointers from the key
-		 * we're about to insert, and when the key actually gets
-		 * inserted and the cached data is marked as dirty - we could
-		 * end up trying to insert a key with a pointer that should be
-		 * dirty, but points to stale data.
-		 *
-		 * If that happens the insert code just bails out and doesn't do
-		 * the insert - however, it doesn't return an error. Hence we
-		 * need to always recheck the current key before advancing to
-		 * the next:
-		 */
 		continue;
 advance:
 		if (bkey_extent_is_data(k.k)) {
@@ -357,3 +335,80 @@ advance:
 
 	return ret;
 }
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_iter iter;
+	struct closure cl;
+	struct btree *b;
+	unsigned id;
+	int ret;
+
+	/* don't handle this yet: */
+	if (flags & BCH_FORCE_IF_METADATA_LOST)
+		return -EINVAL;
+
+	closure_init_stack(&cl);
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+			struct bkey_i_extent *new_key;
+retry:
+			if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+						    dev_idx)) {
+				bch2_btree_iter_set_locks_want(&iter, 0);
+
+				ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
+							    BCH_DATA_BTREE);
+				if (ret)
+					goto err;
+			} else {
+				bkey_copy(&tmp.k, &b->key);
+				new_key = bkey_i_to_extent(&tmp.k);
+
+				ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+						    dev_idx, flags, true);
+				if (ret)
+					goto err;
+
+				if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
+					b = bch2_btree_iter_peek_node(&iter);
+					goto retry;
+				}
+
+				ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+				if (ret == -EINTR) {
+					b = bch2_btree_iter_peek_node(&iter);
+					goto retry;
+				}
+				if (ret)
+					goto err;
+			}
+		}
+		bch2_btree_iter_unlock(&iter);
+
+		/* btree root */
+		mutex_lock(&c->btree_root_lock);
+		mutex_unlock(&c->btree_root_lock);
+	}
+
+	ret = 0;
+out:
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+err:
+	bch2_btree_iter_unlock(&iter);
+	goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+		bch2_dev_metadata_drop(c, dev_idx, flags);
+}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
index 9bdaa79290a1..6db7b9111bf2 100644
--- a/fs/bcachefs/migrate.h
+++ b/fs/bcachefs/migrate.h
@@ -1,8 +1,7 @@
 #ifndef _BCACHEFS_MIGRATE_H
 #define _BCACHEFS_MIGRATE_H
 
-int bch2_move_data_off_device(struct bch_dev *);
-int bch2_move_metadata_off_device(struct bch_dev *);
-int bch2_flag_data_bad(struct bch_dev *);
+int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
 
 #endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index b2b510c55793..42ce031d1799 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -877,12 +877,13 @@ static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
 		offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
 }
 
-static void bkey_to_replicas(struct bkey_s_c_extent e,
+static unsigned bkey_to_replicas(struct bkey_s_c_extent e,
 			     enum bch_data_type data_type,
 			     struct bch_replicas_cpu_entry *r,
 			     unsigned *max_dev)
 {
 	const struct bch_extent_ptr *ptr;
+	unsigned nr = 0;
 
 	BUG_ON(!data_type ||
 	       data_type == BCH_DATA_SB ||
@@ -897,7 +898,9 @@ static void bkey_to_replicas(struct bkey_s_c_extent e,
 		if (!ptr->cached) {
 			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
 			replicas_set_dev(r, ptr->dev);
+			nr++;
 		}
+	return nr;
 }
 
 static struct bch_replicas_cpu *
@@ -992,16 +995,13 @@ err:
 	return ret;
 }
 
-int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
-			  enum bch_data_type data_type)
+static inline int __bch2_check_mark_super(struct bch_fs *c,
+				struct bch_replicas_cpu_entry search,
+				unsigned max_dev)
 {
 	struct bch_replicas_cpu *r, *gc_r;
-	struct bch_replicas_cpu_entry search;
-	unsigned max_dev;
 	bool marked;
 
-	bkey_to_replicas(e, data_type, &search, &max_dev);
-
 	rcu_read_lock();
 	r = rcu_dereference(c->replicas);
 	gc_r = rcu_dereference(c->replicas_gc);
@@ -1009,10 +1009,38 @@ int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
 		(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
 	rcu_read_unlock();
 
-	if (likely(marked))
+	return likely(marked) ? 0
+		: bch2_check_mark_super_slowpath(c, search, max_dev);
+}
+
+int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+			  enum bch_data_type data_type)
+{
+	struct bch_replicas_cpu_entry search;
+	unsigned max_dev;
+
+	if (!bkey_to_replicas(e, data_type, &search, &max_dev))
 		return 0;
 
-	return bch2_check_mark_super_slowpath(c, search, max_dev);
+	return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_check_mark_super_devlist(struct bch_fs *c,
+				  struct bch_devs_list *devs,
+				  enum bch_data_type data_type)
+{
+	struct bch_replicas_cpu_entry search = { .data_type = data_type };
+	unsigned i, max_dev = 0;
+
+	if (!devs->nr)
+		return 0;
+
+	for (i = 0; i < devs->nr; i++) {
+		max_dev = max_t(unsigned, max_dev, devs->devs[i]);
+		replicas_set_dev(&search, devs->devs[i]);
+	}
+
+	return __bch2_check_mark_super(c, search, max_dev);
 }
 
 int bch2_replicas_gc_end(struct bch_fs *c, int err)
@@ -1292,7 +1320,8 @@ bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
 	unsigned max_dev;
 	bool ret;
 
-	bkey_to_replicas(e, data_type, &search, &max_dev);
+	if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+		return true;
 
 	rcu_read_lock();
 	ret = replicas_has_entry(rcu_dereference(c->replicas),
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 8233763257e4..725d2f1487ec 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -129,6 +129,8 @@ bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
 			  enum bch_data_type);
 int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
 			  enum bch_data_type);
+int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
+				  enum bch_data_type);
 
 struct replicas_status {
 	struct {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c343d9f29ed9..59245b24cbcc 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -992,6 +992,9 @@ static void __bch2_dev_offline(struct bch_dev *ca)
 
 	lockdep_assert_held(&c->state_lock);
 
+	if (percpu_ref_is_zero(&ca->io_ref))
+		return;
+
 	__bch2_dev_read_only(c, ca);
 
 	reinit_completion(&ca->io_ref_completion);
@@ -1169,6 +1172,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 		return -EINVAL;
 	}
 
+	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
 	ret = bch2_dev_journal_init(ca, sb->sb);
 	if (ret)
 		return ret;
@@ -1195,7 +1200,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (bch2_dev_sysfs_online(ca))
 		pr_warn("error creating sysfs objects");
 
-	bch2_mark_dev_superblock(c, ca, 0);
+	bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_RW)
 		bch2_dev_allocator_add(c, ca);
@@ -1398,19 +1403,49 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	 *
 	 * flag_data_bad() does not check btree pointers
 	 */
-	ret = bch2_flag_data_bad(ca);
+	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
+	if (ret) {
+		bch_err(ca, "Remove failed: error %i dropping data", ret);
+		goto err;
+	}
+
+	ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
 	if (ret) {
-		bch_err(ca, "Remove failed");
+		bch_err(ca, "Remove failed: error %i flushing journal", ret);
 		goto err;
 	}
 
 	data = bch2_dev_has_data(c, ca);
 	if (data) {
-		bch_err(ca, "Remove failed, still has data (%x)", data);
+		char data_has_str[100];
+		bch2_scnprint_flag_list(data_has_str,
+					sizeof(data_has_str),
+					bch2_data_types,
+					data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+		ret = -EBUSY;
 		goto err;
 	}
 
-	bch2_journal_meta(&c->journal);
+	ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+				      POS(ca->dev_idx, 0),
+				      POS(ca->dev_idx + 1, 0),
+				      ZERO_VERSION,
+				      NULL, NULL, NULL);
+	if (ret) {
+		bch_err(ca, "Remove failed, error deleting alloc info");
+		goto err;
+	}
+
+	/*
+	 * must flush all existing journal entries, they might have
+	 * (overwritten) keys that point to the device we're removing:
+	 */
+	ret = bch2_journal_flush_all_pins(&c->journal);
+	if (ret) {
+		bch_err(ca, "Remove failed, journal error");
+		goto err;
+	}
 
 	__bch2_dev_offline(ca);
 
@@ -1605,7 +1640,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 		return -EINVAL;
 	}
 
-	__bch2_dev_read_only(c, ca);
 	__bch2_dev_offline(ca);
 
 	mutex_unlock(&c->state_lock);
@@ -1615,37 +1649,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned data;
-	int ret;
+	int ret = 0;
 
 	mutex_lock(&c->state_lock);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		bch_err(ca, "Cannot migrate data off RW device");
-		mutex_unlock(&c->state_lock);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}
 
-	mutex_unlock(&c->state_lock);
-
-	ret = bch2_move_data_off_device(ca);
+	ret = bch2_dev_data_migrate(c, ca, 0);
 	if (ret) {
 		bch_err(ca, "Error migrating data: %i", ret);
-		return ret;
-	}
-
-	ret = bch2_move_metadata_off_device(ca);
-	if (ret) {
-		bch_err(ca, "Error migrating metadata: %i", ret);
-		return ret;
+		goto err;
 	}
 
 	data = bch2_dev_has_data(c, ca);
 	if (data) {
 		bch_err(ca, "Migrate error: data still present (%x)", data);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}
-
-	return 0;
+err:
+	mutex_unlock(&c->state_lock);
+	return ret;
 }
 
 /* Filesystem open: */
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index b3c0ef50a4ff..7ebe5981bf45 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -59,6 +59,14 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
 		}
 }
 
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+					 unsigned dev)
+{
+	BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+	BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+	devs->devs[devs->nr++] = dev;
+}
+
 static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
 					      struct bch_devs_mask *mask)
 {
author	Kent Overstreet <kent.overstreet@gmail.com>	2017-05-20 20:51:40 -0800
committer	Kent Overstreet <kent.overstreet@gmail.com>	2018-05-22 00:44:18 -0400
commit	e4b4227e9969849d181881463da29b9f3cc373fd (patch)
tree	b9892d4f9d51cb95d5347bed4a7e400de0570f4b
parent	6e4c78da70c84cceb94532cc9886577507ae565f (diff)