summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-05-20 20:51:40 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2018-05-22 00:44:18 -0400
commite4b4227e9969849d181881463da29b9f3cc373fd (patch)
treeb9892d4f9d51cb95d5347bed4a7e400de0570f4b
parent6e4c78da70c84cceb94532cc9886577507ae565f (diff)
bcachefs: Device removal work
-rw-r--r--fs/bcachefs/btree_gc.c3
-rw-r--r--fs/bcachefs/btree_io.c71
-rw-r--r--fs/bcachefs/btree_iter.c19
-rw-r--r--fs/bcachefs/btree_locking.h1
-rw-r--r--fs/bcachefs/btree_types.h2
-rw-r--r--fs/bcachefs/btree_update.h4
-rw-r--r--fs/bcachefs/btree_update_interior.c177
-rw-r--r--fs/bcachefs/extents.c29
-rw-r--r--fs/bcachefs/extents.h1
-rw-r--r--fs/bcachefs/io.c4
-rw-r--r--fs/bcachefs/io.h2
-rw-r--r--fs/bcachefs/io_types.h3
-rw-r--r--fs/bcachefs/journal.c69
-rw-r--r--fs/bcachefs/journal.h3
-rw-r--r--fs/bcachefs/journal_types.h1
-rw-r--r--fs/bcachefs/migrate.c197
-rw-r--r--fs/bcachefs/migrate.h5
-rw-r--r--fs/bcachefs/super-io.c49
-rw-r--r--fs/bcachefs/super-io.h2
-rw-r--r--fs/bcachefs/super.c72
-rw-r--r--fs/bcachefs/super.h8
21 files changed, 485 insertions, 237 deletions
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ff6273737916..2294cc3adeca 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -159,7 +159,8 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
(!c->opts.nofsck &&
fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
- "superblock not marked as containing replicas"))) {
+ "superblock not marked as containing replicas (type %u)",
+ data_type))) {
ret = bch2_check_mark_super(c, e, data_type);
if (ret)
return ret;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 34cf17680d6a..96484ea206ce 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1437,35 +1437,57 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
struct closure *cl = wbio->cl;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key;
+ struct bkey_s_extent e;
+ struct bch_extent_ptr *ptr;
+ struct btree_iter iter;
+ int ret;
- six_lock_read(&b->lock);
- bkey_copy(&tmp.k, &b->key);
- six_unlock_read(&b->lock);
+ __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH,
+ b->level, 0);
+retry:
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto err;
- if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
- /* Node has been freed: */
+ /* has node been freed? */
+ if (iter.nodes[b->level] != b) {
+ /* node has been freed: */
+ if (!btree_node_dying(b))
+ panic("foo4\n");
goto out;
}
- new_key = bkey_i_to_extent(&tmp.k);
+ if (!btree_node_hashed(b))
+ panic("foo5\n");
- while (wbio->replicas_failed) {
- unsigned idx = __fls(wbio->replicas_failed);
+ bkey_copy(&tmp.k, &b->key);
- bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
- wbio->replicas_failed ^= 1 << idx;
- }
+ new_key = bkey_i_to_extent(&tmp.k);
+ e = extent_i_to_s(new_key);
+ extent_for_each_ptr_backwards(e, ptr)
+ if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+ bch2_extent_drop_ptr(e, ptr);
- if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
- bch2_btree_node_update_key(c, b, new_key)) {
- set_btree_node_noevict(b);
- bch2_fs_fatal_error(c, "fatal error writing btree node");
- }
+ if (!bch2_extent_nr_ptrs(e.c))
+ goto err;
+
+ ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+ if (ret == -EINTR)
+ goto retry;
+ if (ret)
+ goto err;
out:
+ bch2_btree_iter_unlock(&iter);
bio_put(&wbio->bio);
btree_node_write_done(c, b);
if (cl)
closure_put(cl);
+ return;
+err:
+ set_btree_node_noevict(b);
+ bch2_fs_fatal_error(c, "fatal error writing btree node");
+ goto out;
}
void bch2_btree_write_error_work(struct work_struct *work)
@@ -1495,12 +1517,17 @@ static void btree_node_write_endio(struct bio *bio)
struct closure *cl = !wbio->split ? wbio->cl : NULL;
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
+ unsigned long flags;
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
- if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
- bch2_meta_write_fault("btree"))
- set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
+ if (bio->bi_status == BLK_STS_REMOVED ||
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+ bch2_meta_write_fault("btree")) {
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
+ bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+ }
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
@@ -1516,12 +1543,11 @@ static void btree_node_write_endio(struct bio *bio)
wbio->used_mempool,
wbio->data);
- if (wbio->replicas_failed) {
- unsigned long flags;
-
+ if (wbio->failed.nr) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bio_list_add(&c->btree_write_error_list, &wbio->bio);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
queue_work(c->wq, &c->btree_write_error_work);
return;
}
@@ -1732,6 +1758,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
wbio->cl = parent;
+ wbio->failed.nr = 0;
wbio->order = order;
wbio->used_mempool = used_mempool;
wbio->data = data;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 819b8efc5fd8..0b505a738e86 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -108,6 +108,17 @@ success:
return true;
}
+bool bch2_btree_iter_relock(struct btree_iter *iter)
+{
+ unsigned l;
+
+ for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+ if (!bch2_btree_node_relock(iter, l))
+ return false;
+
+ return true;
+}
+
/* Slowpath: */
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
unsigned level,
@@ -214,7 +225,6 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
unsigned new_locks_want)
{
struct btree_iter *linked;
- unsigned l;
/* Drop locks we don't want anymore: */
if (new_locks_want < iter->locks_want)
@@ -228,12 +238,9 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
iter->locks_want = new_locks_want;
btree_iter_drop_extra_locks(iter);
- for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
- if (!bch2_btree_node_relock(iter, l))
- goto fail;
+ if (bch2_btree_iter_relock(iter))
+ return true;
- return true;
-fail:
/*
* Just an optimization: ancestor nodes must be locked before child
* nodes, so set locks_want on iterators that might lock ancestors
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index a000306228fa..acfe5b59df56 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -111,6 +111,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
}
bool bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool bch2_btree_iter_relock(struct btree_iter *);
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index f1e06a378c9a..f0e6896a8a5e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -196,6 +196,7 @@ enum btree_flags {
BTREE_NODE_accessed,
BTREE_NODE_write_in_flight,
BTREE_NODE_just_written,
+ BTREE_NODE_dying,
};
BTREE_FLAG(read_in_flight);
@@ -207,6 +208,7 @@ BTREE_FLAG(write_idx);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
static inline struct btree_write *btree_current_write(struct btree *b)
{
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index e11fcec963ba..c7c2930650d3 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -130,7 +130,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
__le64, unsigned);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
- struct bkey_i_extent *);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+ struct btree *, struct bkey_i_extent *);
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6351e9c2490f..04854532b8b4 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -914,6 +914,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct btree_write *w;
struct bset_tree *t;
+ set_btree_node_dying(b);
btree_interior_update_add_node_reference(as, b);
/*
@@ -1028,6 +1029,10 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
+ BUG_ON(btree_node_root(c, b) &&
+ (b->level < btree_node_root(c, b)->level ||
+ !btree_node_dying(btree_node_root(c, b))));
+
btree_node_root(c, b) = b;
mutex_unlock(&c->btree_root_lock);
@@ -1790,64 +1795,16 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
return ret;
}
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
- struct bkey_i_extent *new_key)
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+ struct btree_update *as,
+ struct btree_iter *iter,
+ struct btree *b, struct btree *new_hash,
+ struct bkey_i_extent *new_key)
{
- struct btree_update *as = NULL;
- struct btree *parent, *new_hash = NULL;
- struct btree_iter iter;
- struct closure cl;
+ struct btree *parent;
bool must_rewrite_parent = false;
int ret;
- __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
- BTREE_MAX_DEPTH,
- b->level, 0);
- closure_init_stack(&cl);
-
- ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
- if (ret)
- return ret;
-
-retry:
- down_read(&c->gc_lock);
- ret = bch2_btree_iter_traverse(&iter);
- if (ret)
- goto err;
-
- /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
- if (!new_hash &&
- PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
- /* bch2_btree_reserve_get will unlock */
- do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
- closure_sync(&cl);
- } while (ret == -EAGAIN);
-
- BUG_ON(ret);
-
- new_hash = bch2_btree_node_mem_alloc(c);
- }
-
- as = bch2_btree_update_start(c, iter.btree_id,
- btree_update_reserve_required(c, b),
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE,
- &cl);
- if (IS_ERR(as)) {
- ret = PTR_ERR(as);
- if (ret == -EAGAIN || ret == -EINTR) {
- bch2_btree_iter_unlock(&iter);
- up_read(&c->gc_lock);
- closure_sync(&cl);
- goto retry;
- }
- goto err;
- }
-
- mutex_lock(&c->btree_interior_update_lock);
-
/*
* Two corner cases that need to be thought about here:
*
@@ -1872,22 +1829,12 @@ retry:
if (b->will_make_reachable)
must_rewrite_parent = true;
- /* other case: btree node being freed */
- if (iter.nodes[b->level] != b) {
- /* node has been freed: */
- BUG_ON(btree_node_hashed(b));
- mutex_unlock(&c->btree_interior_update_lock);
- goto err;
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
-
if (must_rewrite_parent)
as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
btree_interior_update_add_node_reference(as, b);
- parent = iter.nodes[b->level + 1];
+ parent = iter->nodes[b->level + 1];
if (parent) {
if (new_hash) {
bkey_copy(&new_hash->key, &new_key->k_i);
@@ -1896,8 +1843,8 @@ retry:
BUG_ON(ret);
}
- bch2_btree_insert_node(as, parent, &iter,
- &keylist_single(&new_key->k_i));
+ bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
@@ -1917,7 +1864,7 @@ retry:
BUG_ON(btree_node_root(c, b) != b);
- bch2_btree_node_lock_write(b, &iter);
+ bch2_btree_node_lock_write(b, iter);
bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
c->opts.btree_node_size, true,
@@ -1928,14 +1875,94 @@ retry:
&stats);
bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
gc_pos_btree_root(b->btree_id));
- bkey_copy(&b->key, &new_key->k_i);
+
+ if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, &new_key->k_i);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+ mutex_unlock(&c->btree_cache.lock);
+ } else {
+ bkey_copy(&b->key, &new_key->k_i);
+ }
btree_update_updated_root(as);
- bch2_btree_node_unlock_write(b, &iter);
+ bch2_btree_node_unlock_write(b, iter);
}
bch2_btree_update_done(as);
-out:
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+ struct btree *b, struct bkey_i_extent *new_key)
+{
+ struct btree_update *as = NULL;
+ struct btree *new_hash = NULL;
+ struct closure cl;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ if (!down_read_trylock(&c->gc_lock)) {
+ bch2_btree_iter_unlock(iter);
+ down_read(&c->gc_lock);
+
+ if (!bch2_btree_iter_relock(iter)) {
+ ret = -EINTR;
+ goto err;
+ }
+ }
+
+ /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+ if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ /* bch2_btree_reserve_get will unlock */
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ if (ret) {
+ ret = -EINTR;
+
+ bch2_btree_iter_unlock(iter);
+ up_read(&c->gc_lock);
+ closure_sync(&cl);
+ down_read(&c->gc_lock);
+
+ if (!bch2_btree_iter_relock(iter))
+ goto err;
+ }
+
+ new_hash = bch2_btree_node_mem_alloc(c);
+ }
+
+ as = bch2_btree_update_start(c, iter->btree_id,
+ btree_update_reserve_required(c, b),
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE,
+ &cl);
+ if (IS_ERR(as)) {
+ ret = PTR_ERR(as);
+ if (ret == -EAGAIN)
+ ret = -EINTR;
+
+ if (ret != -EINTR)
+ goto err;
+
+ bch2_btree_iter_unlock(iter);
+ up_read(&c->gc_lock);
+ closure_sync(&cl);
+ down_read(&c->gc_lock);
+
+ if (!bch2_btree_iter_relock(iter))
+ goto err;
+ }
+
+ ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+ if (ret)
+ goto err_free_update;
+
+ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+err:
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
list_move(&new_hash->list, &c->btree_cache.freeable);
@@ -1944,14 +1971,12 @@ out:
six_unlock_write(&new_hash->lock);
six_unlock_intent(&new_hash->lock);
}
- bch2_btree_iter_unlock(&iter);
up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
-err:
- if (as)
- bch2_btree_update_free(as);
- goto out;
+err_free_update:
+ bch2_btree_update_free(as);
+ goto err;
}
/* Init code: */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 985f980c95d0..176978ca2231 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -157,6 +157,19 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
return nr_ptrs;
}
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
+{
+ const struct bch_extent_ptr *ptr;
+ unsigned nr_ptrs = 0;
+
+ extent_for_each_ptr(e, ptr)
+ nr_ptrs += (!ptr->cached &&
+ bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
+ BCH_MEMBER_STATE_FAILED);
+
+ return nr_ptrs;
+}
+
unsigned bch2_extent_is_compressed(struct bkey_s_c k)
{
struct bkey_s_c_extent e;
@@ -435,7 +448,8 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
const struct bch_extent_ptr *ptr2;
struct bch_dev *ca;
- if (ptr->dev >= c->sb.nr_devices)
+ if (ptr->dev >= c->sb.nr_devices ||
+ !c->devs[ptr->dev])
return "pointer to invalid device";
ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -490,7 +504,9 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
break;
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ? bch_dev_bkey_exists(c, ptr->dev)
+ : NULL;
p("ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
@@ -1974,15 +1990,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
struct bkey_s_extent e)
{
struct bch_extent_ptr *ptr;
- unsigned tier = 0, nr_cached = 0, nr_good = 0;
+ unsigned tier = 0, nr_cached = 0;
+ unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
bool have_higher_tier;
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached &&
- bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
- BCH_MEMBER_STATE_FAILED)
- nr_good++;
-
if (nr_good <= c->opts.data_replicas)
return;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ff4ce2af16e0..ab7993abbddf 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -45,6 +45,7 @@ bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index e465533563f9..744bff0f42bb 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -140,7 +140,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
const struct bch_extent_ptr *ptr;
struct bch_write_bio *n;
struct bch_dev *ca;
- unsigned ptr_idx = 0;
BUG_ON(c->opts.nochanges);
@@ -169,7 +168,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c;
n->ca = ca;
- n->ptr_idx = ptr_idx++;
n->submit_time_us = local_clock_us();
n->bio.bi_iter.bi_sector = ptr->offset;
@@ -185,7 +183,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
submit_bio(&n->bio);
} else {
n->have_io_ref = false;
- bcache_io_error(c, &n->bio, "device has been removed");
+ n->bio.bi_status = BLK_STS_REMOVED;
bio_endio(&n->bio);
}
}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index b3a23e821097..0c145eb67317 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -21,6 +21,8 @@ void bch2_latency_acct(struct bch_dev *, unsigned, int);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *);
+#define BLK_STS_REMOVED ((__force blk_status_t)128)
+
enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 5b0d7aae97c5..ff18fdc90eb7 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -72,8 +72,7 @@ struct bch_write_bio {
struct closure *cl;
};
- u8 ptr_idx;
- u8 replicas_failed;
+ struct bch_devs_list failed;
u8 order;
unsigned split:1,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 55e5d21bd3ce..30e80409962f 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -338,8 +338,8 @@ struct journal_list {
* Given a journal entry we just read, add it to the list of journal entries to
* be replayed:
*/
-static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
- struct jset *j)
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+ struct journal_list *jlist, struct jset *j)
{
struct journal_replay *i, *pos;
struct list_head *where;
@@ -347,8 +347,6 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
__le64 last_seq;
int ret;
- mutex_lock(&jlist->lock);
-
last_seq = !list_empty(jlist->head)
? list_last_entry(jlist->head, struct journal_replay,
list)->j.last_seq
@@ -376,9 +374,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
memcmp(j, &i->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
-
- ret = JOURNAL_ENTRY_ADD_OK;
- goto out;
+ goto found;
}
if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
@@ -395,12 +391,16 @@ add:
goto out;
}
- memcpy(&i->j, j, bytes);
list_add(&i->list, where);
+ i->devs.nr = 0;
+ memcpy(&i->j, j, bytes);
+found:
+ if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx),
+ c, "duplicate journal entries on same device"))
+ bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
ret = JOURNAL_ENTRY_ADD_OK;
out:
fsck_err:
- mutex_unlock(&jlist->lock);
return ret;
}
@@ -722,7 +722,10 @@ reread: sectors_read = min_t(unsigned,
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- ret = journal_entry_add(c, jlist, j);
+ mutex_lock(&jlist->lock);
+ ret = journal_entry_add(c, ca, jlist, j);
+ mutex_unlock(&jlist->lock);
+
switch (ret) {
case JOURNAL_ENTRY_ADD_OK:
*entries_found = true;
@@ -1011,6 +1014,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
+ p->devs.nr = 0;
}
mutex_lock(&j->blacklist_lock);
@@ -1019,6 +1023,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
atomic_set(&p->count, 1);
+ p->devs = i->devs;
if (journal_seq_blacklist_read(j, i, p)) {
mutex_unlock(&j->blacklist_lock);
@@ -1131,6 +1136,7 @@ static void __journal_entry_new(struct journal *j, int count)
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count);
+ p->devs.nr = 0;
}
static void __bch2_journal_next_entry(struct journal *j)
@@ -2303,6 +2309,9 @@ static void journal_write(struct closure *cl)
BCH_DATA_JOURNAL))
goto err;
+ journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
+ bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
+
/*
* XXX: we really should just disable the entire journal in nochanges
* mode
@@ -2720,6 +2729,46 @@ int bch2_journal_flush(struct journal *j)
return bch2_journal_flush_seq(j, seq);
}
+int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin_list *p;
+ struct bch_devs_list devs;
+ u64 seq = 0;
+ unsigned iter;
+ int ret = 0;
+
+ spin_lock(&j->lock);
+ fifo_for_each_entry_ptr(p, &j->pin, iter)
+ if (bch2_dev_list_has_dev(p->devs, dev_idx))
+ seq = journal_pin_seq(j, p);
+ spin_unlock(&j->lock);
+
+ bch2_journal_flush_pins(j, seq);
+
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+ seq = 0;
+
+ spin_lock(&j->lock);
+ while (!ret && seq < atomic64_read(&j->seq)) {
+ seq = max(seq, last_seq(j));
+ devs = journal_seq_pin(j, seq)->devs;
+ seq++;
+
+ spin_unlock(&j->lock);
+ ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
+ spin_lock(&j->lock);
+ }
+ spin_unlock(&j->lock);
+
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
+ return ret;
+}
+
ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index e6532f2f6100..5f3ece089937 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -118,6 +118,8 @@
*/
struct journal_replay {
struct list_head list;
+ struct bch_devs_list devs;
+ /* must be last: */
struct jset j;
};
@@ -357,6 +359,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
int bch2_journal_meta(struct journal *);
+int bch2_journal_flush_device(struct journal *, unsigned);
void bch2_journal_halt(struct journal *);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 55b41c56a3f2..87f378a6ac4f 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -34,6 +34,7 @@ struct journal_entry_pin_list {
struct list_head list;
struct list_head flushed;
atomic_t count;
+ struct bch_devs_list devs;
};
struct journal;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 8d1c0ee07c24..e11ee9532483 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -27,23 +27,9 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
#define MAX_DATA_OFF_ITER 10
-/*
- * This moves only the data off, leaving the meta-data (if any) in place.
- * It walks the key space, and for any key with a valid pointer to the
- * relevant device, it copies it elsewhere, updating the key to point to
- * the copy.
- * The meta-data is moved off by bch_move_meta_data_off_device.
- *
- * Note: If the number of data replicas desired is > 1, ideally, any
- * new copies would not be made in the same device that already have a
- * copy (if there are enough devices).
- * This is _not_ currently implemented. The multiple replicas can
- * land in the same device even if there are others available.
- */
-
-int bch2_move_data_off_device(struct bch_dev *ca)
+static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
+ int flags)
{
- struct bch_fs *c = ca->fs;
struct btree_iter iter;
struct bkey_s_c k;
u64 keys_moved, sectors_moved;
@@ -113,10 +99,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
return ret;
}
-/*
- * This walks the btree, and for any node on the relevant device it moves the
- * node elsewhere.
- */
static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
enum btree_id id)
{
@@ -200,9 +182,9 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
* is written.
*/
-int bch2_move_metadata_off_device(struct bch_dev *ca)
+static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
+ int flags)
{
- struct bch_fs *c = ca->fs;
unsigned i;
int ret = 0;
@@ -240,37 +222,31 @@ err:
return ret;
}
-/*
- * Flagging data bad when forcibly removing a device after failing to
- * migrate the data off the device.
- */
+int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+ return bch2_dev_usrdata_migrate(c, ca, flags) ?:
+ bch2_dev_metadata_migrate(c, ca, flags);
+}
-static int bch2_flag_key_bad(struct btree_iter *iter,
- struct bch_dev *ca,
- struct bkey_s_c_extent orig)
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+ unsigned dev_idx, int flags, bool metadata)
{
- BKEY_PADDED(key) tmp;
- struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
- struct bch_fs *c = ca->fs;
-
- bkey_reassemble(&tmp.key, orig.s_c);
- e = bkey_i_to_s_extent(&tmp.key);
+ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+ unsigned nr_good;
extent_for_each_ptr_backwards(e, ptr)
- if (ptr->dev == ca->dev_idx)
+ if (ptr->dev == dev_idx)
bch2_extent_drop_ptr(e, ptr);
- /*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize(c, e.s);
+ nr_good = bch2_extent_nr_good_ptrs(c, e.c);
+ if ((!nr_good && !(flags & lost)) ||
+ (nr_good < replicas && !(flags & degraded)))
+ return -EINVAL;
- return bch2_btree_insert_at(c, NULL, NULL, NULL,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(iter, &tmp.key));
+ return 0;
}
/*
@@ -284,11 +260,11 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
* that we've already tried to move the data MAX_DATA_OFF_ITER times and
* are not likely to succeed if we try again.
*/
-int bch2_flag_data_bad(struct bch_dev *ca)
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
- struct bch_fs *c = ca->fs;
struct bkey_s_c k;
- struct bkey_s_c_extent e;
+ struct bkey_s_extent e;
+ BKEY_PADDED(key) tmp;
struct btree_iter iter;
int ret = 0;
@@ -303,11 +279,33 @@ int bch2_flag_data_bad(struct bch_dev *ca)
if (!bkey_extent_is_data(k.k))
goto advance;
- e = bkey_s_c_to_extent(k);
- if (!bch2_extent_has_device(e, ca->dev_idx))
+ if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
goto advance;
- ret = bch2_flag_key_bad(&iter, ca, e);
+ bkey_reassemble(&tmp.key, k);
+ e = bkey_i_to_s_extent(&tmp.key);
+
+ ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+ if (ret)
+ break;
+
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, e.s);
+
+ if (bkey_extent_is_data(e.k) &&
+ (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
+ break;
+
+ iter.pos = bkey_start_pos(&tmp.key.k);
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL,
+ BTREE_INSERT_ENTRY(&iter, &tmp.key));
/*
* don't want to leave ret == -EINTR, since if we raced and
@@ -319,26 +317,6 @@ int bch2_flag_data_bad(struct bch_dev *ca)
if (ret)
break;
- /*
- * If the replica we're dropping was dirty and there is an
- * additional cached replica, the cached replica will now be
- * considered dirty - upon inserting the new version of the key,
- * the bucket accounting will be updated to reflect the fact
- * that the cached data is now dirty and everything works out as
- * if by magic without us having to do anything.
- *
- * The one thing we need to be concerned with here is there's a
- * race between when we drop any stale pointers from the key
- * we're about to insert, and when the key actually gets
- * inserted and the cached data is marked as dirty - we could
- * end up trying to insert a key with a pointer that should be
- * dirty, but points to stale data.
- *
- * If that happens the insert code just bails out and doesn't do
- * the insert - however, it doesn't return an error. Hence we
- * need to always recheck the current key before advancing to
- * the next:
- */
continue;
advance:
if (bkey_extent_is_data(k.k)) {
@@ -357,3 +335,80 @@ advance:
return ret;
}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ struct btree_iter iter;
+ struct closure cl;
+ struct btree *b;
+ unsigned id;
+ int ret;
+
+ /* don't handle this yet: */
+ if (flags & BCH_FORCE_IF_METADATA_LOST)
+ return -EINVAL;
+
+ closure_init_stack(&cl);
+
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ struct bkey_i_extent *new_key;
+retry:
+ if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+ dev_idx)) {
+ bch2_btree_iter_set_locks_want(&iter, 0);
+
+ ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
+ BCH_DATA_BTREE);
+ if (ret)
+ goto err;
+ } else {
+ bkey_copy(&tmp.k, &b->key);
+ new_key = bkey_i_to_extent(&tmp.k);
+
+ ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+ dev_idx, flags, true);
+ if (ret)
+ goto err;
+
+ if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
+ b = bch2_btree_iter_peek_node(&iter);
+ goto retry;
+ }
+
+ ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+ if (ret == -EINTR) {
+ b = bch2_btree_iter_peek_node(&iter);
+ goto retry;
+ }
+ if (ret)
+ goto err;
+ }
+ }
+ bch2_btree_iter_unlock(&iter);
+
+ /* btree root */
+ mutex_lock(&c->btree_root_lock);
+ mutex_unlock(&c->btree_root_lock);
+ }
+
+ ret = 0;
+out:
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
+ return ret;
+err:
+ bch2_btree_iter_unlock(&iter);
+ goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+ bch2_dev_metadata_drop(c, dev_idx, flags);
+}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
index 9bdaa79290a1..6db7b9111bf2 100644
--- a/fs/bcachefs/migrate.h
+++ b/fs/bcachefs/migrate.h
@@ -1,8 +1,7 @@
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
-int bch2_move_data_off_device(struct bch_dev *);
-int bch2_move_metadata_off_device(struct bch_dev *);
-int bch2_flag_data_bad(struct bch_dev *);
+int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index b2b510c55793..42ce031d1799 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -877,12 +877,13 @@ static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
}
-static void bkey_to_replicas(struct bkey_s_c_extent e,
+static unsigned bkey_to_replicas(struct bkey_s_c_extent e,
enum bch_data_type data_type,
struct bch_replicas_cpu_entry *r,
unsigned *max_dev)
{
const struct bch_extent_ptr *ptr;
+ unsigned nr = 0;
BUG_ON(!data_type ||
data_type == BCH_DATA_SB ||
@@ -897,7 +898,9 @@ static void bkey_to_replicas(struct bkey_s_c_extent e,
if (!ptr->cached) {
*max_dev = max_t(unsigned, *max_dev, ptr->dev);
replicas_set_dev(r, ptr->dev);
+ nr++;
}
+ return nr;
}
static struct bch_replicas_cpu *
@@ -992,16 +995,13 @@ err:
return ret;
}
-int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
- enum bch_data_type data_type)
+static inline int __bch2_check_mark_super(struct bch_fs *c,
+ struct bch_replicas_cpu_entry search,
+ unsigned max_dev)
{
struct bch_replicas_cpu *r, *gc_r;
- struct bch_replicas_cpu_entry search;
- unsigned max_dev;
bool marked;
- bkey_to_replicas(e, data_type, &search, &max_dev);
-
rcu_read_lock();
r = rcu_dereference(c->replicas);
gc_r = rcu_dereference(c->replicas_gc);
@@ -1009,10 +1009,38 @@ int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
rcu_read_unlock();
- if (likely(marked))
+ return likely(marked) ? 0
+ : bch2_check_mark_super_slowpath(c, search, max_dev);
+}
+
+int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+ enum bch_data_type data_type)
+{
+ struct bch_replicas_cpu_entry search;
+ unsigned max_dev;
+
+ if (!bkey_to_replicas(e, data_type, &search, &max_dev))
return 0;
- return bch2_check_mark_super_slowpath(c, search, max_dev);
+ return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_check_mark_super_devlist(struct bch_fs *c,
+ struct bch_devs_list *devs,
+ enum bch_data_type data_type)
+{
+ struct bch_replicas_cpu_entry search = { .data_type = data_type };
+ unsigned i, max_dev = 0;
+
+ if (!devs->nr)
+ return 0;
+
+ for (i = 0; i < devs->nr; i++) {
+ max_dev = max_t(unsigned, max_dev, devs->devs[i]);
+ replicas_set_dev(&search, devs->devs[i]);
+ }
+
+ return __bch2_check_mark_super(c, search, max_dev);
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
@@ -1292,7 +1320,8 @@ bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
unsigned max_dev;
bool ret;
- bkey_to_replicas(e, data_type, &search, &max_dev);
+ if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+ return true;
rcu_read_lock();
ret = replicas_has_entry(rcu_dereference(c->replicas),
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 8233763257e4..725d2f1487ec 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -129,6 +129,8 @@ bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type);
int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type);
+int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
+ enum bch_data_type);
struct replicas_status {
struct {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c343d9f29ed9..59245b24cbcc 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -992,6 +992,9 @@ static void __bch2_dev_offline(struct bch_dev *ca)
lockdep_assert_held(&c->state_lock);
+ if (percpu_ref_is_zero(&ca->io_ref))
+ return;
+
__bch2_dev_read_only(c, ca);
reinit_completion(&ca->io_ref_completion);
@@ -1169,6 +1172,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
return -EINVAL;
}
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
return ret;
@@ -1195,7 +1200,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (bch2_dev_sysfs_online(ca))
pr_warn("error creating sysfs objects");
- bch2_mark_dev_superblock(c, ca, 0);
+ bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
@@ -1398,19 +1403,49 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*
* flag_data_bad() does not check btree pointers
*/
- ret = bch2_flag_data_bad(ca);
+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
+ if (ret) {
+ bch_err(ca, "Remove failed: error %i dropping data", ret);
+ goto err;
+ }
+
+ ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
if (ret) {
- bch_err(ca, "Remove failed");
+ bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
- bch_err(ca, "Remove failed, still has data (%x)", data);
+ char data_has_str[100];
+ bch2_scnprint_flag_list(data_has_str,
+ sizeof(data_has_str),
+ bch2_data_types,
+ data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+ ret = -EBUSY;
goto err;
}
- bch2_journal_meta(&c->journal);
+ ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx + 1, 0),
+ ZERO_VERSION,
+ NULL, NULL, NULL);
+ if (ret) {
+ bch_err(ca, "Remove failed, error deleting alloc info");
+ goto err;
+ }
+
+ /*
+ * must flush all existing journal entries, they might have
+ * (overwritten) keys that point to the device we're removing:
+ */
+ ret = bch2_journal_flush_all_pins(&c->journal);
+ if (ret) {
+ bch_err(ca, "Remove failed, journal error");
+ goto err;
+ }
__bch2_dev_offline(ca);
@@ -1605,7 +1640,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
return -EINVAL;
}
- __bch2_dev_read_only(c, ca);
__bch2_dev_offline(ca);
mutex_unlock(&c->state_lock);
@@ -1615,37 +1649,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
{
unsigned data;
- int ret;
+ int ret = 0;
mutex_lock(&c->state_lock);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
bch_err(ca, "Cannot migrate data off RW device");
- mutex_unlock(&c->state_lock);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
- mutex_unlock(&c->state_lock);
-
- ret = bch2_move_data_off_device(ca);
+ ret = bch2_dev_data_migrate(c, ca, 0);
if (ret) {
bch_err(ca, "Error migrating data: %i", ret);
- return ret;
- }
-
- ret = bch2_move_metadata_off_device(ca);
- if (ret) {
- bch_err(ca, "Error migrating metadata: %i", ret);
- return ret;
+ goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
bch_err(ca, "Migrate error: data still present (%x)", data);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
-
- return 0;
+err:
+ mutex_unlock(&c->state_lock);
+ return ret;
}
/* Filesystem open: */
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index b3c0ef50a4ff..7ebe5981bf45 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -59,6 +59,14 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
}
}
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+ unsigned dev)
+{
+ BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+ BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+ devs->devs[devs->nr++] = dev;
+}
+
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
struct bch_devs_mask *mask)
{