summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-12-24 18:36:00 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2018-05-22 00:44:18 -0400
commitc37101792786a50468fe64b911a80953374cfc29 (patch)
tree1b151988bf4361d373fd6912c916a5de1ef4e6d1
parentb444278c7ea416fa94c031cc20977824daa12a62 (diff)
bcachefs: allow RW devices to be removed
-rw-r--r--fs/bcachefs/extents.c30
-rw-r--r--fs/bcachefs/extents.h2
-rw-r--r--fs/bcachefs/journal.c326
-rw-r--r--fs/bcachefs/journal.h3
-rw-r--r--fs/bcachefs/journal_types.h5
-rw-r--r--fs/bcachefs/migrate.c20
-rw-r--r--fs/bcachefs/super.c33
7 files changed, 172 insertions, 247 deletions
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 51262d6f5afb..e055ee93f586 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -123,6 +123,22 @@ bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
return NULL;
}
+bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
+{
+ struct bch_extent_ptr *ptr;
+ bool dropped = false;
+
+ extent_for_each_ptr_backwards(e, ptr)
+ if (ptr->dev == dev) {
+ __bch2_extent_drop_ptr(e, ptr);
+ dropped = true;
+ }
+
+ if (dropped)
+ bch2_extent_drop_redundant_crcs(e);
+ return dropped;
+}
+
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
{
const struct bch_extent_ptr *ptr;
@@ -225,20 +241,6 @@ void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
bch2_extent_drop_redundant_crcs(e);
}
-void bch2_extent_drop_ptr_idx(struct bkey_s_extent e, unsigned idx)
-{
- struct bch_extent_ptr *ptr;
- unsigned i = 0;
-
- extent_for_each_ptr(e, ptr)
- if (i++ == idx)
- goto found;
-
- BUG();
-found:
- bch2_extent_drop_ptr(e, ptr);
-}
-
static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
struct bch_extent_crc_unpacked n)
{
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index eb81b74a9696..aeae361d9e28 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -42,6 +42,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+bool bch2_extent_drop_device(struct bkey_s_extent, unsigned);
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
@@ -432,7 +433,6 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
-void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);
bool bch2_cut_front(struct bpos, struct bkey_i *);
bool bch2_cut_back(struct bpos, struct bkey *);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 914bc4539523..07b4d67dc33b 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -2022,10 +2022,11 @@ static void journal_reclaim_work(struct work_struct *work)
/**
* journal_next_bucket - move on to the next journal bucket if possible
*/
-static int journal_write_alloc(struct journal *j, unsigned sectors)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+ unsigned sectors)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+ struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct journal_device *ja;
struct bch_dev *ca;
@@ -2034,6 +2035,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
READ_ONCE(c->opts.metadata_replicas);
spin_lock(&j->lock);
+ e = bkey_i_to_s_extent(&j->key);
/*
* Drop any pointers to devices that have been removed, are no longer
@@ -2099,6 +2101,8 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
rcu_read_unlock();
j->prev_buf_sectors = 0;
+
+ bkey_copy(&w->key, &j->key);
spin_unlock(&j->lock);
if (replicas < c->opts.metadata_replicas_required)
@@ -2174,13 +2178,26 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
static void journal_write_done(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_prev_buf(j);
+ struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&w->key);
+
+ if (!bch2_extent_nr_ptrs(e)) {
+ bch_err(c, "unable to write journal to sufficient devices");
+ goto err;
+ }
+ if (bch2_check_mark_super(c, e, BCH_DATA_JOURNAL))
+ goto err;
+out:
__bch2_time_stats_update(j->write_time, j->write_start_time);
spin_lock(&j->lock);
j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
+ journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs =
+ bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+
/*
* Updating last_seq_ondisk may let journal_reclaim_work() discard more
* buckets:
@@ -2203,31 +2220,6 @@ static void journal_write_done(struct closure *cl)
if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
mod_delayed_work(system_freezable_wq, &j->write_work, 0);
spin_unlock(&j->lock);
-}
-
-static void journal_write_error(struct closure *cl)
-{
- struct journal *j = container_of(cl, struct journal, io);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
-
- while (j->replicas_failed) {
- unsigned idx = __fls(j->replicas_failed);
-
- bch2_extent_drop_ptr_idx(e, idx);
- j->replicas_failed ^= 1 << idx;
- }
-
- if (!bch2_extent_nr_ptrs(e.c)) {
- bch_err(c, "unable to write journal to sufficient devices");
- goto err;
- }
-
- if (bch2_check_mark_super(c, e.c, BCH_DATA_JOURNAL))
- goto err;
-
-out:
- journal_write_done(cl);
return;
err:
bch2_fatal_error(c);
@@ -2242,12 +2234,12 @@ static void journal_write_endio(struct bio *bio)
if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
bch2_meta_write_fault("journal")) {
- /* Was this a flush or an actual journal write? */
- if (ca->journal.ptr_idx != U8_MAX) {
- set_bit(ca->journal.ptr_idx, &j->replicas_failed);
- set_closure_fn(&j->io, journal_write_error,
- system_highpri_wq);
- }
+ struct journal_buf *w = journal_prev_buf(j);
+ unsigned long flags;
+
+ spin_lock_irqsave(&j->err_lock, flags);
+ bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
+ spin_unlock_irqrestore(&j->err_lock, flags);
}
closure_put(&j->io);
@@ -2263,7 +2255,7 @@ static void journal_write(struct closure *cl)
struct jset *jset;
struct bio *bio;
struct bch_extent_ptr *ptr;
- unsigned i, sectors, bytes, ptr_idx = 0;
+ unsigned i, sectors, bytes;
journal_buf_realloc(j, w);
jset = w->data;
@@ -2310,7 +2302,7 @@ static void journal_write(struct closure *cl)
bytes = vstruct_bytes(w->data);
memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
- if (journal_write_alloc(j, sectors)) {
+ if (journal_write_alloc(j, w, sectors)) {
bch2_journal_halt(j);
bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c);
@@ -2318,13 +2310,6 @@ static void journal_write(struct closure *cl)
return;
}
- if (bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
- BCH_DATA_JOURNAL))
- goto err;
-
- journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
- bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
-
/*
* XXX: we really should just disable the entire journal in nochanges
* mode
@@ -2332,7 +2317,7 @@ static void journal_write(struct closure *cl)
if (c->opts.nochanges)
goto no_io;
- extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
ca = bch_dev_bkey_exists(c, ptr->dev);
if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
@@ -2343,7 +2328,6 @@ static void journal_write(struct closure *cl)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
sectors);
- ca->journal.ptr_idx = ptr_idx++;
bio = ca->journal.bio;
bio_reset(bio);
bio_set_dev(bio, ca->disk_sb.bdev);
@@ -2363,10 +2347,9 @@ static void journal_write(struct closure *cl)
for_each_rw_member(ca, c, i)
if (journal_flushes_device(ca) &&
- !bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
+ !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
percpu_ref_get(&ca->io_ref);
- ca->journal.ptr_idx = U8_MAX;
bio = ca->journal.bio;
bio_reset(bio);
bio_set_dev(bio, ca->disk_sb.bdev);
@@ -2377,7 +2360,7 @@ static void journal_write(struct closure *cl)
}
no_io:
- extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr)
ptr->offset += sectors;
continue_at(cl, journal_write_done, system_highpri_wq);
@@ -2782,163 +2765,32 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
return ret;
}
-ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- union journal_res_state *s = &j->reservations;
- struct bch_dev *ca;
- unsigned iter;
- ssize_t ret = 0;
-
- rcu_read_lock();
- spin_lock(&j->lock);
-
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "active journal entries:\t%zu\n"
- "seq:\t\t\t%llu\n"
- "last_seq:\t\t%llu\n"
- "last_seq_ondisk:\t%llu\n"
- "reservation count:\t%u\n"
- "reservation offset:\t%u\n"
- "current entry u64s:\t%u\n"
- "io in flight:\t\t%i\n"
- "need write:\t\t%i\n"
- "dirty:\t\t\t%i\n"
- "replay done:\t\t%i\n",
- fifo_used(&j->pin),
- (u64) atomic64_read(&j->seq),
- last_seq(j),
- j->last_seq_ondisk,
- journal_state_count(*s, s->idx),
- s->cur_entry_offset,
- j->cur_entry_u64s,
- s->prev_buf_unwritten,
- test_bit(JOURNAL_NEED_WRITE, &j->flags),
- journal_entry_is_open(j),
- test_bit(JOURNAL_REPLAY_DONE, &j->flags));
-
- for_each_member_device_rcu(ca, c, iter,
- &c->rw_devs[BCH_DATA_JOURNAL]) {
- struct journal_device *ja = &ca->journal;
-
- if (!ja->nr)
- continue;
-
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "dev %u:\n"
- "\tnr\t\t%u\n"
- "\tcur_idx\t\t%u (seq %llu)\n"
- "\tlast_idx\t%u (seq %llu)\n",
- iter, ja->nr,
- ja->cur_idx, ja->bucket_seq[ja->cur_idx],
- ja->last_idx, ja->bucket_seq[ja->last_idx]);
- }
-
- spin_unlock(&j->lock);
- rcu_read_unlock();
-
- return ret;
-}
-
-ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *pin;
- ssize_t ret = 0;
- unsigned i;
-
- spin_lock_irq(&j->pin_lock);
- fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "%llu: count %u\n",
- journal_pin_seq(j, pin_list),
- atomic_read(&pin_list->count));
-
- list_for_each_entry(pin, &pin_list->list, list)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "\t%p %pf\n",
- pin, pin->flush);
-
- if (!list_empty(&pin_list->flushed))
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "flushed:\n");
-
- list_for_each_entry(pin, &pin_list->flushed, list)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "\t%p %pf\n",
- pin, pin->flush);
- }
- spin_unlock_irq(&j->pin_lock);
-
- return ret;
-}
+/* startup/shutdown: */
-static bool bch2_journal_writing_to_device(struct bch_dev *ca)
+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
{
- struct journal *j = &ca->fs->journal;
+ union journal_res_state state;
+ struct journal_buf *w;
bool ret;
spin_lock(&j->lock);
- ret = bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key),
- ca->dev_idx);
+ state = READ_ONCE(j->reservations);
+ w = j->buf + !state.idx;
+
+ ret = state.prev_buf_unwritten &&
+ bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx);
spin_unlock(&j->lock);
return ret;
}
-/*
- * This asumes that ca has already been marked read-only so that
- * journal_next_bucket won't pick buckets out of ca any more.
- * Hence, if the journal is not currently pointing to ca, there
- * will be no new writes to journal entries in ca after all the
- * pending ones have been flushed to disk.
- *
- * If the journal is being written to ca, write a new record, and
- * journal_next_bucket will notice that the device is no longer
- * writeable and pick a new set of devices to write to.
- */
-
-int bch2_journal_move(struct bch_dev *ca)
+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
{
- struct journal_device *ja = &ca->journal;
- struct journal *j = &ca->fs->journal;
- u64 seq_to_flush = 0;
- unsigned i;
- int ret;
-
- if (bch2_journal_writing_to_device(ca)) {
- /*
- * bch_journal_meta will write a record and we'll wait
- * for the write to complete.
- * Actually writing the journal (journal_write_locked)
- * will call journal_next_bucket which notices that the
- * device is no longer writeable, and picks a new one.
- */
- bch2_journal_meta(j);
- BUG_ON(bch2_journal_writing_to_device(ca));
- }
-
- for (i = 0; i < ja->nr; i++)
- seq_to_flush = max(seq_to_flush, ja->bucket_seq[i]);
-
- bch2_journal_flush_pins(j, seq_to_flush);
-
- /*
- * Force a meta-data journal entry to be written so that
- * we have newer journal entries in devices other than ca,
- * and wait for the meta data write to complete.
- */
- bch2_journal_meta(j);
-
- /*
- * Verify that we no longer need any of the journal entries in
- * the device
- */
spin_lock(&j->lock);
- ret = j->last_seq_ondisk > seq_to_flush ? 0 : -EIO;
+ bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx);
spin_unlock(&j->lock);
- return ret;
+ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
}
void bch2_fs_journal_stop(struct journal *j)
@@ -3009,6 +2861,7 @@ int bch2_fs_journal_init(struct journal *j)
spin_lock_init(&j->lock);
spin_lock_init(&j->pin_lock);
+ spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
@@ -3038,3 +2891,96 @@ int bch2_fs_journal_init(struct journal *j)
return 0;
}
+
+/* debug: */
+
+ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ union journal_res_state *s = &j->reservations;
+ struct bch_dev *ca;
+ unsigned iter;
+ ssize_t ret = 0;
+
+ rcu_read_lock();
+ spin_lock(&j->lock);
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "active journal entries:\t%zu\n"
+ "seq:\t\t\t%llu\n"
+ "last_seq:\t\t%llu\n"
+ "last_seq_ondisk:\t%llu\n"
+ "reservation count:\t%u\n"
+ "reservation offset:\t%u\n"
+ "current entry u64s:\t%u\n"
+ "io in flight:\t\t%i\n"
+ "need write:\t\t%i\n"
+ "dirty:\t\t\t%i\n"
+ "replay done:\t\t%i\n",
+ fifo_used(&j->pin),
+ (u64) atomic64_read(&j->seq),
+ last_seq(j),
+ j->last_seq_ondisk,
+ journal_state_count(*s, s->idx),
+ s->cur_entry_offset,
+ j->cur_entry_u64s,
+ s->prev_buf_unwritten,
+ test_bit(JOURNAL_NEED_WRITE, &j->flags),
+ journal_entry_is_open(j),
+ test_bit(JOURNAL_REPLAY_DONE, &j->flags));
+
+ for_each_member_device_rcu(ca, c, iter,
+ &c->rw_devs[BCH_DATA_JOURNAL]) {
+ struct journal_device *ja = &ca->journal;
+
+ if (!ja->nr)
+ continue;
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "dev %u:\n"
+ "\tnr\t\t%u\n"
+ "\tcur_idx\t\t%u (seq %llu)\n"
+ "\tlast_idx\t%u (seq %llu)\n",
+ iter, ja->nr,
+ ja->cur_idx, ja->bucket_seq[ja->cur_idx],
+ ja->last_idx, ja->bucket_seq[ja->last_idx]);
+ }
+
+ spin_unlock(&j->lock);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
+{
+ struct journal_entry_pin_list *pin_list;
+ struct journal_entry_pin *pin;
+ ssize_t ret = 0;
+ unsigned i;
+
+ spin_lock_irq(&j->pin_lock);
+ fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "%llu: count %u\n",
+ journal_pin_seq(j, pin_list),
+ atomic_read(&pin_list->count));
+
+ list_for_each_entry(pin, &pin_list->list, list)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "\t%p %pf\n",
+ pin, pin->flush);
+
+ if (!list_empty(&pin_list->flushed))
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "flushed:\n");
+
+ list_for_each_entry(pin, &pin_list->flushed, list)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "\t%p %pf\n",
+ pin, pin->flush);
+ }
+ spin_unlock_irq(&j->pin_lock);
+
+ return ret;
+}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 5f3ece089937..b3e6b2bc1f9b 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -398,8 +398,7 @@ static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
: 0;
}
-int bch2_journal_move(struct bch_dev *);
-
+void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 87f378a6ac4f..66923cf43a83 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -17,6 +17,8 @@ struct journal_res;
struct journal_buf {
struct jset *data;
+ BKEY_PADDED(key);
+
struct closure_waitlist wait;
unsigned size;
@@ -141,7 +143,6 @@ struct journal {
struct closure io;
struct delayed_work write_work;
- unsigned long replicas_failed;
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
@@ -179,6 +180,7 @@ struct journal {
BKEY_PADDED(key);
struct write_point wp;
+ spinlock_t err_lock;
struct delayed_work reclaim_work;
unsigned long last_flushed;
@@ -230,7 +232,6 @@ struct journal_device {
/* Bio for journal reads/writes to this device */
struct bio *bio;
- u8 ptr_idx;
/* for bch_journal_read_device */
struct closure read;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index e11ee9532483..328316a10887 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -196,26 +196,13 @@ static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
return 0;
mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c,
- (1 << BCH_DATA_JOURNAL)|
- (1 << BCH_DATA_BTREE));
-
- /* 1st, Move the btree nodes off the device */
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
for (i = 0; i < BTREE_ID_NR; i++) {
ret = bch2_move_btree_off(c, ca, i);
if (ret)
goto err;
}
-
- /* There are no prios/gens to move -- they are already in the device. */
-
- /* 2nd. Move the journal off the device */
-
- ret = bch2_journal_move(ca);
- if (ret)
- goto err;
-
err:
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
@@ -231,15 +218,12 @@ int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
unsigned dev_idx, int flags, bool metadata)
{
- struct bch_extent_ptr *ptr;
unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
unsigned nr_good;
- extent_for_each_ptr_backwards(e, ptr)
- if (ptr->dev == dev_idx)
- bch2_extent_drop_ptr(e, ptr);
+ bch2_extent_drop_device(e, dev_idx);
nr_good = bch2_extent_nr_good_ptrs(c, e.c);
if ((!nr_good && !(flags & lost)) ||
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 60a2d83ed8c1..61acff4c873f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -88,7 +88,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
-static int bch2_dev_sysfs_online(struct bch_dev *);
+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev)
@@ -649,7 +649,7 @@ static const char *__bch2_fs_online(struct bch_fs *c)
err = "error creating sysfs objects";
__for_each_member_device(ca, c, i, NULL)
- if (bch2_dev_sysfs_online(ca))
+ if (bch2_dev_sysfs_online(c, ca))
goto err;
list_add(&c->list, &bch_fs_list);
@@ -991,9 +991,8 @@ static void bch2_dev_free(struct bch_dev *ca)
kobject_put(&ca->kobj);
}
-static void __bch2_dev_offline(struct bch_dev *ca)
+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
lockdep_assert_held(&c->state_lock);
@@ -1032,9 +1031,8 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
complete(&ca->io_ref_completion);
}
-static int bch2_dev_sysfs_online(struct bch_dev *ca)
+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
int ret;
if (!c->kobj.state_in_sysfs)
@@ -1149,7 +1147,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->fs = c;
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
- if (bch2_dev_sysfs_online(ca))
+ if (bch2_dev_sysfs_online(c, ca))
pr_warn("error creating sysfs objects");
return 0;
@@ -1202,9 +1200,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
bdevname(ca->disk_sb.bdev, c->name);
bdevname(ca->disk_sb.bdev, ca->name);
- if (bch2_dev_sysfs_online(ca))
- pr_warn("error creating sysfs objects");
-
bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
@@ -1311,12 +1306,11 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
bch2_copygc_stop(ca);
/*
- * This stops new data writes (e.g. to existing open data
- * buckets) and then waits for all existing writes to
- * complete.
+ * The allocator thread itself allocates btree nodes, so stop it first:
*/
bch2_dev_allocator_stop(ca);
bch2_dev_allocator_remove(c, ca);
+ bch2_dev_journal_stop(&c->journal, ca);
}
static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
@@ -1393,16 +1387,13 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
percpu_ref_put(&ca->ref); /* XXX */
- if (ca->mi.state == BCH_MEMBER_STATE_RW) {
- bch_err(ca, "Cannot remove RW device");
- goto err;
- }
-
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
bch_err(ca, "Cannot remove without losing data");
goto err;
}
+ __bch2_dev_read_only(c, ca);
+
/*
* XXX: verify that dev_idx is really not in use anymore, anywhere
*
@@ -1452,7 +1443,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
goto err;
}
- __bch2_dev_offline(ca);
+ __bch2_dev_offline(c, ca);
mutex_lock(&c->sb_lock);
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
@@ -1477,6 +1468,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
mutex_unlock(&c->state_lock);
return 0;
err:
+ if (ca->mi.state == BCH_MEMBER_STATE_RW)
+ __bch2_dev_read_write(c, ca);
mutex_unlock(&c->state_lock);
return ret;
}
@@ -1645,7 +1638,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
return -EINVAL;
}
- __bch2_dev_offline(ca);
+ __bch2_dev_offline(c, ca);
mutex_unlock(&c->state_lock);
return 0;