diff options
Diffstat (limited to 'fs')
127 files changed, 1703 insertions, 1132 deletions
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e51e7d88980a..1d847a939f29 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -98,14 +98,25 @@ static struct file_system_type anon_inode_fs_type = { .kill_sb = kill_anon_super, }; -static struct inode *anon_inode_make_secure_inode( - const char *name, - const struct inode *context_inode) +/** + * anon_inode_make_secure_inode - allocate an anonymous inode with security context + * @sb: [in] Superblock to allocate from + * @name: [in] Name of the class of the newfile (e.g., "secretmem") + * @context_inode: + * [in] Optional parent inode for security inheritance + * + * The function ensures proper security initialization through the LSM hook + * security_inode_init_security_anon(). + * + * Return: Pointer to new inode on success, ERR_PTR on failure. + */ +struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name, + const struct inode *context_inode) { struct inode *inode; int error; - inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); + inode = alloc_anon_inode(sb); if (IS_ERR(inode)) return inode; inode->i_flags &= ~S_PRIVATE; @@ -118,6 +129,7 @@ static struct inode *anon_inode_make_secure_inode( } return inode; } +EXPORT_SYMBOL_GPL_FOR_MODULES(anon_inode_make_secure_inode, "kvm"); static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, @@ -132,7 +144,8 @@ static struct file *__anon_inode_getfile(const char *name, return ERR_PTR(-ENOENT); if (make_inode) { - inode = anon_inode_make_secure_inode(name, context_inode); + inode = anon_inode_make_secure_inode(anon_inode_mnt->mnt_sb, + name, context_inode); if (IS_ERR(inode)) { file = ERR_CAST(inode); goto err; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index b375ad610acd..b58525ec7b4d 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -511,7 +511,8 @@ again: bch2_dev_usage_read_fast(ca, &req->usage); avail = dev_buckets_free(ca, req->usage, req->watermark); - if (req->usage.buckets[BCH_DATA_need_discard] > avail) + if (req->usage.buckets[BCH_DATA_need_discard] > + min(avail, ca->mi.nbuckets >> 7)) bch2_dev_do_discards(ca); if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 8043943cdf6a..ddfacad0f70c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -863,9 +863,7 @@ struct bch_fs { DARRAY(enum bcachefs_metadata_version) incompat_versions_requested; -#ifdef CONFIG_UNICODE struct unicode_map *cf_encoding; -#endif struct bch_sb_handle disk_sb; @@ -1285,4 +1283,13 @@ static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca : ca->mi.discard; } +static inline bool bch2_fs_casefold_enabled(struct bch_fs *c) +{ +#ifdef CONFIG_UNICODE + return !c->opts.casefold_disabled; +#else + return false; +#endif +} + #endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 91e0aa796e6b..83c9860e6b82 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -85,7 +85,7 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) six_unlock_intent(&b->c.lock); } -static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) +void __btree_node_data_free(struct btree *b) { BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); @@ -112,16 +112,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) munmap(b->aux_data, btree_aux_data_bytes(b)); #endif b->aux_data = NULL; - - btree_node_to_freedlist(bc, b); } static void btree_node_data_free(struct btree_cache *bc, struct btree *b) { BUG_ON(list_empty(&b->list)); list_del_init(&b->list); + + __btree_node_data_free(b); + --bc->nr_freeable; - __btree_node_data_free(bc, b); + btree_node_to_freedlist(bc, b); } static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -185,10 +186,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) { - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - b = __btree_node_mem_alloc(c, GFP_KERNEL); + struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) return NULL; @@ -198,8 +196,6 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) } bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - - __bch2_btree_node_to_freelist(bc, b); return b; } @@ -524,7 +520,8 @@ restart: --touched;; } else if (!btree_node_reclaim(c, b)) { __bch2_btree_node_hash_remove(bc, b); - __btree_node_data_free(bc, b); + __btree_node_data_free(b); + btree_node_to_freedlist(bc, b); freed++; bc->nr_freed++; @@ -652,9 +649,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); - for (i = 0; i < bc->nr_reserve; i++) - if (!__bch2_btree_node_mem_alloc(c)) + for (i = 0; i < bc->nr_reserve; i++) { + struct btree *b = __bch2_btree_node_mem_alloc(c); + if (!b) goto err; + __bch2_btree_node_to_freelist(bc, b); + } list_splice_init(&bc->live[0].list, &bc->freeable); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index ca3c1b145330..be275f87a60e 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); +void __btree_node_data_free(struct btree *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 08b22bddd747..590cd29f3e86 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -568,9 +568,9 @@ static int __btree_err(int ret, bch2_mark_btree_validate_failure(failed, ca->dev_idx); struct extent_ptr_decoded pick; - have_retry = !bch2_bkey_pick_read_device(c, + have_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - failed, &pick, -1); + failed, &pick, -1) == 1; } if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) @@ -615,7 +615,6 @@ static int __btree_err(int ret, goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); - ret = __bch2_topology_error(c, &out); break; } @@ -644,7 +643,6 @@ static int __btree_err(int ret, goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); - ret = __bch2_topology_error(c, &out); break; } print: @@ -1297,9 +1295,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); - if (updated_range) - bch2_btree_node_drop_keys_outside_node(b); - i = &b->data->keys; for (k = i->start; k != vstruct_last(i);) { struct bkey tmp; @@ -1337,15 +1332,45 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); - scoped_guard(rcu) - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + if (updated_range) + bch2_btree_node_drop_keys_outside_node(b); - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_degraded(b); + /* + * XXX: + * + * We deadlock if too many btree updates require node rewrites while + * we're still in journal replay. + * + * This is because btree node rewrites generate more updates for the + * interior updates (alloc, backpointers), and if those updates touch + * new nodes and generate more rewrites - well, you see the problem. + * + * The biggest cause is that we don't use the btree write buffer (for + * the backpointer updates - this needs some real thought on locking in + * order to fix. + * + * The problem with this workaround (not doing the rewrite for degraded + * nodes in journal replay) is that those degraded nodes persist, and we + * don't want that (this is a real bug when a btree node write completes + * with fewer replicas than we wanted and leaves a degraded node due to + * device _removal_, i.e. the device went away mid write). + * + * It's less of a bug here, but still a problem because we don't yet + * have a way of tracking degraded data - we another index (all + * extents/btree nodes, by replicas entry) in order to fix properly + * (re-replicate degraded data at the earliest possible time). + */ + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { + scoped_guard(rcu) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { + set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } } - } + } if (!ptr_written) { set_btree_node_need_rewrite(b); @@ -1381,7 +1406,7 @@ static void btree_node_read_work(struct work_struct *work) ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), &failed, &rb->pick, -1); - if (ret) { + if (ret <= 0) { set_btree_node_read_error(b); break; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 23d8c62ea4b6..a3fb07c60e25 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -75,39 +75,6 @@ static inline u64 bkey_journal_seq(struct bkey_s_c k) } } -static bool found_btree_node_is_readable(struct btree_trans *trans, - struct found_btree_node *f) -{ - struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; - - found_btree_node_to_key(&tmp.k, f); - - struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); - bool ret = !IS_ERR_OR_NULL(b); - if (!ret) - return ret; - - f->sectors_written = b->written; - f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); - - struct bkey_s_c k; - struct bkey unpacked; - struct btree_node_iter iter; - for_each_btree_node_key_unpack(b, k, &iter, &unpacked) - f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); - - six_unlock_read(&b->c.lock); - - /* - * We might update this node's range; if that happens, we need the node - * to be re-read so the read path can trim keys that are no longer in - * this node - */ - if (b != btree_node_root(trans->c, b)) - bch2_btree_node_evict(trans, &tmp.k); - return ret; -} - static int found_btree_node_cmp_cookie(const void *_l, const void *_r) { const struct found_btree_node *l = _l; @@ -159,17 +126,17 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = { }; static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - struct bio *bio, struct btree_node *bn, u64 offset) + struct btree *b, struct bio *bio, u64 offset) { struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + struct btree_node *bn = b->data; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, bn, PAGE_SIZE); + bch2_bio_map(bio, b->data, c->opts.block_size); u64 submit_time = local_clock(); submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); if (bio->bi_status) { @@ -217,7 +184,28 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, }; rcu_read_unlock(); - if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, b->data, c->opts.btree_node_size); + + submit_time = local_clock(); + submit_bio_wait(bio); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + + found_btree_node_to_key(&b->key, &n); + + CLASS(printbuf, buf)(); + if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { + /* read_done will swap out b->data for another buffer */ + bn = b->data; + /* + * Grab journal_seq here because we want the max journal_seq of + * any bset; read_done sorts down to a single set and picks the + * max journal_seq + */ + n.journal_seq = le64_to_cpu(bn->keys.journal_seq), + n.sectors_written = b->written; + mutex_lock(&f->lock); if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { bch_err(c, "try_read_btree_node() can't handle endian conversion"); @@ -237,12 +225,20 @@ static int read_btree_nodes_worker(void *p) struct find_btree_nodes_worker *w = p; struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); struct bch_dev *ca = w->ca; - void *buf = (void *) __get_free_page(GFP_KERNEL); - struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); unsigned long last_print = jiffies; + struct btree *b = NULL; + struct bio *bio = NULL; + + b = __bch2_btree_node_mem_alloc(c); + if (!b) { + bch_err(c, "read_btree_nodes_worker: error allocating buf"); + w->f->ret = -ENOMEM; + goto err; + } - if (!buf || !bio) { - bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); + bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); + if (!bio) { + bch_err(c, "read_btree_nodes_worker: error allocating bio"); w->f->ret = -ENOMEM; goto err; } @@ -266,11 +262,13 @@ static int read_btree_nodes_worker(void *p) !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) continue; - try_read_btree_node(w->f, ca, bio, buf, sector); + try_read_btree_node(w->f, ca, b, bio, sector); } err: + if (b) + __btree_node_data_free(b); + kfree(b); bio_put(bio); - free_page((unsigned long) buf); enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); closure_put(w->cl); kfree(w); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 901f643ead83..07c2a0f73cc2 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -153,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) c->verify_data = __bch2_btree_node_mem_alloc(c); if (!c->verify_data) goto out; - - list_del_init(&c->verify_data->list); } BUG_ON(b->nsets != 1); @@ -586,6 +584,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, i->ubuf = buf; i->size = size; i->ret = 0; + + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); restart: seqmutex_lock(&c->btree_trans_lock); list_sort(&c->btree_trans_list, list_ptr_order_cmp); @@ -599,6 +599,11 @@ restart: if (!closure_get_not_zero(&trans->ref)) continue; + if (!trans->srcu_held) { + closure_put(&trans->ref); + continue; + } + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); @@ -620,6 +625,8 @@ restart: } seqmutex_unlock(&c->btree_trans_lock); unlocked: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + if (i->buf.allocation_failure) ret = -ENOMEM; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 300f7cc8abdf..28875c5c86ad 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -13,12 +13,15 @@ #include <linux/dcache.h> +#ifdef CONFIG_UNICODE int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, const struct qstr *str, struct qstr *out_cf) { *out_cf = (struct qstr) QSTR_INIT(NULL, 0); -#ifdef CONFIG_UNICODE + if (!bch2_fs_casefold_enabled(trans->c)) + return -EOPNOTSUPP; + unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); int ret = PTR_ERR_OR_ZERO(buf); if (ret) @@ -30,10 +33,8 @@ int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, *out_cf = (struct qstr) QSTR_INIT(buf, ret); return 0; -#else - return -EOPNOTSUPP; -#endif } +#endif static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { @@ -231,7 +232,8 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } -int bch2_dirent_init_name(struct bkey_i_dirent *dirent, +int bch2_dirent_init_name(struct bch_fs *c, + struct bkey_i_dirent *dirent, const struct bch_hash_info *hash_info, const struct qstr *name, const struct qstr *cf_name) @@ -251,6 +253,9 @@ int bch2_dirent_init_name(struct bkey_i_dirent *dirent, offsetof(struct bch_dirent, d_name) - name->len); } else { + if (!bch2_fs_casefold_enabled(c)) + return -EOPNOTSUPP; + #ifdef CONFIG_UNICODE memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); @@ -277,8 +282,6 @@ int bch2_dirent_init_name(struct bkey_i_dirent *dirent, dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); -#else - return -EOPNOTSUPP; #endif } @@ -313,7 +316,7 @@ struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans, dirent->v.d_type = type; dirent->v.d_unused = 0; - int ret = bch2_dirent_init_name(dirent, hash_info, name, cf_name); + int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name); if (ret) return ERR_PTR(ret); diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 70fb0b581221..0417608c18d5 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -23,8 +23,16 @@ struct bch_fs; struct bch_hash_info; struct bch_inode_info; +#ifdef CONFIG_UNICODE int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, const struct qstr *, struct qstr *); +#else +static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + return -EOPNOTSUPP; +} +#endif static inline int bch2_maybe_casefold(struct btree_trans *trans, const struct bch_hash_info *info, @@ -59,7 +67,8 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } -int bch2_dirent_init_name(struct bkey_i_dirent *, +int bch2_dirent_init_name(struct bch_fs *, + struct bkey_i_dirent *, const struct bch_hash_info *, const struct qstr *, const struct qstr *); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 86a842f1e88e..acc3b7b67704 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -282,7 +282,6 @@ x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ - x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ x(EIO, trigger_alloc) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index b2a6c041e165..267e73d9d7e6 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -103,7 +103,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) return bch_err_throw(c, btree_need_topology_repair); } else { return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: - bch_err_throw(c, btree_node_read_validate_error); + bch_err_throw(c, btree_need_topology_repair); } } @@ -633,7 +633,9 @@ err: * log_fsck_err()s: that would require us to track for every error type * which recovery pass corrects it, to get the fsck exit status correct: */ - if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + /* nothing */ + } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { set_bit(BCH_FS_errors_fixed, &c->flags); } else { set_bit(BCH_FS_errors_not_fixed, &c->flags); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 036e4ad95987..83cbd77dcb9c 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -50,19 +50,17 @@ void bch2_io_failures_to_text(struct printbuf *out, struct bch_io_failures *failed) { static const char * const error_types[] = { - "io", "checksum", "ec reconstruct", NULL + "btree validate", "io", "checksum", "ec reconstruct", NULL }; for (struct bch_dev_io_failures *f = failed->devs; f < failed->devs + failed->nr; f++) { unsigned errflags = - ((!!f->failed_io) << 0) | - ((!!f->failed_csum_nr) << 1) | - ((!!f->failed_ec) << 2); - - if (!errflags) - continue; + ((!!f->failed_btree_validate) << 0) | + ((!!f->failed_io) << 1) | + ((!!f->failed_csum_nr) << 2) | + ((!!f->failed_ec) << 3); bch2_printbuf_make_room(out, 1024); out->atomic++; @@ -77,7 +75,9 @@ void bch2_io_failures_to_text(struct printbuf *out, prt_char(out, ' '); - if (is_power_of_2(errflags)) { + if (!errflags) { + prt_str(out, "no error - confused"); + } else if (is_power_of_2(errflags)) { prt_bitflags(out, error_types, errflags); prt_str(out, " error"); } else { diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index db24a76563f8..e54e4f255b22 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -722,7 +722,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, if (IS_ERR(inode)) inode = NULL; -#ifdef CONFIG_UNICODE if (!inode && IS_CASEFOLDED(vdir)) { /* * Do not cache a negative dentry in casefolded directories @@ -737,7 +736,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, */ return NULL; } -#endif return d_splice_alias(&inode->v, dentry); } @@ -2566,9 +2564,10 @@ got_sb: sb->s_shrink->seeks = 0; #ifdef CONFIG_UNICODE - sb->s_encoding = c->cf_encoding; -#endif + if (bch2_fs_casefold_enabled(c)) + sb->s_encoding = c->cf_encoding; generic_set_sb_d_ops(sb); +#endif vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 63093def85e3..856eb2b41896 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -12,6 +12,7 @@ #include "fs.h" #include "fsck.h" #include "inode.h" +#include "io_misc.h" #include "keylist.h" #include "namei.h" #include "recovery_passes.h" @@ -1919,33 +1920,11 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, "extent type past end of inode %llu:%u, i_size %llu\n%s", i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout)); - ret = PTR_ERR_OR_ZERO(whiteout); - if (ret) - goto err; - - bkey_init(&whiteout->k); - whiteout->k.p = SPOS(k.k->p.inode, - last_block, - i->inode.bi_snapshot); - bch2_key_resize(&whiteout->k, - min(KEY_SIZE_MAX & (~0 << c->block_bits), - U64_MAX - whiteout->k.p.offset)); - - - /* - * Need a normal (not BTREE_ITER_all_snapshots) - * iterator, if we're deleting in a different - * snapshot and need to emit a whiteout - */ - struct btree_iter iter2; - bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents, - bkey_start_pos(&whiteout->k), - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_trans_update(trans, &iter2, whiteout, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter2); + ret = bch2_fpunch_snapshot(trans, + SPOS(i->inode.bi_inum, + last_block, + i->inode.bi_snapshot), + POS(i->inode.bi_inum, U64_MAX)); if (ret) goto err; @@ -2302,9 +2281,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; -#ifdef CONFIG_UNICODE hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; -#endif ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k, need_second_pass); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 53e5dc1f6ac1..ef4cc7395b86 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1265,7 +1265,14 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, { struct bch_fs *c = trans->c; -#ifdef CONFIG_UNICODE +#ifndef CONFIG_UNICODE + bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); + return -EOPNOTSUPP; +#endif + + if (c->opts.casefold_disabled) + return -EOPNOTSUPP; + int ret = 0; /* Not supported on individual files. */ if (!S_ISDIR(bi->bi_mode)) @@ -1289,10 +1296,6 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, bi->bi_fields_set |= BIT(Inode_opt_casefold); return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); -#else - bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); - return -EOPNOTSUPP; -#endif } static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index bf72b1d2e2cb..07023667a475 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -135,6 +135,33 @@ err_noprint: return ret; } +/* For fsck */ +int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) +{ + u32 restart_count = trans->restart_count; + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bkey_i delete; + + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + start, end, 0, k, + &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bkey_init(&delete.k); + delete.k.p = iter.pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + + bch2_extent_trim_atomic(trans, &iter, &delete) ?: + bch2_trans_update(trans, &iter, &delete, 0); + })); + + bch2_disk_reservation_put(c, &disk_res); + return ret ?: trans_was_restarted(trans, restart_count); +} + /* * Returns -BCH_ERR_transacton_restart if we had to drop locks: */ diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h index 9cb44a7c43c1..b93e4d4b3c0c 100644 --- a/fs/bcachefs/io_misc.h +++ b/fs/bcachefs/io_misc.h @@ -5,6 +5,8 @@ int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, u64, struct bch_io_opts, s64 *, struct write_point_specifier); + +int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, subvol_inum, u64, s64 *); int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index cd184b219a65..e0874ad9a6cf 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -166,6 +166,7 @@ static noinline void promote_free(struct bch_read_bio *rbio) BUG_ON(ret); async_object_list_del(c, promote, op->list_idx); + async_object_list_del(c, rbio, rbio->list_idx); bch2_data_update_exit(&op->write); @@ -456,6 +457,10 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) if (rbio->start_time) bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], rbio->start_time); +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + if (rbio->list_idx) + async_object_list_del(rbio->c, rbio, rbio->list_idx); +#endif bio_endio(&rbio->bio); } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index f22b05e02c1e..014adbcb404b 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -189,6 +189,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) void bch2_journal_do_writes(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + for (u64 seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); seq++) { @@ -203,6 +205,7 @@ void bch2_journal_do_writes(struct journal *j) if (!journal_state_seq_count(j, j->reservations, seq)) { j->seq_write_started = seq; w->write_started = true; + closure_get(&c->cl); closure_call(&w->io, bch2_journal_write, j->wq, NULL); } @@ -1082,6 +1085,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou if (open && !*blocked) { __bch2_journal_block(j); + s.v = atomic64_read_acquire(&j->reservations.counter); *blocked = true; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index dd3f3434c1b0..343f1daf5da7 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1767,6 +1767,7 @@ static CLOSURE_CALLBACK(journal_write_done) closure_wake_up(&c->freelist_wait); bch2_reset_alloc_cursors(c); + do_discards = true; } j->seq_ondisk = seq; @@ -1818,6 +1819,8 @@ static CLOSURE_CALLBACK(journal_write_done) if (do_discards) bch2_do_discards(c); + + closure_put(&c->cl); } static void journal_write_endio(struct bio *bio) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index cd6201741c59..0042d43b8e57 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -170,6 +170,12 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne return (struct journal_space) { 0, 0 }; /* + * It's possible for bucket size to be misaligned w.r.t. the filesystem + * block size: + */ + min_bucket_size = round_down(min_bucket_size, block_sectors(c)); + + /* * We sorted largest to smallest, and we want the smallest out of the * @nr_devs_want largest devices: */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 27e68d470ad0..5e6de91a8763 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -71,7 +71,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (ret) return ret; - struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode); + struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p); if (!ca) goto out; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index b0a76bd6d6f5..63f8e254495c 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -234,6 +234,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_CASEFOLD, false, \ NULL, "Dirent lookups are casefolded") \ + x(casefold_disabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Disable casefolding filesystem wide") \ x(inodes_32bit, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d0b7e3a36a54..c94debb12d2f 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -273,24 +273,35 @@ static int bch2_journal_replay_key(struct btree_trans *trans, goto out; struct btree_path *path = btree_iter_path(trans, &iter); - if (unlikely(!btree_path_node(path, k->level) && - !k->allocated)) { + if (unlikely(!btree_path_node(path, k->level))) { struct bch_fs *c = trans->c; + CLASS(printbuf, buf)(); + prt_str(&buf, "btree="); + bch2_btree_id_to_text(&buf, k->btree_id); + prt_printf(&buf, " level=%u ", k->level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); + if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { - bch_err(c, "have key in journal replay for btree depth that does not exist, confused"); + bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s", + buf.buf); ret = -EINVAL; } -#if 0 + + if (!k->allocated) { + bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s", + buf.buf); + k->overwritten = true; + goto out; + } + bch2_trans_iter_exit(trans, &iter); bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, 0, iter_flags); ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_increase_depth(trans, iter.path, 0) ?: -BCH_ERR_transaction_restart_nested; -#endif - k->overwritten = true; goto out; } diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index c09ed2dd4639..6a039e011064 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -360,7 +360,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, !(r->passes_complete & BIT_ULL(pass)); bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; - if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { + if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); } diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 71b735a85026..3e9f59226bdf 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -38,6 +38,7 @@ static int bch2_fsck_rename_dirent(struct btree_trans *trans, struct bkey_s_c_dirent old, bool *updated_before_k_pos) { + struct bch_fs *c = trans->c; struct qstr old_name = bch2_dirent_get_name(old); struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); int ret = PTR_ERR_OR_ZERO(new); @@ -60,7 +61,7 @@ static int bch2_fsck_rename_dirent(struct btree_trans *trans, sprintf(renamed_buf, "%.*s.fsck_renamed-%u", old_name.len, old_name.name, i)); - ret = bch2_dirent_init_name(new, hash_info, &renamed_name, NULL); + ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL); if (ret) return ret; @@ -79,7 +80,7 @@ static int bch2_fsck_rename_dirent(struct btree_trans *trans, } ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); - bch_err_fn(trans->c, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 79d51aef70aa..8979ac2d7a3b 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -48,9 +48,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) struct bch_hash_info info = { .inum_snapshot = bi->bi_snapshot, .type = INODE_STR_HASH(bi), -#ifdef CONFIG_UNICODE .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, -#endif .siphash_key = { .k0 = bi->bi_hash_seed } }; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 69c097ff54e7..c46b1053a02c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1025,15 +1025,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, } #ifdef CONFIG_UNICODE - /* Default encoding until we can potentially have more as an option. */ - c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); - if (IS_ERR(c->cf_encoding)) { - printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - ret = -EINVAL; - goto err; + if (bch2_fs_casefold_enabled(c)) { + /* Default encoding until we can potentially have more as an option. */ + c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); + if (IS_ERR(c->cf_encoding)) { + printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + ret = -EINVAL; + goto err; + } } #else if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { @@ -1160,12 +1162,11 @@ int bch2_fs_start(struct bch_fs *c) print_mount_opts(c); -#ifdef CONFIG_UNICODE - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); -#endif + if (c->cf_encoding) + bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); if (!bch2_fs_may_start(c)) return bch_err_throw(c, insufficient_devices_to_start); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9de356bcb411..aa176cc9a324 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -83,6 +83,8 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, /* Does the block group need to be added to the free space tree? */ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Set after we add a new block group to the free space tree. */ + BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c7cc24a5dd5e..8c597fa60523 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1377,7 +1377,10 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); + struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + + if (WARN_ON(node)) + refcount_dec(&node->refs); } static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1beb9458f622..0d6ad7512f21 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1835,6 +1835,8 @@ void btrfs_put_root(struct btrfs_root *root) if (refcount_dec_and_test(&root->refs)) { if (WARN_ON(!xa_empty(&root->inodes))) xa_destroy(&root->inodes); + if (WARN_ON(!xa_empty(&root->delayed_nodes))) + xa_destroy(&root->delayed_nodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); @@ -2156,8 +2158,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) - ret = PTR_ERR(root); + ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); @@ -4310,8 +4311,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * * So wait for all ongoing ordered extents to complete and then run * delayed iputs. This works because once we reach this point no one - * can either create new ordered extents nor create delayed iputs - * through some other means. + * can create new ordered extents, but delayed iputs can still be added + * by a reclaim worker (see comments further below). * * Also note that btrfs_wait_ordered_roots() is not safe here, because * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, @@ -4322,15 +4323,29 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->endio_write_workers); /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); + /* + * Run delayed iputs in case an async reclaim worker is waiting for them + * to be run as mentioned above. + */ btrfs_run_delayed_iputs(fs_info); - /* There should be no more workload to generate new delayed iputs. */ - set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); cancel_work_sync(&fs_info->em_shrinker_work); + /* + * Run delayed iputs again because an async reclaim worker may have + * added new ones if it was flushing delalloc: + * + * shrink_delalloc() -> btrfs_start_delalloc_roots() -> + * start_delalloc_inodes() -> btrfs_add_delayed_iput() + */ + btrfs_run_delayed_iputs(fs_info); + + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); + /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 849199768664..1dc931c4937f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4312,7 +4312,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_unlock(&eb->refs_lock); continue; } - xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4329,6 +4328,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ + xa_unlock_irq(&fs_info->buffer_tree); release_extent_buffer(eb); xa_lock_irq(&fs_info->buffer_tree); } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 0c573d46639a..a83c268f7f87 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1115,11 +1115,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; - ASSERT(ret == 0); + /* + * If ret is 1 (no key found), it means this is an empty block group, + * without any extents allocated from it and there's no block group + * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree + * because we are using the block group tree feature, so block group + * items are stored in the block group tree. It also means there are no + * extents allocated for block groups with a start offset beyond this + * block group's end offset (this is the last, highest, block group). + */ + if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) + ASSERT(ret == 0); start = block_group->start; end = block_group->start + block_group->length; - while (1) { + while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -1149,8 +1159,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_next_item(extent_root, path); if (ret < 0) goto out_locked; - if (ret) - break; } if (start < end) { ret = __add_to_free_space_tree(trans, block_group, path2, @@ -1233,6 +1241,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, { BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; + struct rb_node *node; int nr; int ret; @@ -1261,6 +1270,16 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, btrfs_release_path(path); } + node = rb_first_cached(&trans->fs_info->block_group_cache_tree); + while (node) { + struct btrfs_block_group *bg; + + bg = rb_entry(node, struct btrfs_block_group, cache_node); + clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags); + node = rb_next(node); + cond_resched(); + } + return 0; } @@ -1350,12 +1369,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); + + if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, + &block_group->runtime_flags)) + goto next; + ret = populate_free_space_tree(trans, block_group); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } +next: if (btrfs_should_end_transaction(trans)) { btrfs_end_transaction(trans); trans = btrfs_start_transaction(free_space_root, 1); @@ -1382,6 +1407,29 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); + /* + * While rebuilding the free space tree we may allocate new metadata + * block groups while modifying the free space tree. + * + * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we + * can use multiple transactions, every time btrfs_end_transaction() is + * called at btrfs_rebuild_free_space_tree() we finish the creation of + * new block groups by calling btrfs_create_pending_block_groups(), and + * that in turn calls us, through add_block_group_free_space(), to add + * a free space info item and a free space extent item for the block + * group. + * + * Then later btrfs_rebuild_free_space_tree() may find such new block + * groups and processes them with populate_free_space_tree(), which can + * fail with EEXIST since there are already items for the block group in + * the free space tree. Notice that we say "may find" because a new + * block group may be added to the block groups rbtree in a node before + * or after the block group currently being processed by the rebuild + * process. So signal the rebuild process to skip such new block groups + * if it finds them. + */ + set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags); + ret = add_new_free_space_info(trans, block_group, path); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c0c778243bf1..fc66872b4c74 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4250,9 +4250,9 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { - btrfs_info(fs_info, - "failed to delete reference to %.*s, inode %llu parent %llu", - name->len, name->name, ino, dir_ino); + btrfs_crit(fs_info, + "failed to delete reference to %.*s, root %llu inode %llu parent %llu", + name->len, name->name, btrfs_root_id(root), ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } @@ -4710,7 +4710,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; int ret = 0; struct btrfs_trans_handle *trans; - u64 last_unlink_trans; struct fscrypt_name fname; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) @@ -4736,6 +4735,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out_notrans; } + /* + * Propagate the last_unlink_trans value of the deleted dir to its + * parent directory. This is to prevent an unrecoverable log tree in the + * case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + * + * This is because we can't unlink other roots when replaying the dir + * deletes for directory foo. + */ + if (BTRFS_I(inode)->last_unlink_trans >= trans->transid) + btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); + if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; @@ -4745,27 +4761,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (ret) goto out; - last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; - /* now the directory is empty */ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); - if (!ret) { + if (!ret) btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * Propagate the last_unlink_trans value of the deleted dir to - * its parent directory. This is to prevent an unrecoverable - * log tree in the case we do something like this: - * 1) create dir foo - * 2) create snapshot under dir foo - * 3) delete the snapshot - * 4) rmdir foo - * 5) mkdir foo - * 6) fsync foo or some file inside foo - */ - if (last_unlink_trans >= trans->transid) - BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; - } out: btrfs_end_transaction(trans); out_notrans: @@ -8059,6 +8059,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + bool logs_pinned = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; @@ -8182,6 +8183,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && + new_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not for + * root entries) pin the log early to prevent any concurrent + * task from logging the directory after we removed the old + * entries and before we add the new entries, otherwise that + * task can sync a log without any entry for the inodes we are + * renaming and therefore replaying that log, if a power failure + * happens after syncing the log, would result in deleting the + * inodes. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8253,30 +8279,23 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. + * Do the log updates for all inodes. + * + * If either entry is for a root we don't need to update the logs since + * we've called btrfs_set_log_full_commit() before. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(dest); - - /* Do the log updates for all inodes. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); + } - /* Now unpin the logs. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) +out_fail: + if (logs_pinned) { btrfs_end_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(dest); -out_fail: + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8326,6 +8345,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; + bool logs_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -8460,6 +8480,29 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not a + * root entry) pin the log to prevent any concurrent task from + * logging the directory after we removed the old entry and + * before we add the new entry, otherwise that task can sync + * a log without any entry for the inode we are renaming and + * therefore replaying that log, if a power failure happens + * after syncing the log, would result in deleting the inode. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8524,7 +8567,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); @@ -8540,6 +8583,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, } } out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 913acef3f0a9..8a60983a697c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -666,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, goto out; } + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_record_new_subvolume(trans, BTRFS_I(dir)); - d_instantiate_new(dentry, new_inode_args.inode); new_inode_args.inode = NULL; @@ -3139,7 +3139,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) return -EPERM; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + btrfs_err(fs_info, "scrub: extent tree v2 not yet supported"); return -EINVAL; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ce36fafc771e..7cd5e76a783c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -557,7 +557,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", +"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -571,7 +571,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, err: btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -596,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * /* Super block error, no need to search extent tree. */ if (is_super) { - btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + btrfs_warn_in_rcu(fs_info, "scrub: %s on device %s, physical %llu", errstr, btrfs_dev_name(dev), physical); return; } @@ -631,14 +631,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * &ref_level); if (ret < 0) { btrfs_warn(fs_info, - "failed to resolve tree backref for logical %llu: %d", - swarn.logical, ret); + "scrub: failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); break; } if (ret > 0) break; btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", +"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), ref_level, ref_root); @@ -718,7 +718,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_bytenr(header), logical); return; @@ -728,7 +728,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad fsid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, header->fsid, fs_info->fs_devices->fsid); return; @@ -738,7 +738,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, header->chunk_tree_uuid, fs_info->chunk_tree_uuid); return; @@ -760,7 +760,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); @@ -771,7 +771,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad generation, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_generation(header), stripe->sectors[sector_nr].generation); @@ -814,7 +814,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) */ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { btrfs_warn_rl(fs_info, - "tree block at %llu crosses stripe boundary %llu", + "scrub: tree block at %llu crosses stripe boundary %llu", stripe->logical + (sector_nr << fs_info->sectorsize_bits), stripe->logical); @@ -1046,12 +1046,12 @@ skip: if (repaired) { if (dev) { btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on dev %s physical %llu", + "scrub: fixed up error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on mirror %u", + "scrub: fixed up error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } continue; @@ -1060,12 +1060,12 @@ skip: /* The remaining are all for unrepaired. */ if (dev) { btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on dev %s physical %llu", +"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on mirror %u", + "scrub: unable to fixup (regular) error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } @@ -1593,8 +1593,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, physical, sctx->write_pointer); if (ret) - btrfs_err(fs_info, - "zoned: failed to recover write pointer"); + btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); } mutex_unlock(&sctx->wr_lock); btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); @@ -1658,7 +1657,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, int ret; if (unlikely(!extent_root || !csum_root)) { - btrfs_err(fs_info, "no valid extent or csum root for scrub"); + btrfs_err(fs_info, "scrub: no valid extent or csum root found"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * @@ -1907,7 +1906,7 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, - "stripe %llu has unrepaired metadata sector at %llu", + "scrub: stripe %llu has unrepaired metadata sector at logical %llu", stripe->logical, stripe->logical + (i << fs_info->sectorsize_bits)); return true; @@ -2167,7 +2166,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, -"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", +"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, &error); ret = -EIO; @@ -2789,14 +2788,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ro_set = 0; } else if (ret == -ETXTBSY) { btrfs_warn(fs_info, - "skipping scrub of block group %llu due to active swapfile", + "scrub: skipping scrub of block group %llu due to active swapfile", cache->start); scrub_pause_off(fs_info); ret = 0; goto skip_unfreeze; } else { - btrfs_warn(fs_info, - "failed setting block group ro: %d", ret); + btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", + ret); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); scrub_pause_off(fs_info); @@ -2892,13 +2891,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, ret = btrfs_check_super_csum(fs_info, sb); if (ret != 0) { btrfs_err_rl(fs_info, - "super block at physical %llu devid %llu has bad csum", + "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } if (btrfs_super_generation(sb) != generation) { btrfs_err_rl(fs_info, -"super block at physical %llu devid %llu has bad generation %llu expect %llu", +"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, btrfs_super_generation(sb), generation); return -EUCLEAN; @@ -3059,7 +3058,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, - "scrub on devid %llu: filesystem on %s is not writable", + "scrub: devid %llu: filesystem on %s is not writable", devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 97e933113b82..cea8a7e9d6d3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -143,6 +143,9 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r unsigned int nofs_flag; struct btrfs_inode *inode; + /* Only meant to be called for subvolume roots and not for log roots. */ + ASSERT(is_fstree(btrfs_root_id(root))); + /* * We're holding a transaction handle whether we are logging or * replaying a log tree, so we must make sure NOFS semantics apply @@ -604,21 +607,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, return 0; } -/* - * simple helper to read an inode off the disk from a given root - * This can only be called for subvolume roots and not for the log - */ -static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, - u64 objectid) -{ - struct btrfs_inode *inode; - - inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - return NULL; - return inode; -} - /* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. @@ -668,15 +656,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - ret = 0; - goto out; + btrfs_err(fs_info, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), key->objectid, key->offset); + return -EUCLEAN; } - inode = read_one_inode(root, key->objectid); - if (!inode) { - ret = -EIO; - goto out; - } + inode = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); /* * first check to see if we already have this extent in the @@ -948,9 +936,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -961,7 +950,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(&inode->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1072,7 +1062,9 @@ again: search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = parent_objectid; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret == 0) { + if (ret < 0) { + return ret; + } else if (ret == 0) { struct btrfs_inode_ref *victim_ref; unsigned long ptr; unsigned long ptr_end; @@ -1145,13 +1137,13 @@ again: struct fscrypt_str victim_name; extref = (struct btrfs_inode_extref *)(base + cur_offset); + victim_name.len = btrfs_inode_extref_name_len(leaf, extref); if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) goto next; ret = read_alloc_one_name(leaf, &extref->name, - btrfs_inode_extref_name_len(leaf, extref), - &victim_name); + victim_name.len, &victim_name); if (ret) return ret; @@ -1166,18 +1158,18 @@ again: kfree(victim_name.name); return ret; } else if (!ret) { - ret = -ENOENT; - victim_parent = read_one_inode(root, - parent_objectid); - if (victim_parent) { + victim_parent = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(victim_parent)) { + ret = PTR_ERR(victim_parent); + } else { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, victim_parent, inode, &victim_name); + iput(&victim_parent->vfs_inode); } - iput(&victim_parent->vfs_inode); kfree(victim_name.name); if (ret) return ret; @@ -1314,9 +1306,9 @@ again: struct btrfs_inode *dir; btrfs_release_path(path); - dir = read_one_inode(root, parent_id); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_id, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); kfree(name.name); goto out; } @@ -1388,15 +1380,17 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * copy the back ref in. The link count fixup code will take * care of the rest */ - dir = read_one_inode(root, parent_objectid); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; goto out; } - inode = read_one_inode(root, inode_objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(inode_objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -1408,11 +1402,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * parent object can change from one array * item to another. */ - if (!dir) - dir = read_one_inode(root, parent_objectid); if (!dir) { - ret = -ENOENT; - goto out; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; + goto out; + } } } else { ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); @@ -1681,9 +1677,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; btrfs_release_path(path); - inode = read_one_inode(root, key.offset); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.offset, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } @@ -1719,9 +1715,9 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *inode; struct inode *vfs_inode; - inode = read_one_inode(root, objectid); - if (!inode) - return -EIO; + inode = btrfs_iget_logging(objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; @@ -1760,14 +1756,14 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_inode *dir; int ret; - inode = read_one_inode(root, location->objectid); - if (!inode) - return -ENOENT; + inode = btrfs_iget_logging(location->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); - dir = read_one_inode(root, dirid); - if (!dir) { + dir = btrfs_iget_logging(dirid, root); + if (IS_ERR(dir)) { iput(&inode->vfs_inode); - return -EIO; + return PTR_ERR(dir); } ret = btrfs_add_link(trans, dir, inode, name, 1, index); @@ -1844,9 +1840,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool update_size = true; bool name_added = false; - dir = read_one_inode(root, key->objectid); - if (!dir) - return -EIO; + dir = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(dir)) + return PTR_ERR(dir); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) @@ -2146,9 +2142,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -2300,14 +2297,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (!log_path) return -ENOMEM; - dir = read_one_inode(root, dirid); - /* it isn't an error if the inode isn't there, that can happen - * because we replay the deletes before we copy in the inode item - * from the log + dir = btrfs_iget_logging(dirid, root); + /* + * It isn't an error if the inode isn't there, that can happen because + * we replay the deletes before we copy in the inode item from the log. */ - if (!dir) { + if (IS_ERR(dir)) { btrfs_free_path(log_path); - return 0; + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + return ret; } range_start = 0; @@ -2466,9 +2466,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_inode *inode; u64 from; - inode = read_one_inode(root, key.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } from = ALIGN(i_size_read(&inode->vfs_inode), @@ -7447,6 +7447,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * full log sync. * Also we don't need to worry with renames, since btrfs_rename() marks the log * for full commit when renaming a subvolume. + * + * Must be called before creating the subvolume entry in its parent directory. */ void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, struct btrfs_inode *dir) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 89835071cfea..f475b4b7c457 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3282,6 +3282,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); + + if (list_empty(&device->post_commit_list)) { + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + } + mutex_unlock(&fs_info->chunk_mutex); } } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b5b0156d5b95..9430b34d3cbb 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1403,7 +1403,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1426,6 +1427,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, zone_info[1].physical); return -EIO; } + + if (zone_info[0].alloc_offset == WP_CONVENTIONAL) + zone_info[0].alloc_offset = last_alloc; + + if (zone_info[1].alloc_offset == WP_CONVENTIONAL) + zone_info[1].alloc_offset = last_alloc; + if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); @@ -1446,7 +1454,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; int i; @@ -1461,10 +1470,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); for (i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + zone_info[i].alloc_offset = last_alloc; + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && !btrfs_test_opt(fs_info, DEGRADED)) { btrfs_err(fs_info, @@ -1494,7 +1505,8 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1505,10 +1517,29 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, map->num_stripes); + div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > i) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == i) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; @@ -1526,7 +1557,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1537,8 +1569,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (test_bit(0, active) != test_bit(i, active)) { @@ -1549,6 +1580,29 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, + map->num_stripes / map->sub_stripes); + div_u64_rem(stripe_nr, + (map->num_stripes / map->sub_stripes), + &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > (i / map->sub_stripes)) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == (i / map->sub_stripes)) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if ((i % map->sub_stripes) == 0) { bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; @@ -1637,18 +1691,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: - ret = btrfs_load_block_group_dup(cache, map, zone_info, active); + ret = btrfs_load_block_group_dup(cache, map, zone_info, active, + last_alloc); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: - ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid1(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID0: - ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid0(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID10: - ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid10(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 6a329c329f43..16e4a6bd9b97 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -214,9 +214,11 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) /* * bit 30: I/O error occurred on this folio + * bit 29: CPU has dirty data in D-cache (needs aliasing handling); * bit 0 - 29: remaining parts to complete this folio */ -#define EROFS_ONLINEFOLIO_EIO (1 << 30) +#define EROFS_ONLINEFOLIO_EIO 30 +#define EROFS_ONLINEFOLIO_DIRTY 29 void erofs_onlinefolio_init(struct folio *folio) { @@ -233,19 +235,23 @@ void erofs_onlinefolio_split(struct folio *folio) atomic_inc((atomic_t *)&folio->private); } -void erofs_onlinefolio_end(struct folio *folio, int err) +void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty) { int orig, v; do { orig = atomic_read((atomic_t *)&folio->private); - v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0); + DBG_BUGON(orig <= 0); + v = dirty << EROFS_ONLINEFOLIO_DIRTY; + v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO); } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); - if (v & ~EROFS_ONLINEFOLIO_EIO) + if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1)) return; folio->private = 0; - folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO)); + if (v & BIT(EROFS_ONLINEFOLIO_DIRTY)) + flush_dcache_folio(folio); + folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO))); } static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -351,11 +357,16 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ static int erofs_read_folio(struct file *file, struct folio *folio) { + trace_erofs_read_folio(folio, true); + return iomap_read_folio(folio, &erofs_iomap_ops); } static void erofs_readahead(struct readahead_control *rac) { + trace_erofs_readahead(rac->mapping->host, readahead_index(rac), + readahead_count(rac), true); + return iomap_readahead(rac, &erofs_iomap_ops); } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index bf62e2836b60..358061d7b660 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -301,13 +301,11 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, cur = min(cur, rq->outputsize); if (cur && rq->out[0]) { kin = kmap_local_page(rq->in[nrpages_in - 1]); - if (rq->out[0] == rq->in[nrpages_in - 1]) { + if (rq->out[0] == rq->in[nrpages_in - 1]) memmove(kin + rq->pageofs_out, kin + pi, cur); - flush_dcache_page(rq->out[0]); - } else { + else memcpy_to_page(rq->out[0], rq->pageofs_out, kin + pi, cur); - } kunmap_local(kin); } rq->outputsize -= cur; @@ -325,14 +323,12 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; DBG_BUGON(no >= nrpages_out); cnt = min(insz - pi, PAGE_SIZE - po); - if (rq->out[no] == rq->in[ni]) { + if (rq->out[no] == rq->in[ni]) memmove(kin + po, kin + rq->pageofs_in + pi, cnt); - flush_dcache_page(rq->out[no]); - } else if (rq->out[no]) { + else if (rq->out[no]) memcpy_to_page(rq->out[no], po, kin + rq->pageofs_in + pi, cnt); - } pi += cnt; } while (pi < insz); kunmap_local(kin); diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 2fae209d0274..3e4b38bec0aa 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -58,6 +58,11 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_dirent *de; unsigned int nameoff, maxsize; + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + de = erofs_bread(&buf, dbstart, true); if (IS_ERR(de)) { erofs_err(sb, "failed to readdir of logical block %llu of nid %llu", @@ -88,6 +93,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) break; ctx->pos = dbstart + maxsize; ofs = 0; + cond_resched(); } erofs_put_metabuf(&buf); if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) { diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 7d81f504bff0..91781718199e 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -38,7 +38,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) } else { bio_for_each_folio_all(fi, &rq->bio) { DBG_BUGON(folio_test_uptodate(fi.folio)); - erofs_onlinefolio_end(fi.folio, ret); + erofs_onlinefolio_end(fi.folio, ret, false); } } bio_uninit(&rq->bio); @@ -47,6 +47,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) { + const struct cred *old_cred; struct iov_iter iter; int ret; @@ -60,7 +61,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); + old_cred = override_creds(rq->iocb.ki_filp->f_cred); ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + revert_creds(old_cred); if (ret != -EIOCBQUEUED) erofs_fileio_ki_complete(&rq->iocb, ret); } @@ -93,8 +96,6 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) struct erofs_map_blocks *map = &io->map; unsigned int cur = 0, end = folio_size(folio), len, attached = 0; loff_t pos = folio_pos(folio), ofs; - struct iov_iter iter; - struct bio_vec bv; int err = 0; erofs_onlinefolio_init(folio); @@ -119,13 +120,7 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) err = PTR_ERR(src); break; } - bvec_set_folio(&bv, folio, len, cur); - iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len); - if (copy_to_iter(src, len, &iter) != len) { - erofs_put_metabuf(&buf); - err = -EIO; - break; - } + memcpy_to_folio(folio, cur, src, len); erofs_put_metabuf(&buf); } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { folio_zero_segment(folio, cur, cur + len); @@ -159,7 +154,7 @@ io_retry: } cur += len; } - erofs_onlinefolio_end(folio, err); + erofs_onlinefolio_end(folio, err, false); return err; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index a32c03a80c70..06b867d2fc3b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -315,10 +315,12 @@ static inline struct folio *erofs_grab_folio_nowait(struct address_space *as, /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED 0x0008 /* Located in the special packed inode */ -#define EROFS_MAP_FRAGMENT 0x0010 +#define __EROFS_MAP_FRAGMENT 0x0010 /* The extent refers to partial decompressed data */ #define EROFS_MAP_PARTIAL_REF 0x0020 +#define EROFS_MAP_FRAGMENT (EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT) + struct erofs_map_blocks { struct erofs_buf buf; @@ -390,7 +392,7 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); void erofs_onlinefolio_init(struct folio *folio); void erofs_onlinefolio_split(struct folio *folio); -void erofs_onlinefolio_end(struct folio *folio, int err); +void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index fe8071844724..e3f28a1bb945 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1034,7 +1034,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f, if (!(map->m_flags & EROFS_MAP_MAPPED)) { folio_zero_segment(folio, cur, end); tight = false; - } else if (map->m_flags & EROFS_MAP_FRAGMENT) { + } else if (map->m_flags & __EROFS_MAP_FRAGMENT) { erofs_off_t fpos = offset + cur - map->m_la; err = z_erofs_read_fragment(inode->i_sb, folio, cur, @@ -1091,7 +1091,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f, tight = (bs == PAGE_SIZE); } } while ((end = cur) > 0); - erofs_onlinefolio_end(folio, err); + erofs_onlinefolio_end(folio, err, false); return err; } @@ -1196,7 +1196,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err) cur += len; } kunmap_local(dst); - erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); + erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true); list_del(p); kfree(bvi); } @@ -1355,7 +1355,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - erofs_onlinefolio_end(page_folio(page), err); + erofs_onlinefolio_end(page_folio(page), err, true); continue; } if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 14ea47f954f5..f1a15ff22147 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -413,8 +413,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode, !vi->z_tailextent_headlcn) { map->m_la = 0; map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | - EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; + map->m_flags = EROFS_MAP_FRAGMENT; return 0; } initial_lcn = ofs >> lclusterbits; @@ -489,7 +488,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode, goto unmap_out; } } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { - map->m_flags |= EROFS_MAP_FRAGMENT; + map->m_flags = EROFS_MAP_FRAGMENT; } else { map->m_pa = erofs_pos(sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); @@ -597,6 +596,10 @@ static int z_erofs_map_blocks_ext(struct inode *inode, if (la > map->m_la) { r = mid; + if (la > lend) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } lend = la; } else { l = mid + 1; @@ -613,7 +616,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode, if (lstart < lend) { map->m_la = lstart; if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { - map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT; + map->m_flags = EROFS_MAP_FRAGMENT; vi->z_fragmentoff = map->m_plen; if (recsz > offsetof(struct z_erofs_extent, pstart_lo)) vi->z_fragmentoff |= map->m_pa << 32; @@ -635,12 +638,6 @@ static int z_erofs_map_blocks_ext(struct inode *inode, } } map->m_llen = lend - map->m_la; - if (!last && map->m_llen < sb->s_blocksize) { - erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu", - map->m_llen, map->m_la, vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } return 0; } @@ -799,7 +796,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? + iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ? IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d4dbffdedd08..0fbf5dfedb24 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -883,7 +883,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); - return ep_refcount_dec_and_test(ep); + return true; } /* @@ -891,14 +891,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { - WARN_ON_ONCE(__ep_remove(ep, epi, false)); + if (__ep_remove(ep, epi, false)) + WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; - bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) @@ -931,10 +931,8 @@ static void ep_clear_and_put(struct eventpoll *ep) cond_resched(); } - dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); - - if (dispose) + if (ep_refcount_dec_and_test(ep)) ep_free(ep); } @@ -1137,7 +1135,7 @@ again: dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); - if (dispose) + if (dispose && ep_refcount_dec_and_test(ep)) ep_free(ep); goto again; } diff --git a/fs/exec.c b/fs/exec.c index 1f5fdd2e096e..ba400aafd640 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -114,6 +114,9 @@ static inline void put_binfmt(struct linux_binfmt * fmt) bool path_noexec(const struct path *path) { + /* If it's an anonymous inode make sure that we catch any shenanigans. */ + VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) && + !(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC)); return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } @@ -781,13 +784,15 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) if (IS_ERR(file)) return file; + if (path_noexec(&file->f_path)) + return ERR_PTR(-EACCES); + /* * In the past the regular type check was here. It moved to may_open() in * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here. */ - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || - path_noexec(&file->f_path)) + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode))) return ERR_PTR(-EACCES); err = exe_file_deny_write_access(file); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6bd3de64f2a8..696131e655ed 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,6 +35,17 @@ #include <trace/events/f2fs.h> #include <uapi/linux/f2fs.h> +static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +{ + loff_t old_size = i_size_read(inode); + + if (old_size >= new_size) + return; + + /* zero or drop pages only in range of [old_size, new_size] */ + truncate_pagecache(inode, old_size); +} + static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -103,8 +114,13 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); + filemap_invalidate_unlock(inode->i_mapping); + file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); + folio_lock(folio); if (unlikely(folio->mapping != inode->i_mapping || folio_pos(folio) > i_size_read(inode) || @@ -1109,6 +1125,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, f2fs_down_write(&fi->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + if (attr->ia_size > old_size) + f2fs_zero_post_eof_page(inode, attr->ia_size); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1227,6 +1245,10 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1510,6 +1532,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); truncate_pagecache(inode, offset); @@ -1631,6 +1655,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + filemap_invalidate_lock(mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1762,6 +1790,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); + + f2fs_zero_post_eof_page(inode, offset + len); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1819,6 +1849,10 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_balance_fs(sbi, true); pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; @@ -4860,6 +4894,10 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) err = file_modified(file); if (err) return err; + + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); + filemap_invalidate_unlock(inode->i_mapping); return count; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1cb4cba7f961..bfe104db284e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2078,7 +2078,6 @@ write_node: if (!__write_node_folio(folio, false, &submitted, wbc, do_balance, io_type, NULL)) { - folio_unlock(folio); folio_batch_release(&fbatch); ret = -EIO; goto out; diff --git a/fs/file.c b/fs/file.c index 3a3146664cf3..b6db031545e6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1198,8 +1198,12 @@ bool file_seek_cur_needs_f_lock(struct file *file) if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) return false; - VFS_WARN_ON_ONCE((file_count(file) > 1) && - !mutex_is_locked(&file->f_pos_lock)); + /* + * Note that we are not guaranteed to be called after fdget_pos() on + * this file obj, in which case the caller is expected to provide the + * appropriate locking. + */ + return true; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f102afc03359..47006d0753f1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1147,7 +1147,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct address_space *mapping, struct iov_iter *ii, loff_t pos, - unsigned int max_pages) + unsigned int max_folios) { struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); @@ -1157,12 +1157,11 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, int err = 0; num = min(iov_iter_count(ii), fc->max_write); - num = min(num, max_pages << PAGE_SHIFT); ap->args.in_pages = true; ap->descs[0].offset = offset; - while (num) { + while (num && ap->num_folios < max_folios) { size_t tmp; struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bfe8d8af46f3..9572bdef49ee 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -9,6 +9,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" +#include <linux/dax.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/file.h> @@ -162,6 +163,9 @@ static void fuse_evict_inode(struct inode *inode) /* Will write inode on close/munmap and in all other dirtiers */ WARN_ON(inode->i_state & I_DIRTY_INODE); + if (FUSE_IS_DAX(inode)) + dax_break_layout_final(inode); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { diff --git a/fs/libfs.c b/fs/libfs.c index 9ea0ecc325a8..6f487fc6be34 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1649,12 +1649,10 @@ struct inode *alloc_anon_inode(struct super_block *s) */ inode->i_state = I_DIRTY; /* - * Historically anonymous inodes didn't have a type at all and - * userspace has come to rely on this. Internally they're just - * regular files but S_IFREG is masked off when reporting - * information to userspace. + * Historically anonymous inodes don't have a type at all and + * userspace has come to rely on this. */ - inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; + inode->i_mode = S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE | S_ANON_INODE; diff --git a/fs/namei.c b/fs/namei.c index 4bb889fc980b..c26a7ee42184 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2917,7 +2917,8 @@ static int lookup_one_common(struct mnt_idmap *idmap, * @base: base directory to lookup from * * Look up a dentry by name in the dcache, returning NULL if it does not - * currently exist. The function does not try to create a dentry. + * currently exist. The function does not try to create a dentry and if one + * is found it doesn't try to revalidate it. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. @@ -2933,7 +2934,7 @@ struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base) if (err) return ERR_PTR(err); - return lookup_dcache(name, base, 0); + return d_lookup(base, name); } EXPORT_SYMBOL(try_lookup_noperm); @@ -3057,14 +3058,22 @@ EXPORT_SYMBOL(lookup_one_positive_unlocked); * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. * - * Unlike lookup_noperm, it should be called without the parent + * Unlike lookup_noperm(), it should be called without the parent * i_rwsem held, and will take the i_rwsem itself if necessary. + * + * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already + * existed. */ struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base) { struct dentry *ret; + int err; - ret = try_lookup_noperm(name, base); + err = lookup_noperm_common(name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); if (!ret) ret = lookup_slow(name, base, 0); return ret; @@ -3471,7 +3480,7 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path, return -EACCES; break; default: - VFS_BUG_ON_INODE(1, inode); + VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode); } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); diff --git a/fs/namespace.c b/fs/namespace.c index e13d9ab4f564..54c59e091919 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2310,21 +2310,62 @@ out: return dst_mnt; } -/* Caller should check returned pointer for errors */ +static inline bool extend_array(struct path **res, struct path **to_free, + unsigned n, unsigned *count, unsigned new_count) +{ + struct path *p; -struct vfsmount *collect_mounts(const struct path *path) + if (likely(n < *count)) + return true; + p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); + if (p && *count) + memcpy(p, *res, *count * sizeof(struct path)); + *count = new_count; + kfree(*to_free); + *to_free = *res = p; + return p; +} + +struct path *collect_paths(const struct path *path, + struct path *prealloc, unsigned count) { - struct mount *tree; - namespace_lock(); - if (!check_mnt(real_mount(path->mnt))) - tree = ERR_PTR(-EINVAL); - else - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); - namespace_unlock(); - if (IS_ERR(tree)) - return ERR_CAST(tree); - return &tree->mnt; + struct mount *root = real_mount(path->mnt); + struct mount *child; + struct path *res = prealloc, *to_free = NULL; + unsigned n = 0; + + guard(rwsem_read)(&namespace_sem); + + if (!check_mnt(root)) + return ERR_PTR(-EINVAL); + if (!extend_array(&res, &to_free, 0, &count, 32)) + return ERR_PTR(-ENOMEM); + res[n++] = *path; + list_for_each_entry(child, &root->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, path->dentry)) + continue; + for (struct mount *m = child; m; m = next_mnt(m, child)) { + if (!extend_array(&res, &to_free, n, &count, 2 * count)) + return ERR_PTR(-ENOMEM); + res[n].mnt = &m->mnt; + res[n].dentry = m->mnt.mnt_root; + n++; + } + } + if (!extend_array(&res, &to_free, n, &count, count + 1)) + return ERR_PTR(-ENOMEM); + memset(res + n, 0, (count - n) * sizeof(struct path)); + for (struct path *p = res; p->mnt; p++) + path_get(p); + return res; +} + +void drop_collected_paths(struct path *paths, struct path *prealloc) +{ + for (struct path *p = paths; p->mnt; p++) + path_put(p); + if (paths != prealloc) + kfree(paths); } static void free_mnt_ns(struct mnt_namespace *); @@ -2401,15 +2442,6 @@ void dissolve_on_fput(struct vfsmount *mnt) free_mnt_ns(ns); } -void drop_collected_mounts(struct vfsmount *mnt) -{ - namespace_lock(); - lock_mount_hash(); - umount_tree(real_mount(mnt), 0); - unlock_mount_hash(); - namespace_unlock(); -} - static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2511,21 +2543,6 @@ struct vfsmount *clone_private_mount(const struct path *path) } EXPORT_SYMBOL_GPL(clone_private_mount); -int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - struct vfsmount *root) -{ - struct mount *mnt; - int res = f(root, arg); - if (res) - return res; - list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { - res = f(&mnt->mnt, arg); - if (res) - return res; - } - return 0; -} - static void lock_mnt_tree(struct mount *mnt) { struct mount *p; @@ -2751,14 +2768,14 @@ static int attach_recursive_mnt(struct mount *source_mnt, hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt(&child->mnt_parent->mnt, - child->mnt_mountpoint); - if (q) - mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); child->mnt.mnt_flags &= ~MNT_LOCKED; + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); commit_tree(child); } put_mountpoint(smp); @@ -5290,16 +5307,12 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, kattr.kflags |= MOUNT_KATTR_RECURSE; ret = wants_mount_setattr(uattr, usize, &kattr); - if (ret < 0) - return ret; - - if (ret) { + if (ret > 0) { ret = do_mount_setattr(&file->f_path, &kattr); - if (ret) - return ret; - finish_mount_kattr(&kattr); } + if (ret) + return ret; } fd = get_unused_fd_flags(flags & O_CLOEXEC); @@ -6262,7 +6275,11 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; - drop_collected_mounts(&ns->root->mnt); + namespace_lock(); + lock_mount_hash(); + umount_tree(ns->root, 0); + unlock_mount_hash(); + namespace_unlock(); free_mnt_ns(ns); } diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 72a3e6db2524..f27ea5099a68 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -53,30 +53,40 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, * data written into the pagecache until we can find out from the server what * the values actually are. */ -static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, - loff_t i_size, loff_t pos, size_t copied) +void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, + loff_t pos, size_t copied) { + loff_t i_size, end = pos + copied; blkcnt_t add; size_t gap; + if (end <= i_size_read(inode)) + return; + if (ctx->ops->update_i_size) { - ctx->ops->update_i_size(inode, pos); + ctx->ops->update_i_size(inode, end); return; } - i_size_write(inode, pos); + spin_lock(&inode->i_lock); + + i_size = i_size_read(inode); + if (end > i_size) { + i_size_write(inode, end); #if IS_ENABLED(CONFIG_FSCACHE) - fscache_update_cookie(ctx->cache, NULL, &pos); + fscache_update_cookie(ctx->cache, NULL, &end); #endif - gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1)); - if (copied > gap) { - add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE); + gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1)); + if (copied > gap) { + add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE); - inode->i_blocks = min_t(blkcnt_t, - DIV_ROUND_UP(pos, SECTOR_SIZE), - inode->i_blocks + add); + inode->i_blocks = min_t(blkcnt_t, + DIV_ROUND_UP(end, SECTOR_SIZE), + inode->i_blocks + add); + } } + spin_unlock(&inode->i_lock); } /** @@ -111,7 +121,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio = NULL, *writethrough = NULL; unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; - loff_t i_size, pos = iocb->ki_pos; + loff_t pos = iocb->ki_pos; size_t max_chunk = mapping_max_folio_size(mapping); bool maybe_trouble = false; @@ -344,10 +354,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, flush_dcache_folio(folio); /* Update the inode size if we moved the EOF marker */ + netfs_update_i_size(ctx, inode, pos, copied); pos += copied; - i_size = i_size_read(inode); - if (pos > i_size) - netfs_update_i_size(ctx, inode, i_size, pos, copied); written += copied; if (likely(!wreq)) { diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index fa9a5bf3c6d5..a16660ab7f83 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -9,20 +9,6 @@ #include <linux/uio.h> #include "internal.h" -static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) -{ - struct inode *inode = wreq->inode; - unsigned long long end = wreq->start + wreq->transferred; - - if (!wreq->error && - i_size_read(inode) < end) { - if (wreq->netfs_ops->update_i_size) - wreq->netfs_ops->update_i_size(inode, end); - else - i_size_write(inode, end); - } -} - /* * Perform an unbuffered write where we may have to do an RMW operation on an * encrypted file. This can also be used for direct I/O writes. @@ -98,7 +84,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * if (async) wreq->iocb = iocb; wreq->len = iov_iter_count(&wreq->buffer.iter); - wreq->cleanup = netfs_cleanup_dio_write; ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); if (ret < 0) { _debug("begin = %zd", ret); @@ -106,7 +91,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * } if (!async) { - trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); ret = netfs_wait_for_write(wreq); if (ret > 0) iocb->ki_pos += ret; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index e2ee9183392b..d4f16fefd965 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -28,6 +28,12 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); /* + * buffered_write.c + */ +void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, + loff_t pos, size_t copied); + +/* * main.c */ extern unsigned int netfs_debug; @@ -267,14 +273,32 @@ static inline void netfs_wake_rreq_flag(struct netfs_io_request *rreq, enum netfs_rreq_trace trace) { if (test_bit(rreq_flag, &rreq->flags)) { - trace_netfs_rreq(rreq, trace); clear_bit_unlock(rreq_flag, &rreq->flags); smp_mb__after_atomic(); /* Set flag before task state */ + trace_netfs_rreq(rreq, trace); wake_up(&rreq->waitq); } } /* + * Test the NETFS_RREQ_IN_PROGRESS flag, inserting an appropriate barrier. + */ +static inline bool netfs_check_rreq_in_progress(const struct netfs_io_request *rreq) +{ + /* Order read of flags before read of anything else, such as error. */ + return test_bit_acquire(NETFS_RREQ_IN_PROGRESS, &rreq->flags); +} + +/* + * Test the NETFS_SREQ_IN_PROGRESS flag, inserting an appropriate barrier. + */ +static inline bool netfs_check_subreq_in_progress(const struct netfs_io_subrequest *subreq) +{ + /* Order read of flags before read of anything else, such as error. */ + return test_bit_acquire(NETFS_SREQ_IN_PROGRESS, &subreq->flags); +} + +/* * fscache-cache.c */ #ifdef CONFIG_PROC_FS diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 3db401d269e7..73da6c9f5777 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -58,15 +58,15 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v) if (v == &netfs_io_requests) { seq_puts(m, - "REQUEST OR REF FL ERR OPS COVERAGE\n" - "======== == === == ==== === =========\n" + "REQUEST OR REF FLAG ERR OPS COVERAGE\n" + "======== == === ==== ==== === =========\n" ); return 0; } rreq = list_entry(v, struct netfs_io_request, proc_link); seq_printf(m, - "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx", + "%08x %s %3d %4lx %4ld %3d @%04llx %llx/%llx", rreq->debug_id, netfs_origins[rreq->origin], refcount_read(&rreq->ref), diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 43b67a28a8fa..20748bcfbf59 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -356,22 +356,22 @@ void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, DEFINE_WAIT(myself); list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + if (!netfs_check_subreq_in_progress(subreq)) continue; - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_quiesce); for (;;) { prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + if (!netfs_check_subreq_in_progress(subreq)) break; trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); } } + trace_netfs_rreq(rreq, netfs_rreq_trace_waited_quiesce); finish_wait(&rreq->waitq, &myself); } @@ -381,7 +381,12 @@ void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, static int netfs_collect_in_app(struct netfs_io_request *rreq, bool (*collector)(struct netfs_io_request *rreq)) { - bool need_collect = false, inactive = true; + bool need_collect = false, inactive = true, done = true; + + if (!netfs_check_rreq_in_progress(rreq)) { + trace_netfs_rreq(rreq, netfs_rreq_trace_recollect); + return 1; /* Done */ + } for (int i = 0; i < NR_IO_STREAMS; i++) { struct netfs_io_subrequest *subreq; @@ -395,14 +400,16 @@ static int netfs_collect_in_app(struct netfs_io_request *rreq, struct netfs_io_subrequest, rreq_link); if (subreq && - (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || + (!netfs_check_subreq_in_progress(subreq) || test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { need_collect = true; break; } + if (subreq || !test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags)) + done = false; } - if (!need_collect && !inactive) + if (!need_collect && !inactive && !done) return 0; /* Sleep */ __set_current_state(TASK_RUNNING); @@ -423,14 +430,13 @@ static int netfs_collect_in_app(struct netfs_io_request *rreq, /* * Wait for a request to complete, successfully or otherwise. */ -static ssize_t netfs_wait_for_request(struct netfs_io_request *rreq, - bool (*collector)(struct netfs_io_request *rreq)) +static ssize_t netfs_wait_for_in_progress(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) { DEFINE_WAIT(myself); ssize_t ret; for (;;) { - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { @@ -440,18 +446,22 @@ static ssize_t netfs_wait_for_request(struct netfs_io_request *rreq, case 1: goto all_collected; case 2: + if (!netfs_check_rreq_in_progress(rreq)) + break; + cond_resched(); continue; } } - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) + if (!netfs_check_rreq_in_progress(rreq)) break; + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); } all_collected: + trace_netfs_rreq(rreq, netfs_rreq_trace_waited_ip); finish_wait(&rreq->waitq, &myself); ret = rreq->error; @@ -478,12 +488,12 @@ all_collected: ssize_t netfs_wait_for_read(struct netfs_io_request *rreq) { - return netfs_wait_for_request(rreq, netfs_read_collection); + return netfs_wait_for_in_progress(rreq, netfs_read_collection); } ssize_t netfs_wait_for_write(struct netfs_io_request *rreq) { - return netfs_wait_for_request(rreq, netfs_write_collection); + return netfs_wait_for_in_progress(rreq, netfs_write_collection); } /* @@ -494,10 +504,8 @@ static void netfs_wait_for_pause(struct netfs_io_request *rreq, { DEFINE_WAIT(myself); - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause); - for (;;) { - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause); prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { @@ -507,19 +515,23 @@ static void netfs_wait_for_pause(struct netfs_io_request *rreq, case 1: goto all_collected; case 2: + if (!netfs_check_rreq_in_progress(rreq) || + !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) + break; + cond_resched(); continue; } } - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) || + if (!netfs_check_rreq_in_progress(rreq) || !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) break; schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); } all_collected: + trace_netfs_rreq(rreq, netfs_rreq_trace_waited_pause); finish_wait(&rreq->waitq, &myself); } diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 96ee18af28ef..3e804da1e1eb 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -218,7 +218,7 @@ reassess: stream->collected_to = front->start; } - if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) + if (netfs_check_subreq_in_progress(front)) notes |= HIT_PENDING; smp_rmb(); /* Read counters after IN_PROGRESS flag. */ transferred = READ_ONCE(front->transferred); @@ -293,7 +293,9 @@ reassess: spin_lock(&rreq->lock); remove = front; - trace_netfs_sreq(front, netfs_sreq_trace_discard); + trace_netfs_sreq(front, + notes & ABANDON_SREQ ? + netfs_sreq_trace_abandoned : netfs_sreq_trace_consumed); list_del_init(&front->rreq_link); front = list_first_entry_or_null(&stream->subrequests, struct netfs_io_subrequest, rreq_link); @@ -353,9 +355,11 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) if (rreq->iocb) { rreq->iocb->ki_pos += rreq->transferred; - if (rreq->iocb->ki_complete) + if (rreq->iocb->ki_complete) { + trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete); rreq->iocb->ki_complete( rreq->iocb, rreq->error ? rreq->error : rreq->transferred); + } } if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); @@ -379,9 +383,11 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq) if (rreq->iocb) { rreq->iocb->ki_pos += rreq->transferred; - if (rreq->iocb->ki_complete) + if (rreq->iocb->ki_complete) { + trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete); rreq->iocb->ki_complete( rreq->iocb, rreq->error ? rreq->error : rreq->transferred); + } } if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); @@ -445,7 +451,7 @@ void netfs_read_collection_worker(struct work_struct *work) struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); netfs_see_request(rreq, netfs_rreq_trace_see_work); - if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) { + if (netfs_check_rreq_in_progress(rreq)) { if (netfs_read_collection(rreq)) /* Drop the ref from the IN_PROGRESS flag. */ netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index e2b102ffb768..0f3a36852a4d 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -240,7 +240,7 @@ reassess_streams: } /* Stall if the front is still undergoing I/O. */ - if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) { + if (netfs_check_subreq_in_progress(front)) { notes |= HIT_PENDING; break; } @@ -393,8 +393,10 @@ bool netfs_write_collection(struct netfs_io_request *wreq) ictx->ops->invalidate_cache(wreq); } - if (wreq->cleanup) - wreq->cleanup(wreq); + if ((wreq->origin == NETFS_UNBUFFERED_WRITE || + wreq->origin == NETFS_DIO_WRITE) && + !wreq->error) + netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred); if (wreq->origin == NETFS_DIO_WRITE && wreq->mapping->nrpages) { @@ -419,9 +421,11 @@ bool netfs_write_collection(struct netfs_io_request *wreq) if (wreq->iocb) { size_t written = min(wreq->transferred, wreq->len); wreq->iocb->ki_pos += written; - if (wreq->iocb->ki_complete) + if (wreq->iocb->ki_complete) { + trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete); wreq->iocb->ki_complete( wreq->iocb, wreq->error ? wreq->error : written); + } wreq->iocb = VFS_PTR_POISON; } @@ -434,7 +438,7 @@ void netfs_write_collection_worker(struct work_struct *work) struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); netfs_see_request(rreq, netfs_rreq_trace_see_work); - if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) { + if (netfs_check_rreq_in_progress(rreq)) { if (netfs_write_collection(rreq)) /* Drop the ref from the IN_PROGRESS flag. */ netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index 9d1d8a8bab72..fc9c3e0d34d8 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -146,14 +146,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, subreq = netfs_alloc_subrequest(wreq); subreq->source = to->source; subreq->start = start; - subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); subreq->stream_nr = to->stream_nr; subreq->retry_count = 1; trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), netfs_sreq_trace_new); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + trace_netfs_sreq(subreq, netfs_sreq_trace_split); list_add(&subreq->rreq_link, &to->rreq_link); to = list_next_entry(to, rreq_link); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index df4807460596..4bea008dbebd 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1105,6 +1105,7 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr) } static int ff_layout_async_handle_error_v4(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, @@ -1115,34 +1116,42 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; - switch (task->tk_status) { - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: + switch (op_status) { + case NFS4_OK: + case NFS4ERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFS4ERR_BADSESSION: + case NFS4ERR_BADSLOT: + case NFS4ERR_BAD_HIGH_SLOT: + case NFS4ERR_DEADSESSION: + case NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case NFS4ERR_SEQ_FALSE_RETRY: + case NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session. Exchangeid " "flags 0x%x\n", __func__, task->tk_status, clp->cl_exchange_flags); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - break; - case -NFS4ERR_DELAY: + goto out_retry; + case NFS4ERR_DELAY: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); fallthrough; - case -NFS4ERR_GRACE: + case NFS4ERR_GRACE: rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); - break; - case -NFS4ERR_RETRY_UNCACHED_REP: - break; + goto out_retry; + case NFS4ERR_RETRY_UNCACHED_REP: + goto out_retry; /* Invalidate Layout errors */ - case -NFS4ERR_PNFS_NO_LAYOUT: - case -ESTALE: /* mapped NFS4ERR_STALE */ - case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ - case -EISDIR: /* mapped NFS4ERR_ISDIR */ - case -NFS4ERR_FHEXPIRED: - case -NFS4ERR_WRONG_TYPE: + case NFS4ERR_PNFS_NO_LAYOUT: + case NFS4ERR_STALE: + case NFS4ERR_BADHANDLE: + case NFS4ERR_ISDIR: + case NFS4ERR_FHEXPIRED: + case NFS4ERR_WRONG_TYPE: dprintk("%s Invalid layout error %d\n", __func__, task->tk_status); /* @@ -1155,6 +1164,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, pnfs_destroy_layout(NFS_I(inode)); rpc_wake_up(&tbl->slot_tbl_waitq); goto reset; + default: + break; + } + + switch (task->tk_status) { /* RPC connection errors */ case -ENETDOWN: case -ENETUNREACH: @@ -1174,27 +1188,56 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); rpc_wake_up(&tbl->slot_tbl_waitq); - fallthrough; + break; default: - if (ff_layout_avoid_mds_available_ds(lseg)) - return -NFS4ERR_RESET_TO_PNFS; -reset: - dprintk("%s Retry through MDS. Error %d\n", __func__, - task->tk_status); - return -NFS4ERR_RESET_TO_MDS; + break; } + + if (ff_layout_avoid_mds_available_ds(lseg)) + return -NFS4ERR_RESET_TO_PNFS; +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, + task->tk_status); + return -NFS4ERR_RESET_TO_MDS; + +out_retry: task->tk_status = 0; return -EAGAIN; } /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, + u32 op_status, struct nfs_client *clp, struct pnfs_layout_segment *lseg, u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + switch (op_status) { + case NFS_OK: + case NFSERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFSERR_ACCES: + case NFSERR_BADHANDLE: + case NFSERR_FBIG: + case NFSERR_IO: + case NFSERR_NOSPC: + case NFSERR_ROFS: + case NFSERR_STALE: + goto out_reset_to_pnfs; + case NFSERR_JUKEBOX: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + goto out_retry; + default: + break; + } + switch (task->tk_status) { /* File access problems. Don't mark the device as unavailable */ case -EACCES: @@ -1218,6 +1261,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); } +out_reset_to_pnfs: /* FIXME: Need to prevent infinite looping here. */ return -NFS4ERR_RESET_TO_PNFS; out_retry: @@ -1228,6 +1272,7 @@ out_retry: } static int ff_layout_async_handle_error(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, @@ -1246,10 +1291,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task, switch (vers) { case 3: - return ff_layout_async_handle_error_v3(task, clp, lseg, idx); - case 4: - return ff_layout_async_handle_error_v4(task, state, clp, + return ff_layout_async_handle_error_v3(task, op_status, clp, lseg, idx); + case 4: + return ff_layout_async_handle_error_v4(task, op_status, state, + clp, lseg, idx); default: /* should never happen */ WARN_ON_ONCE(1); @@ -1302,6 +1348,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, switch (status) { case NFS4ERR_DELAY: case NFS4ERR_GRACE: + case NFS4ERR_PERM: break; case NFS4ERR_NXIO: ff_layout_mark_ds_unreachable(lseg, idx); @@ -1334,7 +1381,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, trace_ff_layout_read_error(hdr, task->tk_status); } - err = ff_layout_async_handle_error(task, hdr->args.context->state, + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); @@ -1507,7 +1555,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task, trace_ff_layout_write_error(hdr, task->tk_status); } - err = ff_layout_async_handle_error(task, hdr->args.context->state, + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); @@ -1556,8 +1605,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, trace_ff_layout_commit_error(data, task->tk_status); } - err = ff_layout_async_handle_error(task, NULL, data->ds_clp, - data->lseg, data->ds_commit_index); + err = ff_layout_async_handle_error(task, data->res.op_status, + NULL, data->ds_clp, data->lseg, + data->ds_commit_index); trace_nfs4_pnfs_commit_ds(data, err); switch (err) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8ab7868807a7..a2fa6bc4d74e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2589,15 +2589,26 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); + int err; nfs_clients_init(net); if (!rpc_proc_register(net, &nn->rpcstats)) { - nfs_clients_exit(net); - return -ENOMEM; + err = -ENOMEM; + goto err_proc_rpc; } - return nfs_fs_proc_net_init(net); + err = nfs_fs_proc_net_init(net); + if (err) + goto err_proc_nfs; + + return 0; + +err_proc_nfs: + rpc_proc_unregister(net, "nfs"); +err_proc_rpc: + nfs_clients_exit(net); + return err; } static void nfs_net_exit(struct net *net) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3adb7d0dbec7..1a7ec68bde15 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2059,8 +2059,10 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) { if (atomic_dec_and_test(&lo->plh_outstanding) && - test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) + test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) { + smp_mb__after_atomic(); wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); + } } static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index ccb00aa93be0..e00b2aea8da2 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1409,6 +1409,7 @@ void nfsd41_cb_referring_call(struct nfsd4_callback *cb, out: if (!rcl->__nr_referring_calls) { cb->cb_nr_referring_call_list--; + list_del(&rcl->__list); kfree(rcl); } } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3f3e9f6c4250..6a42cc7a845a 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1611,7 +1611,7 @@ out_unlock: */ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) { - int *nthreads, count = 0, nrpools, i, ret = -EOPNOTSUPP, rem; + int *nthreads, nrpools = 0, i, ret = -EOPNOTSUPP, rem; struct net *net = genl_info_net(info); struct nfsd_net *nn = net_generic(net, nfsd_net_id); const struct nlattr *attr; @@ -1623,12 +1623,11 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) /* count number of SERVER_THREADS values */ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { if (nla_type(attr) == NFSD_A_SERVER_THREADS) - count++; + nrpools++; } mutex_lock(&nfsd_mutex); - nrpools = max(count, nfsd_nrpools(net)); nthreads = kcalloc(nrpools, sizeof(int), GFP_KERNEL); if (!nthreads) { ret = -ENOMEM; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 0b8b28392eb7..2043f0369059 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1393,7 +1393,7 @@ out: bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; const struct cred *old_cred; unsigned int i; bool positive = false; @@ -1416,9 +1416,15 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; + /* + * We need to make a non-const copy of dentry->d_name, + * because lookup_one_positive_unlocked() will hash name + * with parentpath base, which is on another (lower fs). + */ this = lookup_one_positive_unlocked( mnt_idmap(parentpath->layer->mnt), - name, parentpath->dentry); + &QSTR_LEN(name->name, name->len), + parentpath->dentry); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 8baaba0a3fe5..497323128e5f 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -246,9 +246,11 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { - dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); - pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry)); - return dentry; + struct dentry *ret; + + ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret)); + return ret; } static inline int ovl_do_mknod(struct ovl_fs *ofs, diff --git a/fs/pidfs.c b/fs/pidfs.c index c1f0a067be40..69919be1c9d8 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -366,7 +366,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) kinfo.pid = task_pid_vnr(task); kinfo.mask |= PIDFD_INFO_PID; - if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) + if (kinfo.pid == 0 || kinfo.tgid == 0) return -ESRCH; copy_out: diff --git a/fs/pnode.h b/fs/pnode.h index 34b6247af01d..2d026fb98b18 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -28,8 +28,6 @@ #define CL_SHARED_TO_SLAVE 0x20 #define CL_COPY_MNT_NS_FILE 0x40 -#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) - static inline void set_mnt_shared(struct mount *mnt) { mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a3eb3b740f76..3604b616311c 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -42,7 +42,7 @@ static void proc_evict_inode(struct inode *inode) head = ei->sysctl; if (head) { - RCU_INIT_POINTER(ei->sysctl, NULL); + WRITE_ONCE(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index cc9d74a06ff0..08b78150cdde 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -918,17 +918,21 @@ static int proc_sys_compare(const struct dentry *dentry, struct ctl_table_header *head; struct inode *inode; - /* Although proc doesn't have negative dentries, rcu-walk means - * that inode here can be NULL */ - /* AV: can it, indeed? */ - inode = d_inode_rcu(dentry); - if (!inode) - return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; - head = rcu_dereference(PROC_I(inode)->sysctl); + + // false positive is fine here - we'll recheck anyway + if (d_in_lookup(dentry)) + return 0; + + inode = d_inode_rcu(dentry); + // we just might have run into dentry in the middle of __dentry_kill() + if (!inode) + return 1; + + head = READ_ONCE(PROC_I(inode)->sysctl); return !head || !sysctl_is_seen(head); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 27972c0749e7..751479eb128f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -36,9 +36,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; - anon = get_mm_counter(mm, MM_ANONPAGES); - file = get_mm_counter(mm, MM_FILEPAGES); - shmem = get_mm_counter(mm, MM_SHMEMPAGES); + anon = get_mm_counter_sum(mm, MM_ANONPAGES); + file = get_mm_counter_sum(mm, MM_FILEPAGES); + shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and @@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; - swap = get_mm_counter(mm, MM_SWAPENTS); + swap = get_mm_counter_sum(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); @@ -92,12 +92,12 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); + *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; - *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); return mm->total_vm; } @@ -2182,7 +2182,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_FILE; } - if (is_zero_pfn(pmd_pfn(pmd))) + if (is_huge_zero_pmd(pmd)) categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 6ed2dfd4dbbd..d98e0d2de09f 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -594,9 +594,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) struct rmid_read rr = {0}; struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; + int domid, cpu, ret = 0; struct rdt_resource *r; + struct cacheinfo *ci; struct mon_data *md; - int domid, ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -623,10 +624,14 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * one that matches this cache id. */ list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci->id == domid) { - rr.ci = d->ci; + if (d->ci_id == domid) { + rr.ci_id = d->ci_id; + cpu = cpumask_any(&d->hdr.cpu_mask); + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) + continue; mon_event_read(&rr, r, NULL, rdtgrp, - &d->ci->shared_cpu_map, evtid, false); + &ci->shared_cpu_map, evtid, false); goto checkresult; } } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151..0a1eedba2b03 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -98,7 +98,7 @@ struct mon_data { * domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -112,7 +112,7 @@ struct rmid_read { struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; - struct cacheinfo *ci; + unsigned int ci_id; int err; u64 val; void *arch_mon_ctx; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index bde2801289d3..f5637855c3ac 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -361,6 +361,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); struct rdt_mon_domain *d; + struct cacheinfo *ci; struct mbm_state *m; int err, ret; u64 tval = 0; @@ -388,7 +389,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* Summing domains that share a cache, must be on a CPU for that cache. */ - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci || ci->id != rr->ci_id) return -EINVAL; /* @@ -400,7 +402,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ ret = -EINVAL; list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci->id != rr->ci->id) + if (d->ci_id != rr->ci_id) continue; err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, rr->evtid, &tval, rr->arch_mon_ctx); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 1beb124e25f6..77d08229d855 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3036,7 +3036,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, char name[32]; snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); if (snc_mode) sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); @@ -3061,7 +3061,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, return -EPERM; list_for_each_entry(mevt, &r->evt_list, list) { - domid = do_sum ? d->ci->id : d->hdr.id; + domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) return -EINVAL; @@ -3089,7 +3089,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, lockdep_assert_held(&rdtgroup_mutex); snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); kn = kernfs_find_and_get(parent_kn, name); if (kn) { /* diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 5200a0f3cafc..368e870624da 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -509,8 +509,17 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { tmp_list = kmalloc(sizeof(*tmp_list), GFP_ATOMIC); - if (tmp_list == NULL) - break; + if (tmp_list == NULL) { + /* + * If the malloc() fails, we won't drop all + * dentries, and unmounting is likely to trigger + * a 'Dentry still in use' error. + */ + cifs_tcon_dbg(VFS, "Out of memory while dropping dentries\n"); + spin_unlock(&cfids->cfid_list_lock); + spin_unlock(&cifs_sb->tlink_tree_lock); + goto done; + } spin_lock(&cfid->fid_lock); tmp_list->dentry = cfid->dentry; cfid->dentry = NULL; @@ -522,6 +531,7 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) } spin_unlock(&cifs_sb->tlink_tree_lock); +done: list_for_each_entry_safe(tmp_list, q, &entry, entry) { list_del(&tmp_list->entry); dput(tmp_list->dentry); diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index bc8a812ff95f..a28f7cae3caa 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -26,7 +26,7 @@ struct cached_dirents { * open file instance. */ struct mutex de_mutex; - int pos; /* Expected ctx->pos */ + loff_t pos; /* Expected ctx->pos */ struct list_head entries; }; diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index c0196be0e65f..3fdf75737d43 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -1105,7 +1105,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if ((count < 1) || (count > 11)) return -EINVAL; - memset(flags_string, 0, 12); + memset(flags_string, 0, sizeof(flags_string)); if (copy_from_user(flags_string, buffer, count)) return -EFAULT; diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h index 26327442e383..b51ce64fcccf 100644 --- a/fs/smb/client/cifs_ioctl.h +++ b/fs/smb/client/cifs_ioctl.h @@ -61,7 +61,7 @@ struct smb_query_info { struct smb3_key_debug_info { __u64 Suid; __u16 cipher_type; - __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ + __u8 auth_key[SMB2_NTLMV2_SESSKEY_SIZE]; __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; } __packed; diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 45e94e18f4d5..89160bc34d35 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -709,6 +709,7 @@ inc_rfc1001_len(void *buf, int count) struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; + struct list_head rlist; /* reconnect list */ spinlock_t srv_lock; /* protect anything here that is not protected */ __u64 conn_id; /* connection identifier (useful for debugging) */ int srv_count; /* reference counter */ @@ -776,6 +777,7 @@ struct TCP_Server_Info { __le32 session_key_id; /* retrieved from negotiate response and send in session setup request */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ + unsigned long neg_start; /* when negotiate started (jiffies) */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ #define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */ #define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */ @@ -1302,6 +1304,7 @@ struct cifs_tcon { bool use_persistent:1; /* use persistent instead of durable handles */ bool no_lease:1; /* Do not request leases on files or directories */ bool use_witness:1; /* use witness protocol */ + bool dummy:1; /* dummy tcon used for reconnecting channels */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 66093fa78aed..045227ed4efc 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -136,6 +136,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *out_buf, int *bytes_returned); +void smb2_query_server_interfaces(struct work_struct *work); void cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, bool all_channels); diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 7216fcec79e8..75142f49d65d 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1334,7 +1334,12 @@ cifs_readv_callback(struct mid_q_entry *mid) cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted); + goto do_retry; case MID_RETRY_NEEDED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed); +do_retry: + __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags); rdata->result = -EAGAIN; if (server->sign && rdata->got_bytes) /* reset bytes number since we can not check a sign */ @@ -1343,8 +1348,14 @@ cifs_readv_callback(struct mid_q_entry *mid) task_io_account_read(rdata->got_bytes); cifs_stats_bytes_read(tcon, rdata->got_bytes); break; + case MID_RESPONSE_MALFORMED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed); + rdata->result = -EIO; + break; default: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown); rdata->result = -EIO; + break; } if (rdata->result == -ENODATA) { @@ -1713,10 +1724,21 @@ cifs_writev_callback(struct mid_q_entry *mid) } break; case MID_REQUEST_SUBMITTED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted); + __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags); + result = -EAGAIN; + break; case MID_RETRY_NEEDED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed); + __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags); result = -EAGAIN; break; + case MID_RESPONSE_MALFORMED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed); + result = -EIO; + break; default: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown); result = -EIO; break; } diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index c4fb80b37738..205f547ca49e 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -97,7 +97,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) return rc; } -static void smb2_query_server_interfaces(struct work_struct *work) +void smb2_query_server_interfaces(struct work_struct *work) { int rc; int xid; @@ -124,6 +124,14 @@ static void smb2_query_server_interfaces(struct work_struct *work) (SMB_INTERFACE_POLL_INTERVAL * HZ)); } +#define set_need_reco(server) \ +do { \ + spin_lock(&server->srv_lock); \ + if (server->tcpStatus != CifsExiting) \ + server->tcpStatus = CifsNeedReconnect; \ + spin_unlock(&server->srv_lock); \ +} while (0) + /* * Update the tcpStatus for the server. * This is used to signal the cifsd thread to call cifs_reconnect @@ -137,39 +145,45 @@ void cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, bool all_channels) { - struct TCP_Server_Info *pserver; + struct TCP_Server_Info *nserver; struct cifs_ses *ses; + LIST_HEAD(reco); int i; - /* If server is a channel, select the primary channel */ - pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; - /* if we need to signal just this channel */ if (!all_channels) { - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsExiting) - server->tcpStatus = CifsNeedReconnect; - spin_unlock(&server->srv_lock); + set_need_reco(server); return; } - spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { - if (cifs_ses_exiting(ses)) - continue; - spin_lock(&ses->chan_lock); - for (i = 0; i < ses->chan_count; i++) { - if (!ses->chans[i].server) + if (SERVER_IS_CHAN(server)) + server = server->primary_server; + scoped_guard(spinlock, &cifs_tcp_ses_lock) { + set_need_reco(server); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); continue; - - spin_lock(&ses->chans[i].server->srv_lock); - if (ses->chans[i].server->tcpStatus != CifsExiting) - ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&ses->chans[i].server->srv_lock); + } + spin_lock(&ses->chan_lock); + for (i = 1; i < ses->chan_count; i++) { + nserver = ses->chans[i].server; + if (!nserver) + continue; + nserver->srv_count++; + list_add(&nserver->rlist, &reco); + } + spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); } - spin_unlock(&ses->chan_lock); } - spin_unlock(&cifs_tcp_ses_lock); + + list_for_each_entry_safe(server, nserver, &reco, rlist) { + list_del_init(&server->rlist); + set_need_reco(server); + cifs_put_tcp_session(server, 0); + } } /* @@ -665,12 +679,12 @@ server_unresponsive(struct TCP_Server_Info *server) /* * If we're in the process of mounting a share or reconnecting a session * and the server abruptly shut down (e.g. socket wasn't closed, packet - * had been ACK'ed but no SMB response), don't wait longer than 20s to - * negotiate protocol. + * had been ACK'ed but no SMB response), don't wait longer than 20s from + * when negotiate actually started. */ spin_lock(&server->srv_lock); if (server->tcpStatus == CifsInNegotiate && - time_after(jiffies, server->lstrp + 20 * HZ)) { + time_after(jiffies, server->neg_start + 20 * HZ)) { spin_unlock(&server->srv_lock); cifs_reconnect(server, false); return true; @@ -2866,20 +2880,14 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->max_cached_dirs = ctx->max_cached_dirs; tcon->nodelete = ctx->nodelete; tcon->local_lease = ctx->local_lease; - INIT_LIST_HEAD(&tcon->pending_opens); tcon->status = TID_GOOD; - INIT_DELAYED_WORK(&tcon->query_interfaces, - smb2_query_server_interfaces); if (ses->server->dialect >= SMB30_PROT_ID && (ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { /* schedule query interfaces poll */ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, (SMB_INTERFACE_POLL_INTERVAL * HZ)); } -#ifdef CONFIG_CIFS_DFS_UPCALL - INIT_DELAYED_WORK(&tcon->dfs_cache_work, dfs_cache_refresh); -#endif spin_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &ses->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -4199,7 +4207,9 @@ retry: return 0; } + server->lstrp = jiffies; server->tcpStatus = CifsInNegotiate; + server->neg_start = jiffies; spin_unlock(&server->srv_lock); rc = server->ops->negotiate(xid, ses, server); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 9835672267d2..e9212da32f01 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -52,6 +52,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr]; struct TCP_Server_Info *server; struct cifsFileInfo *open_file = req->cfile; + struct cifs_sb_info *cifs_sb = CIFS_SB(wdata->rreq->inode->i_sb); size_t wsize = req->rreq.wsize; int rc; @@ -63,6 +64,10 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); wdata->server = server; + if (cifs_sb->ctx->wsize == 0) + cifs_negotiate_wsize(server, cifs_sb->ctx, + tlink_tcon(req->cfile->tlink)); + retry: if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, false); @@ -160,10 +165,9 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; - if (cifs_sb->ctx->rsize == 0) { + if (cifs_sb->ctx->rsize == 0) cifs_negotiate_rsize(server, cifs_sb->ctx, tlink_tcon(req->cfile->tlink)); - } rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &size, &rdata->credits); diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index a634a34d4086..59ccc2229ab3 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1824,10 +1824,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_errorf(fc, "symlinkroot mount options must be absolute path\n"); goto cifs_parse_mount_err; } - kfree(ctx->symlinkroot); - ctx->symlinkroot = kstrdup(param->string, GFP_KERNEL); - if (!ctx->symlinkroot) + if (strnlen(param->string, PATH_MAX) == PATH_MAX) { + cifs_errorf(fc, "symlinkroot path too long (max path length: %u)\n", + PATH_MAX - 1); goto cifs_parse_mount_err; + } + kfree(ctx->symlinkroot); + ctx->symlinkroot = param->string; + param->string = NULL; break; } /* case Opt_ignore: - is ignored as expected ... */ @@ -1837,13 +1841,6 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } - /* - * By default resolve all native absolute symlinks relative to "/mnt/". - * Same default has drvfs driver running in WSL for resolving SMB shares. - */ - if (!ctx->symlinkroot) - ctx->symlinkroot = kstrdup("/mnt/", GFP_KERNEL); - return 0; cifs_parse_mount_err: diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 56439da4f119..0a9935ce05a5 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -506,7 +506,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) le16_to_cpu(tcon->ses->server->cipher_type); pkey_inf.Suid = tcon->ses->Suid; memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response, - 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + SMB2_NTLMV2_SESSKEY_SIZE); memcpy(pkey_inf.smb3decryptionkey, tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); memcpy(pkey_inf.smb3encryptionkey, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index e77017f47084..da23cc12a52c 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -151,6 +151,12 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace) #ifdef CONFIG_CIFS_DFS_UPCALL INIT_LIST_HEAD(&ret_buf->dfs_ses_list); #endif + INIT_LIST_HEAD(&ret_buf->pending_opens); + INIT_DELAYED_WORK(&ret_buf->query_interfaces, + smb2_query_server_interfaces); +#ifdef CONFIG_CIFS_DFS_UPCALL + INIT_DELAYED_WORK(&ret_buf->dfs_cache_work, dfs_cache_refresh); +#endif return ret_buf; } diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index ba0193cf9033..4e5460206397 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -264,7 +264,7 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, /* The Mode field in the response can now include the file type as well */ fattr->cf_mode = wire_mode_to_posix(le32_to_cpu(info->Mode), fattr->cf_cifsattrs & ATTR_DIRECTORY); - fattr->cf_dtype = S_DT(le32_to_cpu(info->Mode)); + fattr->cf_dtype = S_DT(fattr->cf_mode); switch (fattr->cf_mode & S_IFMT) { case S_IFLNK: diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index bb25e77c5540..5fa29a97ac15 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -57,6 +57,7 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, struct reparse_symlink_data_buffer *buf = NULL; struct cifs_open_info_data data = {}; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + const char *symroot = cifs_sb->ctx->symlinkroot; struct inode *new; struct kvec iov; __le16 *path = NULL; @@ -82,7 +83,8 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, .symlink_target = symlink_target, }; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && + symroot && symname[0] == '/') { /* * This is a request to create an absolute symlink on the server * which does not support POSIX paths, and expects symlink in @@ -92,7 +94,7 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, * ensure compatibility of this symlink stored in absolute form * on the SMB server. */ - if (!strstarts(symname, cifs_sb->ctx->symlinkroot)) { + if (!strstarts(symname, symroot)) { /* * If the absolute Linux symlink target path is not * inside "symlinkroot" location then there is no way @@ -101,12 +103,12 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, cifs_dbg(VFS, "absolute symlink '%s' cannot be converted to NT format " "because it is outside of symlinkroot='%s'\n", - symname, cifs_sb->ctx->symlinkroot); + symname, symroot); rc = -EINVAL; goto out; } - len = strlen(cifs_sb->ctx->symlinkroot); - if (cifs_sb->ctx->symlinkroot[len-1] != '/') + len = strlen(symroot); + if (symroot[len - 1] != '/') len++; if (symname[len] >= 'a' && symname[len] <= 'z' && (symname[len+1] == '/' || symname[len+1] == '\0')) { @@ -782,6 +784,7 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, const char *full_path, struct cifs_sb_info *cifs_sb) { + const char *symroot = cifs_sb->ctx->symlinkroot; char sep = CIFS_DIR_SEP(cifs_sb); char *linux_target = NULL; char *smb_target = NULL; @@ -815,7 +818,8 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, goto out; } - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && !relative) { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && + symroot && !relative) { /* * This is an absolute symlink from the server which does not * support POSIX paths, so the symlink is in NT-style path. @@ -875,15 +879,8 @@ globalroot: abs_path += sizeof("\\DosDevices\\")-1; else if (strstarts(abs_path, "\\GLOBAL??\\")) abs_path += sizeof("\\GLOBAL??\\")-1; - else { - /* Unhandled absolute symlink, points outside of DOS/Win32 */ - cifs_dbg(VFS, - "absolute symlink '%s' cannot be converted from NT format " - "because points to unknown target\n", - smb_target); - rc = -EIO; - goto out; - } + else + goto out_unhandled_target; /* Sometimes path separator after \?? is double backslash */ if (abs_path[0] == '\\') @@ -910,25 +907,19 @@ globalroot: abs_path++; abs_path[0] = drive_letter; } else { - /* Unhandled absolute symlink. Report an error. */ - cifs_dbg(VFS, - "absolute symlink '%s' cannot be converted from NT format " - "because points to unknown target\n", - smb_target); - rc = -EIO; - goto out; + goto out_unhandled_target; } abs_path_len = strlen(abs_path)+1; - symlinkroot_len = strlen(cifs_sb->ctx->symlinkroot); - if (cifs_sb->ctx->symlinkroot[symlinkroot_len-1] == '/') + symlinkroot_len = strlen(symroot); + if (symroot[symlinkroot_len - 1] == '/') symlinkroot_len--; linux_target = kmalloc(symlinkroot_len + 1 + abs_path_len, GFP_KERNEL); if (!linux_target) { rc = -ENOMEM; goto out; } - memcpy(linux_target, cifs_sb->ctx->symlinkroot, symlinkroot_len); + memcpy(linux_target, symroot, symlinkroot_len); linux_target[symlinkroot_len] = '/'; memcpy(linux_target + symlinkroot_len + 1, abs_path, abs_path_len); } else if (smb_target[0] == sep && relative) { @@ -966,6 +957,7 @@ globalroot: * These paths have same format as Linux symlinks, so no * conversion is needed. */ +out_unhandled_target: linux_target = smb_target; smb_target = NULL; } @@ -1172,7 +1164,6 @@ out: if (!have_xattr_dev && (tag == IO_REPARSE_TAG_LX_CHR || tag == IO_REPARSE_TAG_LX_BLK)) return false; - fattr->cf_dtype = S_DT(fattr->cf_mode); return true; } diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index ec0db32c7d98..330bc3d25bad 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -498,8 +498,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, ctx->domainauto = ses->domainAuto; ctx->domainname = ses->domainName; - /* no hostname for extra channels */ - ctx->server_hostname = ""; + ctx->server_hostname = ses->server->hostname; ctx->username = ses->user_name; ctx->password = ses->password; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index a717be1626a3..2df93a75e3b8 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -424,9 +424,9 @@ skip_sess_setup: free_xid(xid); ses->flags &= ~CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES; - /* regardless of rc value, setup polling */ - queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, - (SMB_INTERFACE_POLL_INTERVAL * HZ)); + if (!tcon->ipc && !tcon->dummy) + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); mutex_unlock(&ses->session_mutex); @@ -4229,10 +4229,8 @@ void smb2_reconnect_server(struct work_struct *work) } goto done; } - tcon->status = TID_GOOD; - tcon->retry = false; - tcon->need_reconnect = false; + tcon->dummy = true; /* now reconnect sessions for necessary channels */ list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { @@ -4567,7 +4565,11 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted); + goto do_retry; case MID_RETRY_NEEDED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed); +do_retry: __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags); rdata->result = -EAGAIN; if (server->sign && rdata->got_bytes) @@ -4578,11 +4580,15 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_RESPONSE_MALFORMED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed); credits.value = le16_to_cpu(shdr->CreditRequest); credits.instance = server->reconnect_instance; - fallthrough; + rdata->result = -EIO; + break; default: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown); rdata->result = -EIO; + break; } #ifdef CONFIG_CIFS_SMB_DIRECT /* @@ -4835,11 +4841,14 @@ smb2_writev_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress); credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; result = smb2_check_receive(mid, server, 0); - if (result != 0) + if (result != 0) { + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_bad); break; + } written = le32_to_cpu(rsp->DataLength); /* @@ -4861,14 +4870,23 @@ smb2_writev_callback(struct mid_q_entry *mid) } break; case MID_REQUEST_SUBMITTED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted); + __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags); + result = -EAGAIN; + break; case MID_RETRY_NEEDED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed); + __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags); result = -EAGAIN; break; case MID_RESPONSE_MALFORMED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed); credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; - fallthrough; + result = -EIO; + break; default: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown); result = -EIO; break; } @@ -4908,7 +4926,6 @@ smb2_writev_callback(struct mid_q_entry *mid) server->credits, server->in_flight, 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; - trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress); cifs_write_subrequest_terminated(wdata, result ?: written); release_mid(mid); trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 5ae847919da5..754e94a0e07f 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -907,8 +907,10 @@ wait_send_queue: .local_dma_lkey = sc->ib.pd->local_dma_lkey, .direction = DMA_TO_DEVICE, }; + size_t payload_len = umin(*_remaining_data_length, + sp->max_send_size - sizeof(*packet)); - rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length, + rc = smb_extract_iter_to_rdma(iter, payload_len, &extract); if (rc < 0) goto err_dma; @@ -1013,6 +1015,27 @@ static int smbd_post_send_empty(struct smbd_connection *info) return smbd_post_send_iter(info, NULL, &remaining_data_length); } +static int smbd_post_send_full_iter(struct smbd_connection *info, + struct iov_iter *iter, + int *_remaining_data_length) +{ + int rc = 0; + + /* + * smbd_post_send_iter() respects the + * negotiated max_send_size, so we need to + * loop until the full iter is posted + */ + + while (iov_iter_count(iter) > 0) { + rc = smbd_post_send_iter(info, iter, _remaining_data_length); + if (rc < 0) + break; + } + + return rc; +} + /* * Post a receive request to the transport * The remote peer can only send data when a receive request is posted @@ -1452,6 +1475,9 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) char name[MAX_NAME_LEN]; int rc; + if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer))) + return -ENOMEM; + scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); info->request_cache = kmem_cache_create( @@ -1469,12 +1495,17 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) goto out1; scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); + + struct kmem_cache_args response_args = { + .align = __alignof__(struct smbd_response), + .useroffset = (offsetof(struct smbd_response, packet) + + sizeof(struct smbdirect_data_transfer)), + .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer), + }; info->response_cache = - kmem_cache_create( - name, - sizeof(struct smbd_response) + - sp->max_recv_size, - 0, SLAB_HWCACHE_ALIGN, NULL); + kmem_cache_create(name, + sizeof(struct smbd_response) + sp->max_recv_size, + &response_args, SLAB_HWCACHE_ALIGN); if (!info->response_cache) goto out2; @@ -1747,35 +1778,39 @@ try_again: } /* - * Receive data from receive reassembly queue + * Receive data from the transport's receive reassembly queue * All the incoming data packets are placed in reassembly queue - * buf: the buffer to read data into + * iter: the buffer to read data into * size: the length of data to read * return value: actual data read - * Note: this implementation copies the data from reassebmly queue to receive + * + * Note: this implementation copies the data from reassembly queue to receive * buffers used by upper layer. This is not the optimal code path. A better way * to do it is to not have upper layer allocate its receive buffers but rather * borrow the buffer from reassembly queue, and return it after data is * consumed. But this will require more changes to upper layer code, and also * need to consider packet boundaries while they still being reassembled. */ -static int smbd_recv_buf(struct smbd_connection *info, char *buf, - unsigned int size) +int smbd_recv(struct smbd_connection *info, struct msghdr *msg) { struct smbdirect_socket *sc = &info->socket; struct smbd_response *response; struct smbdirect_data_transfer *data_transfer; + size_t size = iov_iter_count(&msg->msg_iter); int to_copy, to_read, data_read, offset; u32 data_length, remaining_data_length, data_offset; int rc; + if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE)) + return -EINVAL; /* It's a bug in upper layer to get there */ + again: /* * No need to hold the reassembly queue lock all the time as we are * the only one reading from the front of the queue. The transport * may add more entries to the back of the queue at the same time */ - log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size, + log_read(INFO, "size=%zd info->reassembly_data_length=%d\n", size, info->reassembly_data_length); if (info->reassembly_data_length >= size) { int queue_length; @@ -1813,7 +1848,10 @@ again: if (response->first_segment && size == 4) { unsigned int rfc1002_len = data_length + remaining_data_length; - *((__be32 *)buf) = cpu_to_be32(rfc1002_len); + __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len); + if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr), + &msg->msg_iter) != sizeof(rfc1002_hdr)) + return -EFAULT; data_read = 4; response->first_segment = false; log_read(INFO, "returning rfc1002 length %d\n", @@ -1822,10 +1860,9 @@ again: } to_copy = min_t(int, data_length - offset, to_read); - memcpy( - buf + data_read, - (char *)data_transfer + data_offset + offset, - to_copy); + if (copy_to_iter((char *)data_transfer + data_offset + offset, + to_copy, &msg->msg_iter) != to_copy) + return -EFAULT; /* move on to the next buffer? */ if (to_copy == data_length - offset) { @@ -1891,90 +1928,6 @@ read_rfc1002_done: } /* - * Receive a page from receive reassembly queue - * page: the page to read data into - * to_read: the length of data to read - * return value: actual data read - */ -static int smbd_recv_page(struct smbd_connection *info, - struct page *page, unsigned int page_offset, - unsigned int to_read) -{ - struct smbdirect_socket *sc = &info->socket; - int ret; - char *to_address; - void *page_address; - - /* make sure we have the page ready for read */ - ret = wait_event_interruptible( - info->wait_reassembly_queue, - info->reassembly_data_length >= to_read || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (ret) - return ret; - - /* now we can read from reassembly queue and not sleep */ - page_address = kmap_atomic(page); - to_address = (char *) page_address + page_offset; - - log_read(INFO, "reading from page=%p address=%p to_read=%d\n", - page, to_address, to_read); - - ret = smbd_recv_buf(info, to_address, to_read); - kunmap_atomic(page_address); - - return ret; -} - -/* - * Receive data from transport - * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC - * return: total bytes read, or 0. SMB Direct will not do partial read. - */ -int smbd_recv(struct smbd_connection *info, struct msghdr *msg) -{ - char *buf; - struct page *page; - unsigned int to_read, page_offset; - int rc; - - if (iov_iter_rw(&msg->msg_iter) == WRITE) { - /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "Invalid msg iter dir %u\n", - iov_iter_rw(&msg->msg_iter)); - rc = -EINVAL; - goto out; - } - - switch (iov_iter_type(&msg->msg_iter)) { - case ITER_KVEC: - buf = msg->msg_iter.kvec->iov_base; - to_read = msg->msg_iter.kvec->iov_len; - rc = smbd_recv_buf(info, buf, to_read); - break; - - case ITER_BVEC: - page = msg->msg_iter.bvec->bv_page; - page_offset = msg->msg_iter.bvec->bv_offset; - to_read = msg->msg_iter.bvec->bv_len; - rc = smbd_recv_page(info, page, page_offset, to_read); - break; - - default: - /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "Invalid msg type %d\n", - iov_iter_type(&msg->msg_iter)); - rc = -EINVAL; - } - -out: - /* SMBDirect will read it all or nothing */ - if (rc > 0) - msg->msg_iter.count = 0; - return rc; -} - -/* * Send data to transport * Each rqst is transported as a SMBDirect payload * rqst: the data to write @@ -2032,14 +1985,14 @@ int smbd_send(struct TCP_Server_Info *server, klen += rqst->rq_iov[i].iov_len; iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); - rc = smbd_post_send_iter(info, &iter, &remaining_data_length); + rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length); if (rc < 0) break; if (iov_iter_count(&rqst->rq_iter) > 0) { /* And then the data pages if there are any */ - rc = smbd_post_send_iter(info, &rqst->rq_iter, - &remaining_data_length); + rc = smbd_post_send_full_iter(info, &rqst->rq_iter, + &remaining_data_length); if (rc < 0) break; } @@ -2589,13 +2542,14 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, size_t fsize = folioq_folio_size(folioq, slot); if (offset < fsize) { - size_t part = umin(maxsize - ret, fsize - offset); + size_t part = umin(maxsize, fsize - offset); if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part)) return -EIO; offset += part; ret += part; + maxsize -= part; } if (offset >= fsize) { @@ -2610,7 +2564,7 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, slot = 0; } } - } while (rdma->nr_sge < rdma->max_sge || maxsize > 0); + } while (rdma->nr_sge < rdma->max_sge && maxsize > 0); iter->folioq = folioq; iter->folioq_slot = slot; diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 52bcb55d9952..93e5b2bb9f28 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -140,7 +140,7 @@ DECLARE_EVENT_CLASS(smb3_rw_err_class, __entry->len = len; __entry->rc = rc; ), - TP_printk("\tR=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", + TP_printk("R=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", __entry->rreq_debug_id, __entry->rreq_debug_index, __entry->xid, __entry->sesid, __entry->tid, __entry->fid, __entry->offset, __entry->len, __entry->rc) @@ -190,7 +190,7 @@ DECLARE_EVENT_CLASS(smb3_other_err_class, __entry->len = len; __entry->rc = rc; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", __entry->xid, __entry->sesid, __entry->tid, __entry->fid, __entry->offset, __entry->len, __entry->rc) ) @@ -247,7 +247,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_err_class, __entry->len = len; __entry->rc = rc; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d", + TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d", __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len, __entry->rc) ) @@ -298,7 +298,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_done_class, __entry->target_offset = target_offset; __entry->len = len; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x", + TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x", __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len) ) @@ -482,7 +482,7 @@ DECLARE_EVENT_CLASS(smb3_fd_class, __entry->tid = tid; __entry->sesid = sesid; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx", + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx", __entry->xid, __entry->sesid, __entry->tid, __entry->fid) ) @@ -521,7 +521,7 @@ DECLARE_EVENT_CLASS(smb3_fd_err_class, __entry->sesid = sesid; __entry->rc = rc; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d", + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d", __entry->xid, __entry->sesid, __entry->tid, __entry->fid, __entry->rc) ) @@ -794,7 +794,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_err_class, __entry->status = status; __entry->rc = rc; ), - TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d", + TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d", __entry->sesid, __entry->tid, __entry->cmd, __entry->mid, __entry->status, __entry->rc) ) @@ -829,7 +829,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_done_class, __entry->cmd = cmd; __entry->mid = mid; ), - TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu", + TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu", __entry->sesid, __entry->tid, __entry->cmd, __entry->mid) ) @@ -867,7 +867,7 @@ DECLARE_EVENT_CLASS(smb3_mid_class, __entry->when_sent = when_sent; __entry->when_received = when_received; ), - TP_printk("\tcmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu", + TP_printk("cmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu", __entry->cmd, __entry->mid, __entry->pid, __entry->when_sent, __entry->when_received) ) @@ -898,7 +898,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class, __assign_str(func_name); __entry->rc = rc; ), - TP_printk("\t%s: xid=%u rc=%d", + TP_printk("%s: xid=%u rc=%d", __get_str(func_name), __entry->xid, __entry->rc) ) @@ -924,7 +924,7 @@ DECLARE_EVENT_CLASS(smb3_sync_err_class, __entry->ino = ino; __entry->rc = rc; ), - TP_printk("\tino=%lu rc=%d", + TP_printk("ino=%lu rc=%d", __entry->ino, __entry->rc) ) @@ -950,7 +950,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class, __entry->xid = xid; __assign_str(func_name); ), - TP_printk("\t%s: xid=%u", + TP_printk("%s: xid=%u", __get_str(func_name), __entry->xid) ) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 83764c230e9d..3f04a2977ba8 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -40,7 +40,7 @@ void ksmbd_conn_free(struct ksmbd_conn *conn) kvfree(conn->request_buf); kfree(conn->preauth_info); if (atomic_dec_and_test(&conn->refcnt)) { - ksmbd_free_transport(conn->transport); + conn->transport->ops->free_transport(conn->transport); kfree(conn); } } diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 6efed923bd68..dd3e0e3f7bf0 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -133,6 +133,7 @@ struct ksmbd_transport_ops { void *buf, unsigned int len, struct smb2_buffer_desc_v1 *desc, unsigned int desc_len); + void (*free_transport)(struct ksmbd_transport *kt); }; struct ksmbd_transport { diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 1a308171b599..63d17cea2e95 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1607,17 +1607,18 @@ static int krb5_authenticate(struct ksmbd_work *work, out_len = work->response_sz - (le16_to_cpu(rsp->SecurityBufferOffset) + 4); - /* Check previous session */ - prev_sess_id = le64_to_cpu(req->PreviousSessionId); - if (prev_sess_id && prev_sess_id != sess->id) - destroy_previous_session(conn, sess->user, prev_sess_id); - retval = ksmbd_krb5_authenticate(sess, in_blob, in_len, out_blob, &out_len); if (retval) { ksmbd_debug(SMB, "krb5 authentication failed\n"); return -EINVAL; } + + /* Check previous session */ + prev_sess_id = le64_to_cpu(req->PreviousSessionId); + if (prev_sess_id && prev_sess_id != sess->id) + destroy_previous_session(conn, sess->user, prev_sess_id); + rsp->SecurityBufferLength = cpu_to_le16(out_len); if ((conn->sign || server_conf.enforced_signing) || @@ -4871,8 +4872,13 @@ static int get_file_standard_info(struct smb2_query_info_rsp *rsp, sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); - sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9); - sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + if (ksmbd_stream_fd(fp) == false) { + sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9); + sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + } else { + sinfo->AllocationSize = cpu_to_le64(fp->stream.size); + sinfo->EndOfFile = cpu_to_le64(fp->stream.size); + } sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending); sinfo->DeletePending = delete_pending; sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0; @@ -4935,9 +4941,14 @@ static int get_file_all_info(struct ksmbd_work *work, file_info->ChangeTime = cpu_to_le64(time); file_info->Attributes = fp->f_ci->m_fattr; file_info->Pad1 = 0; - file_info->AllocationSize = - cpu_to_le64(stat.blocks << 9); - file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + if (ksmbd_stream_fd(fp) == false) { + file_info->AllocationSize = + cpu_to_le64(stat.blocks << 9); + file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + } else { + file_info->AllocationSize = cpu_to_le64(fp->stream.size); + file_info->EndOfFile = cpu_to_le64(fp->stream.size); + } file_info->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending); file_info->DeletePending = delete_pending; @@ -4946,7 +4957,10 @@ static int get_file_all_info(struct ksmbd_work *work, file_info->IndexNumber = cpu_to_le64(stat.ino); file_info->EASize = 0; file_info->AccessFlags = fp->daccess; - file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + if (ksmbd_stream_fd(fp) == false) + file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + else + file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos); file_info->Mode = fp->coption; file_info->AlignmentRequirement = 0; conv_len = smbConvertToUTF16((__le16 *)file_info->FileName, filename, @@ -5134,8 +5148,13 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, time = ksmbd_UnixTimeToNT(stat.ctime); file_info->ChangeTime = cpu_to_le64(time); file_info->Attributes = fp->f_ci->m_fattr; - file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); - file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + if (ksmbd_stream_fd(fp) == false) { + file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); + file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + } else { + file_info->AllocationSize = cpu_to_le64(fp->stream.size); + file_info->EndOfFile = cpu_to_le64(fp->stream.size); + } file_info->Reserved = cpu_to_le32(0); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_ntwrk_info)); @@ -5158,7 +5177,11 @@ static void get_file_position_info(struct smb2_query_info_rsp *rsp, struct smb2_file_pos_info *file_info; file_info = (struct smb2_file_pos_info *)rsp->Buffer; - file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + if (ksmbd_stream_fd(fp) == false) + file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + else + file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos); + rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_pos_info)); } @@ -5247,8 +5270,13 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->ChangeTime = cpu_to_le64(time); file_info->DosAttributes = fp->f_ci->m_fattr; file_info->Inode = cpu_to_le64(stat.ino); - file_info->EndOfFile = cpu_to_le64(stat.size); - file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); + if (ksmbd_stream_fd(fp) == false) { + file_info->EndOfFile = cpu_to_le64(stat.size); + file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); + } else { + file_info->EndOfFile = cpu_to_le64(fp->stream.size); + file_info->AllocationSize = cpu_to_le64(fp->stream.size); + } file_info->HardLinks = cpu_to_le32(stat.nlink); file_info->Mode = cpu_to_le32(stat.mode & 0777); switch (stat.mode & S_IFMT) { @@ -6190,6 +6218,9 @@ static int set_file_allocation_info(struct ksmbd_work *work, if (!(fp->daccess & FILE_WRITE_DATA_LE)) return -EACCES; + if (ksmbd_stream_fd(fp) == true) + return 0; + rc = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); if (rc) @@ -6248,7 +6279,8 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, * truncate of some filesystem like FAT32 fill zero data in * truncated range. */ - if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) { + if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC && + ksmbd_stream_fd(fp) == false) { ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize); rc = ksmbd_vfs_truncate(work, fp, newsize); if (rc) { @@ -6321,7 +6353,13 @@ static int set_file_position_info(struct ksmbd_file *fp, return -EINVAL; } - fp->filp->f_pos = current_byte_offset; + if (ksmbd_stream_fd(fp) == false) + fp->filp->f_pos = current_byte_offset; + else { + if (current_byte_offset > XATTR_SIZE_MAX) + current_byte_offset = XATTR_SIZE_MAX; + fp->stream.pos = current_byte_offset; + } return 0; } @@ -8535,11 +8573,6 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) goto err_out; } - opinfo->op_state = OPLOCK_STATE_NONE; - wake_up_interruptible_all(&opinfo->oplock_q); - opinfo_put(opinfo); - ksmbd_fd_put(work, fp); - rsp->StructureSize = cpu_to_le16(24); rsp->OplockLevel = rsp_oplevel; rsp->Reserved = 0; @@ -8547,16 +8580,15 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) rsp->VolatileFid = volatile_id; rsp->PersistentFid = persistent_id; ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break)); - if (!ret) - return; - + if (ret) { err_out: + smb2_set_err_rsp(work); + } + opinfo->op_state = OPLOCK_STATE_NONE; wake_up_interruptible_all(&opinfo->oplock_q); - opinfo_put(opinfo); ksmbd_fd_put(work, fp); - smb2_set_err_rsp(work); } static int check_lease_state(struct lease *lease, __le32 req_state) @@ -8686,11 +8718,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work) } lease_state = lease->state; - opinfo->op_state = OPLOCK_STATE_NONE; - wake_up_interruptible_all(&opinfo->oplock_q); - atomic_dec(&opinfo->breaking_cnt); - wake_up_interruptible_all(&opinfo->oplock_brk); - opinfo_put(opinfo); rsp->StructureSize = cpu_to_le16(36); rsp->Reserved = 0; @@ -8699,16 +8726,16 @@ static void smb21_lease_break_ack(struct ksmbd_work *work) rsp->LeaseState = lease_state; rsp->LeaseDuration = 0; ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack)); - if (!ret) - return; - + if (ret) { err_out: + smb2_set_err_rsp(work); + } + + opinfo->op_state = OPLOCK_STATE_NONE; wake_up_interruptible_all(&opinfo->oplock_q); atomic_dec(&opinfo->breaking_cnt); wake_up_interruptible_all(&opinfo->oplock_brk); - opinfo_put(opinfo); - smb2_set_err_rsp(work); } /** diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4998df04ab95..c6cbe0d56e32 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -159,7 +159,8 @@ struct smb_direct_transport { }; #define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) - +#define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \ + struct smb_direct_transport, transport)) enum { SMB_DIRECT_MSG_NEGOTIATE_REQ = 0, SMB_DIRECT_MSG_DATA_TRANSFER @@ -410,6 +411,11 @@ err: return NULL; } +static void smb_direct_free_transport(struct ksmbd_transport *kt) +{ + kfree(SMBD_TRANS(kt)); +} + static void free_transport(struct smb_direct_transport *t) { struct smb_direct_recvmsg *recvmsg; @@ -427,7 +433,8 @@ static void free_transport(struct smb_direct_transport *t) if (t->qp) { ib_drain_qp(t->qp); ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs); - ib_destroy_qp(t->qp); + t->qp = NULL; + rdma_destroy_qp(t->cm_id); } ksmbd_debug(RDMA, "drain the reassembly queue\n"); @@ -455,7 +462,6 @@ static void free_transport(struct smb_direct_transport *t) smb_direct_destroy_pools(t); ksmbd_conn_free(KSMBD_TRANS(t)->conn); - kfree(t); } static struct smb_direct_sendmsg @@ -1935,8 +1941,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, return 0; err: if (t->qp) { - ib_destroy_qp(t->qp); t->qp = NULL; + rdma_destroy_qp(t->cm_id); } if (t->recv_cq) { ib_destroy_cq(t->recv_cq); @@ -2281,4 +2287,5 @@ static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .read = smb_direct_read, .rdma_read = smb_direct_rdma_read, .rdma_write = smb_direct_rdma_write, + .free_transport = smb_direct_free_transport, }; diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index abedf510899a..4e9f98db9ff4 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -93,7 +93,7 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) return t; } -void ksmbd_free_transport(struct ksmbd_transport *kt) +static void ksmbd_tcp_free_transport(struct ksmbd_transport *kt) { struct tcp_transport *t = TCP_TRANS(kt); @@ -656,4 +656,5 @@ static const struct ksmbd_transport_ops ksmbd_tcp_transport_ops = { .read = ksmbd_tcp_read, .writev = ksmbd_tcp_writev, .disconnect = ksmbd_tcp_disconnect, + .free_transport = ksmbd_tcp_free_transport, }; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index ba45e809555a..d3437f6644e3 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -293,6 +293,7 @@ static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos, if (v_len - *pos < count) count = v_len - *pos; + fp->stream.pos = v_len; memcpy(buf, &stream_buf[*pos], count); @@ -456,8 +457,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, true); if (err < 0) goto out; - - fp->filp->f_pos = *pos; + else + fp->stream.pos = size; err = 0; out: kvfree(stream_buf); @@ -1281,6 +1282,7 @@ out1: err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry); if (err) { + mnt_drop_write(parent_path->mnt); path_put(path); path_put(parent_path); } diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index 5bbb179736c2..0708155b5caf 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -44,6 +44,7 @@ struct ksmbd_lock { struct stream { char *name; ssize_t size; + loff_t pos; }; struct ksmbd_inode { diff --git a/fs/super.c b/fs/super.c index 21799e213fd7..80418ca8e215 100644 --- a/fs/super.c +++ b/fs/super.c @@ -964,8 +964,10 @@ void iterate_supers_type(struct file_system_type *type, spin_unlock(&sb_lock); locked = super_lock_shared(sb); - if (locked) + if (locked) { f(sb, arg); + super_unlock_shared(sb); + } spin_lock(&sb_lock); if (p) diff --git a/fs/xattr.c b/fs/xattr.c index 8ec5b0204bfd..600ae97969cf 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1479,6 +1479,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, buffer += err; } remaining_size -= err; + err = 0; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 7839efe050bf..000cc7f4a3ce 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3444,16 +3444,41 @@ xfs_alloc_read_agf( set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } + #ifdef DEBUG - else if (!xfs_is_shutdown(mp)) { - ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); - ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); - ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); - ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); - ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); - ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); + /* + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. + */ + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks); + ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount); + ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest); + ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level); + ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); + xfs_trans_brelse(tp, agfbp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } } -#endif +#endif /* DEBUG */ + if (agfbpp) *agfbpp = agfbp; else diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 0c47b5c6ca7d..750111634d9f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2801,12 +2801,35 @@ xfs_ialloc_read_agi( set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } +#ifdef DEBUG /* - * It's possible for these to be out of sync if - * we are in the middle of a forced shutdown. + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. */ - ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(pag_mount(pag))); + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount); + ok &= pag->pagi_count == be32_to_cpu(agi->agi_count); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + xfs_trans_brelse(tp, agibp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + } +#endif /* DEBUG */ + if (agibpp) *agibpp = agibp; else diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8af83bd161f9..ba5bd6031ece 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2082,44 +2082,6 @@ xfs_buf_delwri_submit( return error; } -/* - * Push a single buffer on a delwri queue. - * - * The purpose of this function is to submit a single buffer of a delwri queue - * and return with the buffer still on the original queue. - * - * The buffer locking and queue management logic between _delwri_pushbuf() and - * _delwri_queue() guarantee that the buffer cannot be queued to another list - * before returning. - */ -int -xfs_buf_delwri_pushbuf( - struct xfs_buf *bp, - struct list_head *buffer_list) -{ - int error; - - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - - trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); - - xfs_buf_lock(bp); - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); - bp->b_flags |= XBF_WRITE; - xfs_buf_submit(bp); - - /* - * The buffer is now locked, under I/O but still on the original delwri - * queue. Wait for I/O completion, restore the DELWRI_Q flag and - * return with the buffer unlocked and still on the original queue. - */ - error = xfs_buf_iowait(bp); - bp->b_flags |= _XBF_DELWRI_Q; - xfs_buf_unlock(bp); - - return error; -} - void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 9d2ab567cf81..15fc56948346 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -326,7 +326,6 @@ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); -extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 90139e0f3271..7fc54725c5f6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -32,6 +32,61 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } +static void +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; + + if (count == 1) { + bip->bli_formats = &bip->__bli_format; + return; + } + + bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), + GFP_KERNEL | __GFP_NOFAIL); +} + +static void +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) +{ + if (bip->bli_formats != &bip->__bli_format) { + kfree(bip->bli_formats); + bip->bli_formats = NULL; + } +} + +static void +xfs_buf_item_free( + struct xfs_buf_log_item *bip) +{ + xfs_buf_item_free_format(bip); + kvfree(bip->bli_item.li_lv_shadow); + kmem_cache_free(xfs_buf_item_cache, bip); +} + +/* + * xfs_buf_item_relse() is called when the buf log item is no longer needed. + */ +static void +xfs_buf_item_relse( + struct xfs_buf_log_item *bip) +{ + struct xfs_buf *bp = bip->bli_buf; + + trace_xfs_buf_item_relse(bp, _RET_IP_); + + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); + ASSERT(atomic_read(&bip->bli_refcount) == 0); + + bp->b_log_item = NULL; + xfs_buf_rele(bp); + xfs_buf_item_free(bip); +} + /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool xfs_buf_log_check_iovec( @@ -390,6 +445,42 @@ xfs_buf_item_pin( } /* + * For a stale BLI, process all the necessary completions that must be + * performed when the final BLI reference goes away. The buffer will be + * referenced and locked here - we return to the caller with the buffer still + * referenced and locked for them to finalise processing of the buffer. + */ +static void +xfs_buf_item_finish_stale( + struct xfs_buf_log_item *bip) +{ + struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_item *lip = &bip->bli_item; + + ASSERT(bip->bli_flags & XFS_BLI_STALE); + ASSERT(xfs_buf_islocked(bp)); + ASSERT(bp->b_flags & XBF_STALE); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(list_empty(&lip->li_trans)); + ASSERT(!bp->b_transp); + + if (bip->bli_flags & XFS_BLI_STALE_INODE) { + xfs_buf_item_done(bp); + xfs_buf_inode_iodone(bp); + ASSERT(list_empty(&bp->b_li_list)); + return; + } + + /* + * We may or may not be on the AIL here, xfs_trans_ail_delete() will do + * the right thing regardless of the situation in which we are called. + */ + xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bip); + ASSERT(bp->b_log_item == NULL); +} + +/* * This is called to unpin the buffer associated with the buf log item which was * previously pinned with a call to xfs_buf_item_pin(). We enter this function * with a buffer pin count, a buffer reference and a BLI reference. @@ -438,13 +529,6 @@ xfs_buf_item_unpin( } if (stale) { - ASSERT(bip->bli_flags & XFS_BLI_STALE); - ASSERT(xfs_buf_islocked(bp)); - ASSERT(bp->b_flags & XBF_STALE); - ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - ASSERT(list_empty(&lip->li_trans)); - ASSERT(!bp->b_transp); - trace_xfs_buf_item_unpin_stale(bip); /* @@ -455,22 +539,7 @@ xfs_buf_item_unpin( * processing is complete. */ xfs_buf_rele(bp); - - /* - * If we get called here because of an IO error, we may or may - * not have the item on the AIL. xfs_trans_ail_delete() will - * take care of that situation. xfs_trans_ail_delete() drops - * the AIL lock. - */ - if (bip->bli_flags & XFS_BLI_STALE_INODE) { - xfs_buf_item_done(bp); - xfs_buf_inode_iodone(bp); - ASSERT(list_empty(&bp->b_li_list)); - } else { - xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - ASSERT(bp->b_log_item == NULL); - } + xfs_buf_item_finish_stale(bip); xfs_buf_relse(bp); return; } @@ -543,43 +612,42 @@ xfs_buf_item_push( * Drop the buffer log item refcount and take appropriate action. This helper * determines whether the bli must be freed or not, since a decrement to zero * does not necessarily mean the bli is unused. - * - * Return true if the bli is freed, false otherwise. */ -bool +void xfs_buf_item_put( struct xfs_buf_log_item *bip) { - struct xfs_log_item *lip = &bip->bli_item; - bool aborted; - bool dirty; + + ASSERT(xfs_buf_islocked(bip->bli_buf)); /* drop the bli ref and return if it wasn't the last one */ if (!atomic_dec_and_test(&bip->bli_refcount)) - return false; + return; - /* - * We dropped the last ref and must free the item if clean or aborted. - * If the bli is dirty and non-aborted, the buffer was clean in the - * transaction but still awaiting writeback from previous changes. In - * that case, the bli is freed on buffer writeback completion. - */ - aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || - xlog_is_shutdown(lip->li_log); - dirty = bip->bli_flags & XFS_BLI_DIRTY; - if (dirty && !aborted) - return false; + /* If the BLI is in the AIL, then it is still dirty and in use */ + if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) { + ASSERT(bip->bli_flags & XFS_BLI_DIRTY); + return; + } /* - * The bli is aborted or clean. An aborted item may be in the AIL - * regardless of dirty state. For example, consider an aborted - * transaction that invalidated a dirty bli and cleared the dirty - * state. + * In shutdown conditions, we can be asked to free a dirty BLI that + * isn't in the AIL. This can occur due to a checkpoint aborting a BLI + * instead of inserting it into the AIL at checkpoint IO completion. If + * there's another bli reference (e.g. a btree cursor holds a clean + * reference) and it is released via xfs_trans_brelse(), we can get here + * with that aborted, dirty BLI. In this case, it is safe to free the + * dirty BLI immediately, as it is not in the AIL and there are no + * other references to it. + * + * We should never get here with a stale BLI via that path as + * xfs_trans_brelse() specifically holds onto stale buffers rather than + * releasing them. */ - if (aborted) - xfs_trans_ail_delete(lip, 0); - xfs_buf_item_relse(bip->bli_buf); - return true; + ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) || + test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags)); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + xfs_buf_item_relse(bip); } /* @@ -600,6 +668,15 @@ xfs_buf_item_put( * if necessary but do not unlock the buffer. This is for support of * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't * free the item. + * + * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must* + * perform a completion abort of any objects attached to the buffer for IO + * tracking purposes. This generally only happens in shutdown situations, + * normally xfs_buf_item_unpin() will drop the last BLI reference and perform + * completion processing. However, because transaction completion can race with + * checkpoint completion during a shutdown, this release context may end up + * being the last active reference to the BLI and so needs to perform this + * cleanup. */ STATIC void xfs_buf_item_release( @@ -607,18 +684,19 @@ xfs_buf_item_release( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool released; bool hold = bip->bli_flags & XFS_BLI_HOLD; bool stale = bip->bli_flags & XFS_BLI_STALE; -#if defined(DEBUG) || defined(XFS_WARN) - bool ordered = bip->bli_flags & XFS_BLI_ORDERED; - bool dirty = bip->bli_flags & XFS_BLI_DIRTY; bool aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); + bool dirty = bip->bli_flags & XFS_BLI_DIRTY; +#if defined(DEBUG) || defined(XFS_WARN) + bool ordered = bip->bli_flags & XFS_BLI_ORDERED; #endif trace_xfs_buf_item_release(bip); + ASSERT(xfs_buf_islocked(bp)); + /* * The bli dirty state should match whether the blf has logged segments * except for ordered buffers, where only the bli should be dirty. @@ -634,16 +712,56 @@ xfs_buf_item_release( bp->b_transp = NULL; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); + /* If there are other references, then we have nothing to do. */ + if (!atomic_dec_and_test(&bip->bli_refcount)) + goto out_release; + + /* + * Stale buffer completion frees the BLI, unlocks and releases the + * buffer. Neither the BLI or buffer are safe to reference after this + * call, so there's nothing more we need to do here. + * + * If we get here with a stale buffer and references to the BLI remain, + * we must not unlock the buffer as the last BLI reference owns lock + * context, not us. + */ + if (stale) { + xfs_buf_item_finish_stale(bip); + xfs_buf_relse(bp); + ASSERT(!hold); + return; + } + /* - * Unref the item and unlock the buffer unless held or stale. Stale - * buffers remain locked until final unpin unless the bli is freed by - * the unref call. The latter implies shutdown because buffer - * invalidation dirties the bli and transaction. + * Dirty or clean, aborted items are done and need to be removed from + * the AIL and released. This frees the BLI, but leaves the buffer + * locked and referenced. */ - released = xfs_buf_item_put(bip); - if (hold || (stale && !released)) + if (aborted || xlog_is_shutdown(lip->li_log)) { + ASSERT(list_empty(&bip->bli_buf->b_li_list)); + xfs_buf_item_done(bp); + goto out_release; + } + + /* + * Clean, unreferenced BLIs can be immediately freed, leaving the buffer + * locked and referenced. + * + * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback. + */ + if (!dirty) + xfs_buf_item_relse(bip); + else + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); + + /* Not safe to reference the BLI from here */ +out_release: + /* + * If we get here with a stale buffer, we must not unlock the + * buffer as the last BLI reference owns lock context, not us. + */ + if (stale || hold) return; - ASSERT(!stale || aborted); xfs_buf_relse(bp); } @@ -729,33 +847,6 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_push = xfs_buf_item_push, }; -STATIC void -xfs_buf_item_get_format( - struct xfs_buf_log_item *bip, - int count) -{ - ASSERT(bip->bli_formats == NULL); - bip->bli_format_count = count; - - if (count == 1) { - bip->bli_formats = &bip->__bli_format; - return; - } - - bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), - GFP_KERNEL | __GFP_NOFAIL); -} - -STATIC void -xfs_buf_item_free_format( - struct xfs_buf_log_item *bip) -{ - if (bip->bli_formats != &bip->__bli_format) { - kfree(bip->bli_formats); - bip->bli_formats = NULL; - } -} - /* * Allocate a new buf log item to go with the given buffer. * Set the buffer's b_log_item field to point to the new @@ -976,34 +1067,6 @@ xfs_buf_item_dirty_format( return false; } -STATIC void -xfs_buf_item_free( - struct xfs_buf_log_item *bip) -{ - xfs_buf_item_free_format(bip); - kvfree(bip->bli_item.li_lv_shadow); - kmem_cache_free(xfs_buf_item_cache, bip); -} - -/* - * xfs_buf_item_relse() is called when the buf log item is no longer needed. - */ -void -xfs_buf_item_relse( - struct xfs_buf *bp) -{ - struct xfs_buf_log_item *bip = bp->b_log_item; - - trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); - - if (atomic_read(&bip->bli_refcount)) - return; - bp->b_log_item = NULL; - xfs_buf_rele(bp); - xfs_buf_item_free(bip); -} - void xfs_buf_item_done( struct xfs_buf *bp) @@ -1023,5 +1086,5 @@ xfs_buf_item_done( xfs_trans_ail_delete(&bp->b_log_item->bli_item, (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : SHUTDOWN_CORRUPT_INCORE); - xfs_buf_item_relse(bp); + xfs_buf_item_relse(bp->b_log_item); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index e10e324cd245..416890b84f8c 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -49,8 +49,7 @@ struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_done(struct xfs_buf *bp); -void xfs_buf_item_relse(struct xfs_buf *); -bool xfs_buf_item_put(struct xfs_buf_log_item *); +void xfs_buf_item_put(struct xfs_buf_log_item *bip); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index b4e32f0860b7..0bd8022e47b4 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1398,11 +1398,9 @@ xfs_qm_dqflush( ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); + ASSERT(atomic_read(&dqp->q_pincount) == 0); trace_xfs_dqflush(dqp); - - xfs_qm_dqunpin_wait(dqp); - fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 48254a72071b..0b41b18debf3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1335,9 +1335,10 @@ xfs_falloc_allocate_range( } #define XFS_FALLOC_FL_SUPPORTED \ - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) + (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ + FALLOC_FL_UNSHARE_RANGE) STATIC long __xfs_file_fallocate( diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 726e29b837e6..bbc2f2973dcc 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -979,7 +979,15 @@ xfs_reclaim_inode( */ if (xlog_is_shutdown(ip->i_mount->m_log)) { xfs_iunpin_wait(ip); + /* + * Avoid a ABBA deadlock on the inode cluster buffer vs + * concurrent xfs_ifree_cluster() trying to mark the inode + * stale. We don't need the inode locked to run the flush abort + * code, but the flush abort needs to lock the cluster buffer. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iflush_shutdown_abort(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); goto reclaim; } if (xfs_ipincount(ip)) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ee3e0f284287..761a996a857c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1635,7 +1635,7 @@ retry: iip = ip->i_itemp; if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { ASSERT(!list_empty(&iip->ili_item.li_bio_list)); - ASSERT(iip->ili_last_fields); + ASSERT(iip->ili_last_fields || xlog_is_shutdown(mp->m_log)); goto out_iunlock; } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c6cb0b6b9e46..285e27ff89e2 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -758,11 +758,14 @@ xfs_inode_item_push( * completed and items removed from the AIL before the next push * attempt. */ + trace_xfs_inode_push_stale(ip, _RET_IP_); return XFS_ITEM_PINNED; } - if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) { + trace_xfs_inode_push_pinned(ip, _RET_IP_); return XFS_ITEM_PINNED; + } if (xfs_iflags_test(ip, XFS_IFLUSHING)) return XFS_ITEM_FLUSHING; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f66d2d430e4f..a80cb6b9969a 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -793,8 +793,10 @@ xlog_cil_ail_insert( struct xfs_log_item *lip = lv->lv_item; xfs_lsn_t item_lsn; - if (aborted) + if (aborted) { + trace_xlog_ail_insert_abort(lip); set_bit(XFS_LI_ABORTED, &lip->li_flags); + } if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) { lip->li_ops->iop_release(lip); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 08443ceec329..866c71d9fbae 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -320,7 +320,7 @@ xfs_mru_cache_create( xfs_mru_cache_free_func_t free_func) { struct xfs_mru_cache *mru = NULL; - int err = 0, grp; + int grp; unsigned int grp_time; if (mrup) @@ -341,8 +341,8 @@ xfs_mru_cache_create( mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists), GFP_KERNEL | __GFP_NOFAIL); if (!mru->lists) { - err = -ENOMEM; - goto exit; + kfree(mru); + return -ENOMEM; } for (grp = 0; grp < mru->grp_count; grp++) @@ -361,14 +361,7 @@ xfs_mru_cache_create( mru->free_func = free_func; mru->data = data; *mrup = mru; - -exit: - if (err && mru && mru->lists) - kfree(mru->lists); - if (err && mru) - kfree(mru); - - return err; + return 0; } /* @@ -425,10 +418,6 @@ xfs_mru_cache_insert( { int error = -EINVAL; - ASSERT(mru && mru->lists); - if (!mru || !mru->lists) - goto out_free; - error = -ENOMEM; if (radix_tree_preload(GFP_KERNEL)) goto out_free; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 417439b58785..fa135ac26471 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -134,6 +134,7 @@ xfs_qm_dqpurge( dqp->q_flags |= XFS_DQFLAG_FREEING; + xfs_qm_dqunpin_wait(dqp); xfs_dqflock(dqp); /* @@ -465,6 +466,7 @@ xfs_qm_dquot_isolate( struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); struct xfs_qm_isolate *isol = arg; + enum lru_status ret = LRU_SKIP; if (!xfs_dqlock_nowait(dqp)) goto out_miss_busy; @@ -478,6 +480,16 @@ xfs_qm_dquot_isolate( goto out_miss_unlock; /* + * If the dquot is pinned or dirty, rotate it to the end of the LRU to + * give some time for it to be cleaned before we try to isolate it + * again. + */ + ret = LRU_ROTATE; + if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0) { + goto out_miss_unlock; + } + + /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. */ @@ -492,41 +504,14 @@ xfs_qm_dquot_isolate( } /* - * If the dquot is dirty, flush it. If it's already being flushed, just - * skip it so there is time for the IO to complete before we try to - * reclaim it again on the next LRU pass. + * The dquot may still be under IO, in which case the flush lock will be + * held. If we can't get the flush lock now, just skip over the dquot as + * if it was dirty. */ if (!xfs_dqflock_nowait(dqp)) goto out_miss_unlock; - if (XFS_DQ_IS_DIRTY(dqp)) { - struct xfs_buf *bp = NULL; - int error; - - trace_xfs_dqreclaim_dirty(dqp); - - /* we have to drop the LRU lock to flush the dquot */ - spin_unlock(&lru->lock); - - error = xfs_dquot_use_attached_buf(dqp, &bp); - if (!bp || error == -EAGAIN) { - xfs_dqfunlock(dqp); - goto out_unlock_dirty; - } - - /* - * dqflush completes dqflock on error, and the delwri ioend - * does it on success. - */ - error = xfs_qm_dqflush(dqp, bp); - if (error) - goto out_unlock_dirty; - - xfs_buf_delwri_queue(bp, &isol->buffers); - xfs_buf_relse(bp); - goto out_unlock_dirty; - } - + ASSERT(!XFS_DQ_IS_DIRTY(dqp)); xfs_dquot_detach_buf(dqp); xfs_dqfunlock(dqp); @@ -548,13 +533,7 @@ out_miss_unlock: out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); - return LRU_SKIP; - -out_unlock_dirty: - trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); - xfs_dqunlock(dqp); - return LRU_RETRY; + return ret; } static unsigned long @@ -1486,7 +1465,6 @@ xfs_qm_flush_one( struct xfs_dquot *dqp, void *data) { - struct xfs_mount *mp = dqp->q_mount; struct list_head *buffer_list = data; struct xfs_buf *bp = NULL; int error = 0; @@ -1497,34 +1475,8 @@ xfs_qm_flush_one( if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; - /* - * The only way the dquot is already flush locked by the time quotacheck - * gets here is if reclaim flushed it before the dqadjust walk dirtied - * it for the final time. Quotacheck collects all dquot bufs in the - * local delwri queue before dquots are dirtied, so reclaim can't have - * possibly queued it for I/O. The only way out is to push the buffer to - * cycle the flush lock. - */ - if (!xfs_dqflock_nowait(dqp)) { - /* buf is pinned in-core by delwri list */ - error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp); - if (error) - goto out_unlock; - - if (!(bp->b_flags & _XBF_DELWRI_Q)) { - error = -EAGAIN; - xfs_buf_relse(bp); - goto out_unlock; - } - xfs_buf_unlock(bp); - - xfs_buf_delwri_pushbuf(bp, buffer_list); - xfs_buf_rele(bp); - - error = -EAGAIN; - goto out_unlock; - } + xfs_qm_dqunpin_wait(dqp); + xfs_dqflock(dqp); error = xfs_dquot_use_attached_buf(dqp, &bp); if (error) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6484c596ecea..736eb0924573 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1259,6 +1259,8 @@ xfs_growfs_check_rtgeom( kfree(nmp); + trace_xfs_growfs_check_rtgeom(mp, min_logfsbs); + if (min_logfsbs > mp->m_sb.sb_logblocks) return -EINVAL; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0bc4b5489078..bb0a82635a77 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2020,14 +2020,13 @@ xfs_remount_rw( int error; if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp && - bdev_read_only(mp->m_logdev_targp->bt_bdev)) { + xfs_readonly_buftarg(mp->m_logdev_targp)) { xfs_warn(mp, "ro->rw transition prohibited by read-only logdev"); return -EACCES; } - if (mp->m_rtdev_targp && - bdev_read_only(mp->m_rtdev_targp->bt_bdev)) { + if (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp)) { xfs_warn(mp, "ro->rw transition prohibited by read-only rtdev"); return -EACCES; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 01d284a1c759..ba45d801df1c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -778,7 +778,6 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); -DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_iodone_async); @@ -1147,6 +1146,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __field(xfs_ino_t, ino) __field(int, count) __field(int, pincount) + __field(unsigned long, iflags) __field(unsigned long, caller_ip) ), TP_fast_assign( @@ -1154,13 +1154,15 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __entry->ino = ip->i_ino; __entry->count = atomic_read(&VFS_I(ip)->i_count); __entry->pincount = atomic_read(&ip->i_pincount); + __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pS", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d iflags 0x%lx caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, __entry->pincount, + __entry->iflags, (char *)__entry->caller_ip) ) @@ -1250,6 +1252,8 @@ DEFINE_IREF_EVENT(xfs_irele); DEFINE_IREF_EVENT(xfs_inode_pin); DEFINE_IREF_EVENT(xfs_inode_unpin); DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); +DEFINE_IREF_EVENT(xfs_inode_push_pinned); +DEFINE_IREF_EVENT(xfs_inode_push_stale); DECLARE_EVENT_CLASS(xfs_namespace_class, TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), @@ -1654,6 +1658,8 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin); +DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort); +DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort); DECLARE_EVENT_CLASS(xfs_ail_class, TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index c6657072361a..b4a07af513ba 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -742,8 +742,10 @@ xfs_trans_free_items( list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { xfs_trans_del_item(lip); - if (abort) + if (abort) { + trace_xfs_trans_free_abort(lip); set_bit(XFS_LI_ABORTED, &lip->li_flags); + } if (lip->li_ops->iop_release) lip->li_ops->iop_release(lip); } diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 80add26c0111..01315ed75502 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -727,7 +727,7 @@ xfs_select_zone( for (;;) { prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); - if (oz) + if (oz || xfs_is_shutdown(mp)) break; schedule(); } @@ -777,26 +777,6 @@ xfs_mark_rtg_boundary( ioend->io_flags |= IOMAP_IOEND_BOUNDARY; } -static void -xfs_submit_zoned_bio( - struct iomap_ioend *ioend, - struct xfs_open_zone *oz, - bool is_seq) -{ - ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; - ioend->io_private = oz; - atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ - - if (is_seq) { - ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; - ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; - } else { - xfs_mark_rtg_boundary(ioend); - } - - submit_bio(&ioend->io_bio); -} - /* * Cache the last zone written to for an inode so that it is considered first * for subsequent writes. @@ -891,6 +871,26 @@ xfs_zone_cache_create_association( xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); } +static void +xfs_submit_zoned_bio( + struct iomap_ioend *ioend, + struct xfs_open_zone *oz, + bool is_seq) +{ + ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; + ioend->io_private = oz; + atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ + + if (is_seq) { + ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; + ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; + } else { + xfs_mark_rtg_boundary(ioend); + } + + submit_bio(&ioend->io_bio); +} + void xfs_zone_alloc_and_submit( struct iomap_ioend *ioend, |