diff options
Diffstat (limited to 'libbcachefs/fs.c')
-rw-r--r-- | libbcachefs/fs.c | 308 |
1 files changed, 223 insertions, 85 deletions
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 0422dcab..f1472e91 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -53,7 +53,7 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_subvolume *); /* Set VFS inode flags from bcachefs inode: */ -static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) +static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode) { static const __maybe_unused unsigned bch_flags_to_vfs[] = { [__BCH_INODE_sync] = S_SYNC, @@ -64,8 +64,10 @@ static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); - if (inode->ei_inode.bi_casefold) + if (bch2_inode_casefold(c, &inode->ei_inode)) inode->v.i_flags |= S_CASEFOLD; + else + inode->v.i_flags &= ~S_CASEFOLD; } void bch2_inode_update_after_write(struct btree_trans *trans, @@ -96,7 +98,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans, inode->ei_inode = *bi; - bch2_inode_flags_to_vfs(inode); + bch2_inode_flags_to_vfs(c, inode); } int __must_check bch2_write_inode(struct bch_fs *c, @@ -647,13 +649,18 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, const struct qstr *name) { struct bch_fs *c = trans->c; - struct btree_iter dirent_iter = {}; subvol_inum inum = {}; struct printbuf buf = PRINTBUF; + struct qstr lookup_name; + int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); + if (ret) + return ERR_PTR(ret); + + struct btree_iter dirent_iter = {}; struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - dir_hash_info, dir, name, 0); - int ret = bkey_err(k); + dir_hash_info, dir, &lookup_name, 0); + ret = bkey_err(k); if (ret) return ERR_PTR(ret); @@ -841,6 +848,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, */ set_nlink(&inode->v, 0); } + + if (IS_CASEFOLDED(vdir)) + d_invalidate(dentry); err: bch2_trans_put(trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); @@ -1251,10 +1261,20 @@ static int bch2_tmpfile(struct mnt_idmap *idmap, return finish_open_simple(file, 0); } +struct bch_fiemap_extent { + struct bkey_buf kbuf; + unsigned flags; +}; + static int bch2_fill_extent(struct bch_fs *c, struct fiemap_extent_info *info, - struct bkey_s_c k, unsigned flags) + struct bch_fiemap_extent *fe) { + struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); + unsigned flags = fe->flags; + + BUG_ON(!k.k->size); + if (bkey_extent_is_direct_data(k.k)) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1307,110 +1327,223 @@ static int bch2_fill_extent(struct bch_fs *c, } } -static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, - u64 start, u64 len) +/* + * Scan a range of an inode for data in pagecache. + * + * Intended to be retryable, so don't modify the output params until success is + * imminent. + */ +static int +bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, + bool nonblock) { - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *ei = to_bch_ei(vinode); - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_buf cur, prev; - bool have_extent = false; - int ret = 0; + loff_t dstart, dend; - ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); - if (ret) + dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); + if (dstart < 0) + return dstart; + + if (dstart == *end) { + *start = dstart; + return 0; + } + + dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); + if (dend < 0) + return dend; + + /* race */ + BUG_ON(dstart == dend); + + *start = dstart; + *end = dend; + return 0; +} + +/* + * Scan a range of pagecache that corresponds to a file mapping hole in the + * extent btree. If data is found, fake up an extent key so it looks like a + * delalloc extent to the rest of the fiemap processing code. + */ +static int +bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode, + u64 start, u64 end, struct bch_fiemap_extent *cur) +{ + struct bch_fs *c = trans->c; + struct bkey_i_extent *delextent; + struct bch_extent_ptr ptr = {}; + loff_t dstart = start << 9, dend = end << 9; + int ret; + + /* + * We hold btree locks here so we cannot block on folio locks without + * dropping trans locks first. Run a nonblocking scan for the common + * case of no folios over holes and fall back on failure. + * + * Note that dropping locks like this is technically racy against + * writeback inserting to the extent tree, but a non-sync fiemap scan is + * fundamentally racy with writeback anyways. Therefore, just report the + * range as delalloc regardless of whether we have to cycle trans locks. + */ + ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true); + if (ret == -EAGAIN) + ret = drop_locks_do(trans, + bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false)); + if (ret < 0) return ret; - struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); - if (start + len < start) - return -EINVAL; + /* + * Create a fake extent key in the buffer. We have to add a dummy extent + * pointer for the fill code to add an extent entry. It's explicitly + * zeroed to reflect delayed allocation (i.e. phys offset 0). + */ + bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); + delextent = bkey_extent_init(cur->kbuf.k); + delextent->k.p = POS(inode->ei_inum.inum, dend >> 9); + delextent->k.size = (dend - dstart) >> 9; + bch2_bkey_append_ptr(&delextent->k_i, ptr); - start >>= 9; + cur->flags = FIEMAP_EXTENT_DELALLOC; - bch2_bkey_buf_init(&cur); - bch2_bkey_buf_init(&prev); - trans = bch2_trans_get(c); + return 0; +} +static int bch2_next_fiemap_extent(struct btree_trans *trans, + struct bch_inode_info *inode, + u64 start, u64 end, + struct bch_fiemap_extent *cur) +{ + u32 snapshot; + int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); + if (ret) + return ret; + + struct btree_iter iter; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(ei->v.i_ino, start), 0); + SPOS(inode->ei_inum.inum, start, snapshot), 0); - while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - enum btree_id data_btree = BTREE_ID_extents; + struct bkey_s_c k = + bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); + ret = bkey_err(k); + if (ret) + goto err; - bch2_trans_begin(trans); + ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur); + if (ret) + goto err; - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); - if (ret) - continue; + struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); + /* + * Does the pagecache or the btree take precedence? + * + * It _should_ be the pagecache, so that we correctly report delalloc + * extents when dirty in the pagecache (we're COW, after all). + * + * But we'd have to add per-sector writeback tracking to + * bch_folio_state, otherwise we report delalloc extents for clean + * cached data in the pagecache. + * + * We should do this, but even then fiemap won't report stable mappings: + * on bcachefs data moves around in the background (copygc, rebalance) + * and we don't provide a way for userspace to lock that out. + */ + if (k.k && + bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)), + pagecache_start)) { + bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); + bch2_cut_front(iter.pos, cur->kbuf.k); + bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k); + cur->flags = 0; + } else if (k.k) { + bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k); + } - k = bch2_btree_iter_peek_max(trans, &iter, end); - ret = bkey_err(k); + if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) { + unsigned sectors = cur->kbuf.k->k.size; + s64 offset_into_extent = 0; + enum btree_id data_btree = BTREE_ID_extents; + ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, + &cur->kbuf); if (ret) - continue; + goto err; - if (!k.k) - break; + struct bkey_i *k = cur->kbuf.k; + sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); - if (!bkey_extent_is_data(k.k) && - k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_advance(trans, &iter); - continue; - } + bch2_cut_front(POS(k->k.p.inode, + bkey_start_offset(&k->k) + offset_into_extent), + k); + bch2_key_resize(&k->k, sectors); + k->k.p = iter.pos; + k->k.p.offset += k->k.size; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} - s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - unsigned sectors = k.k->size - offset_into_extent; +static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + u64 start, u64 len) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(vinode); + struct btree_trans *trans; + struct bch_fiemap_extent cur, prev; + int ret = 0; + + ret = fiemap_prep(&ei->v, info, start, &len, 0); + if (ret) + return ret; + + if (start + len < start) + return -EINVAL; + + start >>= 9; + u64 end = (start + len) >> 9; - bch2_bkey_buf_reassemble(&cur, c, k); + bch2_bkey_buf_init(&cur.kbuf); + bch2_bkey_buf_init(&prev.kbuf); + bkey_init(&prev.kbuf.k->k); - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &cur); + trans = bch2_trans_get(c); + + while (start < end) { + ret = lockrestart_do(trans, + bch2_next_fiemap_extent(trans, ei, start, end, &cur)); if (ret) - continue; + goto err; - k = bkey_i_to_s_c(cur.k); - bch2_bkey_buf_realloc(&prev, c, k.k->u64s); + BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start); + BUG_ON(cur.kbuf.k->k.p.offset > end); - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); + if (bkey_start_offset(&cur.kbuf.k->k) == end) + break; - bch2_cut_front(POS(k.k->p.inode, - bkey_start_offset(k.k) + - offset_into_extent), - cur.k); - bch2_key_resize(&cur.k->k, sectors); - cur.k->k.p = iter.pos; - cur.k->k.p.offset += cur.k->k.size; + start = cur.kbuf.k->k.p.offset; - if (have_extent) { + if (!bkey_deleted(&prev.kbuf.k->k)) { bch2_trans_unlock(trans); - ret = bch2_fill_extent(c, info, - bkey_i_to_s_c(prev.k), 0); + ret = bch2_fill_extent(c, info, &prev); if (ret) - break; + goto err; } - bkey_copy(prev.k, cur.k); - have_extent = true; - - bch2_btree_iter_set_pos(trans, &iter, - POS(iter.pos.inode, iter.pos.offset + sectors)); + bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k); + prev.flags = cur.flags; } - bch2_trans_iter_exit(trans, &iter); - if (!ret && have_extent) { + if (!bkey_deleted(&prev.kbuf.k->k)) { bch2_trans_unlock(trans); - ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), - FIEMAP_EXTENT_LAST); + prev.flags |= FIEMAP_EXTENT_LAST; + ret = bch2_fill_extent(c, info, &prev); } - +err: bch2_trans_put(trans); - bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); - return ret < 0 ? ret : 0; + bch2_bkey_buf_exit(&cur.kbuf, c); + bch2_bkey_buf_exit(&prev.kbuf, c); + + return bch2_err_class(ret < 0 ? ret : 0); } static const struct vm_operations_struct bch_vm_ops = { @@ -1487,13 +1620,14 @@ static int bch2_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); + struct bch_fs *c = inode->v.i_sb->s_fs_info; fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; - if (inode->ei_inode.bi_casefold) + if (bch2_inode_casefold(c, &inode->ei_inode)) fa->flags |= FS_CASEFOLD_FL; fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; @@ -1526,7 +1660,7 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans, (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags) return -EINVAL; - if (s->casefold != bi->bi_casefold) { + if (s->casefold != bch2_inode_casefold(c, bi)) { #ifdef CONFIG_UNICODE int ret = 0; /* Not supported on individual files. */ @@ -1547,9 +1681,8 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans, bch2_check_set_feature(c, BCH_FEATURE_casefolding); - bi->bi_casefold = s->casefold; - bi->bi_fields_set &= ~BIT(Inode_opt_casefold); - bi->bi_fields_set |= s->casefold << Inode_opt_casefold; + bi->bi_casefold = s->casefold + 1; + bi->bi_fields_set |= BIT(Inode_opt_casefold); #else printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); @@ -2330,7 +2463,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) struct inode *vinode; struct bch2_opts_parse *opts_parse = fc->fs_private; struct bch_opts opts = opts_parse->opts; - darray_str devs; + darray_const_str devs; darray_fs devs_to_fs = {}; int ret; @@ -2354,7 +2487,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (!IS_ERR(sb)) goto got_sb; - c = bch2_fs_open(devs.data, devs.nr, opts); + c = bch2_fs_open(&devs, &opts); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; @@ -2445,6 +2578,11 @@ got_sb: if (ret) goto err_put_super; +#ifdef CONFIG_UNICODE + sb->s_encoding = c->cf_encoding; +#endif + generic_set_sb_d_ops(sb); + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); bch_err_msg(c, ret, "mounting: error getting root inode"); |