summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2021-09-01 11:15:56 -0700
committerDarrick J. Wong <djwong@kernel.org>2021-12-15 17:29:11 -0800
commitcf2256c853689eff006fcaaf8100c2d2052671d4 (patch)
tree3dc848f07d254993fc19aa979a6ed9a796ee8105
parent80e8ea43e8818125fdeb254f1f89f6482b1f33bb (diff)
xfs: allow inode-based btrees to reserve space in the data devicereserve-rt-metadata-space_2021-12-15
Create a new space reservation scheme so that btree metadata for the realtime volume can reserve space in the data device to avoid space underruns. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c3
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_imeta.c189
-rw-r--r--fs/xfs/libxfs/xfs_imeta.h12
-rw-r--r--fs/xfs/libxfs/xfs_types.h7
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_fsops.c18
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_mount.c10
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_rtalloc.c23
-rw-r--r--fs/xfs/xfs_rtalloc.h5
-rw-r--r--fs/xfs/xfs_trace.h45
13 files changed, 322 insertions, 1 deletions
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 209c22e2a19d..662d48a560b8 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -113,6 +113,7 @@ xfs_ag_resv_needed(
case XFS_AG_RESV_RMAPBT:
len -= xfs_perag_resv(pag, type)->ar_reserved;
break;
+ case XFS_AG_RESV_IMETA:
case XFS_AG_RESV_NONE:
/* empty */
break;
@@ -347,6 +348,7 @@ xfs_ag_resv_alloc_extent(
switch (type) {
case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_IMETA:
return;
case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_RMAPBT:
@@ -389,6 +391,7 @@ xfs_ag_resv_free_extent(
switch (type) {
case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_IMETA:
return;
case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_RMAPBT:
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index f5fa2151e05d..f61559a022ca 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -60,7 +60,8 @@
#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37
#define XFS_ERRTAG_AG_RESV_FAIL 38
#define XFS_ERRTAG_SWAPEXT_FINISH_ONE 39
-#define XFS_ERRTAG_MAX 40
+#define XFS_ERRTAG_IMETA_RESV_CRITICAL 40
+#define XFS_ERRTAG_MAX 41
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -105,5 +106,6 @@
#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
#define XFS_RANDOM_AG_RESV_FAIL 1
#define XFS_RANDOM_SWAPEXT_FINISH_ONE 1
+#define XFS_RANDOM_IMETA_RESV_CRITICAL 4
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_imeta.c b/fs/xfs/libxfs/xfs_imeta.c
index 4b383bc7d7cd..0c143874ef81 100644
--- a/fs/xfs/libxfs/xfs_imeta.c
+++ b/fs/xfs/libxfs/xfs_imeta.c
@@ -25,6 +25,10 @@
#include "xfs_trans_space.h"
#include "xfs_dir2.h"
#include "xfs_ag.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
/*
* Metadata Inode Number Management
@@ -1034,3 +1038,188 @@ xfs_imeta_droplink(
xfs_is_metadata_inode(ip))
ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA;
}
+
+/*
+ * Is the amount of space that could be allocated towards a given metadata
+ * file at or beneath a certain threshold?
+ */
+static inline bool
+xfs_imeta_resv_can_cover(
+ struct xfs_inode *ip,
+ int64_t rhs)
+{
+ /*
+ * The amount of space that can be allocated to this metadata file is
+ * the remaining reservation for the particular metadata file + the
+ * global free block count. Take care of the first case to avoid
+ * touching the per-cpu counter.
+ */
+ if (ip->i_delayed_blks >= rhs)
+ return true;
+
+ /*
+ * There aren't enough blocks left in the inode's reservation, but it
+ * isn't critical unless there also isn't enough free space.
+ */
+ return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
+ rhs - ip->i_delayed_blks, 2048) >= 0;
+}
+
+/*
+ * Is this metadata file critically low on blocks? For now we'll define that
+ * as the number of blocks we can get our hands on being less than 10% of what
+ * we reserved or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_imeta_resv_critical(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip)
+{
+ uint64_t asked_low_water;
+
+ if (!ip)
+ return false;
+
+ ASSERT(xfs_is_metadata_inode(ip));
+ trace_xfs_imeta_resv_critical(ip, 0);
+
+ if (!xfs_imeta_resv_can_cover(ip, mp->m_rtbtree_maxlevels))
+ return true;
+
+ asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
+ if (!xfs_imeta_resv_can_cover(ip, asked_low_water))
+ return true;
+
+ return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_IMETA_RESV_CRITICAL);
+}
+
+/* Allocate a block from the metadata file's reservation. */
+void
+xfs_imeta_resv_alloc_extent(
+ struct xfs_inode *ip,
+ struct xfs_alloc_arg *args)
+{
+ int64_t len = args->len;
+
+ ASSERT(xfs_is_metadata_inode(ip));
+ ASSERT(args->resv == XFS_AG_RESV_IMETA);
+
+ trace_xfs_imeta_resv_alloc_extent(ip, args->len);
+
+ /*
+ * Allocate the blocks from the metadata inode's block reservation
+ * and update the ondisk sb counter.
+ */
+ if (ip->i_delayed_blks > 0) {
+ int64_t from_resv;
+
+ from_resv = min_t(int64_t, len, ip->i_delayed_blks);
+ ip->i_delayed_blks -= from_resv;
+ xfs_mod_delalloc(ip->i_mount, -from_resv);
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
+ -from_resv);
+ len -= from_resv;
+ }
+
+ /*
+ * Any allocation in excess of the reservation requires in-core and
+ * on-disk fdblocks updates.
+ */
+ if (len)
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, -len);
+
+ ip->i_nblocks += args->len;
+}
+
+/* Free a block to the metadata file's reservation. */
+void
+xfs_imeta_resv_free_extent(
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ xfs_filblks_t len)
+{
+ int64_t to_resv;
+
+ ASSERT(xfs_is_metadata_inode(ip));
+ trace_xfs_imeta_resv_free_extent(ip, len);
+
+ ip->i_nblocks -= len;
+
+ /*
+ * Add the freed blocks back into the inode's delalloc reservation
+ * until it reaches the maximum size. Update the ondisk fdblocks only.
+ */
+ to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
+ if (to_resv > 0) {
+ to_resv = min_t(int64_t, to_resv, len);
+ ip->i_delayed_blks += to_resv;
+ xfs_mod_delalloc(ip->i_mount, to_resv);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
+ len -= to_resv;
+ }
+
+ /*
+ * Everything else goes back to the filesystem, so update the in-core
+ * and on-disk counters.
+ */
+ if (len)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
+}
+
+/* Release a metadata file's space reservation. */
+void
+xfs_imeta_resv_free_inode(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip)
+{
+ if (!ip)
+ return;
+
+ ASSERT(xfs_is_metadata_inode(ip));
+ trace_xfs_imeta_resv_free(ip, 0);
+
+ xfs_mod_delalloc(ip->i_mount, -ip->i_delayed_blks);
+ xfs_mod_fdblocks(ip->i_mount, ip->i_delayed_blks, true);
+ ip->i_delayed_blks = 0;
+ ip->i_meta_resv_asked = 0;
+}
+
+/* Set up a metadata file's space reservation. */
+int
+xfs_imeta_resv_init_inode(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip,
+ xfs_filblks_t ask)
+{
+ xfs_filblks_t hidden_space;
+ xfs_filblks_t used;
+ int error;
+
+ if (!ip || ip->i_meta_resv_asked > 0)
+ return 0;
+
+ ASSERT(xfs_is_metadata_inode(ip));
+
+ /*
+ * Space taken by all other metadata btrees are accounted on-disk as
+ * used space. We therefore only hide the space that is reserved but
+ * not used by the trees.
+ */
+ used = ip->i_nblocks;
+ if (used > ask)
+ ask = used;
+ hidden_space = ask - used;
+
+ error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
+ if (error) {
+ trace_xfs_imeta_resv_init_error(ip, error, _RET_IP_);
+ return error;
+ }
+
+ xfs_mod_delalloc(mp, hidden_space);
+ ip->i_delayed_blks = hidden_space;
+ ip->i_meta_resv_asked = ask;
+
+ trace_xfs_imeta_resv_init(ip, ask);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_imeta.h b/fs/xfs/libxfs/xfs_imeta.h
index 9a5fc2e66036..9633351b8839 100644
--- a/fs/xfs/libxfs/xfs_imeta.h
+++ b/fs/xfs/libxfs/xfs_imeta.h
@@ -66,6 +66,18 @@ void xfs_imeta_droplink(struct xfs_inode *ip);
unsigned int xfs_imeta_create_space_res(struct xfs_mount *mp);
unsigned int xfs_imeta_unlink_space_res(struct xfs_mount *mp);
+/* Space reservations for metadata inodes. */
+struct xfs_alloc_arg;
+
+bool xfs_imeta_resv_critical(struct xfs_mount *mp, struct xfs_inode *ip);
+void xfs_imeta_resv_alloc_extent(struct xfs_inode *ip,
+ struct xfs_alloc_arg *args);
+void xfs_imeta_resv_free_extent(struct xfs_inode *ip, struct xfs_trans *tp,
+ xfs_filblks_t len);
+void xfs_imeta_resv_free_inode(struct xfs_mount *mp, struct xfs_inode *ip);
+int xfs_imeta_resv_init_inode(struct xfs_mount *mp, struct xfs_inode *ip,
+ xfs_filblks_t ask);
+
/* Must be implemented by the libxfs client */
int xfs_imeta_iget(struct xfs_mount *mp, xfs_ino_t ino, unsigned char ftype,
struct xfs_inode **ipp);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index d0afc3d11e37..ec8b1e941709 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -186,6 +186,13 @@ enum xfs_ag_resv_type {
* altering fdblocks. If you think you need this you're wrong.
*/
XFS_AG_RESV_IGNORE,
+
+ /*
+ * This allocation activity is being done on behalf of a metadata file.
+ * These files maintain their own permanent space reservations and are
+ * required to adjust fdblocks using the xfs_imeta_resv_* helpers.
+ */
+ XFS_AG_RESV_IMETA,
};
/*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index cd17682869e3..19b46636825e 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
XFS_RANDOM_AG_RESV_FAIL,
XFS_RANDOM_SWAPEXT_FINISH_ONE,
+ XFS_RANDOM_IMETA_RESV_CRITICAL,
};
struct xfs_errortag_attr {
@@ -172,6 +173,7 @@ XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
XFS_ERRORTAG_ATTR_RW(swapext_finish_one, XFS_RANDOM_SWAPEXT_FINISH_ONE);
+XFS_ERRORTAG_ATTR_RW(imeta_resv_critical, XFS_RANDOM_IMETA_RESV_CRITICAL);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -214,6 +216,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
XFS_ERRORTAG_ATTR_LIST(swapext_finish_one),
+ XFS_ERRORTAG_ATTR_LIST(imeta_resv_critical),
NULL,
};
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 126f3474c1f6..e528a0cd845f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -20,6 +20,7 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_trace.h"
+#include "xfs_rtalloc.h"
/*
* Write new AG headers to disk. Non-transactional, but need to be
@@ -580,6 +581,20 @@ xfs_fs_reserve_ag_blocks(
xfs_warn(mp,
"Error %d reserving per-AG metadata reserve pool.", error);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ if (xfs_has_realtime(mp)) {
+ int err2 = xfs_rt_resv_init(mp);
+
+ if (err2 && err2 != -ENOSPC) {
+ xfs_warn(mp,
+ "Error %d reserving realtime metadata reserve pool.", err2);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+
+ if (err2 && !error)
+ error = err2;
}
return error;
@@ -595,6 +610,9 @@ xfs_fs_unreserve_ag_blocks(
struct xfs_perag *pag;
xfs_agnumber_t agno;
+ if (xfs_has_realtime(mp))
+ xfs_rt_resv_free(mp);
+
for_each_perag(mp, agno, pag)
xfs_ag_resv_free(pag);
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 88de1287bce0..7bf707010219 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -55,6 +55,9 @@ typedef struct xfs_inode {
/* Miscellaneous state. */
unsigned long i_flags; /* see defined flags below */
uint64_t i_delayed_blks; /* count of delay alloc blks */
+ /* Space that has been set aside to root a btree in this file. */
+ uint64_t i_meta_resv_asked;
+
xfs_fsize_t i_disk_size; /* number of bytes in file */
xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
prid_t i_projid; /* owner's project id */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 66b85d14b80f..727545ca2348 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -608,6 +608,15 @@ xfs_agbtree_compute_maxlevels(
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}
+/* Compute maximum possible height for realtime btree types for this fs. */
+static inline void
+xfs_rtbtree_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ /* This will be filled in later. */
+ mp->m_rtbtree_maxlevels = 0;
+}
+
/*
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
@@ -680,6 +689,7 @@ xfs_mountfs(
xfs_refcountbt_compute_maxlevels(mp);
xfs_agbtree_compute_maxlevels(mp);
+ xfs_rtbtree_compute_maxlevels(mp);
/*
* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index cb3d618a58b1..fb3d139f5812 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -151,6 +151,7 @@ typedef struct xfs_mount {
uint m_rmap_maxlevels; /* max rmap btree levels */
uint m_refc_maxlevels; /* max refcount btree level */
unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */
+ unsigned int m_rtbtree_maxlevels; /* max level of all rt btrees */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
uint m_alloc_set_aside; /* space we can't use */
uint m_ag_max_usable; /* max space per AG */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index fa30856a746f..072aa25fda38 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1138,6 +1138,14 @@ error_cancel:
/* Update secondary superblocks now the physical grow has completed */
error = xfs_update_secondary_sbs(mp);
+ if (error)
+ goto out_free;
+
+ /* Reset the rt metadata btree space reservations. */
+ xfs_rt_resv_free(mp);
+ error = xfs_rt_resv_init(mp);
+ if (error == -ENOSPC)
+ error = 0;
out_free:
/*
@@ -1283,6 +1291,21 @@ xfs_rtmount_init(
return 0;
}
+/* Free space reservations for rt metadata inodes. */
+void
+xfs_rt_resv_free(
+ struct xfs_mount *mp)
+{
+}
+
+/* Reserve space for rt metadata inodes' space expansion. */
+int
+xfs_rt_resv_init(
+ struct xfs_mount *mp)
+{
+ return 0;
+}
+
/*
* Realtime metadata files are not quite regular files because userspace can't
* access the realtime bitmap directly, and because we take the ILOCK of the rt
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index a0d0e161d804..7a459c250aa6 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -79,6 +79,9 @@ int /* error */
xfs_rtmount_inodes(
struct xfs_mount *mp); /* file system mount structure */
+void xfs_rt_resv_free(struct xfs_mount *mp);
+int xfs_rt_resv_init(struct xfs_mount *mp);
+
/*
* Pick an extent for allocation at the start of a new realtime file.
* Use the sequence number stored in the atime field of the bitmap inode.
@@ -168,6 +171,8 @@ xfs_rtmount_init(
xfs_warn(mp, "Not built with CONFIG_XFS_RT");
return -ENOSYS;
}
+# define xfs_rt_resv_free(mp) ((void)0)
+# define xfs_rt_resv_init(mp) (0)
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
# define xfs_rtunmount_inodes(m)
# define xfs_rtfile_convert_unwritten(ip, pos, len) (0)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index fa271eb5e078..6d66506475f8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -4655,6 +4655,51 @@ DEFINE_IMETA_DIR_EVENT(xfs_imeta_dir_created);
DEFINE_IMETA_DIR_EVENT(xfs_imeta_dir_unlinked);
DEFINE_IMETA_DIR_EVENT(xfs_imeta_dir_link);
+/* metadata inode space reservations */
+
+DECLARE_EVENT_CLASS(xfs_imeta_resv_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len),
+ TP_ARGS(ip, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned long long, freeblks)
+ __field(unsigned long long, reserved)
+ __field(unsigned long long, asked)
+ __field(unsigned long long, used)
+ __field(unsigned long long, len)
+ ),
+ TP_fast_assign(
+ struct xfs_mount *mp = ip->i_mount;
+
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
+ __entry->reserved = ip->i_delayed_blks;
+ __entry->asked = ip->i_meta_resv_asked;
+ __entry->used = ip->i_nblocks;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->freeblks,
+ __entry->reserved,
+ __entry->asked,
+ __entry->used,
+ __entry->len)
+)
+#define DEFINE_IMETA_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_imeta_resv_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \
+ TP_ARGS(ip, len))
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_init);
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_free);
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_alloc_extent);
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_free_extent);
+DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_critical);
+DEFINE_INODE_ERROR_EVENT(xfs_imeta_resv_init_error);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH