From 23dec8a6ab94fad2b266f65b7c0786fb289cd5aa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Dec 2024 21:44:38 -0500 Subject: [PATCH] bcachefs: bcachefs_metadata_version_persistent_inode_cursors Persistent cursors for inode allocation. A free inodes btree would add substantial overhead to inode allocation and freeing - a "next num to allocate" cursor is always going to be faster. We just need it to be persistent, to avoid scanning the inodes btree from the start on startup. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 3 - fs/bcachefs/bcachefs_format.h | 10 ++- fs/bcachefs/btree_update.c | 2 +- fs/bcachefs/inode.c | 121 +++++++++++++++++++++++--------- fs/bcachefs/inode.h | 10 +++ fs/bcachefs/inode_format.h | 8 +++ fs/bcachefs/logged_ops.c | 5 +- fs/bcachefs/logged_ops_format.h | 5 +- fs/bcachefs/opts.h | 9 ++- fs/bcachefs/sb-errors_format.h | 3 +- fs/bcachefs/super-io.c | 5 ++ fs/bcachefs/super.c | 7 +- 12 files changed, 135 insertions(+), 53 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 0e823a1e043f..d27550ef83a8 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1064,9 +1064,6 @@ struct bch_fs { struct btree_node *verify_ondisk; struct mutex verify_lock; - u64 *unused_inode_hints; - unsigned inode_shard_bits; - /* * A btree node on disk could have too many bsets for an iterator to fit * on the stack - have to dynamically allocate them diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f140c3366e65..09e53bef1a30 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k) x(snapshot_tree, 31) \ x(logged_op_truncate, 32) \ x(logged_op_finsert, 33) \ - x(accounting, 34) + x(accounting, 34) \ + x(inode_alloc_cursor, 35) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -682,7 +683,8 @@ struct bch_sb_field_ext { x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ - x(inode_depth, BCH_VERSION(1, 17)) + x(inode_depth, BCH_VERSION(1, 17)) \ + x(persistent_inode_cursors, BCH_VERSION(1, 18)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -850,6 +852,7 @@ LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); +LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -1347,7 +1350,8 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_set)) \ x(logged_ops, 17, 0, \ BIT_ULL(KEY_TYPE_logged_op_truncate)| \ - BIT_ULL(KEY_TYPE_logged_op_finsert)) \ + BIT_ULL(KEY_TYPE_logged_op_finsert)| \ + BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ x(subvolume_children, 19, 0, \ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 06fd5aa62296..b1da6dba8bf4 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -588,7 +588,7 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); + bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); int ret = bkey_err(k); if (ret) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 8818e41883f2..d1fc44a7b3a6 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -799,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); } +int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + int ret = 0; + + bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, + c, inode_alloc_cursor_inode_bad, + "k.p.inode bad"); +fsck_err: + return ret; +} + +void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); + + prt_printf(out, "idx %llu generation %llu", + le64_to_cpu(i.v->idx), + le64_to_cpu(i.v->gen)); +} + void bch2_inode_init_early(struct bch_fs *c, struct bch_inode_unpacked *inode_u) { @@ -859,43 +881,81 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -/* - * This just finds an empty slot: - */ -int bch2_inode_create(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode_u, - u32 snapshot, u64 cpu) +static struct bkey_i_inode_alloc_cursor * +bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { struct bch_fs *c = trans->c; - struct bkey_s_c k; - u64 min, max, start, pos, *hint; - int ret = 0; - unsigned bits = (c->opts.inodes_32bit ? 31 : 63); + u64 cursor_idx = c->opts.shard_inode_numbers ? cpu : 0; + + cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); + + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), + BTREE_ITER_cached); + int ret = bkey_err(k); + if (ret) + return ERR_PTR(ret); + + struct bkey_i_inode_alloc_cursor *cursor = + k.k->type == KEY_TYPE_inode_alloc_cursor + ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) + : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); + ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + goto err; + + cursor->v.bits = c->opts.shard_inode_numbers_bits; + + unsigned bits = (c->opts.inodes_32bit ? 31 : 63); if (c->opts.shard_inode_numbers) { - bits -= c->inode_shard_bits; + bits -= cursor->v.bits; - min = (cpu << bits); - max = (cpu << bits) | ~(ULLONG_MAX << bits); + *min = (cpu << bits); + *max = (cpu << bits) | ~(ULLONG_MAX << bits); - min = max_t(u64, min, BLOCKDEV_INODE_MAX); - hint = c->unused_inode_hints + cpu; + *min = max_t(u64, *min, BLOCKDEV_INODE_MAX); } else { - min = BLOCKDEV_INODE_MAX; - max = ~(ULLONG_MAX << bits); - hint = c->unused_inode_hints; + *min = BLOCKDEV_INODE_MAX; + *max = ~(ULLONG_MAX << bits); } - start = READ_ONCE(*hint); + if (le64_to_cpu(cursor->v.idx) < *min) + cursor->v.idx = cpu_to_le64(*min); + + if (le64_to_cpu(cursor->v.idx) >= *max) { + cursor->v.idx = cpu_to_le64(*min); + le32_add_cpu(&cursor->v.gen, 1); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret ? ERR_PTR(ret) : cursor; +} + +/* + * This just finds an empty slot: + */ +int bch2_inode_create(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode_u, + u32 snapshot, u64 cpu) +{ + u64 min, max; + struct bkey_i_inode_alloc_cursor *cursor = + bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); + int ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + return ret; - if (start >= max || start < min) - start = min; + u64 start = le64_to_cpu(cursor->v.idx); + u64 pos = start; - pos = start; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), BTREE_ITER_all_snapshots| BTREE_ITER_intent); + struct bkey_s_c k; again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -925,6 +985,7 @@ int bch2_inode_create(struct btree_trans *trans, /* Retry from start */ pos = start = min; bch2_btree_iter_set_pos(iter, POS(0, pos)); + le32_add_cpu(&cursor->v.gen, 1); goto again; found_slot: bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); @@ -935,9 +996,9 @@ int bch2_inode_create(struct btree_trans *trans, return ret; } - *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = bkey_generation(k); + inode_u->bi_generation = le64_to_cpu(cursor->v.gen); + cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); return 0; } @@ -999,8 +1060,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; struct bkey_s_c k; u32 snapshot; int ret; @@ -1040,13 +1099,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) goto err; } - bch2_inode_unpack(k, &inode_u); - - bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter.pos; - delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - - ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + ret = bch2_btree_delete_at(trans, &iter, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); err: diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 5bca6950f20e..d2e134528f0e 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -68,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk .min_val_size = 8, \ }) +int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ + .key_validate = bch2_inode_alloc_cursor_validate, \ + .val_to_text = bch2_inode_alloc_cursor_to_text, \ + .min_val_size = 16, \ +}) + #if 0 typedef struct { u64 lo; diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index be1e747629d2..1b93e189b745 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -165,4 +165,12 @@ LE64_BITMASK(INODEv3_FIELDS_START, struct bch_inode_v3, bi_flags, 31, 36); LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); +struct bch_inode_alloc_cursor { + struct bch_val v; + __u8 bits; + __u8 pad; + __le32 gen; + __le64 idx; +}; + #endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 1ac51af16299..75f27ec26f85 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -65,7 +65,8 @@ int bch2_resume_logged_ops(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_max(trans, iter, BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX), + POS(LOGGED_OPS_INUM_logged_ops, 0), + POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); @@ -76,7 +77,7 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { struct btree_iter iter; int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX)); + BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); if (ret) return ret; diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h index 0b370a963ac6..cfb67c95d4c8 100644 --- a/fs/bcachefs/logged_ops_format.h +++ b/fs/bcachefs/logged_ops_format.h @@ -2,7 +2,10 @@ #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H #define _BCACHEFS_LOGGED_OPS_FORMAT_H -#define LOGGED_OPS_INUM 0 +enum logged_ops_inums { + LOGGED_OPS_INUM_logged_ops, + LOGGED_OPS_INUM_inode_cursors, +}; struct bch_logged_op_truncate { struct bch_val v; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index ea69099e681d..2df5c8f7d0b4 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -222,15 +222,20 @@ enum fsck_err_opts { BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ x(inodes_32bit, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, true, \ NULL, "Constrain inode numbers to 32 bits") \ x(shard_inode_numbers, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT, \ OPT_BOOL(), \ BCH_SB_SHARD_INUMS, true, \ NULL, "Shard new inode numbers by CPU id") \ + x(shard_inode_numbers_bits, u8, \ + OPT_FS|OPT_FORMAT, \ + OPT_UINT(0, 8), \ + BCH_SB_SHARD_INUMS_NBITS, 0, \ + NULL, "Shard new inode numbers by CPU id") \ x(inodes_use_key_cache, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 806486635075..e26317c367f7 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -211,6 +211,7 @@ enum bch_fsck_flags { x(bkey_in_missing_snapshot, 190, 0) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ + x(inode_alloc_cursor_inode_bad, 301, 0) \ x(inode_unpack_error, 193, 0) \ x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ @@ -311,7 +312,7 @@ enum bch_fsck_flags { x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(MAX, 301, 0) + x(MAX, 302, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index b0d52b6ccad4..dbc09e305c27 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -460,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); } +#ifdef __KERNEL__ + if (!BCH_SB_SHARD_INUMS_NBITS(sb)) + SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); +#endif + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { const struct bch_option *opt = bch2_opt_table + opt_id; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 14157820705d..bd6aeceae901 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -586,7 +586,6 @@ static void __bch2_fs_free(struct bch_fs *c) #endif kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); - kfree(c->unused_inode_hints); if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); @@ -872,8 +871,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", @@ -900,9 +897,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || - mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || - !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, - sizeof(u64), GFP_KERNEL))) { + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { ret = -BCH_ERR_ENOMEM_fs_other_alloc; goto err; }