Skip to content

Commit

Permalink
bcachefs: bcachefs_metadata_version_persistent_inode_cursors
Browse files Browse the repository at this point in the history
Persistent cursors for inode allocation.

A free inodes btree would add substantial overhead to inode allocation
and freeing - a "next num to allocate" cursor is always going to be
faster.

We just need it to be persistent, to avoid scanning the inodes btree
from the start on startup.

Signed-off-by: Kent Overstreet <[email protected]>
  • Loading branch information
Kent Overstreet committed Dec 9, 2024
1 parent ff6916c commit 23dec8a
Show file tree
Hide file tree
Showing 12 changed files with 135 additions and 53 deletions.
3 changes: 0 additions & 3 deletions fs/bcachefs/bcachefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1064,9 +1064,6 @@ struct bch_fs {
struct btree_node *verify_ondisk;
struct mutex verify_lock;

u64 *unused_inode_hints;
unsigned inode_shard_bits;

/*
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - have to dynamically allocate them
Expand Down
10 changes: 7 additions & 3 deletions fs/bcachefs/bcachefs_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k)
x(snapshot_tree, 31) \
x(logged_op_truncate, 32) \
x(logged_op_finsert, 33) \
x(accounting, 34)
x(accounting, 34) \
x(inode_alloc_cursor, 35)

enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
Expand Down Expand Up @@ -682,7 +683,8 @@ struct bch_sb_field_ext {
x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \
x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \
x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \
x(inode_depth, BCH_VERSION(1, 17))
x(inode_depth, BCH_VERSION(1, 17)) \
x(persistent_inode_cursors, BCH_VERSION(1, 18))

enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
Expand Down Expand Up @@ -850,6 +852,7 @@ LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);

static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
Expand Down Expand Up @@ -1347,7 +1350,8 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
BIT_ULL(KEY_TYPE_logged_op_finsert)| \
BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
x(subvolume_children, 19, 0, \
Expand Down
2 changes: 1 addition & 1 deletion fs/bcachefs/btree_update.c
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
enum btree_id btree, struct bpos end)
{
bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
int ret = bkey_err(k);
if (ret)
Expand Down
121 changes: 87 additions & 34 deletions fs/bcachefs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
}

int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
int ret = 0;

bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
c, inode_alloc_cursor_inode_bad,
"k.p.inode bad");
fsck_err:
return ret;
}

void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);

prt_printf(out, "idx %llu generation %llu",
le64_to_cpu(i.v->idx),
le64_to_cpu(i.v->gen));
}

void bch2_inode_init_early(struct bch_fs *c,
struct bch_inode_unpacked *inode_u)
{
Expand Down Expand Up @@ -859,43 +881,81 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}

/*
* This just finds an empty slot:
*/
int bch2_inode_create(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode_u,
u32 snapshot, u64 cpu)
static struct bkey_i_inode_alloc_cursor *
bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
u64 min, max, start, pos, *hint;
int ret = 0;
unsigned bits = (c->opts.inodes_32bit ? 31 : 63);

u64 cursor_idx = c->opts.shard_inode_numbers ? cpu : 0;

cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);

struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
BTREE_ID_logged_ops,
POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
BTREE_ITER_cached);
int ret = bkey_err(k);
if (ret)
return ERR_PTR(ret);

struct bkey_i_inode_alloc_cursor *cursor =
k.k->type == KEY_TYPE_inode_alloc_cursor
? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
: bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
ret = PTR_ERR_OR_ZERO(cursor);
if (ret)
goto err;

cursor->v.bits = c->opts.shard_inode_numbers_bits;

unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
if (c->opts.shard_inode_numbers) {
bits -= c->inode_shard_bits;
bits -= cursor->v.bits;

min = (cpu << bits);
max = (cpu << bits) | ~(ULLONG_MAX << bits);
*min = (cpu << bits);
*max = (cpu << bits) | ~(ULLONG_MAX << bits);

min = max_t(u64, min, BLOCKDEV_INODE_MAX);
hint = c->unused_inode_hints + cpu;
*min = max_t(u64, *min, BLOCKDEV_INODE_MAX);
} else {
min = BLOCKDEV_INODE_MAX;
max = ~(ULLONG_MAX << bits);
hint = c->unused_inode_hints;
*min = BLOCKDEV_INODE_MAX;
*max = ~(ULLONG_MAX << bits);
}

start = READ_ONCE(*hint);
if (le64_to_cpu(cursor->v.idx) < *min)
cursor->v.idx = cpu_to_le64(*min);

if (le64_to_cpu(cursor->v.idx) >= *max) {
cursor->v.idx = cpu_to_le64(*min);
le32_add_cpu(&cursor->v.gen, 1);
}
err:
bch2_trans_iter_exit(trans, &iter);
return ret ? ERR_PTR(ret) : cursor;
}

/*
* This just finds an empty slot:
*/
int bch2_inode_create(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode_u,
u32 snapshot, u64 cpu)
{
u64 min, max;
struct bkey_i_inode_alloc_cursor *cursor =
bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
int ret = PTR_ERR_OR_ZERO(cursor);
if (ret)
return ret;

if (start >= max || start < min)
start = min;
u64 start = le64_to_cpu(cursor->v.idx);
u64 pos = start;

pos = start;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
BTREE_ITER_all_snapshots|
BTREE_ITER_intent);
struct bkey_s_c k;
again:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
Expand Down Expand Up @@ -925,6 +985,7 @@ int bch2_inode_create(struct btree_trans *trans,
/* Retry from start */
pos = start = min;
bch2_btree_iter_set_pos(iter, POS(0, pos));
le32_add_cpu(&cursor->v.gen, 1);
goto again;
found_slot:
bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
Expand All @@ -935,9 +996,9 @@ int bch2_inode_create(struct btree_trans *trans,
return ret;
}

*hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
inode_u->bi_generation = le64_to_cpu(cursor->v.gen);
cursor->v.idx = cpu_to_le64(k.k->p.offset + 1);
return 0;
}

Expand Down Expand Up @@ -999,8 +1060,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
u32 snapshot;
int ret;
Expand Down Expand Up @@ -1040,13 +1099,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
goto err;
}

bch2_inode_unpack(k, &inode_u);

bkey_inode_generation_init(&delete.k_i);
delete.k.p = iter.pos;
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);

ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
ret = bch2_btree_delete_at(trans, &iter, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
err:
Expand Down
10 changes: 10 additions & 0 deletions fs/bcachefs/inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk
.min_val_size = 8, \
})

int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
struct bkey_validate_context);
void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);

#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \
.key_validate = bch2_inode_alloc_cursor_validate, \
.val_to_text = bch2_inode_alloc_cursor_to_text, \
.min_val_size = 16, \
})

#if 0
typedef struct {
u64 lo;
Expand Down
8 changes: 8 additions & 0 deletions fs/bcachefs/inode_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,12 @@ LE64_BITMASK(INODEv3_FIELDS_START,
struct bch_inode_v3, bi_flags, 31, 36);
LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);

struct bch_inode_alloc_cursor {
struct bch_val v;
__u8 bits;
__u8 pad;
__le32 gen;
__le64 idx;
};

#endif /* _BCACHEFS_INODE_FORMAT_H */
5 changes: 3 additions & 2 deletions fs/bcachefs/logged_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ int bch2_resume_logged_ops(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_max(trans, iter,
BTREE_ID_logged_ops,
POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX),
POS(LOGGED_OPS_INUM_logged_ops, 0),
POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
BTREE_ITER_prefetch, k,
resume_logged_op(trans, &iter, k)));
bch_err_fn(c, ret);
Expand All @@ -76,7 +77,7 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{
struct btree_iter iter;
int ret = bch2_bkey_get_empty_slot(trans, &iter,
BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX));
BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
if (ret)
return ret;

Expand Down
5 changes: 4 additions & 1 deletion fs/bcachefs/logged_ops_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
#define _BCACHEFS_LOGGED_OPS_FORMAT_H

#define LOGGED_OPS_INUM 0
enum logged_ops_inums {
LOGGED_OPS_INUM_logged_ops,
LOGGED_OPS_INUM_inode_cursors,
};

struct bch_logged_op_truncate {
struct bch_val v;
Expand Down
9 changes: 7 additions & 2 deletions fs/bcachefs/opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,20 @@ enum fsck_err_opts {
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
x(inodes_32bit, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FS|OPT_FORMAT, \
OPT_BOOL(), \
BCH_SB_INODE_32BIT, true, \
NULL, "Constrain inode numbers to 32 bits") \
x(shard_inode_numbers, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FS|OPT_FORMAT, \
OPT_BOOL(), \
BCH_SB_SHARD_INUMS, true, \
NULL, "Shard new inode numbers by CPU id") \
x(shard_inode_numbers_bits, u8, \
OPT_FS|OPT_FORMAT, \
OPT_UINT(0, 8), \
BCH_SB_SHARD_INUMS_NBITS, 0, \
NULL, "Shard new inode numbers by CPU id") \
x(inodes_use_key_cache, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
Expand Down
3 changes: 2 additions & 1 deletion fs/bcachefs/sb-errors_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ enum bch_fsck_flags {
x(bkey_in_missing_snapshot, 190, 0) \
x(inode_pos_inode_nonzero, 191, 0) \
x(inode_pos_blockdev_range, 192, 0) \
x(inode_alloc_cursor_inode_bad, 301, 0) \
x(inode_unpack_error, 193, 0) \
x(inode_str_hash_invalid, 194, 0) \
x(inode_v3_fields_start_bad, 195, 0) \
Expand Down Expand Up @@ -311,7 +312,7 @@ enum bch_fsck_flags {
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
x(MAX, 301, 0)
x(MAX, 302, 0)

enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
Expand Down
5 changes: 5 additions & 0 deletions fs/bcachefs/super-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
}

#ifdef __KERNEL__
if (!BCH_SB_SHARD_INUMS_NBITS(sb))
SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
#endif

for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
const struct bch_option *opt = bch2_opt_table + opt_id;

Expand Down
7 changes: 1 addition & 6 deletions fs/bcachefs/super.c
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,6 @@ static void __bch2_fs_free(struct bch_fs *c)
#endif
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
kfree(c->unused_inode_hints);

if (c->write_ref_wq)
destroy_workqueue(c->write_ref_wq);
Expand Down Expand Up @@ -872,8 +871,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);

c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));

if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
Expand All @@ -900,9 +897,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
ret = -BCH_ERR_ENOMEM_fs_other_alloc;
goto err;
}
Expand Down

0 comments on commit 23dec8a

Please sign in to comment.