summaryrefslogtreecommitdiff
path: root/tools/lib/python/kdoc/python_version.py
diff options
context:
space:
mode:
authorDavid Hildenbrand <david@redhat.com>2024-12-04 13:54:41 +0100
committerMichael S. Tsirkin <mst@redhat.com>2025-01-27 09:39:25 -0500
commitfe1d79dc54ac4f599a8739e4e6f6082ea5629006 (patch)
treeea3cd02ea8a8cfd48b840351d7ac2097387a79b5 /tools/lib/python/kdoc/python_version.py
parenta9403425b3cf704ab79eeb2c19cf00b2a3462834 (diff)
virtio-mem: remember usable region size
Let's remember the usable region size, which will be helpful in kdump mode next. Signed-off-by: David Hildenbrand <david@redhat.com> Message-Id: <20241204125444.1734652-11-david@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Diffstat (limited to 'tools/lib/python/kdoc/python_version.py')
0 files changed, 0 insertions, 0 deletions
fstat' class='diffstat'>-rw-r--r--fs/9p/acl.c1
-rw-r--r--fs/9p/v9fs.c540
-rw-r--r--fs/9p/v9fs.h7
-rw-r--r--fs/9p/vfs_addr.c6
-rw-r--r--fs/9p/vfs_dentry.c25
-rw-r--r--fs/9p/vfs_file.c41
-rw-r--r--fs/9p/vfs_inode.c39
-rw-r--r--fs/9p/vfs_inode_dotl.c21
-rw-r--r--fs/9p/vfs_super.c142
-rw-r--r--fs/Kconfig16
-rw-r--r--fs/Kconfig.binfmt9
-rw-r--r--fs/Makefile7
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c9
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/file.c28
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/afs/Kconfig1
-rw-r--r--fs/afs/Makefile1
-rw-r--r--fs/afs/addr_prefs.c2
-rw-r--r--fs/afs/callback.c4
-rw-r--r--fs/afs/cell.c122
-rw-r--r--fs/afs/cm_security.c340
-rw-r--r--fs/afs/dir.c229
-rw-r--r--fs/afs/dir_edit.c22
-rw-r--r--fs/afs/dir_search.c2
-rw-r--r--fs/afs/dir_silly.c17
-rw-r--r--fs/afs/dynroot.c13
-rw-r--r--fs/afs/file.c12
-rw-r--r--fs/afs/fs_probe.c2
-rw-r--r--fs/afs/inode.c12
-rw-r--r--fs/afs/internal.h54
-rw-r--r--fs/afs/main.c5
-rw-r--r--fs/afs/misc.c28
-rw-r--r--fs/afs/mntpt.c7
-rw-r--r--fs/afs/proc.c3
-rw-r--r--fs/afs/protocol_yfs.h3
-rw-r--r--fs/afs/rotate.c17
-rw-r--r--fs/afs/rxrpc.c46
-rw-r--r--fs/afs/security.c49
-rw-r--r--fs/afs/server.c7
-rw-r--r--fs/afs/super.c6
-rw-r--r--fs/afs/vl_alias.c3
-rw-r--r--fs/afs/write.c11
-rw-r--r--fs/afs/yfsclient.c249
-rw-r--r--fs/aio.c19
-rw-r--r--fs/anon_inodes.c91
-rw-r--r--fs/attr.c56
-rw-r--r--fs/autofs/autofs_i.h5
-rw-r--r--fs/autofs/dev-ioctl.c56
-rw-r--r--fs/autofs/inode.c5
-rw-r--r--fs/autofs/root.c19
-rw-r--r--fs/backing-file.c159
-rw-r--r--fs/bcachefs/Kconfig113
-rw-r--r--fs/bcachefs/Makefile103
-rw-r--r--fs/bcachefs/acl.c445
-rw-r--r--fs/bcachefs/acl.h60
-rw-r--r--fs/bcachefs/alloc_background.c2656
-rw-r--r--fs/bcachefs/alloc_background.h361
-rw-r--r--fs/bcachefs/alloc_background_format.h95
-rw-r--r--fs/bcachefs/alloc_foreground.c1717
-rw-r--r--fs/bcachefs/alloc_foreground.h257
-rw-r--r--fs/bcachefs/alloc_types.h137
-rw-r--r--fs/bcachefs/backpointers.c1229
-rw-r--r--fs/bcachefs/backpointers.h189
-rw-r--r--fs/bcachefs/bbpos.h37
-rw-r--r--fs/bcachefs/bbpos_types.h18
-rw-r--r--fs/bcachefs/bcachefs.h1256
-rw-r--r--fs/bcachefs/bcachefs_format.h1517
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h473
-rw-r--r--fs/bcachefs/bkey.c1117
-rw-r--r--fs/bcachefs/bkey.h605
-rw-r--r--fs/bcachefs/bkey_buf.h61
-rw-r--r--fs/bcachefs/bkey_cmp.h129
-rw-r--r--fs/bcachefs/bkey_methods.c481
-rw-r--r--fs/bcachefs/bkey_methods.h139
-rw-r--r--fs/bcachefs/bkey_sort.c214
-rw-r--r--fs/bcachefs/bkey_sort.h54
-rw-r--r--fs/bcachefs/bkey_types.h241
-rw-r--r--fs/bcachefs/bset.c1570
-rw-r--r--fs/bcachefs/bset.h544
-rw-r--r--fs/bcachefs/btree_cache.c1515
-rw-r--r--fs/bcachefs/btree_cache.h156
-rw-r--r--fs/bcachefs/btree_gc.c1259
-rw-r--r--fs/bcachefs/btree_gc.h89
-rw-r--r--fs/bcachefs/btree_gc_types.h34
-rw-r--r--fs/bcachefs/btree_io.c2635
-rw-r--r--fs/bcachefs/btree_io.h229
-rw-r--r--fs/bcachefs/btree_iter.c3673
-rw-r--r--fs/bcachefs/btree_iter.h955
-rw-r--r--fs/bcachefs/btree_journal_iter.c806
-rw-r--r--fs/bcachefs/btree_journal_iter.h102
-rw-r--r--fs/bcachefs/btree_journal_iter_types.h36
-rw-r--r--fs/bcachefs/btree_key_cache.c849
-rw-r--r--fs/bcachefs/btree_key_cache.h60
-rw-r--r--fs/bcachefs/btree_key_cache_types.h34
-rw-r--r--fs/bcachefs/btree_locking.c902
-rw-r--r--fs/bcachefs/btree_locking.h450
-rw-r--r--fs/bcachefs/btree_node_scan.c603
-rw-r--r--fs/bcachefs/btree_node_scan.h11
-rw-r--r--fs/bcachefs/btree_node_scan_types.h31
-rw-r--r--fs/bcachefs/btree_trans_commit.c1086
-rw-r--r--fs/bcachefs/btree_types.h895
-rw-r--r--fs/bcachefs/btree_update.c908
-rw-r--r--fs/bcachefs/btree_update.h378
-rw-r--r--fs/bcachefs/btree_update_interior.c2763
-rw-r--r--fs/bcachefs/btree_update_interior.h354
-rw-r--r--fs/bcachefs/btree_write_buffer.c883
-rw-r--r--fs/bcachefs/btree_write_buffer.h106
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h59
-rw-r--r--fs/bcachefs/buckets.c1322
-rw-r--r--fs/bcachefs/buckets.h368
-rw-r--r--fs/bcachefs/buckets_types.h95
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c173
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.h15
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal_types.h23
-rw-r--r--fs/bcachefs/chardev.c831
-rw-r--r--fs/bcachefs/chardev.h31
-rw-r--r--fs/bcachefs/checksum.c831
-rw-r--r--fs/bcachefs/checksum.h239
-rw-r--r--fs/bcachefs/clock.c192
-rw-r--r--fs/bcachefs/clock.h28
-rw-r--r--fs/bcachefs/clock_types.h38
-rw-r--r--fs/bcachefs/compress.c772
-rw-r--r--fs/bcachefs/compress.h73
-rw-r--r--fs/bcachefs/darray.c38
-rw-r--r--fs/bcachefs/darray.h103
-rw-r--r--fs/bcachefs/data_update.c907
-rw-r--r--fs/bcachefs/data_update.h66
-rw-r--r--fs/bcachefs/debug.c980
-rw-r--r--fs/bcachefs/debug.h32
-rw-r--r--fs/bcachefs/dirent.c782
-rw-r--r--fs/bcachefs/dirent.h87
-rw-r--r--fs/bcachefs/dirent_format.h58
-rw-r--r--fs/bcachefs/disk_accounting.c1012
-rw-r--r--fs/bcachefs/disk_accounting.h293
-rw-r--r--fs/bcachefs/disk_accounting_format.h167
-rw-r--r--fs/bcachefs/disk_accounting_types.h19
-rw-r--r--fs/bcachefs/disk_groups.c616
-rw-r--r--fs/bcachefs/disk_groups.h111
-rw-r--r--fs/bcachefs/disk_groups_format.h21
-rw-r--r--fs/bcachefs/disk_groups_types.h18
-rw-r--r--fs/bcachefs/ec.c2347
-rw-r--r--fs/bcachefs/ec.h305
-rw-r--r--fs/bcachefs/ec_format.h43
-rw-r--r--fs/bcachefs/ec_types.h34
-rw-r--r--fs/bcachefs/errcode.c71
-rw-r--r--fs/bcachefs/errcode.h378
-rw-r--r--fs/bcachefs/error.c604
-rw-r--r--fs/bcachefs/error.h258
-rw-r--r--fs/bcachefs/extent_update.c173
-rw-r--r--fs/bcachefs/extent_update.h12
-rw-r--r--fs/bcachefs/extents.c1662
-rw-r--r--fs/bcachefs/extents.h772
-rw-r--r--fs/bcachefs/extents_format.h304
-rw-r--r--fs/bcachefs/extents_types.h41
-rw-r--r--fs/bcachefs/eytzinger.c315
-rw-r--r--fs/bcachefs/eytzinger.h300
-rw-r--r--fs/bcachefs/fifo.h127
-rw-r--r--fs/bcachefs/fs-io-buffered.c1102
-rw-r--r--fs/bcachefs/fs-io-buffered.h27
-rw-r--r--fs/bcachefs/fs-io-direct.c703
-rw-r--r--fs/bcachefs/fs-io-direct.h16
-rw-r--r--fs/bcachefs/fs-io-pagecache.c823
-rw-r--r--fs/bcachefs/fs-io-pagecache.h176
-rw-r--r--fs/bcachefs/fs-io.c1065
-rw-r--r--fs/bcachefs/fs-io.h184
-rw-r--r--fs/bcachefs/fs-ioctl.c651
-rw-r--r--fs/bcachefs/fs-ioctl.h83
-rw-r--r--fs/bcachefs/fs.c2450
-rw-r--r--fs/bcachefs/fs.h215
-rw-r--r--fs/bcachefs/fsck.c3152
-rw-r--r--fs/bcachefs/fsck.h28
-rw-r--r--fs/bcachefs/inode.c1451
-rw-r--r--fs/bcachefs/inode.h302
-rw-r--r--fs/bcachefs/inode_format.h179
-rw-r--r--fs/bcachefs/io_misc.c543
-rw-r--r--fs/bcachefs/io_misc.h34
-rw-r--r--fs/bcachefs/io_read.c1387
-rw-r--r--fs/bcachefs/io_read.h196
-rw-r--r--fs/bcachefs/io_write.c1727
-rw-r--r--fs/bcachefs/io_write.h105
-rw-r--r--fs/bcachefs/io_write_types.h97
-rw-r--r--fs/bcachefs/journal.c1717
-rw-r--r--fs/bcachefs/journal.h468
-rw-r--r--fs/bcachefs/journal_io.c2137
-rw-r--r--fs/bcachefs/journal_io.h93
-rw-r--r--fs/bcachefs/journal_reclaim.c1025
-rw-r--r--fs/bcachefs/journal_reclaim.h84
-rw-r--r--fs/bcachefs/journal_sb.c232
-rw-r--r--fs/bcachefs/journal_sb.h24
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c254
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h22
-rw-r--r--fs/bcachefs/journal_seq_blacklist_format.h15
-rw-r--r--fs/bcachefs/journal_types.h344
-rw-r--r--fs/bcachefs/keylist.c50
-rw-r--r--fs/bcachefs/keylist.h72
-rw-r--r--fs/bcachefs/keylist_types.h16
-rw-r--r--fs/bcachefs/logged_ops.c119
-rw-r--r--fs/bcachefs/logged_ops.h20
-rw-r--r--fs/bcachefs/logged_ops_format.h35
-rw-r--r--fs/bcachefs/lru.c226
-rw-r--r--fs/bcachefs/lru.h70
-rw-r--r--fs/bcachefs/lru_format.h27
-rw-r--r--fs/bcachefs/mean_and_variance.c173
-rw-r--r--fs/bcachefs/mean_and_variance.h203
-rw-r--r--fs/bcachefs/mean_and_variance_test.c221
-rw-r--r--fs/bcachefs/migrate.c188
-rw-r--r--fs/bcachefs/migrate.h7
-rw-r--r--fs/bcachefs/move.c1329
-rw-r--r--fs/bcachefs/move.h154
-rw-r--r--fs/bcachefs/move_types.h50
-rw-r--r--fs/bcachefs/movinggc.c462
-rw-r--r--fs/bcachefs/movinggc.h12
-rw-r--r--fs/bcachefs/namei.c834
-rw-r--r--fs/bcachefs/namei.h72
-rw-r--r--fs/bcachefs/nocow_locking.c144
-rw-r--r--fs/bcachefs/nocow_locking.h50
-rw-r--r--fs/bcachefs/nocow_locking_types.h20
-rw-r--r--fs/bcachefs/opts.c737
-rw-r--r--fs/bcachefs/opts.h667
-rw-r--r--fs/bcachefs/printbuf.c509
-rw-r--r--fs/bcachefs/printbuf.h289
-rw-r--r--fs/bcachefs/progress.c63
-rw-r--r--fs/bcachefs/progress.h29
-rw-r--r--fs/bcachefs/quota.c892
-rw-r--r--fs/bcachefs/quota.h73
-rw-r--r--fs/bcachefs/quota_format.h47
-rw-r--r--fs/bcachefs/quota_types.h43
-rw-r--r--fs/bcachefs/rcu_pending.c666
-rw-r--r--fs/bcachefs/rcu_pending.h27
-rw-r--r--fs/bcachefs/rebalance.c700
-rw-r--r--fs/bcachefs/rebalance.h57
-rw-r--r--fs/bcachefs/rebalance_format.h53
-rw-r--r--fs/bcachefs/rebalance_types.h35
-rw-r--r--fs/bcachefs/recovery.c1217
-rw-r--r--fs/bcachefs/recovery.h12
-rw-r--r--fs/bcachefs/recovery_passes.c316
-rw-r--r--fs/bcachefs/recovery_passes.h18
-rw-r--r--fs/bcachefs/recovery_passes_types.h80
-rw-r--r--fs/bcachefs/reflink.c860
-rw-r--r--fs/bcachefs/reflink.h87
-rw-r--r--fs/bcachefs/reflink_format.h38
-rw-r--r--fs/bcachefs/replicas.c919
-rw-r--r--fs/bcachefs/replicas.h83
-rw-r--r--fs/bcachefs/replicas_format.h36
-rw-r--r--fs/bcachefs/replicas_types.h11
-rw-r--r--fs/bcachefs/sb-clean.c340
-rw-r--r--fs/bcachefs/sb-clean.h16
-rw-r--r--fs/bcachefs/sb-counters.c147
-rw-r--r--fs/bcachefs/sb-counters.h20
-rw-r--r--fs/bcachefs/sb-counters_format.h114
-rw-r--r--fs/bcachefs/sb-downgrade.c443
-rw-r--r--fs/bcachefs/sb-downgrade.h12
-rw-r--r--fs/bcachefs/sb-downgrade_format.h17
-rw-r--r--fs/bcachefs/sb-errors.c176
-rw-r--r--fs/bcachefs/sb-errors.h21
-rw-r--r--fs/bcachefs/sb-errors_format.h339
-rw-r--r--fs/bcachefs/sb-errors_types.h15
-rw-r--r--fs/bcachefs/sb-members.c532
-rw-r--r--fs/bcachefs/sb-members.h381
-rw-r--r--fs/bcachefs/sb-members_format.h122
-rw-r--r--fs/bcachefs/sb-members_types.h21
-rw-r--r--fs/bcachefs/seqmutex.h45
-rw-r--r--fs/bcachefs/siphash.c173
-rw-r--r--fs/bcachefs/siphash.h87
-rw-r--r--fs/bcachefs/six.c881
-rw-r--r--fs/bcachefs/six.h388
-rw-r--r--fs/bcachefs/snapshot.c1749
-rw-r--r--fs/bcachefs/snapshot.h265
-rw-r--r--fs/bcachefs/snapshot_format.h36
-rw-r--r--fs/bcachefs/str_hash.c295
-rw-r--r--fs/bcachefs/str_hash.h418
-rw-r--r--fs/bcachefs/subvolume.c724
-rw-r--r--fs/bcachefs/subvolume.h91
-rw-r--r--fs/bcachefs/subvolume_format.h35
-rw-r--r--fs/bcachefs/subvolume_types.h38
-rw-r--r--fs/bcachefs/super-io.c1503
-rw-r--r--fs/bcachefs/super-io.h118
-rw-r--r--fs/bcachefs/super.c2265
-rw-r--r--fs/bcachefs/super.h47
-rw-r--r--fs/bcachefs/super_types.h35
-rw-r--r--fs/bcachefs/sysfs.c888
-rw-r--r--fs/bcachefs/sysfs.h49
-rw-r--r--fs/bcachefs/tests.c887
-rw-r--r--fs/bcachefs/tests.h15
-rw-r--r--fs/bcachefs/thread_with_file.c492
-rw-r--r--fs/bcachefs/thread_with_file.h81
-rw-r--r--fs/bcachefs/thread_with_file_types.h20
-rw-r--r--fs/bcachefs/time_stats.c179
-rw-r--r--fs/bcachefs/time_stats.h160
-rw-r--r--fs/bcachefs/trace.c18
-rw-r--r--fs/bcachefs/trace.h1949
-rw-r--r--fs/bcachefs/two_state_shared_lock.c8
-rw-r--r--fs/bcachefs/two_state_shared_lock.h58
-rw-r--r--fs/bcachefs/util.c1040
-rw-r--r--fs/bcachefs/util.h741
-rw-r--r--fs/bcachefs/varint.c130
-rw-r--r--fs/bcachefs/varint.h11
-rw-r--r--fs/bcachefs/vstructs.h63
-rw-r--r--fs/bcachefs/xattr.c631
-rw-r--r--fs/bcachefs/xattr.h50
-rw-r--r--fs/bcachefs/xattr_format.h19
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/file.c9
-rw-r--r--fs/bfs/inode.c51
-rw-r--r--fs/binfmt_elf.c239
-rw-r--r--fs/binfmt_elf_fdpic.c19
-rw-r--r--fs/binfmt_misc.c124
-rw-r--r--fs/bpf_fs_kfuncs.c261
-rw-r--r--fs/btrfs/Kconfig46
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/accessors.c164
-rw-r--r--fs/btrfs/accessors.h38
-rw-r--r--fs/btrfs/acl.c25
-rw-r--r--fs/btrfs/async-thread.c3
-rw-r--r--fs/btrfs/backref.c122
-rw-r--r--fs/btrfs/backref.h34
-rw-r--r--fs/btrfs/bio.c373
-rw-r--r--fs/btrfs/bio.h42
-rw-r--r--fs/btrfs/block-group.c382
-rw-r--r--fs/btrfs/block-group.h22
-rw-r--r--fs/btrfs/block-rsv.c25
-rw-r--r--fs/btrfs/block-rsv.h1
-rw-r--r--fs/btrfs/btrfs_inode.h45
-rw-r--r--fs/btrfs/compression.c397
-rw-r--r--fs/btrfs/compression.h82
-rw-r--r--fs/btrfs/ctree.c558
-rw-r--r--fs/btrfs/ctree.h55
-rw-r--r--fs/btrfs/defrag.c220
-rw-r--r--fs/btrfs/delalloc-space.c55
-rw-r--r--fs/btrfs/delalloc-space.h4
-rw-r--r--fs/btrfs/delayed-inode.c389
-rw-r--r--fs/btrfs/delayed-inode.h107
-rw-r--r--fs/btrfs/delayed-ref.c77
-rw-r--r--fs/btrfs/delayed-ref.h16
-rw-r--r--fs/btrfs/dev-replace.c54
-rw-r--r--fs/btrfs/dev-replace.h2
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/dir-item.h2
-rw-r--r--fs/btrfs/direct-io.c97
-rw-r--r--fs/btrfs/discard.c19
-rw-r--r--fs/btrfs/disk-io.c407
-rw-r--r--fs/btrfs/disk-io.h11
-rw-r--r--fs/btrfs/export.c10
-rw-r--r--fs/btrfs/extent-io-tree.c522
-rw-r--r--fs/btrfs/extent-io-tree.h166
-rw-r--r--fs/btrfs/extent-tree.c564
-rw-r--r--fs/btrfs/extent-tree.h40
-rw-r--r--fs/btrfs/extent_io.c1367
-rw-r--r--fs/btrfs/extent_io.h21
-rw-r--r--fs/btrfs/extent_map.c199
-rw-r--r--fs/btrfs/extent_map.h50
-rw-r--r--fs/btrfs/fiemap.c13
-rw-r--r--fs/btrfs/file-item.c178
-rw-r--r--fs/btrfs/file-item.h8
-rw-r--r--fs/btrfs/file.c1029
-rw-r--r--fs/btrfs/free-space-cache.c90
-rw-r--r--fs/btrfs/free-space-tree.c571
-rw-r--r--fs/btrfs/free-space-tree.h52
-rw-r--r--fs/btrfs/fs.c48
-rw-r--r--fs/btrfs/fs.h97
-rw-r--r--fs/btrfs/inode-item.c68
-rw-r--r--fs/btrfs/inode-item.h11
-rw-r--r--fs/btrfs/inode.c1974
-rw-r--r--fs/btrfs/ioctl.c446
-rw-r--r--fs/btrfs/ioctl.h6
-rw-r--r--fs/btrfs/locking.c10
-rw-r--r--fs/btrfs/locking.h4
-rw-r--r--fs/btrfs/lzo.c98
-rw-r--r--fs/btrfs/messages.c2
-rw-r--r--fs/btrfs/messages.h188
-rw-r--r--fs/btrfs/misc.h89
-rw-r--r--fs/btrfs/ordered-data.c149
-rw-r--r--fs/btrfs/print-tree.c268
-rw-r--r--fs/btrfs/qgroup.c650
-rw-r--r--fs/btrfs/raid-stripe-tree.c42
-rw-r--r--fs/btrfs/raid56.c969
-rw-r--r--fs/btrfs/raid56.h107
-rw-r--r--fs/btrfs/rcu-string.h58
-rw-r--r--fs/btrfs/ref-verify.c158
-rw-r--r--fs/btrfs/ref-verify.h8
-rw-r--r--fs/btrfs/reflink.c59
-rw-r--r--fs/btrfs/relocation.c442
-rw-r--r--fs/btrfs/relocation.h3
-rw-r--r--fs/btrfs/root-tree.c70
-rw-r--r--fs/btrfs/scrub.c870
-rw-r--r--fs/btrfs/scrub.h2
-rw-r--r--fs/btrfs/send.c698
-rw-r--r--fs/btrfs/space-info.c642
-rw-r--r--fs/btrfs/space-info.h58
-rw-r--r--fs/btrfs/subpage.c304
-rw-r--r--fs/btrfs/subpage.h62
-rw-r--r--fs/btrfs/super.c473
-rw-r--r--fs/btrfs/sysfs.c179
-rw-r--r--fs/btrfs/sysfs.h3
-rw-r--r--fs/btrfs/tests/btrfs-tests.c32
-rw-r--r--fs/btrfs/tests/delayed-refs-tests.c4
-rw-r--r--fs/btrfs/tests/extent-io-tests.c88
-rw-r--r--fs/btrfs/tests/extent-map-tests.c110
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c93
-rw-r--r--fs/btrfs/tests/inode-tests.c107
-rw-r--r--fs/btrfs/tests/qgroup-tests.c16
-rw-r--r--fs/btrfs/transaction.c215
-rw-r--r--fs/btrfs/transaction.h4
-rw-r--r--fs/btrfs/tree-checker.c102
-rw-r--r--fs/btrfs/tree-log.c2622
-rw-r--r--fs/btrfs/tree-log.h8
-rw-r--r--fs/btrfs/tree-mod-log.c81
-rw-r--r--fs/btrfs/ulist.c59
-rw-r--r--fs/btrfs/uuid-tree.c120
-rw-r--r--fs/btrfs/verity.c44
-rw-r--r--fs/btrfs/volumes.c846
-rw-r--r--fs/btrfs/volumes.h63
-rw-r--r--fs/btrfs/xattr.c50
-rw-r--r--fs/btrfs/zlib.c95
-rw-r--r--fs/btrfs/zoned.c497
-rw-r--r--fs/btrfs/zoned.h19
-rw-r--r--fs/btrfs/zstd.c211
-rw-r--r--fs/buffer.c168
-rw-r--r--fs/cachefiles/interface.c11
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/io.c18
-rw-r--r--fs/cachefiles/key.c3
-rw-r--r--fs/cachefiles/namei.c114
-rw-r--r--fs/cachefiles/ondemand.c4
-rw-r--r--fs/cachefiles/volume.c9
-rw-r--r--fs/ceph/Kconfig2
-rw-r--r--fs/ceph/addr.c46
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c18
-rw-r--r--fs/ceph/crypto.c159
-rw-r--r--fs/ceph/crypto.h34
-rw-r--r--fs/ceph/debugfs.c14
-rw-r--r--fs/ceph/dir.c37
-rw-r--r--fs/ceph/export.c21
-rw-r--r--fs/ceph/file.c87
-rw-r--r--fs/ceph/inode.c184
-rw-r--r--fs/ceph/io.c100
-rw-r--r--fs/ceph/io.h8
-rw-r--r--fs/ceph/ioctl.c17
-rw-r--r--fs/ceph/locks.c5
-rw-r--r--fs/ceph/mds_client.c200
-rw-r--r--fs/ceph/mds_client.h18
-rw-r--r--fs/ceph/mdsmap.c14
-rw-r--r--fs/ceph/super.c26
-rw-r--r--fs/ceph/super.h20
-rw-r--r--fs/ceph/xattr.c6
-rw-r--r--fs/coda/cnode.c4
-rw-r--r--fs/coda/dir.c12
-rw-r--r--fs/coda/file.c6
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/configfs/Kconfig1
-rw-r--r--fs/configfs/dir.c22
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/configfs/inode.c3
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/configfs/mount.c7
-rw-r--r--fs/configfs/symlink.c33
-rw-r--r--fs/coredump.c1022
-rw-r--r--fs/cramfs/inode.c18
-rw-r--r--fs/crypto/Kconfig5
-rw-r--r--fs/crypto/bio.c13
-rw-r--r--fs/crypto/crypto.c62
-rw-r--r--fs/crypto/fname.c170
-rw-r--r--fs/crypto/fscrypt_private.h128
-rw-r--r--fs/crypto/hkdf.c113
-rw-r--r--fs/crypto/hooks.c6
-rw-r--r--fs/crypto/inline_crypt.c60
-rw-r--r--fs/crypto/keyring.c155
-rw-r--r--fs/crypto/keysetup.c196
-rw-r--r--fs/crypto/keysetup_v1.c59
-rw-r--r--fs/crypto/policy.c15
-rw-r--r--fs/d_path.c8
-rw-r--r--fs/dax.c500
-rw-r--r--fs/dcache.c376
-rw-r--r--fs/debugfs/file.c89
-rw-r--r--fs/debugfs/inode.c170
-rw-r--r--fs/debugfs/internal.h15
-rw-r--r--fs/devpts/inode.c63
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/dlm/Kconfig1
-rw-r--r--fs/dlm/config.c67
-rw-r--r--fs/dlm/config.h2
-rw-r--r--fs/dlm/lock.c4
-rw-r--r--fs/dlm/lockspace.c46
-rw-r--r--fs/dlm/lowcomms.c17
-rw-r--r--fs/dlm/main.c2
-rw-r--r--fs/dlm/member.c27
-rw-r--r--fs/dlm/recover.c2
-rw-r--r--fs/dlm/user.c6
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/Kconfig2
-rw-r--r--fs/ecryptfs/crypto.c90
-rw-r--r--fs/ecryptfs/dentry.c14
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h40
-rw-r--r--fs/ecryptfs/file.c17
-rw-r--r--fs/ecryptfs/inode.c209
-rw-r--r--fs/ecryptfs/keystore.c65
-rw-r--r--fs/ecryptfs/main.c36
-rw-r--r--fs/ecryptfs/mmap.c10
-rw-r--r--fs/ecryptfs/super.c5
-rw-r--r--fs/efivarfs/file.c10
-rw-r--r--fs/efivarfs/inode.c11
-rw-r--r--fs/efivarfs/internal.h1
-rw-r--r--fs/efivarfs/super.c228
-rw-r--r--fs/efs/inode.c2
-rw-r--r--fs/erofs/Kconfig34
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/compress.h22
-rw-r--r--fs/erofs/data.c113
-rw-r--r--fs/erofs/decompressor.c163
-rw-r--r--fs/erofs/decompressor_crypto.c182
-rw-r--r--fs/erofs/decompressor_deflate.c47
-rw-r--r--fs/erofs/decompressor_lzma.c26
-rw-r--r--fs/erofs/decompressor_zstd.c35
-rw-r--r--fs/erofs/dir.c27
-rw-r--r--fs/erofs/erofs_fs.h33
-rw-r--r--fs/erofs/fileio.c32
-rw-r--r--fs/erofs/fscache.c13
-rw-r--r--fs/erofs/inode.c63
-rw-r--r--fs/erofs/internal.h55
-rw-r--r--fs/erofs/super.c151
-rw-r--r--fs/erofs/sysfs.c71
-rw-r--r--fs/erofs/xattr.c69
-rw-r--r--fs/erofs/xattr.h3
-rw-r--r--fs/erofs/zdata.c182
-rw-r--r--fs/erofs/zmap.c272
-rw-r--r--fs/eventfd.c31
-rw-r--r--fs/eventpoll.c254
-rw-r--r--fs/exec.c171
-rw-r--r--fs/exfat/balloc.c123
-rw-r--r--fs/exfat/dir.c177
-rw-r--r--fs/exfat/exfat_fs.h15
-rw-r--r--fs/exfat/exfat_raw.h6
-rw-r--r--fs/exfat/fatent.c58
-rw-r--r--fs/exfat/file.c111
-rw-r--r--fs/exfat/inode.c160
-rw-r--r--fs/exfat/namei.c27
-rw-r--r--fs/exfat/nls.c6
-rw-r--r--fs/exfat/super.c171
-rw-r--r--fs/exportfs/expfs.c9
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h5
-rw-r--r--fs/ext2/file.c12
-rw-r--r--fs/ext2/inode.c25
-rw-r--r--fs/ext2/ioctl.c4
-rw-r--r--fs/ext2/super.c596
-rw-r--r--fs/ext4/Kconfig27
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/bitmap.c8
-rw-r--r--fs/ext4/block_validity.c5
-rw-r--r--fs/ext4/crypto.c2
-rw-r--r--fs/ext4/dir.c8
-rw-r--r--fs/ext4/ext4.h253
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.c11
-rw-r--r--fs/ext4/ext4_jbd2.h4
-rw-r--r--fs/ext4/extents.c275
-rw-r--r--fs/ext4/extents_status.c66
-rw-r--r--fs/ext4/extents_status.h2
-rw-r--r--fs/ext4/fast_commit.c462
-rw-r--r--fs/ext4/file.c34
-rw-r--r--fs/ext4/fsmap.c37
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c15
-rw-r--r--fs/ext4/indirect.c6
-rw-r--r--fs/ext4/inline.c110
-rw-r--r--fs/ext4/inode.c1126
-rw-r--r--fs/ext4/ioctl.c346
-rw-r--r--fs/ext4/mballoc-test.c5
-rw-r--r--fs/ext4/mballoc.c1114
-rw-r--r--fs/ext4/mballoc.h9
-rw-r--r--fs/ext4/mmp.c16
-rw-r--r--fs/ext4/move_extent.c788
-rw-r--r--fs/ext4/namei.c103
-rw-r--r--fs/ext4/orphan.c45
-rw-r--r--fs/ext4/page-io.c18
-rw-r--r--fs/ext4/readpage.c35
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c214
-rw-r--r--fs/ext4/sysfs.c6
-rw-r--r--fs/ext4/verity.c4
-rw-r--r--fs/ext4/xattr.c41
-rw-r--r--fs/f2fs/acl.c34
-rw-r--r--fs/f2fs/acl.h10
-rw-r--r--fs/f2fs/checkpoint.c311
-rw-r--r--fs/f2fs/compress.c340
-rw-r--r--fs/f2fs/data.c525
-rw-r--r--fs/f2fs/debug.c50
-rw-r--r--fs/f2fs/dir.c258
-rw-r--r--fs/f2fs/extent_cache.c36
-rw-r--r--fs/f2fs/f2fs.h690
-rw-r--r--fs/f2fs/file.c379
-rw-r--r--fs/f2fs/gc.c363
-rw-r--r--fs/f2fs/gc.h7
-rw-r--r--fs/f2fs/inline.c318
-rw-r--r--fs/f2fs/inode.c179
-rw-r--r--fs/f2fs/namei.c172
-rw-r--r--fs/f2fs/node.c857
-rw-r--r--fs/f2fs/node.h84
-rw-r--r--fs/f2fs/recovery.c265
-rw-r--r--fs/f2fs/segment.c366
-rw-r--r--fs/f2fs/segment.h200
-rw-r--r--fs/f2fs/shrinker.c13
-rw-r--r--fs/f2fs/super.c2566
-rw-r--r--fs/f2fs/sysfs.c217
-rw-r--r--fs/f2fs/verity.c4
-rw-r--r--fs/f2fs/xattr.c148
-rw-r--r--fs/f2fs/xattr.h30
-rw-r--r--fs/fat/dir.c7
-rw-r--r--fs/fat/fatent.c2
-rw-r--r--fs/fat/file.c2
-rw-r--r--fs/fat/inode.c25
-rw-r--r--fs/fat/misc.c6
-rw-r--r--fs/fat/namei_msdos.c2
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/fcntl.c23
-rw-r--r--fs/fhandle.c106
-rw-r--r--fs/file.c84
-rw-r--r--fs/file_attr.c490
-rw-r--r--fs/file_table.c23
-rw-r--r--fs/filesystems.c14
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/fs-writeback.c337
-rw-r--r--fs/fs_context.c23
-rw-r--r--fs/fs_dirent.c (renamed from fs/fs_types.c)2
-rw-r--r--fs/fs_parser.c55
-rw-r--r--fs/fs_struct.c42
-rw-r--r--fs/fsopen.c70
-rw-r--r--fs/fuse/Kconfig3
-rw-r--r--fs/fuse/Makefile5
-rw-r--r--fs/fuse/backing.c179
-rw-r--r--fs/fuse/control.c58
-rw-r--r--fs/fuse/cuse.c3
-rw-r--r--fs/fuse/dax.c33
-rw-r--r--fs/fuse/dev.c556
-rw-r--r--fs/fuse/dev_uring.c110
-rw-r--r--fs/fuse/dev_uring_i.h18
-rw-r--r--fs/fuse/dir.c361
-rw-r--r--fs/fuse/file.c1075
-rw-r--r--fs/fuse/fuse_dev_i.h25
-rw-r--r--fs/fuse/fuse_i.h172
-rw-r--r--fs/fuse/inode.c200
-rw-r--r--fs/fuse/ioctl.c4
-rw-r--r--fs/fuse/iomode.c3
-rw-r--r--fs/fuse/passthrough.c162
-rw-r--r--fs/fuse/readdir.c40
-rw-r--r--fs/fuse/sysctl.c24
-rw-r--r--fs/fuse/trace.c13
-rw-r--r--fs/fuse/virtio_fs.c29
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/aops.c110
-rw-r--r--fs/gfs2/aops.h3
-rw-r--r--fs/gfs2/bmap.c57
-rw-r--r--fs/gfs2/bmap.h1
-rw-r--r--fs/gfs2/dir.c6
-rw-r--r--fs/gfs2/file.c34
-rw-r--r--fs/gfs2/glock.c412
-rw-r--r--fs/gfs2/glock.h26
-rw-r--r--fs/gfs2/glops.c117
-rw-r--r--fs/gfs2/incore.h37
-rw-r--r--fs/gfs2/inode.c141
-rw-r--r--fs/gfs2/inode.h12
-rw-r--r--fs/gfs2/lock_dlm.c177
-rw-r--r--fs/gfs2/log.c66
-rw-r--r--fs/gfs2/log.h11
-rw-r--r--fs/gfs2/lops.c29
-rw-r--r--fs/gfs2/lops.h2
-rw-r--r--fs/gfs2/main.c5
-rw-r--r--fs/gfs2/meta_io.c25
-rw-r--r--fs/gfs2/meta_io.h4
-rw-r--r--fs/gfs2/ops_fstype.c124
-rw-r--r--fs/gfs2/quota.c66
-rw-r--r--fs/gfs2/recovery.c36
-rw-r--r--fs/gfs2/recovery.h2
-rw-r--r--fs/gfs2/super.c159
-rw-r--r--fs/gfs2/super.h1
-rw-r--r--fs/gfs2/sys.c69
-rw-r--r--fs/gfs2/trace_gfs2.h2
-rw-r--r--fs/gfs2/trans.c51
-rw-r--r--fs/gfs2/trans.h2
-rw-r--r--fs/gfs2/util.c373
-rw-r--r--fs/gfs2/util.h92
-rw-r--r--fs/gfs2/xattr.c11
-rw-r--r--fs/gfs2/xattr.h2
-rw-r--r--fs/hfs/.kunitconfig7
-rw-r--r--fs/hfs/Kconfig15
-rw-r--r--fs/hfs/Makefile2
-rw-r--r--fs/hfs/bfind.c17
-rw-r--r--fs/hfs/bitmap.c4
-rw-r--r--fs/hfs/bnode.c159
-rw-r--r--fs/hfs/brec.c37
-rw-r--r--fs/hfs/btree.c63
-rw-r--r--fs/hfs/btree.h113
-rw-r--r--fs/hfs/catalog.c129
-rw-r--r--fs/hfs/extent.c21
-rw-r--r--fs/hfs/hfs.h269
-rw-r--r--fs/hfs/hfs_fs.h129
-rw-r--r--fs/hfs/inode.c37
-rw-r--r--fs/hfs/mdb.c20
-rw-r--r--fs/hfs/string.c5
-rw-r--r--fs/hfs/string_test.c133
-rw-r--r--fs/hfs/super.c6
-rw-r--r--fs/hfsplus/.kunitconfig8
-rw-r--r--fs/hfsplus/Kconfig15
-rw-r--r--fs/hfsplus/Makefile3
-rw-r--r--fs/hfsplus/attributes.c8
-rw-r--r--fs/hfsplus/bfind.c14
-rw-r--r--fs/hfsplus/bitmap.c10
-rw-r--r--fs/hfsplus/bnode.c141
-rw-r--r--fs/hfsplus/brec.c12
-rw-r--r--fs/hfsplus/btree.c12
-rw-r--r--fs/hfsplus/catalog.c6
-rw-r--r--fs/hfsplus/dir.c9
-rw-r--r--fs/hfsplus/extents.c30
-rw-r--r--fs/hfsplus/hfsplus_fs.h122
-rw-r--r--fs/hfsplus/hfsplus_raw.h394
-rw-r--r--fs/hfsplus/inode.c56
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hfsplus/super.c132
-rw-r--r--fs/hfsplus/unicode.c63
-rw-r--r--fs/hfsplus/unicode_test.c1579
-rw-r--r--fs/hfsplus/wrapper.c46
-rw-r--r--fs/hfsplus/xattr.c38
-rw-r--r--fs/hostfs/hostfs.h36
-rw-r--r--fs/hostfs/hostfs_kern.c52
-rw-r--r--fs/hostfs/hostfs_user.c59
-rw-r--r--fs/hpfs/anode.c43
-rw-r--r--fs/hpfs/dir.c2
-rw-r--r--fs/hpfs/ea.c2
-rw-r--r--fs/hpfs/file.c24
-rw-r--r--fs/hpfs/hpfs.h44
-rw-r--r--fs/hpfs/inode.c4
-rw-r--r--fs/hpfs/map.c8
-rw-r--r--fs/hpfs/namei.c18
-rw-r--r--fs/hpfs/super.c11
-rw-r--r--fs/hugetlbfs/inode.c157
-rw-r--r--fs/init.c23
-rw-r--r--fs/inode.c433
-rw-r--r--fs/internal.h23
-rw-r--r--fs/ioctl.c329
-rw-r--r--fs/iomap/Makefile9
-rw-r--r--fs/iomap/bio.c88
-rw-r--r--fs/iomap/buffered-io.c1193
-rw-r--r--fs/iomap/direct-io.c270
-rw-r--r--fs/iomap/fiemap.c3
-rw-r--r--fs/iomap/internal.h13
-rw-r--r--fs/iomap/ioend.c218
-rw-r--r--fs/iomap/iter.c21
-rw-r--r--fs/iomap/seek.c12
-rw-r--r--fs/iomap/swapfile.c3
-rw-r--r--fs/iomap/trace.c1
-rw-r--r--fs/iomap/trace.h39
-rw-r--r--fs/isofs/dir.c3
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/isofs/inode.c25
-rw-r--r--fs/isofs/isofs.h4
-rw-r--r--fs/isofs/rock.c40
-rw-r--r--fs/isofs/rock.h6
-rw-r--r--fs/isofs/util.c49
-rw-r--r--fs/jbd2/checkpoint.c5
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c64
-rw-r--r--fs/jbd2/recovery.c10
-rw-r--r--fs/jbd2/revoke.c15
-rw-r--r--fs/jbd2/transaction.c44
-rw-r--r--fs/jffs2/erase.c4
-rw-r--r--fs/jffs2/file.c34
-rw-r--r--fs/jffs2/fs.c4
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/summary.c7
-rw-r--r--fs/jffs2/wbuf.c2
-rw-r--r--fs/jfs/file.c9
-rw-r--r--fs/jfs/inode.c28
-rw-r--r--fs/jfs/ioctl.c4
-rw-r--r--fs/jfs/jfs_discard.c3
-rw-r--r--fs/jfs/jfs_dmap.c16
-rw-r--r--fs/jfs/jfs_dtree.c22
-rw-r--r--fs/jfs/jfs_incore.h6
-rw-r--r--fs/jfs/jfs_inode.h4
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_metapage.c114
-rw-r--r--fs/jfs/jfs_mount.c10
-rw-r--r--fs/jfs/jfs_txnmgr.c11
-rw-r--r--fs/jfs/jfs_xtree.c142
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/dir.c239
-rw-r--r--fs/kernfs/file.c67
-rw-r--r--fs/kernfs/inode.c72
-rw-r--r--fs/kernfs/kernfs-internal.h45
-rw-r--r--fs/kernfs/mount.c64
-rw-r--r--fs/kernfs/symlink.c30
-rw-r--r--fs/libfs.c210
-rw-r--r--fs/lockd/Makefile2
-rw-r--r--fs/lockd/netlink.c45
-rw-r--r--fs/lockd/netlink.h20
-rw-r--r--fs/lockd/netns.h3
-rw-r--r--fs/lockd/svc.c129
-rw-r--r--fs/lockd/svclock.c14
-rw-r--r--fs/lockd/svcshare.c6
-rw-r--r--fs/locks.c111
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/minix/inode.c33
-rw-r--r--fs/minix/minix.h9
-rw-r--r--fs/minix/namei.c39
-rw-r--r--fs/mount.h97
-rw-r--r--fs/mpage.c27
-rw-r--r--fs/namei.c1564
-rw-r--r--fs/namespace.c2422
-rw-r--r--fs/netfs/buffered_read.c56
-rw-r--r--fs/netfs/buffered_write.c47
-rw-r--r--fs/netfs/direct_read.c23
-rw-r--r--fs/netfs/direct_write.c34
-rw-r--r--fs/netfs/fscache_cache.c2
-rw-r--r--fs/netfs/fscache_cookie.c2
-rw-r--r--fs/netfs/fscache_io.c10
-rw-r--r--fs/netfs/internal.h67
-rw-r--r--fs/netfs/main.c11
-rw-r--r--fs/netfs/misc.c241
-rw-r--r--fs/netfs/objects.c76
-rw-r--r--fs/netfs/read_collect.c217
-rw-r--r--fs/netfs/read_pgpriv2.c9
-rw-r--r--fs/netfs/read_retry.c26
-rw-r--r--fs/netfs/read_single.c12
-rw-r--r--fs/netfs/write_collect.c105
-rw-r--r--fs/netfs/write_issue.c43
-rw-r--r--fs/netfs/write_retry.c22
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c12
-rw-r--r--fs/nfs/blocklayout/dev.c13
-rw-r--r--fs/nfs/blocklayout/extent_tree.c104
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c53
-rw-r--r--fs/nfs/callback.c10
-rw-r--r--fs/nfs/client.c77
-rw-r--r--fs/nfs/delegation.c205
-rw-r--r--fs/nfs/delegation.h3
-rw-r--r--fs/nfs/dir.c54
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/export.c14
-rw-r--r--fs/nfs/file.c90
-rw-r--r--fs/nfs/filelayout/filelayout.c10
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c16
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c988
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h64
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c129
-rw-r--r--fs/nfs/fs_context.c116
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/inode.c181
-rw-r--r--fs/nfs/internal.h45
-rw-r--r--fs/nfs/io.c13
-rw-r--r--fs/nfs/localio.c545
-rw-r--r--fs/nfs/mount_clnt.c68
-rw-r--r--fs/nfs/namespace.c6
-rw-r--r--fs/nfs/netns.h6
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3client.c16
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c236
-rw-r--r--fs/nfs/nfs42xdr.c152
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4client.c205
-rw-r--r--fs/nfs/nfs4file.c42
-rw-r--r--fs/nfs/nfs4getroot.c14
-rw-r--r--fs/nfs/nfs4idmap.c21
-rw-r--r--fs/nfs/nfs4proc.c280
-rw-r--r--fs/nfs/nfs4renewd.c2
-rw-r--r--fs/nfs/nfs4session.h4
-rw-r--r--fs/nfs/nfs4state.c17
-rw-r--r--fs/nfs/nfs4super.c44
-rw-r--r--fs/nfs/nfs4trace.c2
-rw-r--r--fs/nfs/nfs4trace.h213
-rw-r--r--fs/nfs/nfs4xdr.c48
-rw-r--r--fs/nfs/nfstrace.h227
-rw-r--r--fs/nfs/pagelist.c9
-rw-r--r--fs/nfs/pnfs.c94
-rw-r--r--fs/nfs/pnfs.h4
-rw-r--r--fs/nfs/pnfs_nfs.c107
-rw-r--r--fs/nfs/read.c3
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/symlink.c20
-rw-r--r--fs/nfs/sysfs.c111
-rw-r--r--fs/nfs/unlink.c11
-rw-r--r--fs/nfs/write.c187
-rw-r--r--fs/nfs_common/nfsacl.c8
-rw-r--r--fs/nfs_common/nfslocalio.c113
-rw-r--r--fs/nfsd/Kconfig21
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/blocklayout.c192
-rw-r--r--fs/nfsd/blocklayoutxdr.c207
-rw-r--r--fs/nfsd/blocklayoutxdr.h22
-rw-r--r--fs/nfsd/debugfs.c143
-rw-r--r--fs/nfsd/export.c93
-rw-r--r--fs/nfsd/export.h7
-rw-r--r--fs/nfsd/filecache.c228
-rw-r--r--fs/nfsd/filecache.h15
-rw-r--r--fs/nfsd/flexfilelayout.c12
-rw-r--r--fs/nfsd/flexfilelayoutxdr.c3
-rw-r--r--fs/nfsd/localio.c81
-rw-r--r--fs/nfsd/lockd.c15
-rw-r--r--fs/nfsd/netlink.c1
-rw-r--r--fs/nfsd/netlink.h1
-rw-r--r--fs/nfsd/nfs3proc.c84
-rw-r--r--fs/nfsd/nfs3xdr.c4
-rw-r--r--fs/nfsd/nfs4callback.c277
-rw-r--r--fs/nfsd/nfs4layouts.c12
-rw-r--r--fs/nfsd/nfs4proc.c210
-rw-r--r--fs/nfsd/nfs4recover.c373
-rw-r--r--fs/nfsd/nfs4state.c614
-rw-r--r--fs/nfsd/nfs4xdr.c107
-rw-r--r--fs/nfsd/nfscache.c15
-rw-r--r--fs/nfsd/nfsctl.c295
-rw-r--r--fs/nfsd/nfsd.h61
-rw-r--r--fs/nfsd/nfsfh.c77
-rw-r--r--fs/nfsd/nfsfh.h71
-rw-r--r--fs/nfsd/nfsproc.c65
-rw-r--r--fs/nfsd/nfssvc.c43
-rw-r--r--fs/nfsd/nfsxdr.c4
-rw-r--r--fs/nfsd/pnfs.h5
-rw-r--r--fs/nfsd/state.h65
-rw-r--r--fs/nfsd/stats.c4
-rw-r--r--fs/nfsd/stats.h2
-rw-r--r--fs/nfsd/trace.h419
-rw-r--r--fs/nfsd/vfs.c669
-rw-r--r--fs/nfsd/vfs.h47
-rw-r--r--fs/nfsd/xdr4.h67
-rw-r--r--fs/nfsd/xdr4cb.h5
-rw-r--r--fs/nilfs2/btree.c4
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/direct.c3
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/ifile.c2
-rw-r--r--fs/nilfs2/inode.c27
-rw-r--r--fs/nilfs2/ioctl.c39
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/nilfs.h5
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segment.c27
-rw-r--r--fs/nilfs2/segment.h1
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/nilfs2/sysfs.c4
-rw-r--r--fs/nilfs2/sysfs.h8
-rw-r--r--fs/nilfs2/the_nilfs.c3
-rw-r--r--fs/nls/nls_base.c27
-rw-r--r--fs/notify/dnotify/dnotify.c8
-rw-r--r--fs/notify/fanotify/fanotify.c11
-rw-r--r--fs/notify/fanotify/fanotify.h11
-rw-r--r--fs/notify/fanotify/fanotify_user.c222
-rw-r--r--fs/notify/fdinfo.c6
-rw-r--r--fs/notify/fsnotify.c91
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/mark.c4
-rw-r--r--fs/nsfs.c355
-rw-r--r--fs/ntfs3/attrib.c159
-rw-r--r--fs/ntfs3/bitmap.c1
-rw-r--r--fs/ntfs3/dir.c7
-rw-r--r--fs/ntfs3/file.c267
-rw-r--r--fs/ntfs3/frecord.c373
-rw-r--r--fs/ntfs3/fslog.c32
-rw-r--r--fs/ntfs3/fsntfs.c166
-rw-r--r--fs/ntfs3/index.c25
-rw-r--r--fs/ntfs3/inode.c182
-rw-r--r--fs/ntfs3/namei.c34
-rw-r--r--fs/ntfs3/ntfs.h5
-rw-r--r--fs/ntfs3/ntfs_fs.h72
-rw-r--r--fs/ntfs3/record.c2
-rw-r--r--fs/ntfs3/run.c27
-rw-r--r--fs/ntfs3/super.c161
-rw-r--r--fs/ntfs3/xattr.c40
-rw-r--r--fs/ocfs2/acl.c1
-rw-r--r--fs/ocfs2/alloc.c14
-rw-r--r--fs/ocfs2/aops.c28
-rw-r--r--fs/ocfs2/cluster/tcp.c13
-rw-r--r--fs/ocfs2/dir.c50
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c11
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c3
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c13
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/extent_map.c10
-rw-r--r--fs/ocfs2/file.c6
-rw-r--r--fs/ocfs2/filecheck.c2
-rw-r--r--fs/ocfs2/inode.c157
-rw-r--r--fs/ocfs2/inode.h1
-rw-r--r--fs/ocfs2/ioctl.c22
-rw-r--r--fs/ocfs2/ioctl.h4
-rw-r--r--fs/ocfs2/journal.c93
-rw-r--r--fs/ocfs2/journal.h1
-rw-r--r--fs/ocfs2/mmap.c5
-rw-r--r--fs/ocfs2/mmap.h2
-rw-r--r--fs/ocfs2/move_extents.c46
-rw-r--r--fs/ocfs2/namei.c11
-rw-r--r--fs/ocfs2/ocfs2.h17
-rw-r--r--fs/ocfs2/ocfs2_fs.h24
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/quota_local.c11
-rw-r--r--fs/ocfs2/refcounttree.c9
-rw-r--r--fs/ocfs2/stack_user.c18
-rw-r--r--fs/ocfs2/stackglue.c3
-rw-r--r--fs/ocfs2/suballoc.c38
-rw-r--r--fs/ocfs2/suballoc.h1
-rw-r--r--fs/ocfs2/super.c7
-rw-r--r--fs/ocfs2/sysfile.c12
-rw-r--r--fs/ocfs2/xattr.c4
-rw-r--r--fs/omfs/file.c9
-rw-r--r--fs/omfs/inode.c179
-rw-r--r--fs/open.c90
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/file.c10
-rw-r--r--fs/orangefs/inode.c35
-rw-r--r--fs/orangefs/namei.c10
-rw-r--r--fs/orangefs/orangefs-debugfs.c19
-rw-r--r--fs/orangefs/orangefs-kernel.h10
-rw-r--r--fs/orangefs/orangefs-mod.c3
-rw-r--r--fs/orangefs/orangefs-sysfs.c28
-rw-r--r--fs/orangefs/orangefs-utils.c6
-rw-r--r--fs/orangefs/super.c193
-rw-r--r--fs/orangefs/xattr.c12
-rw-r--r--fs/overlayfs/copy_up.c185
-rw-r--r--fs/overlayfs/dir.c720
-rw-r--r--fs/overlayfs/export.c6
-rw-r--r--fs/overlayfs/file.c104
-rw-r--r--fs/overlayfs/inode.c137
-rw-r--r--fs/overlayfs/namei.c498
-rw-r--r--fs/overlayfs/overlayfs.h133
-rw-r--r--fs/overlayfs/ovl_entry.h4
-rw-r--r--fs/overlayfs/params.c61
-rw-r--r--fs/overlayfs/params.h1
-rw-r--r--fs/overlayfs/readdir.c313
-rw-r--r--fs/overlayfs/super.c245
-rw-r--r--fs/overlayfs/util.c83
-rw-r--r--fs/overlayfs/xattrs.c35
-rw-r--r--fs/pidfs.c581
-rw-r--r--fs/pipe.c47
-rw-r--r--fs/pnode.c751
-rw-r--r--fs/pnode.h32
-rw-r--r--fs/posix_acl.c8
-rw-r--r--fs/proc/array.c53
-rw-r--r--fs/proc/base.c64
-rw-r--r--fs/proc/fd.c11
-rw-r--r--fs/proc/generic.c59
-rw-r--r--fs/proc/inode.c27
-rw-r--r--fs/proc/internal.h67
-rw-r--r--fs/proc/meminfo.c8
-rw-r--r--fs/proc/namespaces.c9
-rw-r--r--fs/proc/page.c206
-rw-r--r--fs/proc/proc_sysctl.c25
-rw-r--r--fs/proc/root.c122
-rw-r--r--fs/proc/self.c10
-rw-r--r--fs/proc/task_mmu.c714
-rw-r--r--fs/proc/task_nommu.c14
-rw-r--r--fs/proc/thread_self.c11
-rw-r--r--fs/proc/vmcore.c29
-rw-r--r--fs/proc_namespace.c12
-rw-r--r--fs/pstore/inode.c14
-rw-r--r--fs/pstore/platform.c2
-rw-r--r--fs/pstore/ram.c2
-rw-r--r--fs/pstore/zone.c21
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/quota/dquot.c14
-rw-r--r--fs/ramfs/file-mmu.c4
-rw-r--r--fs/ramfs/file-nommu.c12
-rw-r--r--fs/ramfs/inode.c11
-rw-r--r--fs/read_write.c22
-rw-r--r--fs/readdir.c47
-rw-r--r--fs/resctrl/Kconfig39
-rw-r--r--fs/resctrl/Makefile6
-rw-r--r--fs/resctrl/ctrlmondata.c959
-rw-r--r--fs/resctrl/internal.h495
-rw-r--r--fs/resctrl/monitor.c1811
-rw-r--r--fs/resctrl/monitor_trace.h33
-rw-r--r--fs/resctrl/pseudo_lock.c1099
-rw-r--r--fs/resctrl/rdtgroup.c4584
-rw-r--r--fs/romfs/mmap-nommu.c6
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/select.c20
-rw-r--r--fs/signalfd.c29
-rw-r--r--fs/smb/client/Kconfig8
-rw-r--r--fs/smb/client/Makefile2
-rw-r--r--fs/smb/client/cached_dir.c199
-rw-r--r--fs/smb/client/cached_dir.h31
-rw-r--r--fs/smb/client/cifs_debug.c268
-rw-r--r--fs/smb/client/cifs_debug.h6
-rw-r--r--fs/smb/client/cifs_fs_sb.h1
-rw-r--r--fs/smb/client/cifs_ioctl.h2
-rw-r--r--fs/smb/client/cifs_spnego.c66
-rw-r--r--fs/smb/client/cifs_spnego.h2
-rw-r--r--fs/smb/client/cifs_swn.c20
-rw-r--r--fs/smb/client/cifs_unicode.c3
-rw-r--r--fs/smb/client/cifs_unicode.h3
-rw-r--r--fs/smb/client/cifsacl.c36
-rw-r--r--fs/smb/client/cifsencrypt.c377
-rw-r--r--fs/smb/client/cifsfs.c150
-rw-r--r--fs/smb/client/cifsfs.h9
-rw-r--r--fs/smb/client/cifsglob.h341
-rw-r--r--fs/smb/client/cifspdu.h612
-rw-r--r--fs/smb/client/cifsproto.h242
-rw-r--r--fs/smb/client/cifssmb.c1210
-rw-r--r--fs/smb/client/cifstransport.c263
-rw-r--r--fs/smb/client/compress.c94
-rw-r--r--fs/smb/client/compress.h19
-rw-r--r--fs/smb/client/connect.c687
-rw-r--r--fs/smb/client/dfs_cache.c55
-rw-r--r--fs/smb/client/dir.c117
-rw-r--r--fs/smb/client/dns_resolve.h4
-rw-r--r--fs/smb/client/file.c231
-rw-r--r--fs/smb/client/fs_context.c259
-rw-r--r--fs/smb/client/fs_context.h74
-rw-r--r--fs/smb/client/inode.c368
-rw-r--r--fs/smb/client/ioctl.c2
-rw-r--r--fs/smb/client/link.c72
-rw-r--r--fs/smb/client/misc.c126
-rw-r--r--fs/smb/client/namespace.c4
-rw-r--r--fs/smb/client/netmisc.c11
-rw-r--r--fs/smb/client/ntlmssp.h8
-rw-r--r--fs/smb/client/readdir.c88
-rw-r--r--fs/smb/client/reparse.c171
-rw-r--r--fs/smb/client/reparse.h17
-rw-r--r--fs/smb/client/rfc1002pdu.h8
-rw-r--r--fs/smb/client/sess.c160
-rw-r--r--fs/smb/client/smb1ops.c528
-rw-r--r--fs/smb/client/smb2file.c30
-rw-r--r--fs/smb/client/smb2glob.h4
-rw-r--r--fs/smb/client/smb2inode.c419
-rw-r--r--fs/smb/client/smb2maperror.c52
-rw-r--r--fs/smb/client/smb2misc.c84
-rw-r--r--fs/smb/client/smb2ops.c717
-rw-r--r--fs/smb/client/smb2pdu.c555
-rw-r--r--fs/smb/client/smb2pdu.h108
-rw-r--r--fs/smb/client/smb2proto.h41
-rw-r--r--fs/smb/client/smb2transport.c251
-rw-r--r--fs/smb/client/smbdirect.c2287
-rw-r--r--fs/smb/client/smbdirect.h259
-rw-r--r--fs/smb/client/trace.c2
-rw-r--r--fs/smb/client/trace.h279
-rw-r--r--fs/smb/client/transport.c803
-rw-r--r--fs/smb/client/xattr.c52
-rw-r--r--fs/smb/common/Makefile1
-rw-r--r--fs/smb/common/arc4.h23
-rw-r--r--fs/smb/common/cifs_arc4.c75
-rw-r--r--fs/smb/common/fscc.h174
-rw-r--r--fs/smb/common/smb2pdu.h285
-rw-r--r--fs/smb/common/smb2status.h5
-rw-r--r--fs/smb/common/smbacl.h8
-rw-r--r--fs/smb/common/smbdirect/smbdirect.h44
-rw-r--r--fs/smb/common/smbdirect/smbdirect_pdu.h55
-rw-r--r--fs/smb/common/smbdirect/smbdirect_socket.h547
-rw-r--r--fs/smb/common/smbglob.h71
-rw-r--r--fs/smb/server/Kconfig10
-rw-r--r--fs/smb/server/auth.c453
-rw-r--r--fs/smb/server/auth.h12
-rw-r--r--fs/smb/server/connection.c34
-rw-r--r--fs/smb/server/connection.h35
-rw-r--r--fs/smb/server/crypto_ctx.c32
-rw-r--r--fs/smb/server/crypto_ctx.h19
-rw-r--r--fs/smb/server/ksmbd_netlink.h5
-rw-r--r--fs/smb/server/ksmbd_work.c2
-rw-r--r--fs/smb/server/mgmt/share_config.c2
-rw-r--r--fs/smb/server/mgmt/tree_connect.c18
-rw-r--r--fs/smb/server/mgmt/tree_connect.h1
-rw-r--r--fs/smb/server/mgmt/user_session.c74
-rw-r--r--fs/smb/server/mgmt/user_session.h3
-rw-r--r--fs/smb/server/misc.c15
-rw-r--r--fs/smb/server/oplock.c70
-rw-r--r--fs/smb/server/oplock.h1
-rw-r--r--fs/smb/server/server.c6
-rw-r--r--fs/smb/server/server.h1
-rw-r--r--fs/smb/server/smb2misc.c2
-rw-r--r--fs/smb/server/smb2ops.c38
-rw-r--r--fs/smb/server/smb2pdu.c611
-rw-r--r--fs/smb/server/smb2pdu.h114
-rw-r--r--fs/smb/server/smb_common.c2
-rw-r--r--fs/smb/server/smb_common.h286
-rw-r--r--fs/smb/server/smbacl.c21
-rw-r--r--fs/smb/server/transport_ipc.c37
-rw-r--r--fs/smb/server/transport_rdma.c2239
-rw-r--r--fs/smb/server/transport_rdma.h49
-rw-r--r--fs/smb/server/transport_tcp.c172
-rw-r--r--fs/smb/server/transport_tcp.h1
-rw-r--r--fs/smb/server/vfs.c407
-rw-r--r--fs/smb/server/vfs.h15
-rw-r--r--fs/smb/server/vfs_cache.c129
-rw-r--r--fs/smb/server/vfs_cache.h3
-rw-r--r--fs/splice.c7
-rw-r--r--fs/squashfs/Kconfig21
-rw-r--r--fs/squashfs/block.c57
-rw-r--r--fs/squashfs/file.c144
-rw-r--r--fs/squashfs/inode.c41
-rw-r--r--fs/squashfs/squashfs.h1
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_i.h2
-rw-r--r--fs/squashfs/super.c9
-rw-r--r--fs/stack.c4
-rw-r--r--fs/stat.c75
-rw-r--r--fs/super.c427
-rw-r--r--fs/sync.c19
-rw-r--r--fs/sysfs/dir.c2
-rw-r--r--fs/sysfs/file.c56
-rw-r--r--fs/sysfs/group.c42
-rw-r--r--fs/timerfd.c29
-rw-r--r--fs/tracefs/event_inode.c7
-rw-r--r--fs/tracefs/inode.c41
-rw-r--r--fs/ubifs/compress.c243
-rw-r--r--fs/ubifs/crypto.c4
-rw-r--r--fs/ubifs/file.c110
-rw-r--r--fs/ubifs/io.c13
-rw-r--r--fs/ubifs/ioctl.c4
-rw-r--r--fs/ubifs/journal.c13
-rw-r--r--fs/ubifs/lpt.c12
-rw-r--r--fs/ubifs/recovery.c4
-rw-r--r--fs/ubifs/super.c6
-rw-r--r--fs/ubifs/tnc_misc.c9
-rw-r--r--fs/ubifs/ubifs.h36
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c45
-rw-r--r--fs/udf/super.c13
-rw-r--r--fs/udf/truncate.c2
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c2
-rw-r--r--fs/ufs/inode.c18
-rw-r--r--fs/ufs/super.c307
-rw-r--r--fs/ufs/ufs.h9
-rw-r--r--fs/userfaultfd.c323
-rw-r--r--fs/utimes.c5
-rw-r--r--fs/vboxsf/dir.c25
-rw-r--r--fs/vboxsf/file.c60
-rw-r--r--fs/vboxsf/super.c2
-rw-r--r--fs/verity/Kconfig6
-rw-r--r--fs/verity/enable.c23
-rw-r--r--fs/verity/fsverity_private.h33
-rw-r--r--fs/verity/hash_algs.c195
-rw-r--r--fs/verity/measure.c1
-rw-r--r--fs/verity/open.c60
-rw-r--r--fs/verity/read_metadata.c1
-rw-r--r--fs/verity/verify.c181
-rw-r--r--fs/xattr.c43
-rw-r--r--fs/xfs/Kconfig36
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c7
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c46
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c52
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c25
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c7
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c36
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h6
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_btree.c35
-rw-r--r--fs/xfs/libxfs/xfs_btree.h41
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c2
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h118
-rw-r--r--fs/xfs/libxfs/xfs_exchmaps.c4
-rw-r--r--fs/xfs/libxfs/xfs_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_group.c17
-rw-r--r--fs/xfs/libxfs/xfs_group.h9
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c37
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c24
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c3
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.c11
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h180
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h6
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c4
-rw-r--r--fs/xfs/libxfs/xfs_metafile.c2
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h4
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h4
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c11
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c67
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h20
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.c67
-rw-r--r--fs/xfs/libxfs/xfs_sb.c9
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c343
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h25
-rw-r--r--fs/xfs/libxfs/xfs_zones.c1
-rw-r--r--fs/xfs/libxfs/xfs_zones.h7
-rw-r--r--fs/xfs/scrub/btree.c2
-rw-r--r--fs/xfs/scrub/common.c9
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/cow_repair.c4
-rw-r--r--fs/xfs/scrub/dir_repair.c8
-rw-r--r--fs/xfs/scrub/fscounters.c7
-rw-r--r--fs/xfs/scrub/inode_repair.c2
-rw-r--r--fs/xfs/scrub/metapath.c16
-rw-r--r--fs/xfs/scrub/newbt.c9
-rw-r--r--fs/xfs/scrub/nlinks.c42
-rw-r--r--fs/xfs/scrub/nlinks_repair.c4
-rw-r--r--fs/xfs/scrub/orphanage.c18
-rw-r--r--fs/xfs/scrub/parent.c2
-rw-r--r--fs/xfs/scrub/parent_repair.c12
-rw-r--r--fs/xfs/scrub/quota.c8
-rw-r--r--fs/xfs/scrub/quota_repair.c18
-rw-r--r--fs/xfs/scrub/quotacheck.c15
-rw-r--r--fs/xfs/scrub/quotacheck_repair.c21
-rw-r--r--fs/xfs/scrub/rcbag_btree.c38
-rw-r--r--fs/xfs/scrub/reap.c620
-rw-r--r--fs/xfs/scrub/repair.c38
-rw-r--r--fs/xfs/scrub/repair.h12
-rw-r--r--fs/xfs/scrub/rmap_repair.c14
-rw-r--r--fs/xfs/scrub/rtrmap_repair.c14
-rw-r--r--fs/xfs/scrub/scrub.c7
-rw-r--r--fs/xfs/scrub/symlink_repair.c4
-rw-r--r--fs/xfs/scrub/trace.c1
-rw-r--r--fs/xfs/scrub/trace.h49
-rw-r--r--fs/xfs/scrub/xfarray.c2
-rw-r--r--fs/xfs/xfs_aops.c236
-rw-r--r--fs/xfs/xfs_attr_item.c150
-rw-r--r--fs/xfs/xfs_attr_item.h8
-rw-r--r--fs/xfs/xfs_bio_io.c30
-rw-r--r--fs/xfs/xfs_bmap_item.c28
-rw-r--r--fs/xfs/xfs_bmap_item.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_buf.c188
-rw-r--r--fs/xfs/xfs_buf.h14
-rw-r--r--fs/xfs/xfs_buf_item.c322
-rw-r--r--fs/xfs/xfs_buf_item.h8
-rw-r--r--fs/xfs/xfs_buf_item_recover.c48
-rw-r--r--fs/xfs/xfs_buf_mem.c2
-rw-r--r--fs/xfs/xfs_discard.c62
-rw-r--r--fs/xfs/xfs_dquot.c150
-rw-r--r--fs/xfs/xfs_dquot.h22
-rw-r--r--fs/xfs/xfs_dquot_item.c6
-rw-r--r--fs/xfs/xfs_dquot_item_recover.c20
-rw-r--r--fs/xfs/xfs_error.c216
-rw-r--r--fs/xfs/xfs_error.h47
-rw-r--r--fs/xfs/xfs_exchmaps_item.c8
-rw-r--r--fs/xfs/xfs_extent_busy.h8
-rw-r--r--fs/xfs/xfs_extfree_item.c69
-rw-r--r--fs/xfs/xfs_extfree_item.h7
-rw-r--r--fs/xfs/xfs_file.c272
-rw-r--r--fs/xfs/xfs_filestream.c15
-rw-r--r--fs/xfs/xfs_fsmap.c55
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_handle.c56
-rw-r--r--fs/xfs/xfs_health.c4
-rw-r--r--fs/xfs/xfs_icache.c58
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c165
-rw-r--r--fs/xfs/xfs_inode.h21
-rw-r--r--fs/xfs/xfs_inode_item.c146
-rw-r--r--fs/xfs/xfs_inode_item.h14
-rw-r--r--fs/xfs/xfs_inode_item_recover.c26
-rw-r--r--fs/xfs/xfs_ioctl.c53
-rw-r--r--fs/xfs/xfs_ioctl.h4
-rw-r--r--fs/xfs/xfs_iomap.c315
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_iops.c97
-rw-r--r--fs/xfs/xfs_iops.h3
-rw-r--r--fs/xfs/xfs_itable.c26
-rw-r--r--fs/xfs/xfs_itable.h10
-rw-r--r--fs/xfs/xfs_iwalk.c11
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c286
-rw-r--r--fs/xfs/xfs_log.h53
-rw-r--r--fs/xfs/xfs_log_cil.c85
-rw-r--r--fs/xfs/xfs_log_priv.h54
-rw-r--r--fs/xfs/xfs_log_recover.c95
-rw-r--r--fs/xfs/xfs_message.c16
-rw-r--r--fs/xfs/xfs_message.h4
-rw-r--r--fs/xfs/xfs_mount.c189
-rw-r--r--fs/xfs/xfs_mount.h56
-rw-r--r--fs/xfs/xfs_mru_cache.c35
-rw-r--r--fs/xfs/xfs_notify_failure.c17
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_qm.c246
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_qm_bhv.c4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c10
-rw-r--r--fs/xfs/xfs_quotaops.c2
-rw-r--r--fs/xfs/xfs_refcount_item.c44
-rw-r--r--fs/xfs/xfs_refcount_item.h3
-rw-r--r--fs/xfs/xfs_reflink.c149
-rw-r--r--fs/xfs/xfs_reflink.h8
-rw-r--r--fs/xfs/xfs_rmap_item.c44
-rw-r--r--fs/xfs/xfs_rmap_item.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c15
-rw-r--r--fs/xfs/xfs_super.c257
-rw-r--r--fs/xfs/xfs_sysctl.c29
-rw-r--r--fs/xfs/xfs_sysctl.h5
-rw-r--r--fs/xfs/xfs_sysfs.c40
-rw-r--r--fs/xfs/xfs_trace.h213
-rw-r--r--fs/xfs/xfs_trans.c234
-rw-r--r--fs/xfs/xfs_trans.h4
-rw-r--r--fs/xfs/xfs_trans_ail.c41
-rw-r--r--fs/xfs/xfs_trans_dquot.c18
-rw-r--r--fs/xfs/xfs_trans_priv.h28
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/xfs/xfs_zone_alloc.c428
-rw-r--r--fs/xfs/xfs_zone_alloc.h4
-rw-r--r--fs/xfs/xfs_zone_gc.c169
-rw-r--r--fs/xfs/xfs_zone_info.c2
-rw-r--r--fs/xfs/xfs_zone_priv.h19
-rw-r--r--fs/xfs/xfs_zone_space_resv.c33
-rw-r--r--fs/zonefs/file.c57
-rw-r--r--fs/zonefs/super.c43
1407 files changed, 79462 insertions, 160512 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index eed551d8555f..633da5e37299 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -6,6 +6,7 @@
#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
#include <linux/slab.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 77e9c4387c1d..057487efaaeb 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -13,7 +13,8 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/cred.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <net/9p/9p.h>
@@ -33,6 +34,10 @@ struct kmem_cache *v9fs_inode_cache;
*/
enum {
+ /* Mount-point source, we need to handle this explicitly because
+ * the code below accepts unknown args and the vfs layer only handles
+ * source if we rejected it as EINVAL */
+ Opt_source,
/* Options that take integer arguments */
Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
/* String options */
@@ -43,27 +48,71 @@ enum {
Opt_access, Opt_posixacl,
/* Lock timeout option */
Opt_locktimeout,
- /* Error token */
- Opt_err
+
+ /* Client options */
+ Opt_msize, Opt_trans, Opt_legacy, Opt_version,
+
+ /* fd transport options */
+ /* Options that take integer arguments */
+ Opt_rfdno, Opt_wfdno,
+ /* Options that take no arguments */
+
+ /* rdma transport options */
+ /* Options that take integer arguments */
+ Opt_rq_depth, Opt_sq_depth, Opt_timeout,
+
+ /* Options for both fd and rdma transports */
+ Opt_port, Opt_privport,
+};
+
+static const struct constant_table p9_versions[] = {
+ { "9p2000", p9_proto_legacy },
+ { "9p2000.u", p9_proto_2000u },
+ { "9p2000.L", p9_proto_2000L },
+ {}
};
-static const match_table_t tokens = {
- {Opt_debug, "debug=%x"},
- {Opt_dfltuid, "dfltuid=%u"},
- {Opt_dfltgid, "dfltgid=%u"},
- {Opt_afid, "afid=%u"},
- {Opt_uname, "uname=%s"},
- {Opt_remotename, "aname=%s"},
- {Opt_nodevmap, "nodevmap"},
- {Opt_noxattr, "noxattr"},
- {Opt_directio, "directio"},
- {Opt_ignoreqv, "ignoreqv"},
- {Opt_cache, "cache=%s"},
- {Opt_cachetag, "cachetag=%s"},
- {Opt_access, "access=%s"},
- {Opt_posixacl, "posixacl"},
- {Opt_locktimeout, "locktimeout=%u"},
- {Opt_err, NULL}
+/*
+ * This structure contains all parameters used for the core code,
+ * the client, and all the transports.
+ */
+const struct fs_parameter_spec v9fs_param_spec[] = {
+ fsparam_string ("source", Opt_source),
+ fsparam_u32hex ("debug", Opt_debug),
+ fsparam_uid ("dfltuid", Opt_dfltuid),
+ fsparam_gid ("dfltgid", Opt_dfltgid),
+ fsparam_u32 ("afid", Opt_afid),
+ fsparam_string ("uname", Opt_uname),
+ fsparam_string ("aname", Opt_remotename),
+ fsparam_flag ("nodevmap", Opt_nodevmap),
+ fsparam_flag ("noxattr", Opt_noxattr),
+ fsparam_flag ("directio", Opt_directio),
+ fsparam_flag ("ignoreqv", Opt_ignoreqv),
+ fsparam_string ("cache", Opt_cache),
+ fsparam_string ("cachetag", Opt_cachetag),
+ fsparam_string ("access", Opt_access),
+ fsparam_flag ("posixacl", Opt_posixacl),
+ fsparam_u32 ("locktimeout", Opt_locktimeout),
+
+ /* client options */
+ fsparam_u32 ("msize", Opt_msize),
+ fsparam_flag ("noextend", Opt_legacy),
+ fsparam_string ("trans", Opt_trans),
+ fsparam_enum ("version", Opt_version, p9_versions),
+
+ /* fd transport options */
+ fsparam_u32 ("rfdno", Opt_rfdno),
+ fsparam_u32 ("wfdno", Opt_wfdno),
+
+ /* rdma transport options */
+ fsparam_u32 ("sq", Opt_sq_depth),
+ fsparam_u32 ("rq", Opt_rq_depth),
+ fsparam_u32 ("timeout", Opt_timeout),
+
+ /* fd and rdma transprt options */
+ fsparam_u32 ("port", Opt_port),
+ fsparam_flag ("privport", Opt_privport),
+ {}
};
/* Interpret mount options for cache mode */
@@ -101,7 +150,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
struct v9fs_session_info *v9ses = root->d_sb->s_fs_info;
if (v9ses->debug)
- seq_printf(m, ",debug=%x", v9ses->debug);
+ seq_printf(m, ",debug=%#x", v9ses->debug);
if (!uid_eq(v9ses->dfltuid, V9FS_DEFUID))
seq_printf(m, ",dfltuid=%u",
from_kuid_munged(&init_user_ns, v9ses->dfltuid));
@@ -117,7 +166,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
if (v9ses->nodev)
seq_puts(m, ",nodevmap");
if (v9ses->cache)
- seq_printf(m, ",cache=%x", v9ses->cache);
+ seq_printf(m, ",cache=%#x", v9ses->cache);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cachetag && (v9ses->cache & CACHE_FSCACHE))
seq_printf(m, ",cachetag=%s", v9ses->cachetag);
@@ -153,267 +202,254 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
}
/**
- * v9fs_parse_options - parse mount options into session structure
- * @v9ses: existing v9fs session information
- * @opts: The mount option string
+ * v9fs_parse_param - parse a mount option into the filesystem context
+ * @fc: the filesystem context
+ * @param: the parameter to parse
*
* Return 0 upon success, -ERRNO upon failure.
*/
-
-static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
+int v9fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *options, *tmp_options;
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int option = 0;
+ struct v9fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
char *s;
- int ret = 0;
-
- /* setup defaults */
- v9ses->afid = ~0;
- v9ses->debug = 0;
- v9ses->cache = CACHE_NONE;
-#ifdef CONFIG_9P_FSCACHE
- v9ses->cachetag = NULL;
-#endif
- v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
-
- if (!opts)
- return 0;
+ int r;
+ int opt;
+ struct p9_client_opts *clnt = &ctx->client_opts;
+ struct p9_fd_opts *fd_opts = &ctx->fd_opts;
+ struct p9_rdma_opts *rdma_opts = &ctx->rdma_opts;
+ struct p9_session_opts *session_opts = &ctx->session_opts;
+
+ opt = fs_parse(fc, v9fs_param_spec, param, &result);
+ if (opt < 0) {
+ /*
+ * We might like to report bad mount options here, but
+ * traditionally 9p has ignored unknown mount options
+ */
+ if (opt == -ENOPARAM)
+ return 0;
- tmp_options = kstrdup(opts, GFP_KERNEL);
- if (!tmp_options) {
- ret = -ENOMEM;
- goto fail_option_alloc;
+ return opt;
}
- options = tmp_options;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token, r;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_debug:
- r = match_int(&args[0], &option);
- if (r < 0) {
- p9_debug(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
- ret = r;
- } else {
- v9ses->debug = option;
+
+ switch (opt) {
+ case Opt_source:
+ if (fc->source) {
+ pr_info("p9: multiple sources not supported\n");
+ return -EINVAL;
+ }
+ fc->source = param->string;
+ param->string = NULL;
+ break;
+ case Opt_debug:
+ session_opts->debug = result.uint_32;
#ifdef CONFIG_NET_9P_DEBUG
- p9_debug_level = option;
+ p9_debug_level = result.uint_32;
#endif
- }
- break;
-
- case Opt_dfltuid:
- r = match_int(&args[0], &option);
- if (r < 0) {
- p9_debug(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
- ret = r;
- continue;
- }
- v9ses->dfltuid = make_kuid(current_user_ns(), option);
- if (!uid_valid(v9ses->dfltuid)) {
- p9_debug(P9_DEBUG_ERROR,
- "uid field, but not a uid?\n");
- ret = -EINVAL;
- }
- break;
- case Opt_dfltgid:
- r = match_int(&args[0], &option);
- if (r < 0) {
- p9_debug(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
- ret = r;
- continue;
- }
- v9ses->dfltgid = make_kgid(current_user_ns(), option);
- if (!gid_valid(v9ses->dfltgid)) {
- p9_debug(P9_DEBUG_ERROR,
- "gid field, but not a gid?\n");
- ret = -EINVAL;
- }
- break;
- case Opt_afid:
- r = match_int(&args[0], &option);
- if (r < 0) {
- p9_debug(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
- ret = r;
- } else {
- v9ses->afid = option;
- }
- break;
- case Opt_uname:
- kfree(v9ses->uname);
- v9ses->uname = match_strdup(&args[0]);
- if (!v9ses->uname) {
- ret = -ENOMEM;
- goto free_and_return;
- }
- break;
- case Opt_remotename:
- kfree(v9ses->aname);
- v9ses->aname = match_strdup(&args[0]);
- if (!v9ses->aname) {
- ret = -ENOMEM;
- goto free_and_return;
- }
- break;
- case Opt_nodevmap:
- v9ses->nodev = 1;
- break;
- case Opt_noxattr:
- v9ses->flags |= V9FS_NO_XATTR;
- break;
- case Opt_directio:
- v9ses->flags |= V9FS_DIRECT_IO;
- break;
- case Opt_ignoreqv:
- v9ses->flags |= V9FS_IGNORE_QV;
- break;
- case Opt_cachetag:
+ break;
+
+ case Opt_dfltuid:
+ session_opts->dfltuid = result.uid;
+ break;
+ case Opt_dfltgid:
+ session_opts->dfltgid = result.gid;
+ break;
+ case Opt_afid:
+ session_opts->afid = result.uint_32;
+ break;
+ case Opt_uname:
+ kfree(session_opts->uname);
+ session_opts->uname = param->string;
+ param->string = NULL;
+ break;
+ case Opt_remotename:
+ kfree(session_opts->aname);
+ session_opts->aname = param->string;
+ param->string = NULL;
+ break;
+ case Opt_nodevmap:
+ session_opts->nodev = 1;
+ break;
+ case Opt_noxattr:
+ session_opts->flags |= V9FS_NO_XATTR;
+ break;
+ case Opt_directio:
+ session_opts->flags |= V9FS_DIRECT_IO;
+ break;
+ case Opt_ignoreqv:
+ session_opts->flags |= V9FS_IGNORE_QV;
+ break;
+ case Opt_cachetag:
#ifdef CONFIG_9P_FSCACHE
- kfree(v9ses->cachetag);
- v9ses->cachetag = match_strdup(&args[0]);
- if (!v9ses->cachetag) {
- ret = -ENOMEM;
- goto free_and_return;
- }
+ kfree(session_opts->cachetag);
+ session_opts->cachetag = param->string;
+ param->string = NULL;
#endif
- break;
- case Opt_cache:
- s = match_strdup(&args[0]);
- if (!s) {
- ret = -ENOMEM;
- p9_debug(P9_DEBUG_ERROR,
- "problem allocating copy of cache arg\n");
- goto free_and_return;
- }
- r = get_cache_mode(s);
- if (r < 0)
- ret = r;
- else
- v9ses->cache = r;
-
- kfree(s);
- break;
-
- case Opt_access:
- s = match_strdup(&args[0]);
- if (!s) {
- ret = -ENOMEM;
- p9_debug(P9_DEBUG_ERROR,
- "problem allocating copy of access arg\n");
- goto free_and_return;
+ break;
+ case Opt_cache:
+ r = get_cache_mode(param->string);
+ if (r < 0)
+ return r;
+ session_opts->cache = r;
+ break;
+ case Opt_access:
+ s = param->string;
+ session_opts->flags &= ~V9FS_ACCESS_MASK;
+ if (strcmp(s, "user") == 0) {
+ session_opts->flags |= V9FS_ACCESS_USER;
+ } else if (strcmp(s, "any") == 0) {
+ session_opts->flags |= V9FS_ACCESS_ANY;
+ } else if (strcmp(s, "client") == 0) {
+ session_opts->flags |= V9FS_ACCESS_CLIENT;
+ } else {
+ uid_t uid;
+
+ session_opts->flags |= V9FS_ACCESS_SINGLE;
+ r = kstrtouint(s, 10, &uid);
+ if (r) {
+ pr_info("Unknown access argument %s: %d\n",
+ param->string, r);
+ return r;
}
-
- v9ses->flags &= ~V9FS_ACCESS_MASK;
- if (strcmp(s, "user") == 0)
- v9ses->flags |= V9FS_ACCESS_USER;
- else if (strcmp(s, "any") == 0)
- v9ses->flags |= V9FS_ACCESS_ANY;
- else if (strcmp(s, "client") == 0) {
- v9ses->flags |= V9FS_ACCESS_CLIENT;
- } else {
- uid_t uid;
-
- v9ses->flags |= V9FS_ACCESS_SINGLE;
- r = kstrtouint(s, 10, &uid);
- if (r) {
- ret = r;
- pr_info("Unknown access argument %s: %d\n",
- s, r);
- kfree(s);
- continue;
- }
- v9ses->uid = make_kuid(current_user_ns(), uid);
- if (!uid_valid(v9ses->uid)) {
- ret = -EINVAL;
- pr_info("Unknown uid %s\n", s);
- }
+ session_opts->uid = make_kuid(current_user_ns(), uid);
+ if (!uid_valid(session_opts->uid)) {
+ pr_info("Unknown uid %s\n", s);
+ return -EINVAL;
}
+ }
+ break;
- kfree(s);
- break;
-
- case Opt_posixacl:
+ case Opt_posixacl:
#ifdef CONFIG_9P_FS_POSIX_ACL
- v9ses->flags |= V9FS_POSIX_ACL;
+ session_opts->flags |= V9FS_POSIX_ACL;
#else
- p9_debug(P9_DEBUG_ERROR,
- "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
+ p9_debug(P9_DEBUG_ERROR,
+ "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
#endif
- break;
-
- case Opt_locktimeout:
- r = match_int(&args[0], &option);
- if (r < 0) {
- p9_debug(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
- ret = r;
- continue;
- }
- if (option < 1) {
- p9_debug(P9_DEBUG_ERROR,
- "locktimeout must be a greater than zero integer.\n");
- ret = -EINVAL;
- continue;
- }
- v9ses->session_lock_timeout = (long)option * HZ;
- break;
+ break;
+
+ case Opt_locktimeout:
+ if (result.uint_32 < 1) {
+ p9_debug(P9_DEBUG_ERROR,
+ "locktimeout must be a greater than zero integer.\n");
+ return -EINVAL;
+ }
+ session_opts->session_lock_timeout = (long)result.uint_32 * HZ;
+ break;
- default:
- continue;
+ /* Options for client */
+ case Opt_msize:
+ if (result.uint_32 < 4096) {
+ p9_debug(P9_DEBUG_ERROR, "msize should be at least 4k\n");
+ return -EINVAL;
+ }
+ if (result.uint_32 > INT_MAX) {
+ p9_debug(P9_DEBUG_ERROR, "msize too big\n");
+ return -EINVAL;
}
+ clnt->msize = result.uint_32;
+ break;
+ case Opt_trans:
+ v9fs_put_trans(clnt->trans_mod);
+ clnt->trans_mod = v9fs_get_trans_by_name(param->string);
+ if (!clnt->trans_mod) {
+ pr_info("Could not find request transport: %s\n",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case Opt_legacy:
+ clnt->proto_version = p9_proto_legacy;
+ break;
+ case Opt_version:
+ clnt->proto_version = result.uint_32;
+ p9_debug(P9_DEBUG_9P, "Protocol version: %s\n", param->string);
+ break;
+ /* Options for fd transport */
+ case Opt_rfdno:
+ fd_opts->rfd = result.uint_32;
+ break;
+ case Opt_wfdno:
+ fd_opts->wfd = result.uint_32;
+ break;
+ /* Options for rdma transport */
+ case Opt_sq_depth:
+ rdma_opts->sq_depth = result.uint_32;
+ break;
+ case Opt_rq_depth:
+ rdma_opts->rq_depth = result.uint_32;
+ break;
+ case Opt_timeout:
+ rdma_opts->timeout = result.uint_32;
+ break;
+ /* Options for both fd and rdma transports */
+ case Opt_port:
+ fd_opts->port = result.uint_32;
+ rdma_opts->port = result.uint_32;
+ break;
+ case Opt_privport:
+ fd_opts->privport = true;
+ rdma_opts->port = true;
+ break;
}
-free_and_return:
- kfree(tmp_options);
-fail_option_alloc:
- return ret;
+ return 0;
+}
+
+static void v9fs_apply_options(struct v9fs_session_info *v9ses,
+ struct fs_context *fc)
+{
+ struct v9fs_context *ctx = fc->fs_private;
+
+ v9ses->debug = ctx->session_opts.debug;
+ v9ses->dfltuid = ctx->session_opts.dfltuid;
+ v9ses->dfltgid = ctx->session_opts.dfltgid;
+ v9ses->afid = ctx->session_opts.afid;
+ v9ses->uname = ctx->session_opts.uname;
+ ctx->session_opts.uname = NULL;
+ v9ses->aname = ctx->session_opts.aname;
+ ctx->session_opts.aname = NULL;
+ v9ses->nodev = ctx->session_opts.nodev;
+ /*
+ * Note that we must |= flags here as session_init already
+ * set basic flags. This adds in flags from parsed options.
+ */
+ v9ses->flags |= ctx->session_opts.flags;
+#ifdef CONFIG_9P_FSCACHE
+ v9ses->cachetag = ctx->session_opts.cachetag;
+ ctx->session_opts.cachetag = NULL;
+#endif
+ v9ses->cache = ctx->session_opts.cache;
+ v9ses->uid = ctx->session_opts.uid;
+ v9ses->session_lock_timeout = ctx->session_opts.session_lock_timeout;
}
/**
* v9fs_session_init - initialize session
* @v9ses: session information structure
- * @dev_name: device being mounted
- * @data: options
+ * @fc: the filesystem mount context
*
*/
struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
- const char *dev_name, char *data)
+ struct fs_context *fc)
{
struct p9_fid *fid;
int rc = -ENOMEM;
- v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
- if (!v9ses->uname)
- goto err_names;
-
- v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
- if (!v9ses->aname)
- goto err_names;
init_rwsem(&v9ses->rename_sem);
- v9ses->uid = INVALID_UID;
- v9ses->dfltuid = V9FS_DEFUID;
- v9ses->dfltgid = V9FS_DEFGID;
-
- v9ses->clnt = p9_client_create(dev_name, data);
+ v9ses->clnt = p9_client_create(fc);
if (IS_ERR(v9ses->clnt)) {
rc = PTR_ERR(v9ses->clnt);
p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
goto err_names;
}
+ /*
+ * Initialize flags on the real v9ses. v9fs_apply_options below
+ * will |= the additional flags from parsed options.
+ */
v9ses->flags = V9FS_ACCESS_USER;
if (p9_is_proto_dotl(v9ses->clnt)) {
@@ -423,9 +459,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->flags |= V9FS_PROTO_2000U;
}
- rc = v9fs_parse_options(v9ses, data);
- if (rc < 0)
- goto err_clnt;
+ v9fs_apply_options(v9ses, fc);
v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
@@ -438,8 +472,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->flags &= ~V9FS_ACCESS_MASK;
v9ses->flags |= V9FS_ACCESS_USER;
}
- /*FIXME !! */
- /* for legacy mode, fall back to V9FS_ACCESS_ANY */
+ /* FIXME: for legacy mode, fall back to V9FS_ACCESS_ANY */
if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
@@ -450,7 +483,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (!v9fs_proto_dotl(v9ses) ||
!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
/*
- * We support ACL checks on clinet only if the protocol is
+ * We support ACL checks on client only if the protocol is
* 9P2000.L and access is V9FS_ACCESS_CLIENT.
*/
v9ses->flags &= ~V9FS_ACL_MASK;
@@ -472,7 +505,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
#ifdef CONFIG_9P_FSCACHE
/* register the session for caching */
if (v9ses->cache & CACHE_FSCACHE) {
- rc = v9fs_cache_session_get_cookie(v9ses, dev_name);
+ rc = v9fs_cache_session_get_cookie(v9ses, fc->source);
if (rc < 0)
goto err_clnt;
}
@@ -561,7 +594,7 @@ static ssize_t caches_show(struct kobject *kobj,
spin_lock(&v9fs_sessionlist_lock);
list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
if (v9ses->cachetag) {
- n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
+ n = snprintf(buf + count, limit, "%s\n", v9ses->cachetag);
if (n < 0) {
count = n;
break;
@@ -597,13 +630,16 @@ static const struct attribute_group v9fs_attr_group = {
static int __init v9fs_sysfs_init(void)
{
+ int ret;
+
v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
if (!v9fs_kobj)
return -ENOMEM;
- if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
+ ret = sysfs_create_group(v9fs_kobj, &v9fs_attr_group);
+ if (ret) {
kobject_put(v9fs_kobj);
- return -ENOMEM;
+ return ret;
}
return 0;
@@ -669,7 +705,7 @@ static int __init init_v9fs(void)
int err;
pr_info("Installing v9fs 9p2000 file system support\n");
- /* TODO: Setup list of registered trasnport modules */
+ /* TODO: Setup list of registered transport modules */
err = v9fs_init_inode_cache();
if (err < 0) {
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index f28bc763847a..6a12445d3858 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -10,6 +10,9 @@
#include <linux/backing-dev.h>
#include <linux/netfs.h>
+#include <linux/fs_parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
/**
* enum p9_session_flags - option flags for each 9P session
@@ -163,11 +166,13 @@ static inline struct fscache_volume *v9fs_session_cache(struct v9fs_session_info
#endif
}
+extern const struct fs_parameter_spec v9fs_param_spec[];
+extern int v9fs_parse_param(struct fs_context *fc, struct fs_parameter *param);
extern int v9fs_show_options(struct seq_file *m, struct dentry *root);
struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
- const char *dev_name, char *data);
+ struct fs_context *fc);
extern void v9fs_session_close(struct v9fs_session_info *v9ses);
extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 32619d146cbc..862164181bac 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -59,7 +59,7 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
if (len > 0)
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
- netfs_write_subrequest_terminated(subreq, len ?: err, false);
+ netfs_write_subrequest_terminated(subreq, len ?: err);
}
/**
@@ -77,7 +77,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
- if (subreq->rreq->origin != NETFS_DIO_READ)
+ if (subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
+ subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (pos + total >= i_size_read(rreq->inode))
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
@@ -164,4 +165,5 @@ const struct address_space_operations v9fs_addr_operations = {
.invalidate_folio = netfs_invalidate_folio,
.direct_IO = noop_direct_IO,
.writepages = netfs_writepages,
+ .migrate_folio = filemap_migrate_folio,
};
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 5061f192eafd..c5bf74d547e8 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -80,8 +80,13 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct v9fs_session_info *v9ses;
fid = v9fs_fid_lookup(dentry);
- if (IS_ERR(fid))
+ if (IS_ERR(fid)) {
+ p9_debug(
+ P9_DEBUG_VFS,
+ "v9fs_fid_lookup: dentry = %pd (%p), got error %pe\n",
+ dentry, dentry, fid);
return PTR_ERR(fid);
+ }
v9ses = v9fs_inode2v9ses(inode);
if (v9fs_proto_dotl(v9ses))
@@ -90,12 +95,25 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
retval = v9fs_refresh_inode(fid, inode);
p9_fid_put(fid);
- if (retval == -ENOENT)
+ if (retval == -ENOENT) {
+ p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) invalidated due to ENOENT\n",
+ dentry, dentry);
return 0;
- if (retval < 0)
+ }
+ if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+ p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) invalidated due to type change\n",
+ dentry, dentry);
+ return 0;
+ }
+ if (retval < 0) {
+ p9_debug(P9_DEBUG_VFS,
+ "refresh inode: dentry = %pd (%p), got error %pe\n",
+ dentry, dentry, ERR_PTR(retval));
return retval;
+ }
}
out_valid:
+ p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) is valid\n", dentry, dentry);
return 1;
}
@@ -127,7 +145,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {
};
const struct dentry_operations v9fs_dentry_operations = {
- .d_delete = always_delete_dentry,
.d_release = v9fs_dentry_release,
.d_unalias_trylock = v9fs_dentry_unalias_trylock,
.d_unalias_unlock = v9fs_dentry_unalias_unlock,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 348cc90bf9c5..6f3880208587 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -43,14 +43,18 @@ int v9fs_file_open(struct inode *inode, struct file *file)
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
int omode;
+ int o_append;
p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
v9ses = v9fs_inode2v9ses(inode);
- if (v9fs_proto_dotl(v9ses))
+ if (v9fs_proto_dotl(v9ses)) {
omode = v9fs_open_to_dotl_flags(file->f_flags);
- else
+ o_append = P9_DOTL_APPEND;
+ } else {
omode = v9fs_uflags2omode(file->f_flags,
v9fs_proto_dotu(v9ses));
+ o_append = P9_OAPPEND;
+ }
fid = file->private_data;
if (!fid) {
fid = v9fs_fid_clone(file_dentry(file));
@@ -58,9 +62,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
return PTR_ERR(fid);
if ((v9ses->cache & CACHE_WRITEBACK) && (omode & P9_OWRITE)) {
- int writeback_omode = (omode & ~P9_OWRITE) | P9_ORDWR;
+ int writeback_omode = (omode & ~(P9_OWRITE | o_append)) | P9_ORDWR;
p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, try opening O_RDWR\n");
+
err = p9_client_open(fid, writeback_omode);
if (err < 0) {
p9_debug(P9_DEBUG_CACHE, "could not open O_RDWR, disabling caches\n");
@@ -454,9 +459,10 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
}
static int
-v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+v9fs_file_mmap_prepare(struct vm_area_desc *desc)
{
int retval;
+ struct file *filp = desc->file;
struct inode *inode = file_inode(filp);
struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
@@ -464,12 +470,12 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
if (!(v9ses->cache & CACHE_WRITEBACK)) {
p9_debug(P9_DEBUG_CACHE, "(read-only mmap mode)");
- return generic_file_readonly_mmap(filp, vma);
+ return generic_file_readonly_mmap_prepare(desc);
}
- retval = generic_file_mmap(filp, vma);
+ retval = generic_file_mmap_prepare(desc);
if (!retval)
- vma->vm_ops = &v9fs_mmap_file_vm_ops;
+ desc->vm_ops = &v9fs_mmap_file_vm_ops;
return retval;
}
@@ -482,24 +488,15 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf)
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
{
- struct inode *inode;
-
- struct writeback_control wbc = {
- .nr_to_write = LONG_MAX,
- .sync_mode = WB_SYNC_ALL,
- .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE,
- /* absolute end, byte at end included */
- .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE +
- (vma->vm_end - vma->vm_start - 1),
- };
-
if (!(vma->vm_flags & VM_SHARED))
return;
p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
- inode = file_inode(vma->vm_file);
- filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
+ filemap_fdatawrite_range(file_inode(vma->vm_file)->i_mapping,
+ (loff_t)vma->vm_pgoff * PAGE_SIZE,
+ (loff_t)vma->vm_pgoff * PAGE_SIZE +
+ (vma->vm_end - vma->vm_start - 1));
}
static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -516,7 +513,7 @@ const struct file_operations v9fs_file_operations = {
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock,
- .mmap = generic_file_readonly_mmap,
+ .mmap_prepare = generic_file_readonly_mmap_prepare,
.splice_read = v9fs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = v9fs_file_fsync,
@@ -531,7 +528,7 @@ const struct file_operations v9fs_file_operations_dotl = {
.release = v9fs_dir_release,
.lock = v9fs_file_lock_dotl,
.flock = v9fs_file_flock_dotl,
- .mmap = v9fs_file_mmap,
+ .mmap_prepare = v9fs_file_mmap_prepare,
.splice_read = v9fs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 399d455d50d6..97abe65bf7c1 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -422,7 +422,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
/*
* initialize the inode with the stat info
@@ -768,44 +768,40 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
struct v9fs_inode __maybe_unused *v9inode;
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
- struct dentry *res = NULL;
struct inode *inode;
int p9_omode;
if (d_in_lookup(dentry)) {
- res = v9fs_vfs_lookup(dir, dentry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- dentry = res;
+ struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
+ if (res || d_really_is_positive(dentry))
+ return finish_no_open(file, res);
}
/* Only creates */
- if (!(flags & O_CREAT) || d_really_is_positive(dentry))
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode);
p9_omode = v9fs_uflags2omode(flags, v9fs_proto_dotu(v9ses));
if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) {
- p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR;
+ p9_omode = (p9_omode & ~(P9_OWRITE | P9_OAPPEND)) | P9_ORDWR;
p9_debug(P9_DEBUG_CACHE,
"write-only file with writeback enabled, creating w/ O_RDWR\n");
}
fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- goto error;
- }
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
inode = d_inode(dentry);
v9inode = V9FS_I(inode);
err = finish_open(file, dentry, generic_file_open);
- if (err)
- goto error;
+ if (unlikely(err)) {
+ p9_fid_put(fid);
+ return err;
+ }
file->private_data = fid;
#ifdef CONFIG_9P_FSCACHE
@@ -818,13 +814,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
v9fs_open_fid_add(inode, &fid);
file->f_mode |= FMODE_CREATED;
-out:
- dput(res);
- return err;
-
-error:
- p9_fid_put(fid);
- goto out;
+ return 0;
}
/**
@@ -1403,4 +1393,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
.getattr = v9fs_vfs_getattr,
.setattr = v9fs_vfs_setattr,
};
-
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index cc2007be2173..643e759eacb2 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -112,7 +112,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
/*
* initialize the inode with the stat info
@@ -238,20 +238,16 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
struct p9_fid *dfid = NULL, *ofid = NULL;
struct v9fs_session_info *v9ses;
struct posix_acl *pacl = NULL, *dacl = NULL;
- struct dentry *res = NULL;
if (d_in_lookup(dentry)) {
- res = v9fs_vfs_lookup(dir, dentry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- dentry = res;
+ struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
+ if (res || d_really_is_positive(dentry))
+ return finish_no_open(file, res);
}
/* Only creates */
- if (!(flags & O_CREAT) || d_really_is_positive(dentry))
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
v9ses = v9fs_inode2v9ses(dir);
@@ -286,7 +282,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
}
if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) {
- p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR;
+ p9_omode = (p9_omode & ~(P9_OWRITE | P9_DOTL_APPEND)) | P9_ORDWR;
p9_debug(P9_DEBUG_CACHE,
"write-only file with writeback enabled, creating w/ O_RDWR\n");
}
@@ -337,7 +333,6 @@ out:
p9_fid_put(ofid);
p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
- dput(res);
return err;
}
@@ -407,8 +402,8 @@ static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
err);
goto error;
}
- v9fs_fid_add(dentry, &fid);
v9fs_set_create_acl(inode, fid, dacl, pacl);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
err = 0;
inc_nlink(dir);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 489db161abc9..315336de6f02 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -19,6 +19,7 @@
#include <linux/statfs.h>
#include <linux/magic.h>
#include <linux/fscache.h>
+#include <linux/fs_context.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -30,32 +31,10 @@
static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
-/**
- * v9fs_set_super - set the superblock
- * @s: super block
- * @data: file system specific data
- *
- */
-
-static int v9fs_set_super(struct super_block *s, void *data)
-{
- s->s_fs_info = data;
- return set_anon_super(s, data);
-}
-
-/**
- * v9fs_fill_super - populate superblock with info
- * @sb: superblock
- * @v9ses: session information
- * @flags: flags propagated from v9fs_mount()
- *
- */
-
-static int
-v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
- int flags)
+static int v9fs_fill_super(struct super_block *sb)
{
int ret;
+ struct v9fs_session_info *v9ses = v9ses = sb->s_fs_info;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -95,16 +74,12 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
}
/**
- * v9fs_mount - mount a superblock
- * @fs_type: file system type
- * @flags: mount flags
- * @dev_name: device name that was mounted
- * @data: mount options
+ * v9fs_get_tree - create the mountable root and superblock
+ * @fc: the filesystem context
*
*/
-static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+static int v9fs_get_tree(struct fs_context *fc)
{
struct super_block *sb = NULL;
struct inode *inode = NULL;
@@ -117,27 +92,30 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
if (!v9ses)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
- fid = v9fs_session_init(v9ses, dev_name, data);
+ fid = v9fs_session_init(v9ses, fc);
if (IS_ERR(fid)) {
retval = PTR_ERR(fid);
goto free_session;
}
- sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
+ fc->s_fs_info = v9ses;
+ sb = sget_fc(fc, NULL, set_anon_super_fc);
if (IS_ERR(sb)) {
retval = PTR_ERR(sb);
goto clunk_fid;
}
- retval = v9fs_fill_super(sb, v9ses, flags);
+ retval = v9fs_fill_super(sb);
if (retval)
goto release_sb;
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- sb->s_d_op = &v9fs_cached_dentry_operations;
- else
- sb->s_d_op = &v9fs_dentry_operations;
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
+ set_default_d_op(sb, &v9fs_cached_dentry_operations);
+ } else {
+ set_default_d_op(sb, &v9fs_dentry_operations);
+ sb->s_d_flags |= DCACHE_DONTCACHE;
+ }
inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb);
if (IS_ERR(inode)) {
@@ -157,14 +135,15 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
v9fs_fid_add(root, &fid);
p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+ return 0;
clunk_fid:
p9_fid_put(fid);
v9fs_session_close(v9ses);
free_session:
kfree(v9ses);
- return ERR_PTR(retval);
+ return retval;
release_sb:
/*
@@ -175,7 +154,7 @@ release_sb:
*/
p9_fid_put(fid);
deactivate_locked_super(sb);
- return ERR_PTR(retval);
+ return retval;
}
/**
@@ -250,7 +229,7 @@ static int v9fs_drop_inode(struct inode *inode)
v9ses = v9fs_inode2v9ses(inode);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
/*
* in case of non cached mode always drop the
* inode because we want the inode attribute
@@ -301,11 +280,86 @@ static const struct super_operations v9fs_super_ops_dotl = {
.write_inode = v9fs_write_inode_dotl,
};
+static void v9fs_free_fc(struct fs_context *fc)
+{
+ struct v9fs_context *ctx = fc->fs_private;
+
+ if (!ctx)
+ return;
+
+ /* These should be NULL by now but guard against leaks */
+ kfree(ctx->session_opts.uname);
+ kfree(ctx->session_opts.aname);
+#ifdef CONFIG_9P_FSCACHE
+ kfree(ctx->session_opts.cachetag);
+#endif
+ if (ctx->client_opts.trans_mod)
+ v9fs_put_trans(ctx->client_opts.trans_mod);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations v9fs_context_ops = {
+ .parse_param = v9fs_parse_param,
+ .get_tree = v9fs_get_tree,
+ .free = v9fs_free_fc,
+};
+
+static int v9fs_init_fs_context(struct fs_context *fc)
+{
+ struct v9fs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ /* initialize core options */
+ ctx->session_opts.afid = ~0;
+ ctx->session_opts.cache = CACHE_NONE;
+ ctx->session_opts.session_lock_timeout = P9_LOCK_TIMEOUT;
+ ctx->session_opts.uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
+ if (!ctx->session_opts.uname)
+ goto error;
+
+ ctx->session_opts.aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
+ if (!ctx->session_opts.aname)
+ goto error;
+
+ ctx->session_opts.uid = INVALID_UID;
+ ctx->session_opts.dfltuid = V9FS_DEFUID;
+ ctx->session_opts.dfltgid = V9FS_DEFGID;
+
+ /* initialize client options */
+ ctx->client_opts.proto_version = p9_proto_2000L;
+ ctx->client_opts.msize = DEFAULT_MSIZE;
+
+ /* initialize fd transport options */
+ ctx->fd_opts.port = P9_FD_PORT;
+ ctx->fd_opts.rfd = ~0;
+ ctx->fd_opts.wfd = ~0;
+ ctx->fd_opts.privport = false;
+
+ /* initialize rdma transport options */
+ ctx->rdma_opts.port = P9_RDMA_PORT;
+ ctx->rdma_opts.sq_depth = P9_RDMA_SQ_DEPTH;
+ ctx->rdma_opts.rq_depth = P9_RDMA_RQ_DEPTH;
+ ctx->rdma_opts.timeout = P9_RDMA_TIMEOUT;
+ ctx->rdma_opts.privport = false;
+
+ fc->ops = &v9fs_context_ops;
+ fc->fs_private = ctx;
+
+ return 0;
+error:
+ fc->need_free = 1;
+ return -ENOMEM;
+}
+
struct file_system_type v9fs_fs_type = {
.name = "9p",
- .mount = v9fs_mount,
.kill_sb = v9fs_kill_super,
.owner = THIS_MODULE,
.fs_flags = FS_RENAME_DOES_D_MOVE,
+ .init_fs_context = v9fs_init_fs_context,
+ .parameters = v9fs_param_spec,
};
MODULE_ALIAS_FS("9p");
diff --git a/fs/Kconfig b/fs/Kconfig
index afe21866d6b4..0bfdaecaa877 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -51,7 +51,6 @@ source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"
-source "fs/bcachefs/Kconfig"
source "fs/zonefs/Kconfig"
endif # BLOCK
@@ -59,7 +58,7 @@ endif # BLOCK
config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
- depends on ZONE_DEVICE || FS_DAX_LIMITED
+ depends on ZONE_DEVICE
select FS_IOMAP
select DAX
help
@@ -95,13 +94,6 @@ config FS_DAX_PMD
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
-# Selected by DAX drivers that do not expect filesystem DAX to support
-# get_user_pages() of DAX mappings. I.e. "limited" indicates no support
-# for fork() of processes with MAP_SHARED mappings or support for
-# direct-I/O to a DAX mapping.
-config FS_DAX_LIMITED
- bool
-
# Posix ACL utility routines
#
# Note: Posix ACLs can be implemented without these helpers. Never use
@@ -256,8 +248,7 @@ config ARCH_SUPPORTS_HUGETLBFS
menuconfig HUGETLBFS
bool "HugeTLB file system support"
- depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
- depends on (SYSFS || SYSCTL)
+ depends on ARCH_SUPPORTS_HUGETLBFS
select MEMFD_CREATE
select PADATA if SMP
help
@@ -286,6 +277,7 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
+ select SPARSEMEM_VMEMMAP_PREINIT if ARCH_WANT_HUGETLB_VMEMMAP_PREINIT
config HUGETLB_PMD_PAGE_TABLE_SHARING
def_bool HUGETLB_PAGE
@@ -334,6 +326,7 @@ source "fs/omfs/Kconfig"
source "fs/hpfs/Kconfig"
source "fs/qnx4/Kconfig"
source "fs/qnx6/Kconfig"
+source "fs/resctrl/Kconfig"
source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
source "fs/ufs/Kconfig"
@@ -367,6 +360,7 @@ config GRACE_PERIOD
config LOCKD
tristate
depends on FILE_LOCKING
+ select CRC32
select GRACE_PERIOD
config LOCKD_V4
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bd2f530e5740..1949e25c7741 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -184,4 +184,13 @@ config EXEC_KUNIT_TEST
This builds the exec KUnit tests, which tests boundary conditions
of various aspects of the exec internals.
+config ARCH_HAS_ELF_CORE_EFLAGS
+ bool
+ depends on BINFMT_ELF && ELF_CORE
+ default n
+ help
+ Select this option if the architecture makes use of the e_flags
+ field in the ELF header to store ABI or other architecture-specific
+ information that should be preserved in core dumps.
+
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 77fd7f7b5d02..a04274a3c854 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,8 +14,9 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
- kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
+ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
+ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
+ file_attr.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
@@ -120,7 +121,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_F2FS_FS) += f2fs/
-obj-$(CONFIG_BCACHEFS_FS) += bcachefs/
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
@@ -128,3 +128,4 @@ obj-$(CONFIG_EROFS_FS) += erofs/
obj-$(CONFIG_VBOXSF_FS) += vboxsf/
obj-$(CONFIG_ZONEFS_FS) += zonefs/
obj-$(CONFIG_BPF_LSM) += bpf_fs_kfuncs.o
+obj-$(CONFIG_RESCTRL_FS) += resctrl/
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index ee80718aaeec..cd13165fd904 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -25,7 +25,7 @@
const struct file_operations adfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.fsync = generic_file_fsync,
.write_iter = generic_file_write_iter,
.splice_read = filemap_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 21527189e430..6830f8bc8d4e 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -53,13 +53,14 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
truncate_pagecache(inode, inode->i_size);
}
-static int adfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int adfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 017c48a80203..fdccdbbfc213 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -397,7 +397,7 @@ static int adfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (asb->s_ftsuffix)
asb->s_namelen += 4;
- sb->s_d_op = &adfs_dentry_operations;
+ set_default_d_op(sb, &adfs_dentry_operations);
root = adfs_iget(sb, &root_obj);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 7a71018e3f67..765c3443663e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -415,13 +415,14 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return ret;
}
-static int affs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int affs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -430,14 +431,15 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int affs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
+static int affs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned int len, unsigned int copied,
struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
/* Clear Archived bit on file writes, as AmigaOS would do */
if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
@@ -645,7 +647,8 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio)
return err;
}
-static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
+static int affs_write_begin_ofs(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -684,9 +687,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return err;
}
-static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int affs_write_end_ofs(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
@@ -999,7 +1003,7 @@ const struct file_operations affs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.open = affs_file_open,
.release = affs_file_release,
.fsync = affs_file_fsync,
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0210df8d3500..0bfc7d151dcd 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -29,7 +29,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
pr_debug("affs_iget(%lu)\n", inode->i_ino);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 2fa40337776d..44f8aa883100 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -500,9 +500,9 @@ got_root:
return PTR_ERR(root_inode);
if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL))
- sb->s_d_op = &affs_intl_dentry_operations;
+ set_default_d_op(sb, &affs_intl_dentry_operations);
else
- sb->s_d_op = &affs_dentry_operations;
+ set_default_d_op(sb, &affs_dentry_operations);
sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index fc8ba9142f2f..682bd8ec2c10 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -5,6 +5,7 @@ config AFS_FS
select AF_RXRPC
select DNS_RESOLVER
select NETFS_SUPPORT
+ select CRYPTO_KRB5
help
If you say Y here, you will get an experimental Andrew File System
driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 5efd7e13b304..b49b8fe682f3 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -8,6 +8,7 @@ kafs-y := \
addr_prefs.o \
callback.o \
cell.o \
+ cm_security.o \
cmservice.o \
dir.o \
dir_edit.o \
diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c
index c0384201b8fe..133736412c3d 100644
--- a/fs/afs/addr_prefs.c
+++ b/fs/afs/addr_prefs.c
@@ -48,7 +48,7 @@ static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv)
strv[count++] = p;
/* Skip over word */
- while (!isspace(*p))
+ while (!isspace(*p) && *p)
p++;
if (!*p)
break;
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 69e1dd55b160..894d2bad6b6c 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -42,7 +42,7 @@ static void afs_volume_init_callback(struct afs_volume *volume)
list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb);
- queue_work(system_unbound_wq, &vnode->cb_work);
+ queue_work(system_dfl_wq, &vnode->cb_work);
}
}
@@ -90,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
if (reason != afs_cb_break_for_deleted &&
vnode->status.type == AFS_FTYPE_FILE &&
atomic_read(&vnode->cb_nr_mmap))
- queue_work(system_unbound_wq, &vnode->cb_work);
+ queue_work(system_dfl_wq, &vnode->cb_work);
trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true);
} else {
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 0168bbf53fe0..71c10a05cebe 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -140,7 +140,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
return ERR_PTR(-ENOMEM);
}
- cell->name = kmalloc(1 + namelen + 1, GFP_KERNEL);
+ /* Allocate the cell name and the key name in one go. */
+ cell->name = kmalloc(1 + namelen + 1 +
+ 4 + namelen + 1, GFP_KERNEL);
if (!cell->name) {
kfree(cell);
return ERR_PTR(-ENOMEM);
@@ -151,7 +153,11 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->name_len = namelen;
for (i = 0; i < namelen; i++)
cell->name[i] = tolower(name[i]);
- cell->name[i] = 0;
+ cell->name[i++] = 0;
+
+ cell->key_desc = cell->name + i;
+ memcpy(cell->key_desc, "afs@", 4);
+ memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1);
cell->net = net;
refcount_set(&cell->ref, 1);
@@ -177,6 +183,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
VL_SERVICE, AFS_VL_PORT);
if (IS_ERR(vllist)) {
ret = PTR_ERR(vllist);
+ vllist = NULL;
goto parse_failed;
}
@@ -228,7 +235,7 @@ error:
* @name: The name of the cell.
* @namesz: The strlen of the cell name.
* @vllist: A colon/comma separated list of numeric IP addresses or NULL.
- * @excl: T if an error should be given if the cell name already exists.
+ * @reason: The reason we're doing the lookup
* @trace: The reason to be logged if the lookup is successful.
*
* Look up a cell record by name and query the DNS for VL server addresses if
@@ -238,7 +245,8 @@ error:
*/
struct afs_cell *afs_lookup_cell(struct afs_net *net,
const char *name, unsigned int namesz,
- const char *vllist, bool excl,
+ const char *vllist,
+ enum afs_lookup_cell_for reason,
enum afs_cell_trace trace)
{
struct afs_cell *cell, *candidate, *cursor;
@@ -246,12 +254,18 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
enum afs_cell_state state;
int ret, n;
- _enter("%s,%s", name, vllist);
+ _enter("%s,%s,%u", name, vllist, reason);
- if (!excl) {
+ if (reason != AFS_LOOKUP_CELL_PRELOAD) {
cell = afs_find_cell(net, name, namesz, trace);
- if (!IS_ERR(cell))
+ if (!IS_ERR(cell)) {
+ if (reason == AFS_LOOKUP_CELL_DYNROOT)
+ goto no_wait;
+ if (cell->state == AFS_CELL_SETTING_UP ||
+ cell->state == AFS_CELL_UNLOOKED)
+ goto lookup_cell;
goto wait_for_cell;
+ }
}
/* Assume we're probably going to create a cell and preallocate and
@@ -297,26 +311,69 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
rb_insert_color(&cell->net_node, &net->cells);
up_write(&net->cells_lock);
- afs_queue_cell(cell, afs_cell_trace_queue_new);
+lookup_cell:
+ if (reason != AFS_LOOKUP_CELL_PRELOAD &&
+ reason != AFS_LOOKUP_CELL_ROOTCELL) {
+ set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
+ afs_queue_cell(cell, afs_cell_trace_queue_new);
+ }
wait_for_cell:
- _debug("wait_for_cell");
state = smp_load_acquire(&cell->state); /* vs error */
- if (state != AFS_CELL_ACTIVE &&
- state != AFS_CELL_DEAD) {
+ switch (state) {
+ case AFS_CELL_ACTIVE:
+ case AFS_CELL_DEAD:
+ break;
+ case AFS_CELL_UNLOOKED:
+ default:
+ if (reason == AFS_LOOKUP_CELL_PRELOAD ||
+ reason == AFS_LOOKUP_CELL_ROOTCELL)
+ break;
+ _debug("wait_for_cell");
afs_see_cell(cell, afs_cell_trace_wait);
wait_var_event(&cell->state,
({
state = smp_load_acquire(&cell->state); /* vs error */
state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD;
}));
+ _debug("waited_for_cell %d %d", cell->state, cell->error);
}
+no_wait:
/* Check the state obtained from the wait check. */
+ state = smp_load_acquire(&cell->state); /* vs error */
if (state == AFS_CELL_DEAD) {
ret = cell->error;
goto error;
}
+ if (state == AFS_CELL_ACTIVE) {
+ switch (cell->dns_status) {
+ case DNS_LOOKUP_NOT_DONE:
+ if (cell->dns_source == DNS_RECORD_FROM_CONFIG) {
+ ret = 0;
+ break;
+ }
+ fallthrough;
+ default:
+ ret = -EIO;
+ goto error;
+ case DNS_LOOKUP_GOOD:
+ case DNS_LOOKUP_GOOD_WITH_BAD:
+ ret = 0;
+ break;
+ case DNS_LOOKUP_GOT_NOT_FOUND:
+ ret = -ENOENT;
+ goto error;
+ case DNS_LOOKUP_BAD:
+ ret = -EREMOTEIO;
+ goto error;
+ case DNS_LOOKUP_GOT_LOCAL_FAILURE:
+ case DNS_LOOKUP_GOT_TEMP_FAILURE:
+ case DNS_LOOKUP_GOT_NS_FAILURE:
+ ret = -EDESTADDRREQ;
+ goto error;
+ }
+ }
_leave(" = %p [cell]", cell);
return cell;
@@ -324,7 +381,7 @@ wait_for_cell:
cell_already_exists:
_debug("cell exists");
cell = cursor;
- if (excl) {
+ if (reason == AFS_LOOKUP_CELL_PRELOAD) {
ret = -EEXIST;
} else {
afs_use_cell(cursor, trace);
@@ -383,7 +440,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
return -EINVAL;
/* allocate a cell record for the root/workstation cell */
- new_root = afs_lookup_cell(net, rootcell, len, vllist, false,
+ new_root = afs_lookup_cell(net, rootcell, len, vllist,
+ AFS_LOOKUP_CELL_ROOTCELL,
afs_cell_trace_use_lookup_ws);
if (IS_ERR(new_root)) {
_leave(" = %ld", PTR_ERR(new_root));
@@ -659,33 +717,6 @@ void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs)
}
/*
- * Allocate a key to use as a placeholder for anonymous user security.
- */
-static int afs_alloc_anon_key(struct afs_cell *cell)
-{
- struct key *key;
- char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp;
-
- /* Create a key to represent an anonymous user. */
- memcpy(keyname, "afs@", 4);
- dp = keyname + 4;
- cp = cell->name;
- do {
- *dp++ = tolower(*cp);
- } while (*cp++);
-
- key = rxrpc_get_null_key(keyname);
- if (IS_ERR(key))
- return PTR_ERR(key);
-
- cell->anonymous_key = key;
-
- _debug("anon key %p{%x}",
- cell->anonymous_key, key_serial(cell->anonymous_key));
- return 0;
-}
-
-/*
* Activate a cell.
*/
static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
@@ -694,12 +725,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
struct afs_cell *pcell;
int ret;
- if (!cell->anonymous_key) {
- ret = afs_alloc_anon_key(cell);
- if (ret < 0)
- return ret;
- }
-
ret = afs_proc_cell_setup(cell);
if (ret < 0)
return ret;
@@ -776,6 +801,7 @@ static bool afs_manage_cell(struct afs_cell *cell)
switch (cell->state) {
case AFS_CELL_SETTING_UP:
goto set_up_cell;
+ case AFS_CELL_UNLOOKED:
case AFS_CELL_ACTIVE:
goto cell_is_active;
case AFS_CELL_REMOVING:
@@ -796,7 +822,7 @@ set_up_cell:
goto remove_cell;
}
- afs_set_cell_state(cell, AFS_CELL_ACTIVE);
+ afs_set_cell_state(cell, AFS_CELL_UNLOOKED);
cell_is_active:
if (afs_has_cell_expired(cell, &next_manage))
@@ -806,6 +832,8 @@ cell_is_active:
ret = afs_update_cell(cell);
if (ret < 0)
cell->error = ret;
+ if (cell->state == AFS_CELL_UNLOOKED)
+ afs_set_cell_state(cell, AFS_CELL_ACTIVE);
}
if (next_manage < TIME64_MAX && cell->net->live) {
diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c
new file mode 100644
index 000000000000..edcbd249d202
--- /dev/null
+++ b/fs/afs/cm_security.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Cache manager security.
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/slab.h>
+#include <crypto/krb5.h>
+#include "internal.h"
+#include "afs_cm.h"
+#include "afs_fs.h"
+#include "protocol_yfs.h"
+#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
+#include <trace/events/rxrpc.h>
+
+#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c
+#define xdr_round_up(x) (round_up((x), sizeof(__be32)))
+#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32)))
+
+#ifdef CONFIG_RXGK
+static int afs_create_yfs_cm_token(struct sk_buff *challenge,
+ struct afs_server *server);
+#endif
+
+/*
+ * Respond to an RxGK challenge, adding appdata.
+ */
+static int afs_respond_to_challenge(struct sk_buff *challenge)
+{
+#ifdef CONFIG_RXGK
+ struct krb5_buffer appdata = {};
+ struct afs_server *server;
+#endif
+ struct rxrpc_peer *peer;
+ unsigned long peer_data;
+ u16 service_id;
+ u8 security_index;
+
+ rxrpc_kernel_query_challenge(challenge, &peer, &peer_data,
+ &service_id, &security_index);
+
+ _enter("%u,%u", service_id, security_index);
+
+ switch (service_id) {
+ /* We don't send CM_SERVICE RPCs, so don't expect a challenge
+ * therefrom.
+ */
+ case FS_SERVICE:
+ case VL_SERVICE:
+ case YFS_FS_SERVICE:
+ case YFS_VL_SERVICE:
+ break;
+ default:
+ pr_warn("Can't respond to unknown challenge %u:%u",
+ service_id, security_index);
+ return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO,
+ afs_abort_unsupported_sec_class);
+ }
+
+ switch (security_index) {
+#ifdef CONFIG_RXKAD
+ case RXRPC_SECURITY_RXKAD:
+ return rxkad_kernel_respond_to_challenge(challenge);
+#endif
+
+#ifdef CONFIG_RXGK
+ case RXRPC_SECURITY_RXGK:
+ return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+
+ case RXRPC_SECURITY_YFS_RXGK:
+ switch (service_id) {
+ case FS_SERVICE:
+ case YFS_FS_SERVICE:
+ server = (struct afs_server *)peer_data;
+ if (!server->cm_rxgk_appdata.data) {
+ mutex_lock(&server->cm_token_lock);
+ if (!server->cm_rxgk_appdata.data)
+ afs_create_yfs_cm_token(challenge, server);
+ mutex_unlock(&server->cm_token_lock);
+ }
+ if (server->cm_rxgk_appdata.data)
+ appdata = server->cm_rxgk_appdata;
+ break;
+ }
+ return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+#endif
+
+ default:
+ return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO,
+ afs_abort_unsupported_sec_class);
+ }
+}
+
+/*
+ * Process the OOB message queue, processing challenge packets.
+ */
+void afs_process_oob_queue(struct work_struct *work)
+{
+ struct afs_net *net = container_of(work, struct afs_net, rx_oob_work);
+ struct sk_buff *oob;
+ enum rxrpc_oob_type type;
+
+ while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) {
+ switch (type) {
+ case RXRPC_OOB_CHALLENGE:
+ afs_respond_to_challenge(oob);
+ break;
+ }
+ rxrpc_kernel_free_oob(oob);
+ }
+}
+
+#ifdef CONFIG_RXGK
+/*
+ * Create a securities keyring for the cache manager and attach a key to it for
+ * the RxGK tokens we want to use to secure the callback connection back from
+ * the fileserver.
+ */
+int afs_create_token_key(struct afs_net *net, struct socket *socket)
+{
+ const struct krb5_enctype *krb5;
+ struct key *ring;
+ key_ref_t key;
+ char K0[32], *desc;
+ int ret;
+
+ ring = keyring_alloc("kafs",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+ KEY_POS_SEARCH | KEY_POS_WRITE |
+ KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH,
+ KEY_ALLOC_NOT_IN_QUOTA,
+ NULL, NULL);
+ if (IS_ERR(ring))
+ return PTR_ERR(ring);
+
+ ret = rxrpc_sock_set_security_keyring(socket->sk, ring);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOPKG;
+ krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96);
+ if (!krb5)
+ goto out;
+
+ if (WARN_ON_ONCE(krb5->key_len > sizeof(K0)))
+ goto out;
+
+ ret = -ENOMEM;
+ desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u",
+ YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype);
+ if (!desc)
+ goto out;
+
+ wait_for_random_bytes();
+ get_random_bytes(K0, krb5->key_len);
+
+ key = key_create(make_key_ref(ring, true),
+ "rxrpc_s", desc,
+ K0, krb5->key_len,
+ KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ kfree(desc);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto out;
+ }
+
+ net->fs_cm_token_key = key_ref_to_ptr(key);
+ ret = 0;
+out:
+ key_put(ring);
+ return ret;
+}
+
+/*
+ * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver.
+ */
+static int afs_create_yfs_cm_token(struct sk_buff *challenge,
+ struct afs_server *server)
+{
+ const struct krb5_enctype *conn_krb5, *token_krb5;
+ const struct krb5_buffer *token_key;
+ struct crypto_aead *aead;
+ struct scatterlist sg;
+ struct afs_net *net = server->cell->net;
+ const struct key *key = net->fs_cm_token_key;
+ size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset;
+ __be32 caps[1] = {
+ [0] = htonl(AFS_CAP_ERROR_TRANSLATION),
+ };
+ __be32 *xdr;
+ void *appdata, *K0, *encbase;
+ u32 enctype;
+ int ret;
+
+ if (!key)
+ return -ENOKEY;
+
+ /* Assume that the fileserver is happy to use the same encoding type as
+ * we were told to use by the token obtained by the user.
+ */
+ enctype = rxgk_kernel_query_challenge(challenge);
+
+ conn_krb5 = crypto_krb5_find_enctype(enctype);
+ if (!conn_krb5)
+ return -ENOPKG;
+ token_krb5 = key->payload.data[0];
+ token_key = (const struct krb5_buffer *)&key->payload.data[2];
+
+ /* struct rxgk_key {
+ * afs_uint32 enctype;
+ * opaque key<>;
+ * };
+ */
+ keysize = 4 + xdr_len_object(conn_krb5->key_len);
+
+ /* struct RXGK_AuthName {
+ * afs_int32 kind;
+ * opaque data<AUTHDATAMAX>;
+ * opaque display<AUTHPRINTABLEMAX>;
+ * };
+ */
+ uuidsize = sizeof(server->uuid);
+ authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0);
+
+ /* struct RXGK_Token {
+ * rxgk_key K0;
+ * RXGK_Level level;
+ * rxgkTime starttime;
+ * afs_int32 lifetime;
+ * afs_int32 bytelife;
+ * rxgkTime expirationtime;
+ * struct RXGK_AuthName identities<>;
+ * };
+ */
+ toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize);
+
+ offset = 0;
+ encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset);
+
+ /* struct RXGK_TokenContainer {
+ * afs_int32 kvno;
+ * afs_int32 enctype;
+ * opaque encrypted_token<>;
+ * };
+ */
+ contsize = 4 + 4 + xdr_len_object(encsize);
+
+ /* struct YFSAppData {
+ * opr_uuid initiatorUuid;
+ * opr_uuid acceptorUuid;
+ * Capabilities caps;
+ * afs_int32 enctype;
+ * opaque callbackKey<>;
+ * opaque callbackToken<>;
+ * };
+ */
+ adatasize = 16 + 16 +
+ xdr_len_object(sizeof(caps)) +
+ 4 +
+ xdr_len_object(conn_krb5->key_len) +
+ xdr_len_object(contsize);
+
+ ret = -ENOMEM;
+ appdata = kzalloc(adatasize, GFP_KERNEL);
+ if (!appdata)
+ goto out;
+ xdr = appdata;
+
+ memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */
+ xdr += 16 / 4;
+ memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */
+ xdr += 16 / 4;
+ *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */
+ memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */
+ xdr += ARRAY_SIZE(caps);
+ *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */
+
+ *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */
+ K0 = xdr;
+ get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */
+ xdr += xdr_round_up(conn_krb5->key_len) / 4;
+
+ *xdr++ = htonl(contsize); /* appdata.callbackToken.len */
+ *xdr++ = htonl(1); /* cont.kvno */
+ *xdr++ = htonl(token_krb5->etype); /* cont.enctype */
+ *xdr++ = htonl(encsize); /* cont.encrypted_token.len */
+
+ encbase = xdr;
+ xdr += offset / 4;
+ *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */
+ *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */
+ memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */
+ xdr += xdr_round_up(conn_krb5->key_len) / 4;
+
+ *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */
+ *xdr++ = htonl(0); /* token.starttime */
+ *xdr++ = htonl(0); /* " */
+ *xdr++ = htonl(0); /* token.lifetime */
+ *xdr++ = htonl(0); /* token.bytelife */
+ *xdr++ = htonl(0); /* token.expirationtime */
+ *xdr++ = htonl(0); /* " */
+ *xdr++ = htonl(1); /* token.identities.count */
+ *xdr++ = htonl(0); /* token.identities[0].kind */
+ *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */
+ memcpy(xdr, &server->uuid, uuidsize);
+ xdr += xdr_round_up(uuidsize) / 4;
+ *xdr++ = htonl(0); /* token.identities[0].display.len */
+
+ xdr = encbase + xdr_round_up(encsize);
+
+ if ((unsigned long)xdr - (unsigned long)appdata != adatasize)
+ pr_err("Appdata size incorrect %lx != %zx\n",
+ (unsigned long)xdr - (unsigned long)appdata, adatasize);
+
+ aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN,
+ GFP_KERNEL);
+ if (IS_ERR(aead)) {
+ ret = PTR_ERR(aead);
+ goto out_token;
+ }
+
+ sg_init_one(&sg, encbase, encsize);
+ ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false);
+ if (ret < 0)
+ goto out_aead;
+
+ server->cm_rxgk_appdata.len = adatasize;
+ server->cm_rxgk_appdata.data = appdata;
+ appdata = NULL;
+
+out_aead:
+ crypto_free_aead(aead);
+out_token:
+ kfree(appdata);
+out:
+ return ret;
+}
+#endif /* CONFIG_RXGK */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 9e7b1fe82c27..f4e9e12373ac 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -779,7 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
struct inode *inode = NULL, *ti;
afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version);
- bool supports_ibulk;
+ bool supports_ibulk, isnew;
long ret;
int i;
@@ -850,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
* callback counters.
*/
ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode,
- afs_ilookup5_test_by_fid, &vp->fid);
+ afs_ilookup5_test_by_fid, &vp->fid, &isnew);
if (!IS_ERR_OR_NULL(ti)) {
vnode = AFS_FS_I(ti);
vp->dv_before = vnode->status.data_version;
@@ -943,7 +943,7 @@ static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry)
}
strcpy(p, name);
- ret = lookup_one_len(buf, dentry->d_parent, len);
+ ret = lookup_noperm(&QSTR(buf), dentry->d_parent);
if (IS_ERR(ret) || d_is_positive(ret))
goto out_s;
dput(ret);
@@ -1823,7 +1823,8 @@ error:
static void afs_rename_success(struct afs_operation *op)
{
- struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry));
+ struct afs_vnode *vnode = op->more_files[0].vnode;
+ struct afs_vnode *new_vnode = op->more_files[1].vnode;
_enter("op=%08x", op->debug_id);
@@ -1834,22 +1835,40 @@ static void afs_rename_success(struct afs_operation *op)
op->ctime = op->file[1].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[1]);
}
+ if (op->more_files[0].scb.have_status)
+ afs_vnode_commit_status(op, &op->more_files[0]);
+ if (op->more_files[1].scb.have_status)
+ afs_vnode_commit_status(op, &op->more_files[1]);
/* If we're moving a subdir between dirs, we need to update
* its DV counter too as the ".." will be altered.
*/
- if (S_ISDIR(vnode->netfs.inode.i_mode) &&
- op->file[0].vnode != op->file[1].vnode) {
- u64 new_dv;
+ if (op->file[0].vnode != op->file[1].vnode) {
+ if (S_ISDIR(vnode->netfs.inode.i_mode)) {
+ u64 new_dv;
- write_seqlock(&vnode->cb_lock);
+ write_seqlock(&vnode->cb_lock);
- new_dv = vnode->status.data_version + 1;
- trace_afs_set_dv(vnode, new_dv);
- vnode->status.data_version = new_dv;
- inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
+ new_dv = vnode->status.data_version + 1;
+ trace_afs_set_dv(vnode, new_dv);
+ vnode->status.data_version = new_dv;
+ inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
- write_sequnlock(&vnode->cb_lock);
+ write_sequnlock(&vnode->cb_lock);
+ }
+
+ if ((op->rename.rename_flags & RENAME_EXCHANGE) &&
+ S_ISDIR(new_vnode->netfs.inode.i_mode)) {
+ u64 new_dv;
+
+ write_seqlock(&new_vnode->cb_lock);
+
+ new_dv = new_vnode->status.data_version + 1;
+ new_vnode->status.data_version = new_dv;
+ inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv);
+
+ write_sequnlock(&new_vnode->cb_lock);
+ }
}
}
@@ -1900,8 +1919,8 @@ static void afs_rename_edit_dir(struct afs_operation *op)
if (S_ISDIR(vnode->netfs.inode.i_mode) &&
new_dvnode != orig_dvnode &&
test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
- afs_edit_dir_update_dotdot(vnode, new_dvnode,
- afs_edit_dir_for_rename_sub);
+ afs_edit_dir_update(vnode, &dotdot_name, new_dvnode,
+ afs_edit_dir_for_rename_sub);
new_inode = d_inode(new_dentry);
if (new_inode) {
@@ -1915,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op)
/* Now we can update d_fsdata on the dentries to reflect their
* new parent's data_version.
- *
- * Note that if we ever implement RENAME_EXCHANGE, we'll have
- * to update both dentries with opposing dir versions.
*/
afs_update_dentry_version(op, new_dvp, op->dentry);
afs_update_dentry_version(op, new_dvp, op->dentry_2);
@@ -1930,6 +1946,67 @@ static void afs_rename_edit_dir(struct afs_operation *op)
fscache_end_operation(&new_cres);
}
+static void afs_rename_exchange_edit_dir(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ struct afs_vnode *orig_dvnode = orig_dvp->vnode;
+ struct afs_vnode *new_dvnode = new_dvp->vnode;
+ struct afs_vnode *old_vnode = op->more_files[0].vnode;
+ struct afs_vnode *new_vnode = op->more_files[1].vnode;
+ struct dentry *old_dentry = op->dentry;
+ struct dentry *new_dentry = op->dentry_2;
+
+ _enter("op=%08x", op->debug_id);
+
+ if (new_dvnode == orig_dvnode) {
+ down_write(&orig_dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+ orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) {
+ afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+ new_vnode, afs_edit_dir_for_rename_0);
+ afs_edit_dir_update(orig_dvnode, &new_dentry->d_name,
+ old_vnode, afs_edit_dir_for_rename_1);
+ }
+
+ d_exchange(old_dentry, new_dentry);
+ up_write(&orig_dvnode->validate_lock);
+ } else {
+ down_write(&orig_dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+ orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta)
+ afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+ new_vnode, afs_edit_dir_for_rename_0);
+
+ up_write(&orig_dvnode->validate_lock);
+ down_write(&new_dvnode->validate_lock);
+
+ if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) &&
+ new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta)
+ afs_edit_dir_update(new_dvnode, &new_dentry->d_name,
+ old_vnode, afs_edit_dir_for_rename_1);
+
+ if (S_ISDIR(old_vnode->netfs.inode.i_mode) &&
+ test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags))
+ afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode,
+ afs_edit_dir_for_rename_sub);
+
+ if (S_ISDIR(new_vnode->netfs.inode.i_mode) &&
+ test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags))
+ afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode,
+ afs_edit_dir_for_rename_sub);
+
+ /* Now we can update d_fsdata on the dentries to reflect their
+ * new parents' data_version.
+ */
+ afs_update_dentry_version(op, new_dvp, old_dentry);
+ afs_update_dentry_version(op, orig_dvp, new_dentry);
+
+ d_exchange(old_dentry, new_dentry);
+ up_write(&new_dvnode->validate_lock);
+ }
+}
+
static void afs_rename_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
@@ -1948,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = {
.put = afs_rename_put,
};
+#if 0 /* Autoswitched in yfs_fs_rename_replace(). */
+static const struct afs_operation_ops afs_rename_replace_operation = {
+ .issue_afs_rpc = NULL,
+ .issue_yfs_rpc = yfs_fs_rename_replace,
+ .success = afs_rename_success,
+ .edit_dir = afs_rename_edit_dir,
+ .put = afs_rename_put,
+};
+#endif
+
+static const struct afs_operation_ops afs_rename_noreplace_operation = {
+ .issue_afs_rpc = NULL,
+ .issue_yfs_rpc = yfs_fs_rename_noreplace,
+ .success = afs_rename_success,
+ .edit_dir = afs_rename_edit_dir,
+ .put = afs_rename_put,
+};
+
+static const struct afs_operation_ops afs_rename_exchange_operation = {
+ .issue_afs_rpc = NULL,
+ .issue_yfs_rpc = yfs_fs_rename_exchange,
+ .success = afs_rename_success,
+ .edit_dir = afs_rename_exchange_edit_dir,
+ .put = afs_rename_put,
+};
+
/*
* rename a file in an AFS filesystem and/or move it between directories
*/
@@ -1956,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *new_dentry, unsigned int flags)
{
struct afs_operation *op;
- struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+ struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL;
int ret;
- if (flags)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
return -EINVAL;
/* Don't allow silly-rename files be moved around. */
@@ -1969,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
vnode = AFS_FS_I(d_inode(old_dentry));
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
+ if (d_is_positive(new_dentry))
+ new_vnode = AFS_FS_I(d_inode(new_dentry));
_enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}",
orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
@@ -1989,6 +2094,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (ret < 0)
goto error;
+ ret = -ENOMEM;
+ op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+ if (!op->more_files)
+ goto error;
+
afs_op_set_vnode(op, 0, orig_dvnode);
afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
op->file[0].dv_delta = 1;
@@ -1997,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
+ op->more_files[0].vnode = vnode;
+ op->more_files[0].speculative = true;
+ op->more_files[1].vnode = new_vnode;
+ op->more_files[1].speculative = true;
+ op->nr_files = 4;
op->dentry = old_dentry;
op->dentry_2 = new_dentry;
+ op->rename.rename_flags = flags;
op->rename.new_negative = d_is_negative(new_dentry);
- op->ops = &afs_rename_operation;
- /* For non-directories, check whether the target is busy and if so,
- * make a copy of the dentry and then do a silly-rename. If the
- * silly-rename succeeds, the copied dentry is hashed and becomes the
- * new target.
- */
- if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
- /* To prevent any new references to the target during the
- * rename, we unhash the dentry in advance.
+ if (flags & RENAME_NOREPLACE) {
+ op->ops = &afs_rename_noreplace_operation;
+ } else if (flags & RENAME_EXCHANGE) {
+ op->ops = &afs_rename_exchange_operation;
+ d_drop(new_dentry);
+ } else {
+ /* If we might displace the target, we might need to do silly
+ * rename.
*/
- if (!d_unhashed(new_dentry)) {
- d_drop(new_dentry);
- op->rename.rehash = new_dentry;
- }
+ op->ops = &afs_rename_operation;
- if (d_count(new_dentry) > 2) {
- /* copy the target dentry's name */
- op->rename.tmp = d_alloc(new_dentry->d_parent,
- &new_dentry->d_name);
- if (!op->rename.tmp) {
- afs_op_nomem(op);
- goto error;
+ /* For non-directories, check whether the target is busy and if
+ * so, make a copy of the dentry and then do a silly-rename.
+ * If the silly-rename succeeds, the copied dentry is hashed
+ * and becomes the new target.
+ */
+ if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
+ /* To prevent any new references to the target during
+ * the rename, we unhash the dentry in advance.
+ */
+ if (!d_unhashed(new_dentry)) {
+ d_drop(new_dentry);
+ op->rename.rehash = new_dentry;
}
- ret = afs_sillyrename(new_dvnode,
- AFS_FS_I(d_inode(new_dentry)),
- new_dentry, op->key);
- if (ret) {
- afs_op_set_error(op, ret);
- goto error;
+ if (d_count(new_dentry) > 2) {
+ /* copy the target dentry's name */
+ op->rename.tmp = d_alloc(new_dentry->d_parent,
+ &new_dentry->d_name);
+ if (!op->rename.tmp) {
+ afs_op_nomem(op);
+ goto error;
+ }
+
+ ret = afs_sillyrename(new_dvnode,
+ AFS_FS_I(d_inode(new_dentry)),
+ new_dentry, op->key);
+ if (ret) {
+ afs_op_set_error(op, ret);
+ goto error;
+ }
+
+ op->dentry_2 = op->rename.tmp;
+ op->rename.rehash = NULL;
+ op->rename.new_negative = true;
}
-
- op->dentry_2 = op->rename.tmp;
- op->rename.rehash = NULL;
- op->rename.new_negative = true;
}
}
@@ -2052,6 +2179,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
d_drop(old_dentry);
ret = afs_do_sync_operation(op);
+ if (ret == -ENOTSUPP)
+ ret = -EINVAL;
out:
afs_dir_unuse_cookie(orig_dvnode, ret);
if (new_dvnode != orig_dvnode)
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 60a549f1d9c5..fd3aa9f97ce6 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -239,7 +239,7 @@ static void afs_edit_init_block(union afs_xdr_dir_block *meta,
* The caller must hold the inode locked.
*/
void afs_edit_dir_add(struct afs_vnode *vnode,
- struct qstr *name, struct afs_fid *new_fid,
+ const struct qstr *name, struct afs_fid *new_fid,
enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *meta, *block;
@@ -391,7 +391,7 @@ error:
* The caller must hold the inode locked.
*/
void afs_edit_dir_remove(struct afs_vnode *vnode,
- struct qstr *name, enum afs_edit_dir_reason why)
+ const struct qstr *name, enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *meta, *block, *pblock;
union afs_xdr_dirent *de, *pde;
@@ -522,11 +522,11 @@ error:
}
/*
- * Edit a subdirectory that has been moved between directories to update the
- * ".." entry.
+ * Edit an entry in a directory to update the vnode it refers to. This is also
+ * used to update the ".." entry in a directory.
*/
-void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
- enum afs_edit_dir_reason why)
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+ struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *block;
union afs_xdr_dirent *de;
@@ -557,7 +557,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto already_invalidated;
- slot = afs_dir_scan_block(block, &dotdot_name, b);
+ slot = afs_dir_scan_block(block, name, b);
if (slot >= 0)
goto found_dirent;
@@ -566,7 +566,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
/* Didn't find the dirent to clobber. Download the directory again. */
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
- 0, 0, 0, 0, "..");
+ 0, 0, 0, 0, name->name);
afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd);
goto out;
@@ -576,7 +576,7 @@ found_dirent:
de->u.unique = htonl(new_dvnode->fid.unique);
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot,
- ntohl(de->u.vnode), ntohl(de->u.unique), "..");
+ ntohl(de->u.vnode), ntohl(de->u.unique), name->name);
kunmap_local(block);
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
@@ -589,12 +589,12 @@ out:
already_invalidated:
kunmap_local(block);
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
- 0, 0, 0, 0, "..");
+ 0, 0, 0, 0, name->name);
goto out;
error:
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
- 0, 0, 0, 0, "..");
+ 0, 0, 0, 0, name->name);
goto out;
}
diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c
index b25bd892db4d..d2516e55b5ed 100644
--- a/fs/afs/dir_search.c
+++ b/fs/afs/dir_search.c
@@ -188,7 +188,7 @@ bad:
/*
* Search the appropriate hash chain in the contents of an AFS directory.
*/
-int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
+int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name,
struct afs_fid *_fid, afs_dataversion_t *_dir_version)
{
struct afs_dir_iter iter = { .dvnode = dvnode, };
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index a1e581946b93..014495d4b868 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
if (IS_ERR(op))
return PTR_ERR(op);
+ op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+ if (!op->more_files) {
+ afs_put_operation(op);
+ return -ENOMEM;
+ }
+
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, dvnode);
op->file[0].dv_delta = 1;
@@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
+ op->more_files[0].vnode = AFS_FS_I(d_inode(old));
+ op->more_files[0].speculative = true;
+ op->more_files[1].vnode = AFS_FS_I(d_inode(new));
+ op->more_files[1].speculative = true;
+ op->nr_files = 4;
op->dentry = old;
op->dentry_2 = new;
@@ -113,16 +124,14 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
sdentry = NULL;
do {
- int slen;
-
dput(sdentry);
sillycounter++;
/* Create a silly name. Note that the ".__afs" prefix is
* understood by the salvager and must not be changed.
*/
- slen = scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter);
- sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+ scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter);
+ sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent);
/* N.B. Better to return EBUSY here ... it could be dangerous
* to delete the file while it's in use.
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 691e0ae607a1..aa56e8951e03 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -64,7 +64,7 @@ static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino)
vnode = AFS_FS_I(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
netfs_inode_init(&vnode->netfs, NULL, false);
simple_inode_init_ts(inode);
set_nlink(inode, 2);
@@ -108,7 +108,8 @@ static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *
dotted = true;
}
- cell = afs_lookup_cell(net, name, len, NULL, false,
+ cell = afs_lookup_cell(net, name, len, NULL,
+ AFS_LOOKUP_CELL_DYNROOT,
afs_cell_trace_use_lookup_dynroot);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
@@ -258,7 +259,7 @@ static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry
vnode = AFS_FS_I(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
netfs_inode_init(&vnode->netfs, NULL, false);
simple_inode_init_ts(inode);
set_nlink(inode, 1);
@@ -348,9 +349,9 @@ static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx)
}
if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) {
- rcu_read_lock();
+ down_read(&net->cells_lock);
ret = afs_dynroot_readdir_cells(net, ctx);
- rcu_read_unlock();
+ up_read(&net->cells_lock);
}
return ret;
}
@@ -383,7 +384,7 @@ struct inode *afs_dynroot_iget_root(struct super_block *sb)
vnode = AFS_FS_I(inode);
/* there shouldn't be an existing inode */
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
netfs_inode_init(&vnode->netfs, NULL, false);
simple_inode_init_ts(inode);
set_nlink(inode, 2);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index fc15497608c6..f66a92294284 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,7 @@
#include <trace/events/netfs.h>
#include "internal.h"
-static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
+static int afs_file_mmap_prepare(struct vm_area_desc *desc);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -35,7 +35,7 @@ const struct file_operations afs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = afs_file_read_iter,
.write_iter = netfs_file_write_iter,
- .mmap = afs_file_mmap,
+ .mmap_prepare = afs_file_mmap_prepare,
.splice_read = afs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = afs_fsync,
@@ -492,16 +492,16 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode)
/*
* Handle setting up a memory mapping on an AFS file.
*/
-static int afs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int afs_file_mmap_prepare(struct vm_area_desc *desc)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(desc->file));
int ret;
afs_add_open_mmap(vnode);
- ret = generic_file_mmap(file, vma);
+ ret = generic_file_mmap_prepare(desc);
if (ret == 0)
- vma->vm_ops = &afs_vm_ops;
+ desc->vm_ops = &afs_vm_ops;
else
afs_drop_open_mmap(vnode);
return ret;
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 07a8bfbdd9b9..e0030ac74ea0 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -534,6 +534,6 @@ dont_wait:
*/
void afs_fs_probe_cleanup(struct afs_net *net)
{
- if (del_timer_sync(&net->fs_probe_timer))
+ if (timer_delete_sync(&net->fs_probe_timer))
afs_dec_servers_outstanding(net);
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e9538e91f848..dde1857fcabb 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -427,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
struct afs_vnode *vnode = vp->vnode;
int ret;
- if (vnode->netfs.inode.i_state & I_NEW) {
+ if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
afs_op_set_error(op, ret);
if (ret == 0)
@@ -579,7 +579,7 @@ struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp)
inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
/* deal with an existing inode */
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
_leave(" = %p", inode);
return inode;
}
@@ -639,7 +639,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
_debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid);
- BUG_ON(!(inode->i_state & I_NEW));
+ BUG_ON(!(inode_state_read_once(inode) & I_NEW));
vnode = AFS_FS_I(inode);
vnode->cb_v_check = atomic_read(&as->volume->cb_v_break);
@@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode)
_enter("");
if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
- return generic_delete_inode(inode);
+ return inode_just_drop(inode);
else
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
/*
@@ -748,7 +748,7 @@ void afs_evict_inode(struct inode *inode)
if ((S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) &&
- (inode->i_state & I_DIRTY) &&
+ (inode_state_read_once(inode) & I_DIRTY) &&
!sbi->dyn_root) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 440b0e731093..009064b8d661 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -20,6 +20,7 @@
#include <linux/uuid.h>
#include <linux/mm_types.h>
#include <linux/dns_resolver.h>
+#include <crypto/krb5.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
@@ -176,8 +177,10 @@ struct afs_call {
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
bool responded; /* Got a response from the call (may be abort) */
+ u8 security_ix; /* Security class */
u16 service_id; /* Actual service ID (after upgrade) */
unsigned int debug_id; /* Trace ID */
+ u32 enctype; /* Security encoding type */
u32 operation_ID; /* operation ID for an incoming call */
u32 count; /* count for use in unmarshalling */
union { /* place to extract temporary data */
@@ -281,6 +284,7 @@ struct afs_net {
struct socket *socket;
struct afs_call *spare_incoming_call;
struct work_struct charge_preallocation_work;
+ struct work_struct rx_oob_work;
struct mutex socket_mutex;
atomic_t nr_outstanding_calls;
atomic_t nr_superblocks;
@@ -305,6 +309,7 @@ struct afs_net {
struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */
struct hlist_head fs_proc; /* procfs servers list */
+ struct key *fs_cm_token_key; /* Key for creating CM tokens */
struct work_struct fs_prober;
struct timer_list fs_probe_timer;
atomic_t servers_outstanding;
@@ -338,6 +343,7 @@ extern const char afs_init_sysname[];
enum afs_cell_state {
AFS_CELL_SETTING_UP,
+ AFS_CELL_UNLOOKED,
AFS_CELL_ACTIVE,
AFS_CELL_REMOVING,
AFS_CELL_DEAD,
@@ -407,6 +413,7 @@ struct afs_cell {
u8 name_len; /* Length of name */
char *name; /* Cell name, case-flattened and NUL-padded */
+ char *key_desc; /* Authentication key description */
};
/*
@@ -540,6 +547,8 @@ struct afs_server {
struct list_head volumes; /* RCU list of afs_server_entry objects */
struct work_struct destroyer; /* Work item to try and destroy a server */
struct timer_list timer; /* Management timer */
+ struct mutex cm_token_lock; /* Lock governing creation of appdata */
+ struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */
time64_t unuse_time; /* Time at which last unused */
unsigned long flags;
#define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */
@@ -555,6 +564,7 @@ struct afs_server {
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
#define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */
#define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */
+#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */
refcount_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
@@ -884,9 +894,10 @@ struct afs_operation {
bool need_rehash;
} unlink;
struct {
- struct dentry *rehash;
- struct dentry *tmp;
- bool new_negative;
+ struct dentry *rehash;
+ struct dentry *tmp;
+ unsigned int rename_flags;
+ bool new_negative;
} rename;
struct {
struct netfs_io_subrequest *subreq;
@@ -1040,9 +1051,18 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
extern int afs_cell_init(struct afs_net *, const char *);
extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned,
enum afs_cell_trace);
+enum afs_lookup_cell_for {
+ AFS_LOOKUP_CELL_DYNROOT,
+ AFS_LOOKUP_CELL_MOUNTPOINT,
+ AFS_LOOKUP_CELL_DIRECT_MOUNT,
+ AFS_LOOKUP_CELL_PRELOAD,
+ AFS_LOOKUP_CELL_ROOTCELL,
+ AFS_LOOKUP_CELL_ALIAS_CHECK,
+};
struct afs_cell *afs_lookup_cell(struct afs_net *net,
const char *name, unsigned int namesz,
- const char *vllist, bool excl,
+ const char *vllist,
+ enum afs_lookup_cell_for reason,
enum afs_cell_trace trace);
extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace);
void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason);
@@ -1059,6 +1079,19 @@ extern void __net_exit afs_cell_purge(struct afs_net *);
extern bool afs_cm_incoming_call(struct afs_call *);
/*
+ * cm_security.c
+ */
+void afs_process_oob_queue(struct work_struct *work);
+#ifdef CONFIG_RXGK
+int afs_create_token_key(struct afs_net *net, struct socket *socket);
+#else
+static inline int afs_create_token_key(struct afs_net *net, struct socket *socket)
+{
+ return 0;
+}
+#endif
+
+/*
* dir.c
*/
extern const struct file_operations afs_dir_file_operations;
@@ -1077,11 +1110,11 @@ int afs_single_writepages(struct address_space *mapping,
/*
* dir_edit.c
*/
-extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
+extern void afs_edit_dir_add(struct afs_vnode *, const struct qstr *, struct afs_fid *,
enum afs_edit_dir_reason);
-extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
-void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
- enum afs_edit_dir_reason why);
+extern void afs_edit_dir_remove(struct afs_vnode *, const struct qstr *, enum afs_edit_dir_reason);
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+ struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why);
void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
/*
@@ -1092,7 +1125,7 @@ bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name);
union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block);
int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
struct afs_fid *_fid);
-int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
+int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name,
struct afs_fid *_fid, afs_dataversion_t *_dir_version);
/*
@@ -1673,6 +1706,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *);
extern void yfs_fs_link(struct afs_operation *);
extern void yfs_fs_symlink(struct afs_operation *);
extern void yfs_fs_rename(struct afs_operation *);
+void yfs_fs_rename_replace(struct afs_operation *op);
+void yfs_fs_rename_noreplace(struct afs_operation *op);
+void yfs_fs_rename_exchange(struct afs_operation *op);
extern void yfs_fs_store_data(struct afs_operation *);
extern void yfs_fs_setattr(struct afs_operation *);
extern void yfs_fs_get_volume_status(struct afs_operation *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index c845c5daaeba..e6bb8237db98 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -73,6 +73,7 @@ static int __net_init afs_net_init(struct net *net_ns)
generate_random_uuid((unsigned char *)&net->uuid);
INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation);
+ INIT_WORK(&net->rx_oob_work, afs_process_oob_queue);
mutex_init(&net->socket_mutex);
net->cells = RB_ROOT;
@@ -168,13 +169,13 @@ static int __init afs_init(void)
printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
- afs_wq = alloc_workqueue("afs", 0, 0);
+ afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0);
if (!afs_wq)
goto error_afs_wq;
afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
if (!afs_async_calls)
goto error_async;
- afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
+ afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!afs_lock_manager)
goto error_lockmgr;
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index b8180bf2281f..c8a7f266080d 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
+#include <crypto/krb5.h>
#include "internal.h"
#include "afs_fs.h"
#include "protocol_uae.h"
@@ -103,7 +104,34 @@ int afs_abort_to_error(u32 abort_code)
case RXKADDATALEN: return -EKEYREJECTED;
case RXKADILLEGALLEVEL: return -EKEYREJECTED;
+ case RXGK_INCONSISTENCY: return -EPROTO;
+ case RXGK_PACKETSHORT: return -EPROTO;
+ case RXGK_BADCHALLENGE: return -EPROTO;
+ case RXGK_SEALEDINCON: return -EKEYREJECTED;
+ case RXGK_NOTAUTH: return -EKEYREJECTED;
+ case RXGK_EXPIRED: return -EKEYEXPIRED;
+ case RXGK_BADLEVEL: return -EKEYREJECTED;
+ case RXGK_BADKEYNO: return -EKEYREJECTED;
+ case RXGK_NOTRXGK: return -EKEYREJECTED;
+ case RXGK_UNSUPPORTED: return -EKEYREJECTED;
+ case RXGK_GSSERROR: return -EKEYREJECTED;
+#ifdef RXGK_BADETYPE
+ case RXGK_BADETYPE: return -ENOPKG;
+#endif
+#ifdef RXGK_BADTOKEN
+ case RXGK_BADTOKEN: return -EKEYREJECTED;
+#endif
+#ifdef RXGK_BADETYPE
+ case RXGK_DATALEN: return -EPROTO;
+#endif
+#ifdef RXGK_BADQOP
+ case RXGK_BADQOP: return -EKEYREJECTED;
+#endif
+
+ case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG;
+
case RXGEN_OPCODE: return -ENOTSUPP;
+ case RX_INVALID_OPERATION: return -ENOTSUPP;
default: return -EREMOTEIO;
}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 45cee6534122..57c204a3c04e 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (size > AFS_MAXCELLNAME)
return -ENAMETOOLONG;
- cell = afs_lookup_cell(ctx->net, p, size, NULL, false,
+ cell = afs_lookup_cell(ctx->net, p, size, NULL,
+ AFS_LOOKUP_CELL_MOUNTPOINT,
afs_cell_trace_use_lookup_mntpt);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
@@ -137,7 +138,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
ret = -EINVAL;
if (content[size - 1] == '.')
- ret = vfs_parse_fs_string(fc, "source", content, size - 1);
+ ret = vfs_parse_fs_qstr(fc, "source",
+ &QSTR_LEN(content, size - 1));
do_delayed_call(&cleanup);
if (ret < 0)
return ret;
@@ -189,7 +191,6 @@ struct vfsmount *afs_d_automount(struct path *path)
if (IS_ERR(newmnt))
return newmnt;
- mntget(newmnt); /* prevent immediate expiration */
mnt_set_expiry(newmnt, &afs_vfsmounts);
queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
afs_mntpt_expiry_timeout * HZ);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 40e879c8ca77..44520549b509 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -122,7 +122,8 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
if (strcmp(buf, "add") == 0) {
struct afs_cell *cell;
- cell = afs_lookup_cell(net, name, strlen(name), args, true,
+ cell = afs_lookup_cell(net, name, strlen(name), args,
+ AFS_LOOKUP_CELL_PRELOAD,
afs_cell_trace_use_lookup_add);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
index e4cd89c44c46..b2f06c1917c2 100644
--- a/fs/afs/protocol_yfs.h
+++ b/fs/afs/protocol_yfs.h
@@ -50,6 +50,9 @@ enum YFS_FS_Operations {
YFSREMOVEACL = 64171,
YFSREMOVEFILE2 = 64173,
YFSSTOREOPAQUEACL2 = 64174,
+ YFSRENAME_REPLACE = 64176,
+ YFSRENAME_NOREPLACE = 64177,
+ YFSRENAME_EXCHANGE = 64187,
YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */
YFSFETCHDATA64 = 64537, /* YFS Fetch file data */
YFSSTOREDATA64 = 64538, /* YFS Store file data */
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index a1c24f589d9e..6a4e7da10fc4 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op)
afs_op_set_error(op, -EDQUOT);
goto failed_but_online;
+ case RX_INVALID_OPERATION:
+ case RXGEN_OPCODE:
+ /* Handle downgrading to an older operation. */
+ afs_op_set_error(op, -ENOTSUPP);
+ if (op->flags & AFS_OPERATION_DOWNGRADE) {
+ op->flags &= ~AFS_OPERATION_DOWNGRADE;
+ goto go_again;
+ }
+ goto failed_but_online;
+
default:
afs_op_accumulate_error(op, error, abort_code);
failed_but_online:
@@ -620,12 +630,13 @@ iterate_address:
op->addr_index = addr_index;
set_bit(addr_index, &op->addr_tried);
- op->volsync.creation = TIME64_MIN;
- op->volsync.update = TIME64_MIN;
- op->call_responded = false;
_debug("address [%u] %u/%u %pISp",
op->server_index, addr_index, alist->nr_addrs,
rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
+go_again:
+ op->volsync.creation = TIME64_MIN;
+ op->volsync.update = TIME64_MIN;
+ op->call_responded = false;
_leave(" = t");
return true;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d5e480a33859..bf0e4ea0aafd 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -24,8 +24,17 @@ static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned
static void afs_process_async_call(struct work_struct *);
static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
+static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID);
+static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob);
static int afs_deliver_cm_op_id(struct afs_call *);
+static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = {
+ .notify_new_call = afs_rx_new_call,
+ .discard_new_call = afs_rx_discard_new_call,
+ .user_attach_call = afs_rx_attach,
+ .notify_oob = afs_rx_notify_oob,
+};
+
/* asynchronous incoming call initial processing */
static const struct afs_call_type afs_RXCMxxxx = {
.name = "CB.xxxx",
@@ -49,6 +58,7 @@ int afs_open_socket(struct afs_net *net)
goto error_1;
socket->sk->sk_allocation = GFP_NOFS;
+ socket->sk->sk_user_data = net;
/* bind the callback manager's address to make this a server socket */
memset(&srx, 0, sizeof(srx));
@@ -64,16 +74,24 @@ int afs_open_socket(struct afs_net *net)
if (ret < 0)
goto error_2;
- ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+ ret = rxrpc_sock_set_manage_response(socket->sk, true);
+ if (ret < 0)
+ goto error_2;
+
+ ret = afs_create_token_key(net, socket);
+ if (ret < 0)
+ pr_err("Couldn't create RxGK CM key: %d\n", ret);
+
+ ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx));
if (ret == -EADDRINUSE) {
srx.transport.sin6.sin6_port = 0;
- ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+ ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx));
}
if (ret < 0)
goto error_2;
srx.srx_service = YFS_CM_SERVICE;
- ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+ ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx));
if (ret < 0)
goto error_2;
@@ -84,8 +102,7 @@ int afs_open_socket(struct afs_net *net)
* it sends back to us.
*/
- rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
- afs_rx_discard_new_call);
+ rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops);
ret = kernel_listen(socket, INT_MAX);
if (ret < 0)
@@ -125,7 +142,9 @@ void afs_close_socket(struct afs_net *net)
kernel_sock_shutdown(net->socket, SHUT_RDWR);
flush_workqueue(afs_async_calls);
+ net->socket->sk->sk_user_data = NULL;
sock_release(net->socket);
+ key_put(net->fs_cm_token_key);
_debug("dework");
_leave("");
@@ -738,7 +757,6 @@ void afs_charge_preallocation(struct work_struct *work)
if (rxrpc_kernel_charge_accept(net->socket,
afs_wake_up_async_call,
- afs_rx_attach,
(unsigned long)call,
GFP_KERNEL,
call->debug_id) < 0)
@@ -800,10 +818,14 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
if (!afs_cm_incoming_call(call))
return -ENOTSUPP;
+ call->security_ix = rxrpc_kernel_query_call_security(call->rxcall,
+ &call->service_id,
+ &call->enctype);
+
trace_afs_cb_call(call);
call->work.func = call->type->work;
- /* pass responsibility for the remainer of this message off to the
+ /* pass responsibility for the remainder of this message off to the
* cache manager op */
return call->type->deliver(call);
}
@@ -952,3 +974,13 @@ noinline int afs_protocol_error(struct afs_call *call,
call->unmarshalling_error = true;
return -EBADMSG;
}
+
+/*
+ * Wake up OOB notification processing.
+ */
+static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob)
+{
+ struct afs_net *net = sk->sk_user_data;
+
+ schedule_work(&net->rx_oob_work);
+}
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 6a7744c9e2a2..55ddce94af03 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -16,6 +16,31 @@
static DEFINE_HASHTABLE(afs_permits_cache, 10);
static DEFINE_SPINLOCK(afs_permits_lock);
+static DEFINE_MUTEX(afs_key_lock);
+
+/*
+ * Allocate a key to use as a placeholder for anonymous user security.
+ */
+static int afs_alloc_anon_key(struct afs_cell *cell)
+{
+ struct key *key;
+
+ mutex_lock(&afs_key_lock);
+ key = cell->anonymous_key;
+ if (!key) {
+ key = rxrpc_get_null_key(cell->key_desc);
+ if (!IS_ERR(key))
+ cell->anonymous_key = key;
+ }
+ mutex_unlock(&afs_key_lock);
+
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ _debug("anon key %p{%x}",
+ cell->anonymous_key, key_serial(cell->anonymous_key));
+ return 0;
+}
/*
* get a key
@@ -23,11 +48,12 @@ static DEFINE_SPINLOCK(afs_permits_lock);
struct key *afs_request_key(struct afs_cell *cell)
{
struct key *key;
+ int ret;
- _enter("{%x}", key_serial(cell->anonymous_key));
+ _enter("{%s}", cell->key_desc);
- _debug("key %s", cell->anonymous_key->description);
- key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description,
+ _debug("key %s", cell->key_desc);
+ key = request_key_net(&key_type_rxrpc, cell->key_desc,
cell->net->net, NULL);
if (IS_ERR(key)) {
if (PTR_ERR(key) != -ENOKEY) {
@@ -35,6 +61,12 @@ struct key *afs_request_key(struct afs_cell *cell)
return key;
}
+ if (!cell->anonymous_key) {
+ ret = afs_alloc_anon_key(cell);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
/* act as anonymous user */
_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
return key_get(cell->anonymous_key);
@@ -52,11 +84,10 @@ struct key *afs_request_key_rcu(struct afs_cell *cell)
{
struct key *key;
- _enter("{%x}", key_serial(cell->anonymous_key));
+ _enter("{%s}", cell->key_desc);
- _debug("key %s", cell->anonymous_key->description);
- key = request_key_net_rcu(&key_type_rxrpc,
- cell->anonymous_key->description,
+ _debug("key %s", cell->key_desc);
+ key = request_key_net_rcu(&key_type_rxrpc, cell->key_desc,
cell->net->net);
if (IS_ERR(key)) {
if (PTR_ERR(key) != -ENOKEY) {
@@ -65,6 +96,8 @@ struct key *afs_request_key_rcu(struct afs_cell *cell)
}
/* act as anonymous user */
+ if (!cell->anonymous_key)
+ return NULL; /* Need to allocate */
_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
return key_get(cell->anonymous_key);
} else {
@@ -408,7 +441,7 @@ int afs_permission(struct mnt_idmap *idmap, struct inode *inode,
if (mask & MAY_NOT_BLOCK) {
key = afs_request_key_rcu(vnode->volume->cell);
- if (IS_ERR(key))
+ if (IS_ERR_OR_NULL(key))
return -ECHILD;
ret = -ECHILD;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index c530d1ca15df..c4428ebddb1d 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -131,6 +131,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *
timer_setup(&server->timer, afs_server_timer, 0);
INIT_LIST_HEAD(&server->volumes);
init_waitqueue_head(&server->probe_wq);
+ mutex_init(&server->cm_token_lock);
INIT_LIST_HEAD(&server->probe_link);
INIT_HLIST_NODE(&server->proc_link);
spin_lock_init(&server->probe_lock);
@@ -318,7 +319,7 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate,
a = atomic_inc_return(&server->active);
if (a == 1 && activate &&
!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
- del_timer(&server->timer);
+ timer_delete(&server->timer);
trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
@@ -330,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate,
void afs_put_server(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int a, debug_id = server->debug_id;
+ unsigned int a, debug_id;
bool zero;
int r;
if (!server)
return;
+ debug_id = server->debug_id;
a = atomic_read(&server->active);
zero = __refcount_dec_and_test(&server->ref, &r);
trace_afs_server(debug_id, r - 1, a, reason);
@@ -396,6 +398,7 @@ static void afs_server_rcu(struct rcu_head *rcu)
afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state),
afs_estate_trace_put_server);
afs_put_cell(server->cell, afs_cell_trace_put_server);
+ kfree(server->cm_rxgk_appdata.data);
kfree(server);
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 25b306db6992..d672b7ab57ae 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -290,7 +290,7 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
/* lookup the cell record */
if (cellname) {
cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
- NULL, false,
+ NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT,
afs_cell_trace_use_lookup_mount);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%*.*s'\n",
@@ -483,9 +483,9 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
goto error;
if (as->dyn_root) {
- sb->s_d_op = &afs_dynroot_dentry_operations;
+ set_default_d_op(sb, &afs_dynroot_dentry_operations);
} else {
- sb->s_d_op = &afs_fs_dentry_operations;
+ set_default_d_op(sb, &afs_fs_dentry_operations);
rcu_assign_pointer(as->volume->sb, sb);
}
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index 709b4cdb723e..fc9676abd252 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
if (!name_len || name_len > AFS_MAXCELLNAME)
master = ERR_PTR(-EOPNOTSUPP);
else
- master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false,
+ master = afs_lookup_cell(cell->net, cell_name, name_len, NULL,
+ AFS_LOOKUP_CELL_ALIAS_CHECK,
afs_cell_trace_use_lookup_canonical);
kfree(cell_name);
if (IS_ERR(master))
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 18b0a9f1615e..93ad86ff3345 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -120,17 +120,17 @@ static void afs_issue_write_worker(struct work_struct *work)
#if 0 // Error injection
if (subreq->debug_index == 3)
- return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
+ return netfs_write_subrequest_terminated(subreq, -ENOANO);
if (!subreq->retry_count) {
set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
- return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN);
}
#endif
op = afs_alloc_operation(wreq->netfs_priv, vnode->volume);
if (IS_ERR(op))
- return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN);
afs_op_set_vnode(op, 0, vnode);
op->file[0].dv_delta = 1;
@@ -166,13 +166,13 @@ static void afs_issue_write_worker(struct work_struct *work)
break;
}
- netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false);
+ netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len);
}
void afs_issue_write(struct netfs_io_subrequest *subreq)
{
subreq->work.func = afs_issue_write_worker;
- if (!queue_work(system_unbound_wq, &subreq->work))
+ if (!queue_work(system_dfl_wq, &subreq->work))
WARN_ON_ONCE(1);
}
@@ -202,6 +202,7 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st
case NETFS_READ_GAPS:
case NETFS_READ_SINGLE:
case NETFS_READ_FOR_WRITE:
+ case NETFS_UNBUFFERED_READ:
case NETFS_DIO_READ:
return;
default:
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 257af259c04a..febf13a49f0b 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -1042,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op)
_enter("");
+ if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags))
+ return yfs_fs_rename_replace(op);
+
call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename,
sizeof(__be32) +
sizeof(struct yfs_xdr_RPCFlags) +
@@ -1071,6 +1074,252 @@ void yfs_fs_rename(struct afs_operation *op)
}
/*
+ * Deliver reply data to a YFS.Rename_NoReplace operation. This does not
+ * return the status of a displaced target inode as there cannot be one.
+ */
+static int yfs_deliver_fs_rename_1(struct afs_call *call)
+{
+ struct afs_operation *op = call->op;
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ struct afs_vnode_param *old_vp = &op->more_files[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
+ xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+ xdr_decode_YFSFid(&bp, &old_vp->fid);
+ xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+ xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+ xdr_decode_YFSVolSync(&bp, &op->volsync);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange
+ * operation. These return the status of the displaced target inode if there
+ * was one.
+ */
+static int yfs_deliver_fs_rename_2(struct afs_call *call)
+{
+ struct afs_operation *op = call->op;
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ struct afs_vnode_param *old_vp = &op->more_files[0];
+ struct afs_vnode_param *new_vp = &op->more_files[1];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
+ xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+ xdr_decode_YFSFid(&bp, &old_vp->fid);
+ xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+ xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+ xdr_decode_YFSFid(&bp, &new_vp->fid);
+ xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb);
+ xdr_decode_YFSVolSync(&bp, &op->volsync);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+static void yfs_done_fs_rename_replace(struct afs_call *call)
+{
+ if (call->error == -ECONNABORTED &&
+ (call->abort_code == RX_INVALID_OPERATION ||
+ call->abort_code == RXGEN_OPCODE)) {
+ set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags);
+ call->op->flags |= AFS_OPERATION_DOWNGRADE;
+ }
+}
+
+/*
+ * YFS.Rename_Replace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Replace = {
+ .name = "FS.Rename_Replace",
+ .op = yfs_FS_Rename_Replace,
+ .deliver = yfs_deliver_fs_rename_2,
+ .done = yfs_done_fs_rename_replace,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_NoReplace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_NoReplace = {
+ .name = "FS.Rename_NoReplace",
+ .op = yfs_FS_Rename_NoReplace,
+ .deliver = yfs_deliver_fs_rename_1,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_Exchange operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Exchange = {
+ .name = "FS.Rename_Exchange",
+ .op = yfs_FS_Rename_Exchange,
+ .deliver = yfs_deliver_fs_rename_2,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Rename a file or directory, replacing the target if it exists. The status
+ * of a displaced target is returned.
+ */
+void yfs_fs_rename_replace(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ const struct qstr *orig_name = &op->dentry->d_name;
+ const struct qstr *new_name = &op->dentry_2->d_name;
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(orig_name->len) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(new_name->len),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return afs_op_nomem(op);
+
+ /* Marshall the parameters. */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME_REPLACE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+ bp = xdr_encode_name(bp, orig_name);
+ bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+ bp = xdr_encode_name(bp, new_name);
+ yfs_check_req(call, bp);
+
+ call->fid = orig_dvp->fid;
+ trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+ afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Rename a file or directory, failing if the target dirent exists.
+ */
+void yfs_fs_rename_noreplace(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ const struct qstr *orig_name = &op->dentry->d_name;
+ const struct qstr *new_name = &op->dentry_2->d_name;
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(orig_name->len) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(new_name->len),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return afs_op_nomem(op);
+
+ /* Marshall the parameters. */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+ bp = xdr_encode_name(bp, orig_name);
+ bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+ bp = xdr_encode_name(bp, new_name);
+ yfs_check_req(call, bp);
+
+ call->fid = orig_dvp->fid;
+ trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+ afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Exchange a pair of files directories.
+ */
+void yfs_fs_rename_exchange(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ const struct qstr *orig_name = &op->dentry->d_name;
+ const struct qstr *new_name = &op->dentry_2->d_name;
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(orig_name->len) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(new_name->len),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return afs_op_nomem(op);
+
+ /* Marshall the parameters. */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+ bp = xdr_encode_name(bp, orig_name);
+ bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+ bp = xdr_encode_name(bp, new_name);
+ yfs_check_req(call, bp);
+
+ call->fid = orig_dvp->fid;
+ trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+ afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
* YFS.StoreData64 operation type.
*/
static const struct afs_call_type yfs_RXYFSStoreData64 = {
diff --git a/fs/aio.c b/fs/aio.c
index 7b976b564cfc..0a23a8c0717f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -392,15 +392,15 @@ static const struct vm_operations_struct aio_ring_vm_ops = {
#endif
};
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mmap_prepare(struct vm_area_desc *desc)
{
- vm_flags_set(vma, VM_DONTEXPAND);
- vma->vm_ops = &aio_ring_vm_ops;
+ desc->vm_flags |= VM_DONTEXPAND;
+ desc->vm_ops = &aio_ring_vm_ops;
return 0;
}
static const struct file_operations aio_ring_fops = {
- .mmap = aio_ring_mmap,
+ .mmap_prepare = aio_ring_mmap_prepare,
};
#if IS_ENABLED(CONFIG_MIGRATION)
@@ -445,7 +445,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
folio_get(dst);
rc = folio_migrate_mapping(mapping, dst, src, 1);
- if (rc != MIGRATEPAGE_SUCCESS) {
+ if (rc) {
folio_put(dst);
goto out_unlock;
}
@@ -636,7 +636,7 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
/* Synchronize against RCU protected table->table[] dereferences */
INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
- queue_rcu_work(system_wq, &ctx->free_rwork);
+ queue_rcu_work(system_percpu_wq, &ctx->free_rwork);
}
/*
@@ -1511,6 +1511,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
{
int ret;
+ req->ki_write_stream = 0;
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
@@ -1639,10 +1640,10 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
static void aio_fsync_work(struct work_struct *work)
{
struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
- const struct cred *old_cred = override_creds(iocb->fsync.creds);
- iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
- revert_creds(old_cred);
+ scoped_with_creds(iocb->fsync.creds)
+ iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+
put_cred(iocb->fsync.creds);
iocb_put(iocb);
}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 583ac81669c2..b8381c7fb636 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,10 +24,51 @@
#include <linux/uaccess.h>
+#include "internal.h"
+
static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;
/*
+ * User space expects anonymous inodes to have no file type in st_mode.
+ *
+ * In particular, 'lsof' has this legacy logic:
+ *
+ * type = s->st_mode & S_IFMT;
+ * switch (type) {
+ * ...
+ * case 0:
+ * if (!strcmp(p, "anon_inode"))
+ * Lf->ntype = Ntype = N_ANON_INODE;
+ *
+ * to detect our old anon_inode logic.
+ *
+ * Rather than mess with our internal sane inode data, just fix it
+ * up here in getattr() by masking off the format bits.
+ */
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ stat->mode &= ~S_IFMT;
+ return 0;
+}
+
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct inode_operations anon_inode_operations = {
+ .getattr = anon_inode_getattr,
+ .setattr = anon_inode_setattr,
+};
+
+/*
* anon_inodefs_dname() is called from d_path().
*/
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -45,6 +86,8 @@ static int anon_inodefs_init_fs_context(struct fs_context *fc)
struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
ctx->dops = &anon_inodefs_dentry_operations;
return 0;
}
@@ -55,17 +98,29 @@ static struct file_system_type anon_inode_fs_type = {
.kill_sb = kill_anon_super,
};
-static struct inode *anon_inode_make_secure_inode(
- const char *name,
- const struct inode *context_inode)
+/**
+ * anon_inode_make_secure_inode - allocate an anonymous inode with security context
+ * @sb: [in] Superblock to allocate from
+ * @name: [in] Name of the class of the newfile (e.g., "secretmem")
+ * @context_inode:
+ * [in] Optional parent inode for security inheritance
+ *
+ * The function ensures proper security initialization through the LSM hook
+ * security_inode_init_security_anon().
+ *
+ * Return: Pointer to new inode on success, ERR_PTR on failure.
+ */
+struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
+ const struct inode *context_inode)
{
struct inode *inode;
int error;
- inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+ inode = alloc_anon_inode(sb);
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
+ inode->i_op = &anon_inode_operations;
error = security_inode_init_security_anon(inode, &QSTR(name),
context_inode);
if (error) {
@@ -74,6 +129,7 @@ static struct inode *anon_inode_make_secure_inode(
}
return inode;
}
+EXPORT_SYMBOL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");
static struct file *__anon_inode_getfile(const char *name,
const struct file_operations *fops,
@@ -88,7 +144,8 @@ static struct file *__anon_inode_getfile(const char *name,
return ERR_PTR(-ENOENT);
if (make_inode) {
- inode = anon_inode_make_secure_inode(name, context_inode);
+ inode = anon_inode_make_secure_inode(anon_inode_mnt->mnt_sb,
+ name, context_inode);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
goto err;
@@ -223,27 +280,8 @@ static int __anon_inode_getfd(const char *name,
const struct inode *context_inode,
bool make_inode)
{
- int error, fd;
- struct file *file;
-
- error = get_unused_fd_flags(flags);
- if (error < 0)
- return error;
- fd = error;
-
- file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
- make_inode);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_put_unused_fd;
- }
- fd_install(fd, file);
-
- return fd;
-
-err_put_unused_fd:
- put_unused_fd(fd);
- return error;
+ return FD_ADD(flags, __anon_inode_getfile(name, fops, priv, flags,
+ context_inode, make_inode));
}
/**
@@ -313,6 +351,7 @@ static int __init anon_inode_init(void)
anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
if (IS_ERR(anon_inode_inode))
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
+ anon_inode_inode->i_op = &anon_inode_operations;
return 0;
}
diff --git a/fs/attr.c b/fs/attr.c
index 9caf63d20d03..b9ec6b47bab2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -230,7 +230,7 @@ EXPORT_SYMBOL(setattr_prepare);
* @inode: the inode to be truncated
* @offset: the new size to assign to the inode
*
- * inode_newsize_ok must be called with i_mutex held.
+ * inode_newsize_ok must be called with i_rwsem held exclusively.
*
* inode_newsize_ok will check filesystem limits and ulimits to check that the
* new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
@@ -286,20 +286,12 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
unsigned int ia_valid = attr->ia_valid;
struct timespec64 now;
- if (ia_valid & ATTR_CTIME) {
- /*
- * In the case of an update for a write delegation, we must respect
- * the value in ia_ctime and not use the current time.
- */
- if (ia_valid & ATTR_DELEG)
- now = inode_set_ctime_deleg(inode, attr->ia_ctime);
- else
- now = inode_set_ctime_current(inode);
- } else {
- /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */
- WARN_ON_ONCE(ia_valid & ATTR_MTIME);
+ if (ia_valid & ATTR_CTIME_SET)
+ now = inode_set_ctime_deleg(inode, attr->ia_ctime);
+ else if (ia_valid & ATTR_CTIME)
+ now = inode_set_ctime_current(inode);
+ else
now = current_time(inode);
- }
if (ia_valid & ATTR_ATIME_SET)
inode_set_atime_to_ts(inode, attr->ia_atime);
@@ -318,7 +310,7 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
* @inode: the inode to be updated
* @attr: the new attributes
*
- * setattr_copy must be called with i_mutex held.
+ * setattr_copy must be called with i_rwsem held exclusively.
*
* setattr_copy updates the inode's metadata with that specified
* in attr on idmapped mounts. Necessary permission checks to determine
@@ -359,12 +351,11 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
inode_set_atime_to_ts(inode, attr->ia_atime);
if (ia_valid & ATTR_MTIME)
inode_set_mtime_to_ts(inode, attr->ia_mtime);
- if (ia_valid & ATTR_CTIME) {
- if (ia_valid & ATTR_DELEG)
- inode_set_ctime_deleg(inode, attr->ia_ctime);
- else
- inode_set_ctime_to_ts(inode, attr->ia_ctime);
- }
+
+ if (ia_valid & ATTR_CTIME_SET)
+ inode_set_ctime_deleg(inode, attr->ia_ctime);
+ else if (ia_valid & ATTR_CTIME)
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
}
EXPORT_SYMBOL(setattr_copy);
@@ -403,13 +394,13 @@ EXPORT_SYMBOL(may_setattr);
* @attr: new attributes
* @delegated_inode: returns inode, if the inode is delegated
*
- * The caller must hold the i_mutex on the affected object.
+ * The caller must hold the i_rwsem exclusively on the affected object.
*
* If notify_change discovers a delegation in need of breaking,
* it will return -EWOULDBLOCK and return a reference to the inode in
* delegated_inode. The caller should then break the delegation and
* retry. Because breaking a delegation may take a long time, the
- * caller should drop the i_mutex before doing so.
+ * caller should drop the i_rwsem before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
@@ -424,7 +415,7 @@ EXPORT_SYMBOL(may_setattr);
* performed on the raw inode simply pass @nop_mnt_idmap.
*/
int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
- struct iattr *attr, struct inode **delegated_inode)
+ struct iattr *attr, struct delegated_inode *delegated_inode)
{
struct inode *inode = dentry->d_inode;
umode_t mode = inode->i_mode;
@@ -456,22 +447,25 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- /* Flag setting protected by i_mutex */
+ /* Flag setting protected by i_rwsem */
if (is_sxid(attr->ia_mode))
inode->i_flags &= ~S_NOSEC;
}
now = current_time(inode);
- attr->ia_ctime = now;
- if (!(ia_valid & ATTR_ATIME_SET))
- attr->ia_atime = now;
- else
+ if (ia_valid & ATTR_ATIME_SET)
attr->ia_atime = timestamp_truncate(attr->ia_atime, inode);
- if (!(ia_valid & ATTR_MTIME_SET))
- attr->ia_mtime = now;
else
+ attr->ia_atime = now;
+ if (ia_valid & ATTR_CTIME_SET)
+ attr->ia_ctime = timestamp_truncate(attr->ia_ctime, inode);
+ else
+ attr->ia_ctime = now;
+ if (ia_valid & ATTR_MTIME_SET)
attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode);
+ else
+ attr->ia_mtime = now;
if (ia_valid & ATTR_KILL_PRIV) {
error = security_inode_need_killpriv(dentry);
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 23cea74f9933..4fd555528c5d 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -16,6 +16,7 @@
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
+#include <uapi/linux/mount.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/uaccess.h>
@@ -27,6 +28,9 @@
#include <linux/magic.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include "../mount.h"
+#include <linux/ns_common.h>
+
/* This is the range of ioctl() numbers we claim as ours */
#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
@@ -114,6 +118,7 @@ struct autofs_sb_info {
int pipefd;
struct file *pipe;
struct pid *oz_pgrp;
+ u64 mnt_ns_id;
int version;
int sub_version;
int min_proto;
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index c5a6aae12d2c..6743b3b64217 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -231,32 +231,14 @@ static int test_by_type(const struct path *path, void *p)
*/
static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
{
- int err, fd;
-
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (likely(fd >= 0)) {
- struct file *filp;
- struct path path;
-
- err = find_autofs_mount(name, &path, test_by_dev, &devid);
- if (err)
- goto out;
-
- filp = dentry_open(&path, O_RDONLY, current_cred());
- path_put(&path);
- if (IS_ERR(filp)) {
- err = PTR_ERR(filp);
- goto out;
- }
-
- fd_install(fd, filp);
- }
+ struct path path __free(path_put) = {};
+ int err;
- return fd;
+ err = find_autofs_mount(name, &path, test_by_dev, &devid);
+ if (err)
+ return err;
-out:
- put_unused_fd(fd);
- return err;
+ return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred()));
}
/* Open a file descriptor on an autofs mount point */
@@ -381,6 +363,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
swap(sbi->oz_pgrp, new_pid);
sbi->pipefd = pipefd;
sbi->pipe = pipe;
+ sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id;
sbi->flags &= ~AUTOFS_SBI_CATATONIC;
}
out:
@@ -449,17 +432,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
if (!autofs_type_indirect(sbi->type))
return -EINVAL;
- /* An expire timeout greater than the superblock timeout
- * could be a problem at shutdown but the super block
- * timeout itself can change so all we can really do is
- * warn the user.
- */
- if (timeout >= sbi->exp_timeout)
- pr_warn("per-mount expire timeout is greater than "
- "the parent autofs mount timeout which could "
- "prevent shutdown\n");
-
- dentry = try_lookup_one_len(param->path, base, path_len);
+ dentry = try_lookup_noperm(&QSTR_LEN(param->path, path_len),
+ base);
if (IS_ERR_OR_NULL(dentry))
return dentry ? PTR_ERR(dentry) : -ENOENT;
ino = autofs_dentry_ino(dentry);
@@ -486,6 +460,18 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
ino->flags |= AUTOFS_INF_EXPIRE_SET;
ino->exp_timeout = timeout * HZ;
}
+
+ /* An expire timeout greater than the superblock timeout
+ * could be a problem at shutdown but the super block
+ * timeout itself can change so all we can really do is
+ * warn the user.
+ */
+ if (ino->flags & AUTOFS_INF_EXPIRE_SET &&
+ ino->exp_timeout > sbi->exp_timeout)
+ pr_warn("per-mount expire timeout is greater than "
+ "the parent autofs mount timeout which could "
+ "prevent shutdown\n");
+
dput(dentry);
}
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index ee2edccaef70..b932b1719dfc 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -55,7 +55,7 @@ void autofs_kill_sb(struct super_block *sb)
}
pr_debug("shutting down\n");
- kill_litter_super(sb);
+ kill_anon_super(sb);
if (sbi)
kfree_rcu(sbi, rcu);
}
@@ -251,6 +251,7 @@ static struct autofs_sb_info *autofs_alloc_sbi(void)
sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
sbi->pipefd = -1;
+ sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id;
set_autofs_type_indirect(&sbi->type);
mutex_init(&sbi->wq_mutex);
@@ -311,7 +312,7 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_blocksize_bits = 10;
s->s_magic = AUTOFS_SUPER_MAGIC;
s->s_op = &autofs_sops;
- s->s_d_op = &autofs_dentry_operations;
+ set_default_d_op(s, &autofs_dentry_operations);
s->s_time_gran = 1;
/*
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 174c7205fee4..2c31002b314a 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -341,6 +341,14 @@ static struct vfsmount *autofs_d_automount(struct path *path)
if (autofs_oz_mode(sbi))
return NULL;
+ /* Refuse to trigger mount if current namespace is not the owner
+ * and the mount is propagation private.
+ */
+ if (sbi->mnt_ns_id != to_ns_common(current->nsproxy->mnt_ns)->ns_id) {
+ if (vfsmount_to_propagation_flags(path->mnt) & MS_PRIVATE)
+ return ERR_PTR(-EPERM);
+ }
+
/*
* If an expire request is pending everyone must wait.
* If the expire fails we're still mounted so continue
@@ -594,9 +602,8 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
}
inode->i_private = cp;
inode->i_size = size;
- d_add(dentry, inode);
- dget(dentry);
+ d_make_persistent(dentry, inode);
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
@@ -623,12 +630,11 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
- struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count--;
- dput(ino->dentry);
+ d_make_discardable(dentry);
d_inode(dentry)->i_size = 0;
clear_nlink(d_inode(dentry));
@@ -710,7 +716,7 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count--;
- dput(ino->dentry);
+ d_make_discardable(dentry);
d_inode(dentry)->i_size = 0;
clear_nlink(d_inode(dentry));
@@ -740,12 +746,11 @@ static struct dentry *autofs_dir_mkdir(struct mnt_idmap *idmap,
inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_add(dentry, inode);
if (sbi->version < 5)
autofs_set_leaf_automount_flags(dentry);
- dget(dentry);
+ d_make_persistent(dentry, inode);
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
inc_nlink(dir);
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 763fbe9b72b2..45da8600d564 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -41,7 +41,7 @@ struct file *backing_file_open(const struct path *user_path, int flags,
return f;
path_get(user_path);
- *backing_file_user_path(f) = *user_path;
+ backing_file_set_user_path(f, user_path);
error = vfs_open(real_path, f);
if (error) {
fput(f);
@@ -65,7 +65,7 @@ struct file *backing_tmpfile_open(const struct path *user_path, int flags,
return f;
path_get(user_path);
- *backing_file_user_path(f) = *user_path;
+ backing_file_set_user_path(f, user_path);
error = vfs_tmpfile(real_idmap, real_parentpath, f, mode);
if (error) {
fput(f);
@@ -157,13 +157,37 @@ static int backing_aio_init_wq(struct kiocb *iocb)
return sb_init_dio_done_wq(sb);
}
+static int do_backing_file_read_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags)
+{
+ struct backing_aio *aio = NULL;
+ int ret;
+
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
+
+ return vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
+ }
+
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ return -ENOMEM;
+
+ aio->orig_iocb = iocb;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_complete = backing_aio_rw_complete;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ return ret;
+}
ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx)
{
- struct backing_aio *aio = NULL;
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -176,41 +200,57 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
- old_cred = override_creds(ctx->cred);
+ scoped_with_creds(ctx->cred)
+ ret = do_backing_file_read_iter(file, iter, iocb, flags);
+
+ if (ctx->accessed)
+ ctx->accessed(iocb->ki_filp);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_read_iter);
+
+static int do_backing_file_write_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ void (*end_write)(struct kiocb *, ssize_t))
+{
+ struct backing_aio *aio;
+ int ret;
+
if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags);
- ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
- } else {
- ret = -ENOMEM;
- aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
- if (!aio)
- goto out;
-
- aio->orig_iocb = iocb;
- kiocb_clone(&aio->iocb, iocb, get_file(file));
- aio->iocb.ki_complete = backing_aio_rw_complete;
- refcount_set(&aio->ref, 2);
- ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
- backing_aio_put(aio);
- if (ret != -EIOCBQUEUED)
- backing_aio_cleanup(aio, ret);
+ ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
+ if (end_write)
+ end_write(iocb, ret);
+ return ret;
}
-out:
- revert_creds(old_cred);
- if (ctx->accessed)
- ctx->accessed(iocb->ki_filp);
+ ret = backing_aio_init_wq(iocb);
+ if (ret)
+ return ret;
+
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ return -ENOMEM;
+ aio->orig_iocb = iocb;
+ aio->end_write = end_write;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_flags = flags;
+ aio->iocb.ki_complete = backing_aio_queue_completion;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
return ret;
}
-EXPORT_SYMBOL_GPL(backing_file_read_iter);
ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -227,46 +267,8 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
- /*
- * Stacked filesystems don't support deferred completions, don't copy
- * this property in case it is set by the issuer.
- */
- flags &= ~IOCB_DIO_CALLER_COMP;
-
- old_cred = override_creds(ctx->cred);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(flags);
-
- ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
- if (ctx->end_write)
- ctx->end_write(iocb, ret);
- } else {
- struct backing_aio *aio;
-
- ret = backing_aio_init_wq(iocb);
- if (ret)
- goto out;
-
- ret = -ENOMEM;
- aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
- if (!aio)
- goto out;
-
- aio->orig_iocb = iocb;
- aio->end_write = ctx->end_write;
- kiocb_clone(&aio->iocb, iocb, get_file(file));
- aio->iocb.ki_flags = flags;
- aio->iocb.ki_complete = backing_aio_queue_completion;
- refcount_set(&aio->ref, 2);
- ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
- backing_aio_put(aio);
- if (ret != -EIOCBQUEUED)
- backing_aio_cleanup(aio, ret);
- }
-out:
- revert_creds(old_cred);
-
- return ret;
+ scoped_with_creds(ctx->cred)
+ return do_backing_file_write_iter(file, iter, iocb, flags, ctx->end_write);
}
EXPORT_SYMBOL_GPL(backing_file_write_iter);
@@ -275,15 +277,13 @@ ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
unsigned int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
return -EIO;
- old_cred = override_creds(ctx->cred);
- ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
- revert_creds(old_cred);
+ scoped_with_creds(ctx->cred)
+ ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
if (ctx->accessed)
ctx->accessed(iocb->ki_filp);
@@ -297,7 +297,6 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
size_t len, unsigned int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
@@ -310,11 +309,11 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
if (ret)
return ret;
- old_cred = override_creds(ctx->cred);
- file_start_write(out);
- ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
- file_end_write(out);
- revert_creds(old_cred);
+ scoped_with_creds(ctx->cred) {
+ file_start_write(out);
+ ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
+ file_end_write(out);
+ }
if (ctx->end_write)
ctx->end_write(iocb, ret);
@@ -326,21 +325,19 @@ EXPORT_SYMBOL_GPL(backing_file_splice_write);
int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
struct file *user_file = vma->vm_file;
int ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
return -EIO;
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return -ENODEV;
vma_set_file(vma, file);
- old_cred = override_creds(ctx->cred);
- ret = call_mmap(vma->vm_file, vma);
- revert_creds(old_cred);
+ scoped_with_creds(ctx->cred)
+ ret = vfs_mmap(vma->vm_file, vma);
if (ctx->accessed)
ctx->accessed(user_file);
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
deleted file mode 100644
index c9798750202d..000000000000
--- a/fs/bcachefs/Kconfig
+++ /dev/null
@@ -1,113 +0,0 @@
-
-config BCACHEFS_FS
- tristate "bcachefs filesystem support (EXPERIMENTAL)"
- depends on BLOCK
- select EXPORTFS
- select CLOSURES
- select LIBCRC32C
- select CRC64
- select FS_POSIX_ACL
- select LZ4_COMPRESS
- select LZ4_DECOMPRESS
- select LZ4HC_COMPRESS
- select LZ4HC_DECOMPRESS
- select ZLIB_DEFLATE
- select ZLIB_INFLATE
- select ZSTD_COMPRESS
- select ZSTD_DECOMPRESS
- select CRYPTO
- select CRYPTO_LIB_SHA256
- select CRYPTO_CHACHA20
- select CRYPTO_POLY1305
- select KEYS
- select RAID6_PQ
- select XOR_BLOCKS
- select XXHASH
- select SRCU
- select SYMBOLIC_ERRNAME
- select MIN_HEAP
- help
- The bcachefs filesystem - a modern, copy on write filesystem, with
- support for multiple devices, compression, checksumming, etc.
-
-config BCACHEFS_QUOTA
- bool "bcachefs quota support"
- depends on BCACHEFS_FS
- select QUOTACTL
-
-config BCACHEFS_ERASURE_CODING
- bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
- depends on BCACHEFS_FS
- select QUOTACTL
- help
- This enables the "erasure_code" filesysystem and inode option, which
- organizes data into reed-solomon stripes instead of ordinary
- replication.
-
- WARNING: this feature is still undergoing on disk format changes, and
- should only be enabled for testing purposes.
-
-config BCACHEFS_POSIX_ACL
- bool "bcachefs POSIX ACL support"
- depends on BCACHEFS_FS
- select FS_POSIX_ACL
-
-config BCACHEFS_DEBUG
- bool "bcachefs debugging"
- depends on BCACHEFS_FS
- help
- Enables many extra debugging checks and assertions.
-
- The resulting code will be significantly slower than normal; you
- probably shouldn't select this option unless you're a developer.
-
-config BCACHEFS_INJECT_TRANSACTION_RESTARTS
- bool "Randomly inject transaction restarts"
- depends on BCACHEFS_DEBUG
- help
- Randomly inject transaction restarts in a few core paths - may have a
- significant performance penalty
-
-config BCACHEFS_TESTS
- bool "bcachefs unit and performance tests"
- depends on BCACHEFS_FS
- help
- Include some unit and performance tests for the core btree code
-
-config BCACHEFS_LOCK_TIME_STATS
- bool "bcachefs lock time statistics"
- depends on BCACHEFS_FS
- help
- Expose statistics for how long we held a lock in debugfs
-
-config BCACHEFS_NO_LATENCY_ACCT
- bool "disable latency accounting and time stats"
- depends on BCACHEFS_FS
- help
- This disables device latency tracking and time stats, only for performance testing
-
-config BCACHEFS_SIX_OPTIMISTIC_SPIN
- bool "Optimistic spinning for six locks"
- depends on BCACHEFS_FS
- depends on SMP
- default y
- help
- Instead of immediately sleeping when attempting to take a six lock that
- is held by another thread, spin for a short while, as long as the
- thread owning the lock is running.
-
-config BCACHEFS_PATH_TRACEPOINTS
- bool "Extra btree_path tracepoints"
- depends on BCACHEFS_FS && TRACING
- help
- Enable extra tracepoints for debugging btree_path operations; we don't
- normally want these enabled because they happen at very high rates.
-
-config MEAN_AND_VARIANCE_UNIT_TEST
- tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
- depends on KUNIT
- depends on BCACHEFS_FS
- default KUNIT_ALL_TESTS
- help
- This option enables the kunit tests for mean_and_variance module.
- If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
deleted file mode 100644
index 9af65079374f..000000000000
--- a/fs/bcachefs/Makefile
+++ /dev/null
@@ -1,103 +0,0 @@
-
-obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
-
-bcachefs-y := \
- acl.o \
- alloc_background.o \
- alloc_foreground.o \
- backpointers.o \
- bkey.o \
- bkey_methods.o \
- bkey_sort.o \
- bset.o \
- btree_cache.o \
- btree_gc.o \
- btree_io.o \
- btree_iter.o \
- btree_journal_iter.o \
- btree_key_cache.o \
- btree_locking.o \
- btree_node_scan.o \
- btree_trans_commit.o \
- btree_update.o \
- btree_update_interior.o \
- btree_write_buffer.o \
- buckets.o \
- buckets_waiting_for_journal.o \
- chardev.o \
- checksum.o \
- clock.o \
- compress.o \
- darray.o \
- data_update.o \
- debug.o \
- dirent.o \
- disk_accounting.o \
- disk_groups.o \
- ec.o \
- errcode.o \
- error.o \
- extents.o \
- extent_update.o \
- eytzinger.o \
- fs.o \
- fs-ioctl.o \
- fs-io.o \
- fs-io-buffered.o \
- fs-io-direct.o \
- fs-io-pagecache.o \
- fsck.o \
- inode.o \
- io_read.o \
- io_misc.o \
- io_write.o \
- journal.o \
- journal_io.o \
- journal_reclaim.o \
- journal_sb.o \
- journal_seq_blacklist.o \
- keylist.o \
- logged_ops.o \
- lru.o \
- mean_and_variance.o \
- migrate.o \
- move.o \
- movinggc.o \
- namei.o \
- nocow_locking.o \
- opts.o \
- printbuf.o \
- progress.o \
- quota.o \
- rebalance.o \
- rcu_pending.o \
- recovery.o \
- recovery_passes.o \
- reflink.o \
- replicas.o \
- sb-clean.o \
- sb-counters.o \
- sb-downgrade.o \
- sb-errors.o \
- sb-members.o \
- siphash.o \
- six.o \
- snapshot.o \
- str_hash.o \
- subvolume.o \
- super.o \
- super-io.o \
- sysfs.o \
- tests.o \
- time_stats.o \
- thread_with_file.o \
- trace.o \
- two_state_shared_lock.o \
- util.o \
- varint.o \
- xattr.o
-
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
-
-# Silence "note: xyz changed in GCC X.X" messages
-subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
deleted file mode 100644
index 99487727ae64..000000000000
--- a/fs/bcachefs/acl.c
+++ /dev/null
@@ -1,445 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-
-#include "acl.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static const char * const acl_types[] = {
- [ACL_USER_OBJ] = "user_obj",
- [ACL_USER] = "user",
- [ACL_GROUP_OBJ] = "group_obj",
- [ACL_GROUP] = "group",
- [ACL_MASK] = "mask",
- [ACL_OTHER] = "other",
- NULL,
-};
-
-void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
-{
- const void *p, *end = value + size;
-
- if (!value ||
- size < sizeof(bch_acl_header) ||
- ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
- return;
-
- p = value + sizeof(bch_acl_header);
- while (p < end) {
- const bch_acl_entry *in = p;
- unsigned tag = le16_to_cpu(in->e_tag);
-
- prt_str(out, acl_types[tag]);
-
- switch (tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- p += sizeof(bch_acl_entry_short);
- break;
- case ACL_USER:
- prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
- p += sizeof(bch_acl_entry);
- break;
- case ACL_GROUP:
- prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
- p += sizeof(bch_acl_entry);
- break;
- }
-
- prt_printf(out, " %o", le16_to_cpu(in->e_perm));
-
- if (p != end)
- prt_char(out, ' ');
- }
-}
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-#include "fs.h"
-
-#include <linux/fs.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
-{
- return sizeof(bch_acl_header) +
- sizeof(bch_acl_entry_short) * nr_short +
- sizeof(bch_acl_entry) * nr_long;
-}
-
-static inline int acl_to_xattr_type(int type)
-{
- switch (type) {
- case ACL_TYPE_ACCESS:
- return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
- case ACL_TYPE_DEFAULT:
- return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
- default:
- BUG();
- }
-}
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
- const void *value, size_t size)
-{
- const void *p, *end = value + size;
- struct posix_acl *acl;
- struct posix_acl_entry *out;
- unsigned count = 0;
- int ret;
-
- if (!value)
- return NULL;
- if (size < sizeof(bch_acl_header))
- goto invalid;
- if (((bch_acl_header *)value)->a_version !=
- cpu_to_le32(BCH_ACL_VERSION))
- goto invalid;
-
- p = value + sizeof(bch_acl_header);
- while (p < end) {
- const bch_acl_entry *entry = p;
-
- if (p + sizeof(bch_acl_entry_short) > end)
- goto invalid;
-
- switch (le16_to_cpu(entry->e_tag)) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- p += sizeof(bch_acl_entry_short);
- break;
- case ACL_USER:
- case ACL_GROUP:
- p += sizeof(bch_acl_entry);
- break;
- default:
- goto invalid;
- }
-
- count++;
- }
-
- if (p > end)
- goto invalid;
-
- if (!count)
- return NULL;
-
- acl = allocate_dropping_locks(trans, ret,
- posix_acl_alloc(count, _gfp));
- if (!acl)
- return ERR_PTR(-ENOMEM);
- if (ret) {
- kfree(acl);
- return ERR_PTR(ret);
- }
-
- out = acl->a_entries;
-
- p = value + sizeof(bch_acl_header);
- while (p < end) {
- const bch_acl_entry *in = p;
-
- out->e_tag = le16_to_cpu(in->e_tag);
- out->e_perm = le16_to_cpu(in->e_perm);
-
- switch (out->e_tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- p += sizeof(bch_acl_entry_short);
- break;
- case ACL_USER:
- out->e_uid = make_kuid(&init_user_ns,
- le32_to_cpu(in->e_id));
- p += sizeof(bch_acl_entry);
- break;
- case ACL_GROUP:
- out->e_gid = make_kgid(&init_user_ns,
- le32_to_cpu(in->e_id));
- p += sizeof(bch_acl_entry);
- break;
- }
-
- out++;
- }
-
- BUG_ON(out != acl->a_entries + acl->a_count);
-
- return acl;
-invalid:
- pr_err("invalid acl entry");
- return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static struct bkey_i_xattr *
-bch2_acl_to_xattr(struct btree_trans *trans,
- const struct posix_acl *acl,
- int type)
-{
- struct bkey_i_xattr *xattr;
- bch_acl_header *acl_header;
- const struct posix_acl_entry *acl_e, *pe;
- void *outptr;
- unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
-
- FOREACH_ACL_ENTRY(acl_e, acl, pe) {
- switch (acl_e->e_tag) {
- case ACL_USER:
- case ACL_GROUP:
- nr_long++;
- break;
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- nr_short++;
- break;
- default:
- return ERR_PTR(-EINVAL);
- }
- }
-
- acl_len = bch2_acl_size(nr_short, nr_long);
- u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
-
- if (u64s > U8_MAX)
- return ERR_PTR(-E2BIG);
-
- xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
- if (IS_ERR(xattr))
- return xattr;
-
- bkey_xattr_init(&xattr->k_i);
- xattr->k.u64s = u64s;
- xattr->v.x_type = acl_to_xattr_type(type);
- xattr->v.x_name_len = 0;
- xattr->v.x_val_len = cpu_to_le16(acl_len);
-
- acl_header = xattr_val(&xattr->v);
- acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
-
- outptr = (void *) acl_header + sizeof(*acl_header);
-
- FOREACH_ACL_ENTRY(acl_e, acl, pe) {
- bch_acl_entry *entry = outptr;
-
- entry->e_tag = cpu_to_le16(acl_e->e_tag);
- entry->e_perm = cpu_to_le16(acl_e->e_perm);
- switch (acl_e->e_tag) {
- case ACL_USER:
- entry->e_id = cpu_to_le32(
- from_kuid(&init_user_ns, acl_e->e_uid));
- outptr += sizeof(bch_acl_entry);
- break;
- case ACL_GROUP:
- entry->e_id = cpu_to_le32(
- from_kgid(&init_user_ns, acl_e->e_gid));
- outptr += sizeof(bch_acl_entry);
- break;
-
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- outptr += sizeof(bch_acl_entry_short);
- break;
- }
- }
-
- BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
-
- return xattr;
-}
-
-struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
- struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
- struct btree_iter iter = { NULL };
- struct posix_acl *acl = NULL;
-
- if (rcu)
- return ERR_PTR(-ECHILD);
-
- struct btree_trans *trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash, inode_inum(inode), &search, 0);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
- acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
- ret = PTR_ERR_OR_ZERO(acl);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- if (ret)
- acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
-
- if (!IS_ERR_OR_NULL(acl))
- set_cached_acl(&inode->v, type, acl);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return acl;
-}
-
-int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode_u,
- struct posix_acl *acl, int type)
-{
- struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
- int ret;
-
- if (type == ACL_TYPE_DEFAULT &&
- !S_ISDIR(inode_u->bi_mode))
- return acl ? -EACCES : 0;
-
- if (acl) {
- struct bkey_i_xattr *xattr =
- bch2_acl_to_xattr(trans, acl, type);
- if (IS_ERR(xattr))
- return PTR_ERR(xattr);
-
- ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
- inum, &xattr->k_i, 0);
- } else {
- struct xattr_search_key search =
- X_SEARCH(acl_to_xattr_type(type), "", 0);
-
- ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
- inum, &search);
- }
-
- return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-}
-
-int bch2_set_acl(struct mnt_idmap *idmap,
- struct dentry *dentry,
- struct posix_acl *_acl, int type)
-{
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_iter inode_iter = { NULL };
- struct bch_inode_unpacked inode_u;
- struct posix_acl *acl;
- umode_t mode;
- int ret;
-
- mutex_lock(&inode->ei_update_lock);
- struct btree_trans *trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
- acl = _acl;
-
- ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
- bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_intent);
- if (ret)
- goto btree_err;
-
- mode = inode_u.bi_mode;
-
- if (type == ACL_TYPE_ACCESS) {
- ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
- if (ret)
- goto btree_err;
- }
-
- ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
- if (ret)
- goto btree_err;
-
- inode_u.bi_ctime = bch2_current_time(c);
- inode_u.bi_mode = mode;
-
- ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
-btree_err:
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- if (unlikely(ret))
- goto err;
-
- bch2_inode_update_after_write(trans, inode, &inode_u,
- ATTR_CTIME|ATTR_MODE);
-
- set_cached_acl(&inode->v, type, acl);
-err:
- bch2_trans_put(trans);
- mutex_unlock(&inode->ei_update_lock);
-
- return ret;
-}
-
-int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode,
- umode_t mode,
- struct posix_acl **new_acl)
-{
- struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
- struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
- struct btree_iter iter;
- struct posix_acl *acl = NULL;
-
- struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash_info, inum, &search, BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (ret)
- return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-
- acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
- ret = PTR_ERR_OR_ZERO(acl);
- if (ret)
- goto err;
-
- ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
- if (ret)
- goto err;
-
- struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- new->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
- *new_acl = acl;
- acl = NULL;
-err:
- bch2_trans_iter_exit(trans, &iter);
- if (!IS_ERR_OR_NULL(acl))
- kfree(acl);
- return ret;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
deleted file mode 100644
index fe730a6bf0c1..000000000000
--- a/fs/bcachefs/acl.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ACL_H
-#define _BCACHEFS_ACL_H
-
-struct bch_inode_unpacked;
-struct bch_hash_info;
-struct bch_inode_info;
-struct posix_acl;
-
-#define BCH_ACL_VERSION 0x0001
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
- __le32 e_id;
-} bch_acl_entry;
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
-} bch_acl_entry_short;
-
-typedef struct {
- __le32 a_version;
-} bch_acl_header;
-
-void bch2_acl_to_text(struct printbuf *, const void *, size_t);
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-struct posix_acl *bch2_get_acl(struct inode *, int, bool);
-
-int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *,
- struct posix_acl *, int);
-int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *,
- umode_t, struct posix_acl **);
-
-#else
-
-static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode_u,
- struct posix_acl *acl, int type)
-{
- return 0;
-}
-
-static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode,
- umode_t mode,
- struct posix_acl **new_acl)
-{
- return 0;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
-
-#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
deleted file mode 100644
index 5fb396be9127..000000000000
--- a/fs/bcachefs/alloc_background.c
+++ /dev/null
@@ -1,2656 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "lru.h"
-#include "recovery.h"
-#include "trace.h"
-#include "varint.h"
-
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/random.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-#include <linux/sort.h>
-#include <linux/jiffies.h>
-
-static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
-
-/* Persistent alloc info: */
-
-static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bkey_alloc_unpacked {
- u64 journal_seq;
- u8 gen;
- u8 oldest_gen;
- u8 data_type;
- bool need_discard:1;
- bool need_inc_gen:1;
-#define x(_name, _bits) u##_bits _name;
- BCH_ALLOC_FIELDS_V2()
-#undef x
-};
-
-static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
- const void **p, unsigned field)
-{
- unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
- u64 v;
-
- if (!(a->fields & (1 << field)))
- return 0;
-
- switch (bytes) {
- case 1:
- v = *((const u8 *) *p);
- break;
- case 2:
- v = le16_to_cpup(*p);
- break;
- case 4:
- v = le32_to_cpup(*p);
- break;
- case 8:
- v = le64_to_cpup(*p);
- break;
- default:
- BUG();
- }
-
- *p += bytes;
- return v;
-}
-
-static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
- struct bkey_s_c k)
-{
- const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
- const void *d = in->data;
- unsigned idx = 0;
-
- out->gen = in->gen;
-
-#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
- BCH_ALLOC_FIELDS_V1()
-#undef x
-}
-
-static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
- struct bkey_s_c k)
-{
- struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
- const u8 *in = a.v->data;
- const u8 *end = bkey_val_end(a);
- unsigned fieldnr = 0;
- int ret;
- u64 v;
-
- out->gen = a.v->gen;
- out->oldest_gen = a.v->oldest_gen;
- out->data_type = a.v->data_type;
-
-#define x(_name, _bits) \
- if (fieldnr < a.v->nr_fields) { \
- ret = bch2_varint_decode_fast(in, end, &v); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- } else { \
- v = 0; \
- } \
- out->_name = v; \
- if (v != out->_name) \
- return -1; \
- fieldnr++;
-
- BCH_ALLOC_FIELDS_V2()
-#undef x
- return 0;
-}
-
-static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
- struct bkey_s_c k)
-{
- struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
- const u8 *in = a.v->data;
- const u8 *end = bkey_val_end(a);
- unsigned fieldnr = 0;
- int ret;
- u64 v;
-
- out->gen = a.v->gen;
- out->oldest_gen = a.v->oldest_gen;
- out->data_type = a.v->data_type;
- out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
- out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
- out->journal_seq = le64_to_cpu(a.v->journal_seq);
-
-#define x(_name, _bits) \
- if (fieldnr < a.v->nr_fields) { \
- ret = bch2_varint_decode_fast(in, end, &v); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- } else { \
- v = 0; \
- } \
- out->_name = v; \
- if (v != out->_name) \
- return -1; \
- fieldnr++;
-
- BCH_ALLOC_FIELDS_V2()
-#undef x
- return 0;
-}
-
-static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
-{
- struct bkey_alloc_unpacked ret = { .gen = 0 };
-
- switch (k.k->type) {
- case KEY_TYPE_alloc:
- bch2_alloc_unpack_v1(&ret, k);
- break;
- case KEY_TYPE_alloc_v2:
- bch2_alloc_unpack_v2(&ret, k);
- break;
- case KEY_TYPE_alloc_v3:
- bch2_alloc_unpack_v3(&ret, k);
- break;
- }
-
- return ret;
-}
-
-static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
-{
- unsigned i, bytes = offsetof(struct bch_alloc, data);
-
- for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
- if (a->fields & (1 << i))
- bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
-
- return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
- int ret = 0;
-
- /* allow for unknown fields */
- bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
- c, alloc_v1_val_size_bad,
- "incorrect value size (%zu < %u)",
- bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
-fsck_err:
- return ret;
-}
-
-int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_alloc_unpacked u;
- int ret = 0;
-
- bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
- c, alloc_v2_unpack_error,
- "unpack error");
-fsck_err:
- return ret;
-}
-
-int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_alloc_unpacked u;
- int ret = 0;
-
- bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
- c, alloc_v3_unpack_error,
- "unpack error");
-fsck_err:
- return ret;
-}
-
-int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bch_alloc_v4 a;
- int ret = 0;
-
- bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
-
- bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
- c, alloc_v4_val_size_bad,
- "bad val size (%u > %zu)",
- alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
-
- bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
- BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
- c, alloc_v4_backpointers_start_bad,
- "invalid backpointers_start");
-
- bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
- c, alloc_key_data_type_bad,
- "invalid data type (got %u should be %u)",
- a.data_type, alloc_data_type(a, a.data_type));
-
- for (unsigned i = 0; i < 2; i++)
- bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
- c, alloc_key_io_time_bad,
- "invalid io_time[%s]: %llu, max %llu",
- i == READ ? "read" : "write",
- a.io_time[i], LRU_TIME_MAX);
-
- unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
- offsetof(struct bch_alloc_v4, stripe_sectors)
- ? a.stripe_sectors
- : 0;
-
- switch (a.data_type) {
- case BCH_DATA_free:
- case BCH_DATA_need_gc_gens:
- case BCH_DATA_need_discard:
- bkey_fsck_err_on(stripe_sectors ||
- a.dirty_sectors ||
- a.cached_sectors ||
- a.stripe,
- c, alloc_key_empty_but_have_data,
- "empty data type free but have data %u.%u.%u %u",
- stripe_sectors,
- a.dirty_sectors,
- a.cached_sectors,
- a.stripe);
- break;
- case BCH_DATA_sb:
- case BCH_DATA_journal:
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- bkey_fsck_err_on(!a.dirty_sectors &&
- !stripe_sectors,
- c, alloc_key_dirty_sectors_0,
- "data_type %s but dirty_sectors==0",
- bch2_data_type_str(a.data_type));
- break;
- case BCH_DATA_cached:
- bkey_fsck_err_on(!a.cached_sectors ||
- a.dirty_sectors ||
- stripe_sectors ||
- a.stripe,
- c, alloc_key_cached_inconsistency,
- "data type inconsistency");
-
- bkey_fsck_err_on(!a.io_time[READ] &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
- c, alloc_key_cached_but_read_time_zero,
- "cached bucket with read_time == 0");
- break;
- case BCH_DATA_stripe:
- break;
- }
-fsck_err:
- return ret;
-}
-
-void bch2_alloc_v4_swab(struct bkey_s k)
-{
- struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
-
- a->journal_seq_nonempty = swab64(a->journal_seq_nonempty);
- a->journal_seq_empty = swab64(a->journal_seq_empty);
- a->flags = swab32(a->flags);
- a->dirty_sectors = swab32(a->dirty_sectors);
- a->cached_sectors = swab32(a->cached_sectors);
- a->io_time[0] = swab64(a->io_time[0]);
- a->io_time[1] = swab64(a->io_time[1]);
- a->stripe = swab32(a->stripe);
- a->nr_external_backpointers = swab32(a->nr_external_backpointers);
- a->stripe_sectors = swab32(a->stripe_sectors);
-}
-
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
-
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
- bch2_prt_data_type(out, a->data_type);
- prt_newline(out);
- prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty);
- prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty);
- prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
- prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
- prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
- prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
- prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
- prt_printf(out, "stripe %u\n", a->stripe);
- prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
- prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
- prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
-
- if (ca)
- prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca));
- prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
- printbuf_indent_sub(out, 2);
-
- bch2_dev_put(ca);
-}
-
-void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
-{
- if (k.k->type == KEY_TYPE_alloc_v4) {
- void *src, *dst;
-
- *out = *bkey_s_c_to_alloc_v4(k).v;
-
- src = alloc_v4_backpointers(out);
- SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
- dst = alloc_v4_backpointers(out);
-
- if (src < dst)
- memset(src, 0, dst - src);
-
- SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
- } else {
- struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-
- *out = (struct bch_alloc_v4) {
- .journal_seq_nonempty = u.journal_seq,
- .flags = u.need_discard,
- .gen = u.gen,
- .oldest_gen = u.oldest_gen,
- .data_type = u.data_type,
- .stripe_redundancy = u.stripe_redundancy,
- .dirty_sectors = u.dirty_sectors,
- .cached_sectors = u.cached_sectors,
- .io_time[READ] = u.read_time,
- .io_time[WRITE] = u.write_time,
- .stripe = u.stripe,
- };
-
- SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
- }
-}
-
-static noinline struct bkey_i_alloc_v4 *
-__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bkey_i_alloc_v4 *ret;
-
- ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
- if (IS_ERR(ret))
- return ret;
-
- if (k.k->type == KEY_TYPE_alloc_v4) {
- void *src, *dst;
-
- bkey_reassemble(&ret->k_i, k);
-
- src = alloc_v4_backpointers(&ret->v);
- SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
- dst = alloc_v4_backpointers(&ret->v);
-
- if (src < dst)
- memset(src, 0, dst - src);
-
- SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
- set_alloc_v4_u64s(ret);
- } else {
- bkey_alloc_v4_init(&ret->k_i);
- ret->k.p = k.k->p;
- bch2_alloc_to_v4(k, &ret->v);
- }
- return ret;
-}
-
-static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bkey_s_c_alloc_v4 a;
-
- if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
- ((a = bkey_s_c_to_alloc_v4(k), true) &&
- BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
- return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
-
- return __bch2_alloc_to_v4_mut(trans, k);
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
- return bch2_alloc_to_v4_mut_inlined(trans, k);
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos pos)
-{
- struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
- BTREE_ITER_with_updates|
- BTREE_ITER_cached|
- BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (unlikely(ret))
- return ERR_PTR(ret);
-
- struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
- ret = PTR_ERR_OR_ZERO(a);
- if (unlikely(ret))
- goto err;
- return a;
-err:
- bch2_trans_iter_exit(trans, iter);
- return ERR_PTR(ret);
-}
-
-__flatten
-struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
- enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
- int ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- return ERR_PTR(ret);
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
- bch2_trans_iter_exit(trans, &iter);
- return unlikely(ret) ? ERR_PTR(ret) : a;
-}
-
-static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
-{
- *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
-
- pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
- return pos;
-}
-
-static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
-{
- pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
- pos.offset += offset;
- return pos;
-}
-
-static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
-{
- return k.k->type == KEY_TYPE_bucket_gens
- ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
- : 0;
-}
-
-int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
- c, bucket_gens_val_size_bad,
- "bad val size (%zu != %zu)",
- bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
-fsck_err:
- return ret;
-}
-
-void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
- if (i)
- prt_char(out, ' ');
- prt_printf(out, "%u", g.v->gens[i]);
- }
-}
-
-int bch2_bucket_gens_init(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bkey_i_bucket_gens g;
- bool have_bucket_gens_key = false;
- int ret;
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- /*
- * Not a fsck error because this is checked/repaired by
- * bch2_check_alloc_key() which runs later:
- */
- if (!bch2_dev_bucket_exists(c, k.k->p))
- continue;
-
- struct bch_alloc_v4 a;
- u8 gen = bch2_alloc_to_v4(k, &a)->gen;
- unsigned offset;
- struct bpos pos = alloc_gens_pos(iter.pos, &offset);
- int ret2 = 0;
-
- if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
- ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret2)
- goto iter_err;
- have_bucket_gens_key = false;
- }
-
- if (!have_bucket_gens_key) {
- bkey_bucket_gens_init(&g.k_i);
- g.k.p = pos;
- have_bucket_gens_key = true;
- }
-
- g.v.gens[offset] = gen;
-iter_err:
- ret2;
- }));
-
- if (have_bucket_gens_key && !ret)
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
-
- bch2_trans_put(trans);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_alloc_read(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bch_dev *ca = NULL;
- int ret;
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
- ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
- u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
-
- if (k.k->type != KEY_TYPE_bucket_gens)
- continue;
-
- ca = bch2_dev_iterate(c, ca, k.k->p.inode);
- /*
- * Not a fsck error because this is checked/repaired by
- * bch2_check_alloc_key() which runs later:
- */
- if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
-
- const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
-
- for (u64 b = max_t(u64, ca->mi.first_bucket, start);
- b < min_t(u64, ca->mi.nbuckets, end);
- b++)
- *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
- 0;
- }));
- } else {
- ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- ca = bch2_dev_iterate(c, ca, k.k->p.inode);
- /*
- * Not a fsck error because this is checked/repaired by
- * bch2_check_alloc_key() which runs later:
- */
- if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
-
- if (k.k->p.offset < ca->mi.first_bucket) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
- continue;
- }
-
- if (k.k->p.offset >= ca->mi.nbuckets) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
-
- struct bch_alloc_v4 a;
- *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
- 0;
- }));
- }
-
- bch2_dev_put(ca);
- bch2_trans_put(trans);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-/* Free space/discard btree: */
-
-static int __need_discard_or_freespace_err(struct btree_trans *trans,
- struct bkey_s_c alloc_k,
- bool set, bool discard, bool repair)
-{
- struct bch_fs *c = trans->c;
- enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
- enum bch_sb_error_id err_id = discard
- ? BCH_FSCK_ERR_need_discard_key_wrong
- : BCH_FSCK_ERR_freespace_key_wrong;
- enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, alloc_k);
-
- int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
- "bucket incorrectly %sset in %s btree\n"
- " %s",
- set ? "" : "un",
- bch2_btree_id_str(btree),
- buf.buf);
- if (ret == -BCH_ERR_fsck_ignore ||
- ret == -BCH_ERR_fsck_errors_not_fixed)
- ret = 0;
-
- printbuf_exit(&buf);
- return ret;
-}
-
-#define need_discard_or_freespace_err(...) \
- fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
-
-#define need_discard_or_freespace_err_on(cond, ...) \
- (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false)
-
-static int bch2_bucket_do_index(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bkey_s_c alloc_k,
- const struct bch_alloc_v4 *a,
- bool set)
-{
- enum btree_id btree;
- struct bpos pos;
-
- if (a->data_type != BCH_DATA_free &&
- a->data_type != BCH_DATA_need_discard)
- return 0;
-
- switch (a->data_type) {
- case BCH_DATA_free:
- btree = BTREE_ID_freespace;
- pos = alloc_freespace_pos(alloc_k.k->p, *a);
- break;
- case BCH_DATA_need_discard:
- btree = BTREE_ID_need_discard;
- pos = alloc_k.k->p;
- break;
- default:
- return 0;
- }
-
- struct btree_iter iter;
- struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
- int ret = bkey_err(old);
- if (ret)
- return ret;
-
- need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
- !old.k->type != set,
- trans, alloc_k, set,
- btree == BTREE_ID_need_discard, false);
-
- ret = bch2_btree_bit_mod_iter(trans, &iter, set);
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
- struct bpos bucket, u8 gen)
-{
- struct btree_iter iter;
- unsigned offset;
- struct bpos pos = alloc_gens_pos(bucket, &offset);
- struct bkey_i_bucket_gens *g;
- struct bkey_s_c k;
- int ret;
-
- g = bch2_trans_kmalloc(trans, sizeof(*g));
- ret = PTR_ERR_OR_ZERO(g);
- if (ret)
- return ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
- BTREE_ITER_intent|
- BTREE_ITER_with_updates);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (k.k->type != KEY_TYPE_bucket_gens) {
- bkey_bucket_gens_init(&g->k_i);
- g->k.p = iter.pos;
- } else {
- bkey_reassemble(&g->k_i, k);
- }
-
- g->v.gens[offset] = gen;
-
- ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
- enum bch_data_type data_type,
- s64 delta_buckets,
- s64 delta_sectors,
- s64 delta_fragmented, unsigned flags)
-{
- s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
-
- return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
- d, dev_data_type,
- .dev = ca->dev_idx,
- .data_type = data_type);
-}
-
-int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
- const struct bch_alloc_v4 *old,
- const struct bch_alloc_v4 *new,
- unsigned flags)
-{
- s64 old_sectors = bch2_bucket_sectors(*old);
- s64 new_sectors = bch2_bucket_sectors(*new);
- if (old->data_type != new->data_type) {
- int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
- 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
- bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
- -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
- if (ret)
- return ret;
- } else if (old_sectors != new_sectors) {
- int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
- 0,
- new_sectors - old_sectors,
- bch2_bucket_sectors_fragmented(ca, *new) -
- bch2_bucket_sectors_fragmented(ca, *old), flags);
- if (ret)
- return ret;
- }
-
- s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
- s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
- if (old_unstriped != new_unstriped) {
- int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
- !!new_unstriped - !!old_unstriped,
- new_unstriped - old_unstriped,
- 0,
- flags);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int bch2_trigger_alloc(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
- if (!ca)
- return -BCH_ERR_trigger_alloc;
-
- struct bch_alloc_v4 old_a_convert;
- const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
-
- struct bch_alloc_v4 *new_a;
- if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
- new_a = bkey_s_to_alloc_v4(new).v;
- } else {
- BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
-
- struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
- ret = PTR_ERR_OR_ZERO(new_ka);
- if (unlikely(ret))
- goto err;
- new_a = &new_ka->v;
- }
-
- if (flags & BTREE_TRIGGER_transactional) {
- alloc_data_type_set(new_a, new_a->data_type);
-
- int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
- (int) data_type_is_empty(old_a->data_type);
-
- if (is_empty_delta < 0) {
- new_a->io_time[READ] = bch2_current_io_time(c, READ);
- new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
- SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
- }
-
- if (data_type_is_empty(new_a->data_type) &&
- BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
- !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
- if (new_a->oldest_gen == new_a->gen &&
- !bch2_bucket_sectors_total(*new_a))
- new_a->oldest_gen++;
- new_a->gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
- alloc_data_type_set(new_a, new_a->data_type);
- }
-
- if (old_a->data_type != new_a->data_type ||
- (new_a->data_type == BCH_DATA_free &&
- alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
- ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
- bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
- if (ret)
- goto err;
- }
-
- if (new_a->data_type == BCH_DATA_cached &&
- !new_a->io_time[READ])
- new_a->io_time[READ] = bch2_current_io_time(c, READ);
-
- ret = bch2_lru_change(trans, new.k->p.inode,
- bucket_to_u64(new.k->p),
- alloc_lru_idx_read(*old_a),
- alloc_lru_idx_read(*new_a));
- if (ret)
- goto err;
-
- ret = bch2_lru_change(trans,
- BCH_LRU_BUCKET_FRAGMENTATION,
- bucket_to_u64(new.k->p),
- alloc_lru_idx_fragmentation(*old_a, ca),
- alloc_lru_idx_fragmentation(*new_a, ca));
- if (ret)
- goto err;
-
- if (old_a->gen != new_a->gen) {
- ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
- if (ret)
- goto err;
- }
-
- if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
- old_a->cached_sectors) {
- ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
- -((s64) old_a->cached_sectors),
- flags & BTREE_TRIGGER_gc);
- if (ret)
- goto err;
- }
-
- ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
- if (ret)
- goto err;
- }
-
- if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
- u64 transaction_seq = trans->journal_res.seq;
- BUG_ON(!transaction_seq);
-
- if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
- trans, alloc_key_journal_seq_in_future,
- "bucket journal seq in future (currently at %llu)\n%s",
- journal_cur_seq(&c->journal),
- (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
- new_a->journal_seq_nonempty = transaction_seq;
-
- int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
- (int) data_type_is_empty(old_a->data_type);
-
- /*
- * Record journal sequence number of empty -> nonempty transition:
- * Note that there may be multiple empty -> nonempty
- * transitions, data in a bucket may be overwritten while we're
- * still writing to it - so be careful to only record the first:
- * */
- if (is_empty_delta < 0 &&
- new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
- new_a->journal_seq_nonempty = transaction_seq;
- new_a->journal_seq_empty = 0;
- }
-
- /*
- * Bucket becomes empty: mark it as waiting for a journal flush,
- * unless updates since empty -> nonempty transition were never
- * flushed - we may need to ask the journal not to flush
- * intermediate sequence numbers:
- */
- if (is_empty_delta > 0) {
- if (new_a->journal_seq_nonempty == transaction_seq ||
- bch2_journal_noflush_seq(&c->journal,
- new_a->journal_seq_nonempty,
- transaction_seq)) {
- new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
- } else {
- new_a->journal_seq_empty = transaction_seq;
-
- ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- new.k->p.inode, new.k->p.offset,
- transaction_seq);
- if (bch2_fs_fatal_err_on(ret, c,
- "setting bucket_needs_journal_commit: %s",
- bch2_err_str(ret)))
- goto err;
- }
- }
-
- if (new_a->gen != old_a->gen) {
- rcu_read_lock();
- u8 *gen = bucket_gen(ca, new.k->p.offset);
- if (unlikely(!gen)) {
- rcu_read_unlock();
- goto invalid_bucket;
- }
- *gen = new_a->gen;
- rcu_read_unlock();
- }
-
-#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
-#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
-#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
-
- if (statechange(a->data_type == BCH_DATA_free) &&
- bucket_flushed(new_a))
- closure_wake_up(&c->freelist_wait);
-
- if (statechange(a->data_type == BCH_DATA_need_discard) &&
- !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
- bucket_flushed(new_a))
- bch2_discard_one_bucket_fast(ca, new.k->p.offset);
-
- if (statechange(a->data_type == BCH_DATA_cached) &&
- !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_dev_do_invalidates(ca);
-
- if (statechange(a->data_type == BCH_DATA_need_gc_gens))
- bch2_gc_gens_async(c);
- }
-
- if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
- rcu_read_lock();
- struct bucket *g = gc_bucket(ca, new.k->p.offset);
- if (unlikely(!g)) {
- rcu_read_unlock();
- goto invalid_bucket;
- }
- g->gen_valid = 1;
- g->gen = new_a->gen;
- rcu_read_unlock();
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch2_dev_put(ca);
- return ret;
-invalid_bucket:
- bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
- (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
- ret = -BCH_ERR_trigger_alloc;
- goto err;
-}
-
-/*
- * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
- * extents style btrees, but works on non-extents btrees:
- */
-static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
-{
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-
- if (bkey_err(k))
- return k;
-
- if (k.k->type) {
- return k;
- } else {
- struct btree_iter iter2;
- struct bpos next;
-
- bch2_trans_copy_iter(&iter2, iter);
-
- struct btree_path *path = btree_iter_path(iter->trans, iter);
- if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
- end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
-
- end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
-
- /*
- * btree node min/max is a closed interval, upto takes a half
- * open interval:
- */
- k = bch2_btree_iter_peek_max(&iter2, end);
- next = iter2.pos;
- bch2_trans_iter_exit(iter->trans, &iter2);
-
- BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
-
- if (bkey_err(k))
- return k;
-
- bkey_init(hole);
- hole->p = iter->pos;
-
- bch2_key_resize(hole, next.offset - iter->pos.offset);
- return (struct bkey_s_c) { hole, NULL };
- }
-}
-
-static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
-{
- if (*ca) {
- if (bucket->offset < (*ca)->mi.first_bucket)
- bucket->offset = (*ca)->mi.first_bucket;
-
- if (bucket->offset < (*ca)->mi.nbuckets)
- return true;
-
- bch2_dev_put(*ca);
- *ca = NULL;
- bucket->inode++;
- bucket->offset = 0;
- }
-
- rcu_read_lock();
- *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
- if (*ca) {
- *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
- bch2_dev_get(*ca);
- }
- rcu_read_unlock();
-
- return *ca != NULL;
-}
-
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
- struct bch_dev **ca, struct bkey *hole)
-{
- struct bch_fs *c = iter->trans->c;
- struct bkey_s_c k;
-again:
- k = bch2_get_key_or_hole(iter, POS_MAX, hole);
- if (bkey_err(k))
- return k;
-
- *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
-
- if (!k.k->type) {
- struct bpos hole_start = bkey_start_pos(k.k);
-
- if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
- if (!next_bucket(c, ca, &hole_start))
- return bkey_s_c_null;
-
- bch2_btree_iter_set_pos(iter, hole_start);
- goto again;
- }
-
- if (k.k->p.offset > (*ca)->mi.nbuckets)
- bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
- }
-
- return k;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_key(struct btree_trans *trans,
- struct bkey_s_c alloc_k,
- struct btree_iter *alloc_iter,
- struct btree_iter *discard_iter,
- struct btree_iter *freespace_iter,
- struct btree_iter *bucket_gens_iter)
-{
- struct bch_fs *c = trans->c;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- unsigned gens_offset;
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
- if (fsck_err_on(!ca,
- trans, alloc_key_to_missing_dev_bucket,
- "alloc key for invalid device:bucket %llu:%llu",
- alloc_k.k->p.inode, alloc_k.k->p.offset))
- ret = bch2_btree_delete_at(trans, alloc_iter, 0);
- if (!ca)
- return ret;
-
- if (!ca->mi.freespace_initialized)
- goto out;
-
- a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
- bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
- k = bch2_btree_iter_peek_slot(discard_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bool is_discarded = a->data_type == BCH_DATA_need_discard;
- if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
- trans, alloc_k, !is_discarded, true, true)) {
- ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
- if (ret)
- goto err;
- }
-
- bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
- k = bch2_btree_iter_peek_slot(freespace_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bool is_free = a->data_type == BCH_DATA_free;
- if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
- trans, alloc_k, !is_free, false, true)) {
- ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
- if (ret)
- goto err;
- }
-
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
- trans, bucket_gens_key_wrong,
- "incorrect gen in bucket_gens btree (got %u should be %u)\n"
- " %s",
- alloc_gen(k, gens_offset), a->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- struct bkey_i_bucket_gens *g =
- bch2_trans_kmalloc(trans, sizeof(*g));
-
- ret = PTR_ERR_OR_ZERO(g);
- if (ret)
- goto err;
-
- if (k.k->type == KEY_TYPE_bucket_gens) {
- bkey_reassemble(&g->k_i, k);
- } else {
- bkey_bucket_gens_init(&g->k_i);
- g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
- }
-
- g->v.gens[gens_offset] = a->gen;
-
- ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
- if (ret)
- goto err;
- }
-out:
-err:
-fsck_err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bpos start,
- struct bpos *end,
- struct btree_iter *freespace_iter)
-{
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- if (!ca->mi.freespace_initialized)
- return 0;
-
- bch2_btree_iter_set_pos(freespace_iter, start);
-
- k = bch2_btree_iter_peek_slot(freespace_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- *end = bkey_min(k.k->p, *end);
-
- if (fsck_err_on(k.k->type != KEY_TYPE_set,
- trans, freespace_hole_missing,
- "hole in alloc btree missing in freespace btree\n"
- " device %llu buckets %llu-%llu",
- freespace_iter->pos.inode,
- freespace_iter->pos.offset,
- end->offset)) {
- struct bkey_i *update =
- bch2_trans_kmalloc(trans, sizeof(*update));
-
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- goto err;
-
- bkey_init(&update->k);
- update->k.type = KEY_TYPE_set;
- update->k.p = freespace_iter->pos;
- bch2_key_resize(&update->k,
- min_t(u64, U32_MAX, end->offset -
- freespace_iter->pos.offset));
-
- ret = bch2_trans_update(trans, freespace_iter, update, 0);
- if (ret)
- goto err;
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
- struct bpos start,
- struct bpos *end,
- struct btree_iter *bucket_gens_iter)
-{
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- unsigned i, gens_offset, gens_end_offset;
- int ret;
-
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
-
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
- alloc_gens_pos(*end, &gens_end_offset)))
- gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
-
- if (k.k->type == KEY_TYPE_bucket_gens) {
- struct bkey_i_bucket_gens g;
- bool need_update = false;
-
- bkey_reassemble(&g.k_i, k);
-
- for (i = gens_offset; i < gens_end_offset; i++) {
- if (fsck_err_on(g.v.gens[i], trans,
- bucket_gens_hole_wrong,
- "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
- bucket_gens_pos_to_alloc(k.k->p, i).inode,
- bucket_gens_pos_to_alloc(k.k->p, i).offset,
- g.v.gens[i])) {
- g.v.gens[i] = 0;
- need_update = true;
- }
- }
-
- if (need_update) {
- struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- memcpy(u, &g, sizeof(g));
-
- ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
- if (ret)
- goto err;
- }
- }
-
- *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-struct check_discard_freespace_key_async {
- struct work_struct work;
- struct bch_fs *c;
- struct bbpos pos;
-};
-
-static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- u8 gen;
- ret = k.k->type != KEY_TYPE_set
- ? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
- : 0;
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static void check_discard_freespace_key_work(struct work_struct *work)
-{
- struct check_discard_freespace_key_async *w =
- container_of(work, struct check_discard_freespace_key_async, work);
-
- bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
- bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
- kfree(w);
-}
-
-int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
- bool async_repair)
-{
- struct bch_fs *c = trans->c;
- enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
- ? BCH_DATA_need_discard
- : BCH_DATA_free;
- struct printbuf buf = PRINTBUF;
-
- struct bpos bucket = iter->pos;
- bucket.offset &= ~(~0ULL << 56);
- u64 genbits = iter->pos.offset & (~0ULL << 56);
-
- struct btree_iter alloc_iter;
- struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
- BTREE_ID_alloc, bucket,
- async_repair ? BTREE_ITER_cached : 0);
- int ret = bkey_err(alloc_k);
- if (ret)
- return ret;
-
- if (!bch2_dev_bucket_exists(c, bucket)) {
- if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
- "entry in %s btree for nonexistant dev:bucket %llu:%llu",
- bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
- goto delete;
- ret = 1;
- goto out;
- }
-
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
- if (a->data_type != state ||
- (state == BCH_DATA_free &&
- genbits != alloc_freespace_genbits(*a))) {
- if (fsck_err(trans, need_discard_freespace_key_bad,
- "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
- bch2_btree_id_str(iter->btree_id),
- iter->pos.inode,
- iter->pos.offset,
- a->data_type == state,
- genbits >> 56, alloc_freespace_genbits(*a) >> 56))
- goto delete;
- ret = 1;
- goto out;
- }
-
- *gen = a->gen;
-out:
-fsck_err:
- bch2_set_btree_iter_dontneed(&alloc_iter);
- bch2_trans_iter_exit(trans, &alloc_iter);
- printbuf_exit(&buf);
- return ret;
-delete:
- if (!async_repair) {
- ret = bch2_btree_bit_mod_iter(trans, iter, false) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_commit;
- goto out;
- } else {
- /*
- * We can't repair here when called from the allocator path: the
- * commit will recurse back into the allocator
- */
- struct check_discard_freespace_key_async *w =
- kzalloc(sizeof(*w), GFP_KERNEL);
- if (!w)
- goto out;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
- kfree(w);
- goto out;
- }
-
- INIT_WORK(&w->work, check_discard_freespace_key_work);
- w->c = c;
- w->pos = BBPOS(iter->btree_id, iter->pos);
- queue_work(c->write_ref_wq, &w->work);
- goto out;
- }
-}
-
-static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
-{
- u8 gen;
- int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
- return ret < 0 ? ret : 0;
-}
-
-/*
- * We've already checked that generation numbers in the bucket_gens btree are
- * valid for buckets that exist; this just checks for keys for nonexistent
- * buckets.
- */
-static noinline_for_stack
-int bch2_check_bucket_gens_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i_bucket_gens g;
- u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
- u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
- u64 b;
- bool need_update = false;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
- bkey_reassemble(&g.k_i, k);
-
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
- if (!ca) {
- if (fsck_err(trans, bucket_gens_to_invalid_dev,
- "bucket_gens key for invalid device:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_btree_delete_at(trans, iter, 0);
- goto out;
- }
-
- if (fsck_err_on(end <= ca->mi.first_bucket ||
- start >= ca->mi.nbuckets,
- trans, bucket_gens_to_invalid_buckets,
- "bucket_gens key for invalid buckets:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter, 0);
- goto out;
- }
-
- for (b = start; b < ca->mi.first_bucket; b++)
- if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
- trans, bucket_gens_nonzero_for_invalid_buckets,
- "bucket_gens key has nonzero gen for invalid bucket")) {
- g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
- need_update = true;
- }
-
- for (b = ca->mi.nbuckets; b < end; b++)
- if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
- trans, bucket_gens_nonzero_for_invalid_buckets,
- "bucket_gens key has nonzero gen for invalid bucket")) {
- g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
- need_update = true;
- }
-
- if (need_update) {
- struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto out;
-
- memcpy(u, &g, sizeof(g));
- ret = bch2_trans_update(trans, iter, u, 0);
- }
-out:
-fsck_err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_alloc_info(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
- struct bch_dev *ca = NULL;
- struct bkey hole;
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_prefetch);
- bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_prefetch);
- bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_prefetch);
- bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_prefetch);
-
- while (1) {
- struct bpos next;
-
- bch2_trans_begin(trans);
-
- k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
- ret = bkey_err(k);
- if (ret)
- goto bkey_err;
-
- if (!k.k)
- break;
-
- if (k.k->type) {
- next = bpos_nosnap_successor(k.k->p);
-
- ret = bch2_check_alloc_key(trans,
- k, &iter,
- &discard_iter,
- &freespace_iter,
- &bucket_gens_iter);
- if (ret)
- goto bkey_err;
- } else {
- next = k.k->p;
-
- ret = bch2_check_alloc_hole_freespace(trans, ca,
- bkey_start_pos(k.k),
- &next,
- &freespace_iter) ?:
- bch2_check_alloc_hole_bucket_gens(trans,
- bkey_start_pos(k.k),
- &next,
- &bucket_gens_iter);
- if (ret)
- goto bkey_err;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto bkey_err;
-
- bch2_btree_iter_set_pos(&iter, next);
-bkey_err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &bucket_gens_iter);
- bch2_trans_iter_exit(trans, &freespace_iter);
- bch2_trans_iter_exit(trans, &discard_iter);
- bch2_trans_iter_exit(trans, &iter);
- bch2_dev_put(ca);
- ca = NULL;
-
- if (ret < 0)
- goto err;
-
- ret = for_each_btree_key(trans, iter,
- BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_prefetch, k,
- bch2_check_discard_freespace_key_fsck(trans, &iter));
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_prefetch);
- while (1) {
- bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&iter);
- if (!k.k)
- break;
-
- ret = bkey_err(k) ?:
- bch2_check_discard_freespace_key_fsck(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- ret = 0;
- continue;
- }
- if (ret) {
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_err(c, "while checking %s", buf.buf);
- printbuf_exit(&buf);
- break;
- }
-
- bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
- }
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- goto err;
-
- ret = for_each_btree_key_commit(trans, iter,
- BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_bucket_gens_key(trans, &iter, k));
-err:
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
- struct btree_iter *alloc_iter,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- struct bkey_s_c alloc_k;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- alloc_k = bch2_btree_iter_peek(alloc_iter);
- if (!alloc_k.k)
- return 0;
-
- ret = bkey_err(alloc_k);
- if (ret)
- return ret;
-
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
- if (!ca)
- return 0;
-
- a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
- u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
- if (lru_idx) {
- ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
- bucket_to_u64(alloc_k.k->p),
- lru_idx, alloc_k, last_flushed);
- if (ret)
- goto err;
- }
-
- if (a->data_type != BCH_DATA_cached)
- goto err;
-
- if (fsck_err_on(!a->io_time[READ],
- trans, alloc_key_cached_but_read_time_zero,
- "cached bucket with read_time 0\n"
- " %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- struct bkey_i_alloc_v4 *a_mut =
- bch2_alloc_to_v4_mut(trans, alloc_k);
- ret = PTR_ERR_OR_ZERO(a_mut);
- if (ret)
- goto err;
-
- a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
- ret = bch2_trans_update(trans, alloc_iter,
- &a_mut->k_i, BTREE_TRIGGER_norun);
- if (ret)
- goto err;
-
- a = &a_mut->v;
- }
-
- ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
- bucket_to_u64(alloc_k.k->p),
- a->io_time[READ],
- alloc_k, last_flushed);
- if (ret)
- goto err;
-err:
-fsck_err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
-{
- struct bkey_buf last_flushed;
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
- bch2_check_stripe_to_lru_refs(c);
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
-{
- int ret;
-
- mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i)
- if (i->bucket == bucket) {
- ret = -BCH_ERR_EEXIST_discard_in_flight_add;
- goto out;
- }
-
- ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
- .in_progress = in_progress,
- .bucket = bucket,
- }));
-out:
- mutex_unlock(&ca->discard_buckets_in_flight_lock);
- return ret;
-}
-
-static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
-{
- mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i)
- if (i->bucket == bucket) {
- BUG_ON(!i->in_progress);
- darray_remove_item(&ca->discard_buckets_in_flight, i);
- goto found;
- }
- BUG();
-found:
- mutex_unlock(&ca->discard_buckets_in_flight_lock);
-}
-
-struct discard_buckets_state {
- u64 seen;
- u64 open;
- u64 need_journal_commit;
- u64 discarded;
-};
-
-/*
- * This is needed because discard is both a filesystem option and a device
- * option, and mount options are supposed to apply to that mount and not be
- * persisted, i.e. if it's set as a mount option we can't propagate it to the
- * device.
- */
-static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
-{
- return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
- ? c->opts.discard
- : ca->mi.discard;
-}
-
-static int bch2_discard_one_bucket(struct btree_trans *trans,
- struct bch_dev *ca,
- struct btree_iter *need_discard_iter,
- struct bpos *discard_pos_done,
- struct discard_buckets_state *s,
- bool fastpath)
-{
- struct bch_fs *c = trans->c;
- struct bpos pos = need_discard_iter->pos;
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
- struct bkey_i_alloc_v4 *a;
- struct printbuf buf = PRINTBUF;
- bool discard_locked = false;
- int ret = 0;
-
- if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
- s->open++;
- goto out;
- }
-
- u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
- pos.inode, pos.offset);
- if (seq_ready > c->journal.flushed_seq_ondisk) {
- if (seq_ready > c->journal.flushing_seq)
- s->need_journal_commit++;
- goto out;
- }
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
- need_discard_iter->pos,
- BTREE_ITER_cached);
- ret = bkey_err(k);
- if (ret)
- goto out;
-
- a = bch2_alloc_to_v4_mut(trans, k);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto out;
-
- if (a->v.data_type != BCH_DATA_need_discard) {
- if (need_discard_or_freespace_err(trans, k, true, true, true)) {
- ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
- if (ret)
- goto out;
- goto commit;
- }
-
- goto out;
- }
-
- if (!fastpath) {
- if (discard_in_flight_add(ca, iter.pos.offset, true))
- goto out;
-
- discard_locked = true;
- }
-
- if (!bkey_eq(*discard_pos_done, iter.pos)) {
- s->discarded++;
- *discard_pos_done = iter.pos;
-
- if (discard_opt_enabled(c, ca) && !c->opts.nochanges) {
- /*
- * This works without any other locks because this is the only
- * thread that removes items from the need_discard tree
- */
- bch2_trans_unlock_long(trans);
- blkdev_issue_discard(ca->disk_sb.bdev,
- k.k->p.offset * ca->mi.bucket_size,
- ca->mi.bucket_size,
- GFP_KERNEL);
- ret = bch2_trans_relock_notrace(trans);
- if (ret)
- goto out;
- }
- }
-
- SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- alloc_data_type_set(&a->v, a->v.data_type);
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto out;
-commit:
- ret = bch2_trans_commit(trans, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto out;
-
- if (!fastpath)
- count_event(c, bucket_discard);
- else
- count_event(c, bucket_discard_fast);
-out:
-fsck_err:
- if (discard_locked)
- discard_in_flight_remove(ca, iter.pos.offset);
- if (!ret)
- s->seen++;
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static void bch2_do_discards_work(struct work_struct *work)
-{
- struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
- struct bch_fs *c = ca->fs;
- struct discard_buckets_state s = {};
- struct bpos discard_pos_done = POS_MAX;
- int ret;
-
- /*
- * We're doing the commit in bch2_discard_one_bucket instead of using
- * for_each_btree_key_commit() so that we can increment counters after
- * successful commit:
- */
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter,
- BTREE_ID_need_discard,
- POS(ca->dev_idx, 0),
- POS(ca->dev_idx, U64_MAX), 0, k,
- bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
-
- if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
- bch2_journal_flush_async(&c->journal, NULL);
-
- trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
- bch2_err_str(ret));
-
- percpu_ref_put(&ca->io_ref);
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
-}
-
-void bch2_dev_do_discards(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
- return;
-
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
- goto put_write_ref;
-
- if (queue_work(c->write_ref_wq, &ca->discard_work))
- return;
-
- percpu_ref_put(&ca->io_ref);
-put_write_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
-}
-
-void bch2_do_discards(struct bch_fs *c)
-{
- for_each_member_device(c, ca)
- bch2_dev_do_discards(ca);
-}
-
-static int bch2_do_discards_fast_one(struct btree_trans *trans,
- struct bch_dev *ca,
- u64 bucket,
- struct bpos *discard_pos_done,
- struct discard_buckets_state *s)
-{
- struct btree_iter need_discard_iter;
- struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
- BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
- int ret = bkey_err(discard_k);
- if (ret)
- return ret;
-
- if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
- trans, discarding_bucket_not_in_need_discard_btree,
- "attempting to discard bucket %u:%llu not in need_discard btree",
- ca->dev_idx, bucket))
- goto out;
-
- ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &need_discard_iter);
- return ret;
-}
-
-static void bch2_do_discards_fast_work(struct work_struct *work)
-{
- struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
- struct bch_fs *c = ca->fs;
- struct discard_buckets_state s = {};
- struct bpos discard_pos_done = POS_MAX;
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = 0;
-
- while (1) {
- bool got_bucket = false;
- u64 bucket;
-
- mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i) {
- if (i->in_progress)
- continue;
-
- got_bucket = true;
- bucket = i->bucket;
- i->in_progress = true;
- break;
- }
- mutex_unlock(&ca->discard_buckets_in_flight_lock);
-
- if (!got_bucket)
- break;
-
- ret = lockrestart_do(trans,
- bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
- bch_err_fn(c, ret);
-
- discard_in_flight_remove(ca, bucket);
-
- if (ret)
- break;
- }
-
- trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
-
- bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref);
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
-}
-
-static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
-{
- struct bch_fs *c = ca->fs;
-
- if (discard_in_flight_add(ca, bucket, false))
- return;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
- return;
-
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
- goto put_ref;
-
- if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
- return;
-
- percpu_ref_put(&ca->io_ref);
-put_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
-}
-
-static int invalidate_one_bp(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bkey_s_c_backpointer bp,
- struct bkey_buf *last_flushed)
-{
- struct btree_iter extent_iter;
- struct bkey_s_c extent_k =
- bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
- int ret = bkey_err(extent_k);
- if (ret)
- return ret;
-
- struct bkey_i *n =
- bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
- BTREE_UPDATE_internal_snapshot_node);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
-err:
- bch2_trans_iter_exit(trans, &extent_iter);
- return ret;
-}
-
-static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bpos bucket,
- u8 gen,
- struct bkey_buf *last_flushed)
-{
- struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket);
- struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket);
-
- return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
- bp_start, bp_end, 0, k,
- NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc, ({
- if (k.k->type != KEY_TYPE_backpointer)
- continue;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
- if (bp.v->bucket_gen != gen)
- continue;
-
- /* filter out bps with gens that don't match */
-
- invalidate_one_bp(trans, ca, bp, last_flushed);
- }));
-}
-
-noinline_for_stack
-static int invalidate_one_bucket(struct btree_trans *trans,
- struct bch_dev *ca,
- struct btree_iter *lru_iter,
- struct bkey_s_c lru_k,
- struct bkey_buf *last_flushed,
- s64 *nr_to_invalidate)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
- struct btree_iter alloc_iter = {};
- int ret = 0;
-
- if (*nr_to_invalidate <= 0)
- return 1;
-
- if (!bch2_dev_bucket_exists(c, bucket)) {
- if (fsck_err(trans, lru_entry_to_invalid_bucket,
- "lru key points to nonexistent device:bucket %llu:%llu",
- bucket.inode, bucket.offset))
- return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
- goto out;
- }
-
- if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
- return 0;
-
- struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
- BTREE_ID_alloc, bucket,
- BTREE_ITER_cached);
- ret = bkey_err(alloc_k);
- if (ret)
- return ret;
-
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
- /* We expect harmless races here due to the btree write buffer: */
- if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
- goto out;
-
- /*
- * Impossible since alloc_lru_idx_read() only returns nonzero if the
- * bucket is supposed to be on the cached bucket LRU (i.e.
- * BCH_DATA_cached)
- *
- * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
- */
- BUG_ON(a->data_type != BCH_DATA_cached);
- BUG_ON(a->dirty_sectors);
-
- if (!a->cached_sectors)
- bch_err(c, "invalidating empty bucket, confused");
-
- unsigned cached_sectors = a->cached_sectors;
- u8 gen = a->gen;
-
- ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
- if (ret)
- goto out;
-
- trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
- --*nr_to_invalidate;
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &alloc_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
- struct bch_dev *ca, bool *wrapped)
-{
- struct bkey_s_c k;
-again:
- k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
- if (!k.k && !*wrapped) {
- bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
- *wrapped = true;
- goto again;
- }
-
- return k;
-}
-
-static void bch2_do_invalidates_work(struct work_struct *work)
-{
- struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
- struct bch_fs *c = ca->fs;
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = 0;
-
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- ret = bch2_btree_write_buffer_tryflush(trans);
- if (ret)
- goto err;
-
- s64 nr_to_invalidate =
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
- struct btree_iter iter;
- bool wrapped = false;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
- lru_pos(ca->dev_idx, 0,
- ((bch2_current_io_time(c, READ) + U32_MAX) &
- LRU_TIME_MAX)), 0);
-
- while (true) {
- bch2_trans_begin(trans);
-
- struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
- ret = bkey_err(k);
- if (ret)
- goto restart_err;
- if (!k.k)
- break;
-
- ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
-restart_err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- bch2_btree_iter_advance(&iter);
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref);
- bch2_bkey_buf_exit(&last_flushed, c);
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
-}
-
-void bch2_dev_do_invalidates(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
- return;
-
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
- goto put_ref;
-
- if (queue_work(c->write_ref_wq, &ca->invalidate_work))
- return;
-
- percpu_ref_put(&ca->io_ref);
-put_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
-}
-
-void bch2_do_invalidates(struct bch_fs *c)
-{
- for_each_member_device(c, ca)
- bch2_dev_do_invalidates(ca);
-}
-
-int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
- u64 bucket_start, u64 bucket_end)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey hole;
- struct bpos end = POS(ca->dev_idx, bucket_end);
- struct bch_member *m;
- unsigned long last_updated = jiffies;
- int ret;
-
- BUG_ON(bucket_start > bucket_end);
- BUG_ON(bucket_end > ca->mi.nbuckets);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
- BTREE_ITER_prefetch);
- /*
- * Scan the alloc btree for every bucket on @ca, and add buckets to the
- * freespace/need_discard/need_gc_gens btrees as needed:
- */
- while (1) {
- if (time_after(jiffies, last_updated + HZ * 10)) {
- bch_info(ca, "%s: currently at %llu/%llu",
- __func__, iter.pos.offset, ca->mi.nbuckets);
- last_updated = jiffies;
- }
-
- bch2_trans_begin(trans);
-
- if (bkey_ge(iter.pos, end)) {
- ret = 0;
- break;
- }
-
- k = bch2_get_key_or_hole(&iter, end, &hole);
- ret = bkey_err(k);
- if (ret)
- goto bkey_err;
-
- if (k.k->type) {
- /*
- * We process live keys in the alloc btree one at a
- * time:
- */
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
- ret = bch2_bucket_do_index(trans, ca, k, a, true) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto bkey_err;
-
- bch2_btree_iter_advance(&iter);
- } else {
- struct bkey_i *freespace;
-
- freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
- ret = PTR_ERR_OR_ZERO(freespace);
- if (ret)
- goto bkey_err;
-
- bkey_init(&freespace->k);
- freespace->k.type = KEY_TYPE_set;
- freespace->k.p = k.k->p;
- freespace->k.size = k.k->size;
-
- ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto bkey_err;
-
- bch2_btree_iter_set_pos(&iter, k.k->p);
- }
-bkey_err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
-
- if (ret < 0) {
- bch_err_msg(ca, ret, "initializing free space");
- return ret;
- }
-
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-int bch2_fs_freespace_init(struct bch_fs *c)
-{
- int ret = 0;
- bool doing_init = false;
-
- /*
- * We can crash during the device add path, so we need to check this on
- * every mount:
- */
-
- for_each_member_device(c, ca) {
- if (ca->mi.freespace_initialized)
- continue;
-
- if (!doing_init) {
- bch_info(c, "initializing freespace");
- doing_init = true;
- }
-
- ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
- if (ret) {
- bch2_dev_put(ca);
- bch_err_fn(c, ret);
- return ret;
- }
- }
-
- if (doing_init) {
- mutex_lock(&c->sb_lock);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- bch_verbose(c, "done initializing freespace");
- }
-
- return 0;
-}
-
-/* device removal */
-
-int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bpos start = POS(ca->dev_idx, 0);
- struct bpos end = POS(ca->dev_idx, U64_MAX);
- int ret;
-
- /*
- * We clear the LRU and need_discard btrees first so that we don't race
- * with bch2_do_invalidates() and bch2_do_discards()
- */
- ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?:
- bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_dev_usage_remove(c, ca->dev_idx);
- bch_err_msg(ca, ret, "removing dev alloc info");
- return ret;
-}
-
-/* Bucket IO clocks: */
-
-static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
-{
- struct bch_fs *c = trans->c;
-
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a =
- bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
- int ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- return ret;
-
- u64 now = bch2_current_io_time(c, rw);
- if (a->v.io_time[rw] == now)
- goto out;
-
- a->v.io_time[rw] = now;
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
-{
- if (bch2_trans_relock(trans))
- bch2_trans_begin(trans);
-
- return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
-}
-
-/* Startup/shutdown (ro/rw): */
-
-void bch2_recalc_capacity(struct bch_fs *c)
-{
- u64 capacity = 0, reserved_sectors = 0, gc_reserve;
- unsigned bucket_size_max = 0;
- unsigned long ra_pages = 0;
-
- lockdep_assert_held(&c->state_lock);
-
- for_each_online_member(c, ca) {
- struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
-
- ra_pages += bdi->ra_pages;
- }
-
- bch2_set_ra_pages(c, ra_pages);
-
- for_each_rw_member(c, ca) {
- u64 dev_reserve = 0;
-
- /*
- * We need to reserve buckets (from the number
- * of currently available buckets) against
- * foreground writes so that mainly copygc can
- * make forward progress.
- *
- * We need enough to refill the various reserves
- * from scratch - copygc will use its entire
- * reserve all at once, then run against when
- * its reserve is refilled (from the formerly
- * available buckets).
- *
- * This reserve is just used when considering if
- * allocations for foreground writes must wait -
- * not -ENOSPC calculations.
- */
-
- dev_reserve += ca->nr_btree_reserve * 2;
- dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
-
- dev_reserve += 1; /* btree write point */
- dev_reserve += 1; /* copygc write point */
- dev_reserve += 1; /* rebalance write point */
-
- dev_reserve *= ca->mi.bucket_size;
-
- capacity += bucket_to_sector(ca, ca->mi.nbuckets -
- ca->mi.first_bucket);
-
- reserved_sectors += dev_reserve * 2;
-
- bucket_size_max = max_t(unsigned, bucket_size_max,
- ca->mi.bucket_size);
- }
-
- gc_reserve = c->opts.gc_reserve_bytes
- ? c->opts.gc_reserve_bytes >> 9
- : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
-
- reserved_sectors = max(gc_reserve, reserved_sectors);
-
- reserved_sectors = min(reserved_sectors, capacity);
-
- c->reserved = reserved_sectors;
- c->capacity = capacity - reserved_sectors;
-
- c->bucket_size_max = bucket_size_max;
-
- /* Wake up case someone was waiting for buckets */
- closure_wake_up(&c->freelist_wait);
-}
-
-u64 bch2_min_rw_member_capacity(struct bch_fs *c)
-{
- u64 ret = U64_MAX;
-
- for_each_rw_member(c, ca)
- ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
- return ret;
-}
-
-static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
-{
- struct open_bucket *ob;
- bool ret = false;
-
- for (ob = c->open_buckets;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++) {
- spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list &&
- ob->dev == ca->dev_idx)
- ret = true;
- spin_unlock(&ob->lock);
- }
-
- return ret;
-}
-
-/* device goes ro: */
-void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
-{
- lockdep_assert_held(&c->state_lock);
-
- /* First, remove device from allocation groups: */
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
- clear_bit(ca->dev_idx, c->rw_devs[i].d);
-
- c->rw_devs_change_count++;
-
- /*
- * Capacity is calculated based off of devices in allocation groups:
- */
- bch2_recalc_capacity(c);
-
- bch2_open_buckets_stop(c, ca, false);
-
- /*
- * Wake up threads that were blocked on allocation, so they can notice
- * the device can no longer be removed and the capacity has changed:
- */
- closure_wake_up(&c->freelist_wait);
-
- /*
- * journal_res_get() can block waiting for free space in the journal -
- * it needs to notice there may not be devices to allocate from anymore:
- */
- wake_up(&c->journal.wait);
-
- /* Now wait for any in flight writes: */
-
- closure_wait_event(&c->open_buckets_wait,
- !bch2_dev_has_open_write_point(c, ca));
-}
-
-/* device goes rw: */
-void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
-{
- lockdep_assert_held(&c->state_lock);
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
- if (ca->mi.data_allowed & (1 << i))
- set_bit(ca->dev_idx, c->rw_devs[i].d);
-
- c->rw_devs_change_count++;
-}
-
-void bch2_dev_allocator_background_exit(struct bch_dev *ca)
-{
- darray_exit(&ca->discard_buckets_in_flight);
-}
-
-void bch2_dev_allocator_background_init(struct bch_dev *ca)
-{
- mutex_init(&ca->discard_buckets_in_flight_lock);
- INIT_WORK(&ca->discard_work, bch2_do_discards_work);
- INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
- INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
-}
-
-void bch2_fs_allocator_background_init(struct bch_fs *c)
-{
- spin_lock_init(&c->freelist_lock);
-}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
deleted file mode 100644
index c556ccaffe89..000000000000
--- a/fs/bcachefs/alloc_background.h
+++ /dev/null
@@ -1,361 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
-#define _BCACHEFS_ALLOC_BACKGROUND_H
-
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "buckets.h"
-#include "debug.h"
-#include "super.h"
-
-/* How out of date a pointer gen is allowed to be: */
-#define BUCKET_GC_GEN_MAX 96U
-
-static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
-{
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
- bool ret = ca && bucket_valid(ca, pos.offset);
- rcu_read_unlock();
- return ret;
-}
-
-static inline u64 bucket_to_u64(struct bpos bucket)
-{
- return (bucket.inode << 48) | bucket.offset;
-}
-
-static inline struct bpos u64_to_bucket(u64 bucket)
-{
- return POS(bucket >> 48, bucket & ~(~0ULL << 48));
-}
-
-static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
-{
- return a.gen - a.oldest_gen;
-}
-
-static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
-{
- dst->gen = src.gen;
- dst->data_type = src.data_type;
- dst->stripe_sectors = src.stripe_sectors;
- dst->dirty_sectors = src.dirty_sectors;
- dst->cached_sectors = src.cached_sectors;
- dst->stripe = src.stripe;
-}
-
-static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src)
-{
- dst->gen = src.gen;
- dst->data_type = src.data_type;
- dst->stripe_sectors = src.stripe_sectors;
- dst->dirty_sectors = src.dirty_sectors;
- dst->cached_sectors = src.cached_sectors;
- dst->stripe = src.stripe;
-}
-
-static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
-{
- struct bch_alloc_v4 ret = {};
- __bucket_m_to_alloc(&ret, b);
- return ret;
-}
-
-static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
-{
- switch (data_type) {
- case BCH_DATA_cached:
- case BCH_DATA_stripe:
- return BCH_DATA_user;
- default:
- return data_type;
- }
-}
-
-static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
- enum bch_data_type ptr)
-{
- return !data_type_is_empty(bucket) &&
- bucket_data_type(bucket) != bucket_data_type(ptr);
-}
-
-/*
- * It is my general preference to use unsigned types for unsigned quantities -
- * however, these helpers are used in disk accounting calculations run by
- * triggers where the output will be negated and added to an s64. unsigned is
- * right out even though all these quantities will fit in 32 bits, since it
- * won't be sign extended correctly; u64 will negate "correctly", but s64 is the
- * simpler option here.
- */
-static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
-{
- return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
-}
-
-static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
-{
- return a.stripe_sectors + a.dirty_sectors;
-}
-
-static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
-{
- return a.data_type == BCH_DATA_cached
- ? a.cached_sectors
- : bch2_bucket_sectors_dirty(a);
-}
-
-static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
- struct bch_alloc_v4 a)
-{
- int d = bch2_bucket_sectors(a);
-
- return d ? max(0, ca->mi.bucket_size - d) : 0;
-}
-
-static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
-{
- int d = a.stripe_sectors + a.dirty_sectors;
-
- return d ? max(0, ca->mi.bucket_size - d) : 0;
-}
-
-static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
-{
- return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
-}
-
-static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
- enum bch_data_type data_type)
-{
- if (a.stripe)
- return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
- if (bch2_bucket_sectors_dirty(a))
- return bucket_data_type(data_type);
- if (a.cached_sectors)
- return BCH_DATA_cached;
- if (BCH_ALLOC_V4_NEED_DISCARD(&a))
- return BCH_DATA_need_discard;
- if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
- return BCH_DATA_need_gc_gens;
- return BCH_DATA_free;
-}
-
-static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type)
-{
- a->data_type = alloc_data_type(*a, data_type);
-}
-
-static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
-{
- return a.data_type == BCH_DATA_cached
- ? a.io_time[READ] & LRU_TIME_MAX
- : 0;
-}
-
-#define DATA_TYPES_MOVABLE \
- ((1U << BCH_DATA_btree)| \
- (1U << BCH_DATA_user)| \
- (1U << BCH_DATA_stripe))
-
-static inline bool data_type_movable(enum bch_data_type type)
-{
- return (1U << type) & DATA_TYPES_MOVABLE;
-}
-
-static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
- struct bch_dev *ca)
-{
- if (a.data_type >= BCH_DATA_NR)
- return 0;
-
- if (!data_type_movable(a.data_type) ||
- !bch2_bucket_sectors_fragmented(ca, a))
- return 0;
-
- /*
- * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
- * bucket_sectors_dirty is (much) bigger than bucket_size
- */
- u64 d = min_t(s64, bch2_bucket_sectors_dirty(a),
- ca->mi.bucket_size);
-
- return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
-}
-
-static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
-{
- return ((u64) alloc_gc_gen(a) >> 4) << 56;
-}
-
-static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
-{
- pos.offset |= alloc_freespace_genbits(a);
- return pos;
-}
-
-static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
-{
- return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
- BCH_ALLOC_V4_U64s_V0) +
- BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
- (sizeof(struct bch_backpointer) / sizeof(u64));
-}
-
-static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
-{
- unsigned ret = alloc_v4_u64s_noerror(a);
- BUG_ON(ret > U8_MAX - BKEY_U64s);
- return ret;
-}
-
-static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
-{
- set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
- enum btree_iter_update_trigger_flags);
-
-void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
-
-static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
-{
- const struct bch_alloc_v4 *ret;
-
- if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
- goto slowpath;
-
- ret = bkey_s_c_to_alloc_v4(k).v;
- if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
- goto slowpath;
-
- return ret;
-slowpath:
- __bch2_alloc_to_v4(k, convert);
- return convert;
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
-
-int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-
-int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_alloc_v4_swab(struct bkey_s);
-void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
- .key_validate = bch2_alloc_v1_validate, \
- .val_to_text = bch2_alloc_to_text, \
- .trigger = bch2_trigger_alloc, \
- .min_val_size = 8, \
-})
-
-#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \
- .key_validate = bch2_alloc_v2_validate, \
- .val_to_text = bch2_alloc_to_text, \
- .trigger = bch2_trigger_alloc, \
- .min_val_size = 8, \
-})
-
-#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \
- .key_validate = bch2_alloc_v3_validate, \
- .val_to_text = bch2_alloc_to_text, \
- .trigger = bch2_trigger_alloc, \
- .min_val_size = 16, \
-})
-
-#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \
- .key_validate = bch2_alloc_v4_validate, \
- .val_to_text = bch2_alloc_to_text, \
- .swab = bch2_alloc_v4_swab, \
- .trigger = bch2_trigger_alloc, \
- .min_val_size = 48, \
-})
-
-int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
- .key_validate = bch2_bucket_gens_validate, \
- .val_to_text = bch2_bucket_gens_to_text, \
-})
-
-int bch2_bucket_gens_init(struct bch_fs *);
-
-static inline bool bkey_is_alloc(const struct bkey *k)
-{
- return k->type == KEY_TYPE_alloc ||
- k->type == KEY_TYPE_alloc_v2 ||
- k->type == KEY_TYPE_alloc_v3;
-}
-
-int bch2_alloc_read(struct bch_fs *);
-
-int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
- const struct bch_alloc_v4 *,
- const struct bch_alloc_v4 *, unsigned);
-int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool);
-int bch2_check_alloc_info(struct bch_fs *);
-int bch2_check_alloc_to_lru_refs(struct bch_fs *);
-void bch2_dev_do_discards(struct bch_dev *);
-void bch2_do_discards(struct bch_fs *);
-
-static inline u64 should_invalidate_buckets(struct bch_dev *ca,
- struct bch_dev_usage u)
-{
- u64 want_free = ca->mi.nbuckets >> 7;
- u64 free = max_t(s64, 0,
- u.d[BCH_DATA_free].buckets
- + u.d[BCH_DATA_need_discard].buckets
- - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
-
- return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
-}
-
-void bch2_dev_do_invalidates(struct bch_dev *);
-void bch2_do_invalidates(struct bch_fs *);
-
-static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
-{
- return (void *) ((u64 *) &a->v +
- (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
- BCH_ALLOC_V4_U64s_V0));
-}
-
-static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
-{
- return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
-}
-
-int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
-int bch2_fs_freespace_init(struct bch_fs *);
-int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
-
-void bch2_recalc_capacity(struct bch_fs *);
-u64 bch2_min_rw_member_capacity(struct bch_fs *);
-
-void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-
-void bch2_dev_allocator_background_exit(struct bch_dev *);
-void bch2_dev_allocator_background_init(struct bch_dev *);
-
-void bch2_fs_allocator_background_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
deleted file mode 100644
index 740238369a5a..000000000000
--- a/fs/bcachefs/alloc_background_format.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-
-struct bch_alloc {
- struct bch_val v;
- __u8 fields;
- __u8 gen;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1() \
- x(read_time, 16) \
- x(write_time, 16) \
- x(data_type, 8) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
- x(oldest_gen, 8) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
- struct bch_val v;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2() \
- x(read_time, 64) \
- x(write_time, 64) \
- x(dirty_sectors, 32) \
- x(cached_sectors, 32) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-struct bch_alloc_v3 {
- struct bch_val v;
- __le64 journal_seq;
- __le32 flags;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
-
-struct bch_alloc_v4 {
- struct bch_val v;
- __u64 journal_seq_nonempty;
- __u32 flags;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 stripe_redundancy;
- __u32 dirty_sectors;
- __u32 cached_sectors;
- __u64 io_time[2];
- __u32 stripe;
- __u32 nr_external_backpointers;
- /* end of fields in original version of alloc_v4 */
- __u64 journal_seq_empty;
- __u32 stripe_sectors;
- __u32 pad;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0 6
-#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
-
-#define KEY_TYPE_BUCKET_GENS_BITS 8
-#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
- struct bch_val v;
- u8 gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
deleted file mode 100644
index 0cac65347a5d..000000000000
--- a/fs/bcachefs/alloc_foreground.c
+++ /dev/null
@@ -1,1717 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2012 Google, Inc.
- *
- * Foreground allocator code: allocate buckets from freelist, and allocate in
- * sector granularity from writepoints.
- *
- * bch2_bucket_alloc() allocates a single bucket from a specific device.
- *
- * bch2_bucket_alloc_set() allocates one or more buckets from different devices
- * in a given filesystem.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "io_write.h"
-#include "journal.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "trace.h"
-
-#include <linux/math64.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-
-static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
- struct mutex *lock)
-{
- if (!mutex_trylock(lock)) {
- bch2_trans_unlock(trans);
- mutex_lock(lock);
- }
-}
-
-const char * const bch2_watermarks[] = {
-#define x(t) #t,
- BCH_WATERMARKS()
-#undef x
- NULL
-};
-
-/*
- * Open buckets represent a bucket that's currently being allocated from. They
- * serve two purposes:
- *
- * - They track buckets that have been partially allocated, allowing for
- * sub-bucket sized allocations - they're used by the sector allocator below
- *
- * - They provide a reference to the buckets they own that mark and sweep GC
- * can find, until the new allocation has a pointer to it inserted into the
- * btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
-
-void bch2_reset_alloc_cursors(struct bch_fs *c)
-{
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL)
- memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
- rcu_read_unlock();
-}
-
-static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
-{
- open_bucket_idx_t idx = ob - c->open_buckets;
- open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
- ob->hash = *slot;
- *slot = idx;
-}
-
-static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
-{
- open_bucket_idx_t idx = ob - c->open_buckets;
- open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
- while (*slot != idx) {
- BUG_ON(!*slot);
- slot = &c->open_buckets[*slot].hash;
- }
-
- *slot = ob->hash;
- ob->hash = 0;
-}
-
-void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
-
- if (ob->ec) {
- ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
- return;
- }
-
- spin_lock(&ob->lock);
- ob->valid = false;
- ob->data_type = 0;
- spin_unlock(&ob->lock);
-
- spin_lock(&c->freelist_lock);
- bch2_open_bucket_hash_remove(c, ob);
-
- ob->freelist = c->open_buckets_freelist;
- c->open_buckets_freelist = ob - c->open_buckets;
-
- c->open_buckets_nr_free++;
- ca->nr_open_buckets--;
- spin_unlock(&c->freelist_lock);
-
- closure_wake_up(&c->open_buckets_wait);
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *c,
- struct open_buckets *obs,
- unsigned dev, int err)
-{
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, obs, ob, i)
- if (ob->dev == dev && ob->ec)
- bch2_ec_bucket_cancel(c, ob, err);
-}
-
-static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
-{
- struct open_bucket *ob;
-
- BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
-
- ob = c->open_buckets + c->open_buckets_freelist;
- c->open_buckets_freelist = ob->freelist;
- atomic_set(&ob->pin, 1);
- ob->data_type = 0;
-
- c->open_buckets_nr_free--;
- return ob;
-}
-
-static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs)
- return false;
-
- return bch2_is_superblock_bucket(ca, b);
-}
-
-static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
-{
- BUG_ON(c->open_buckets_partial_nr >=
- ARRAY_SIZE(c->open_buckets_partial));
-
- spin_lock(&c->freelist_lock);
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
- rcu_read_unlock();
-
- ob->on_partial_list = true;
- c->open_buckets_partial[c->open_buckets_partial_nr++] =
- ob - c->open_buckets;
- spin_unlock(&c->freelist_lock);
-
- closure_wake_up(&c->open_buckets_wait);
- closure_wake_up(&c->freelist_wait);
-}
-
-static inline bool may_alloc_bucket(struct bch_fs *c,
- struct bpos bucket,
- struct bucket_alloc_state *s)
-{
- if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
- s->skipped_open++;
- return false;
- }
-
- u64 journal_seq_ready =
- bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
- bucket.inode, bucket.offset);
- if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
- if (journal_seq_ready > c->journal.flushing_seq)
- s->need_journal_commit++;
- s->skipped_need_journal_commit++;
- return false;
- }
-
- if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
- s->skipped_nocow++;
- return false;
- }
-
- return true;
-}
-
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- u64 bucket, u8 gen,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
- struct closure *cl)
-{
- if (unlikely(is_superblock_bucket(c, ca, bucket)))
- return NULL;
-
- if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
- s->skipped_nouse++;
- return NULL;
- }
-
- spin_lock(&c->freelist_lock);
-
- if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) {
- if (cl)
- closure_wait(&c->open_buckets_wait, cl);
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
- spin_unlock(&c->freelist_lock);
- return ERR_PTR(-BCH_ERR_open_buckets_empty);
- }
-
- /* Recheck under lock: */
- if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
- spin_unlock(&c->freelist_lock);
- s->skipped_open++;
- return NULL;
- }
-
- struct open_bucket *ob = bch2_open_bucket_alloc(c);
-
- spin_lock(&ob->lock);
- ob->valid = true;
- ob->sectors_free = ca->mi.bucket_size;
- ob->dev = ca->dev_idx;
- ob->gen = gen;
- ob->bucket = bucket;
- spin_unlock(&ob->lock);
-
- ca->nr_open_buckets++;
- bch2_open_bucket_hash_add(c, ob);
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
- track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
-
- spin_unlock(&c->freelist_lock);
- return ob;
-}
-
-static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
- struct btree_iter *freespace_iter,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
-
- if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s))
- return NULL;
-
- u8 gen;
- int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret)
- return NULL;
-
- return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
-}
-
-/*
- * This path is for before the freespace btree is initialized:
- */
-static noinline struct open_bucket *
-bch2_bucket_alloc_early(struct btree_trans *trans,
- struct bch_dev *ca,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter, citer;
- struct bkey_s_c k, ck;
- struct open_bucket *ob = NULL;
- u64 first_bucket = ca->mi.first_bucket;
- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
- u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
- u64 alloc_cursor = alloc_start;
- int ret;
-
- /*
- * Scan with an uncached iterator to avoid polluting the key cache. An
- * uncached iter will return a cached key if one exists, but if not
- * there is no other underlying protection for the associated key cache
- * slot. To avoid racing bucket allocations, look up the cached key slot
- * of any likely allocation candidate before attempting to proceed with
- * the allocation. This provides proper exclusion on the associated
- * bucket.
- */
-again:
- for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
- BTREE_ITER_slots, k, ret) {
- u64 bucket = k.k->p.offset;
-
- if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
- break;
-
- if (s->btree_bitmap != BTREE_BITMAP_ANY &&
- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
- bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
- if (s->btree_bitmap == BTREE_BITMAP_YES &&
- bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
- break;
-
- bucket = sector_to_bucket(ca,
- round_up(bucket_to_sector(ca, bucket) + 1,
- 1ULL << ca->mi.btree_bitmap_shift));
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
- s->buckets_seen++;
- s->skipped_mi_btree_bitmap++;
- continue;
- }
-
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- if (a->data_type != BCH_DATA_free)
- continue;
-
- /* now check the cached key to serialize concurrent allocs of the bucket */
- ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
- ret = bkey_err(ck);
- if (ret)
- break;
-
- a = bch2_alloc_to_v4(ck, &a_convert);
- if (a->data_type != BCH_DATA_free)
- goto next;
-
- s->buckets_seen++;
-
- ob = may_alloc_bucket(c, k.k->p, s)
- ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen,
- watermark, s, cl)
- : NULL;
-next:
- bch2_set_btree_iter_dontneed(&citer);
- bch2_trans_iter_exit(trans, &citer);
- if (ob)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- alloc_cursor = iter.pos.offset;
-
- if (!ob && ret)
- ob = ERR_PTR(ret);
-
- if (!ob && alloc_start > first_bucket) {
- alloc_cursor = alloc_start = first_bucket;
- goto again;
- }
-
- *dev_alloc_cursor = alloc_cursor;
-
- return ob;
-}
-
-static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
- struct bch_dev *ca,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
- struct closure *cl)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct open_bucket *ob = NULL;
- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
- u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
- u64 alloc_cursor = alloc_start;
- int ret;
-again:
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
- POS(ca->dev_idx, alloc_cursor),
- POS(ca->dev_idx, U64_MAX),
- 0, k, ret) {
- /*
- * peek normally dosen't trim extents - they can span iter.pos,
- * which is not what we want here:
- */
- iter.k.size = iter.k.p.offset - iter.pos.offset;
-
- while (iter.k.size) {
- s->buckets_seen++;
-
- u64 bucket = iter.pos.offset & ~(~0ULL << 56);
- if (s->btree_bitmap != BTREE_BITMAP_ANY &&
- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
- bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
- if (s->btree_bitmap == BTREE_BITMAP_YES &&
- bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
- goto fail;
-
- bucket = sector_to_bucket(ca,
- round_up(bucket_to_sector(ca, bucket + 1),
- 1ULL << ca->mi.btree_bitmap_shift));
- alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
-
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
- s->skipped_mi_btree_bitmap++;
- goto next;
- }
-
- ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl);
- if (ob) {
- if (!IS_ERR(ob))
- *dev_alloc_cursor = iter.pos.offset;
- bch2_set_btree_iter_dontneed(&iter);
- break;
- }
-
- iter.k.size--;
- iter.pos.offset++;
- }
-next:
- if (ob || ret)
- break;
- }
-fail:
- bch2_trans_iter_exit(trans, &iter);
-
- BUG_ON(ob && ret);
-
- if (ret)
- ob = ERR_PTR(ret);
-
- if (!ob && alloc_start > ca->mi.first_bucket) {
- alloc_cursor = alloc_start = ca->mi.first_bucket;
- goto again;
- }
-
- return ob;
-}
-
-static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
- enum bch_watermark watermark,
- enum bch_data_type data_type,
- struct closure *cl,
- struct bch_dev_usage *usage,
- struct bucket_alloc_state *s,
- struct open_bucket *ob)
-{
- struct printbuf buf = PRINTBUF;
-
- printbuf_tabstop_push(&buf, 24);
-
- prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx);
- prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]);
- prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]);
- prt_printf(&buf, "blocking\t%u\n", cl != NULL);
- prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets);
- prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark));
- prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
- bch2_copygc_wait_amount(c),
- c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
- prt_printf(&buf, "seen\t%llu\n", s->buckets_seen);
- prt_printf(&buf, "open\t%llu\n", s->skipped_open);
- prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit);
- prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow);
- prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse);
- prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap);
-
- if (!IS_ERR(ob)) {
- prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
- trace_bucket_alloc(c, buf.buf);
- } else {
- prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
- trace_bucket_alloc_fail(c, buf.buf);
- }
-
- printbuf_exit(&buf);
-}
-
-/**
- * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
- * @trans: transaction object
- * @ca: device to allocate from
- * @watermark: how important is this allocation?
- * @data_type: BCH_DATA_journal, btree, user...
- * @cl: if not NULL, closure to be used to wait if buckets not available
- * @nowait: if true, do not wait for buckets to become available
- * @usage: for secondarily also returning the current device usage
- *
- * Returns: an open_bucket on success, or an ERR_PTR() on failure.
- */
-static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
- struct bch_dev *ca,
- enum bch_watermark watermark,
- enum bch_data_type data_type,
- struct closure *cl,
- bool nowait,
- struct bch_dev_usage *usage)
-{
- struct bch_fs *c = trans->c;
- struct open_bucket *ob = NULL;
- bool freespace = READ_ONCE(ca->mi.freespace_initialized);
- u64 avail;
- struct bucket_alloc_state s = {
- .btree_bitmap = data_type == BCH_DATA_btree,
- };
- bool waiting = nowait;
-again:
- bch2_dev_usage_read_fast(ca, usage);
- avail = dev_buckets_free(ca, *usage, watermark);
-
- if (usage->d[BCH_DATA_need_discard].buckets > avail)
- bch2_dev_do_discards(ca);
-
- if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
- bch2_gc_gens_async(c);
-
- if (should_invalidate_buckets(ca, *usage))
- bch2_dev_do_invalidates(ca);
-
- if (!avail) {
- if (watermark > BCH_WATERMARK_normal &&
- c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
- goto alloc;
-
- if (cl && !waiting) {
- closure_wait(&c->freelist_wait, cl);
- waiting = true;
- goto again;
- }
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
-
- ob = ERR_PTR(-BCH_ERR_freelist_empty);
- goto err;
- }
-
- if (waiting)
- closure_wake_up(&c->freelist_wait);
-alloc:
- ob = likely(freespace)
- ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
- : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
-
- if (s.need_journal_commit * 2 > avail)
- bch2_journal_flush_async(&c->journal, NULL);
-
- if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
- s.btree_bitmap = BTREE_BITMAP_ANY;
- goto alloc;
- }
-
- if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
- freespace = false;
- goto alloc;
- }
-err:
- if (!ob)
- ob = ERR_PTR(-BCH_ERR_no_buckets_found);
-
- if (!IS_ERR(ob))
- ob->data_type = data_type;
-
- if (!IS_ERR(ob))
- count_event(c, bucket_alloc);
- else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
- count_event(c, bucket_alloc_fail);
-
- if (!IS_ERR(ob)
- ? trace_bucket_alloc_enabled()
- : trace_bucket_alloc_fail_enabled())
- trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob);
-
- return ob;
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
- enum bch_watermark watermark,
- enum bch_data_type data_type,
- struct closure *cl)
-{
- struct bch_dev_usage usage;
- struct open_bucket *ob;
-
- bch2_trans_do(c,
- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
- data_type, cl, false, &usage)));
- return ob;
-}
-
-static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
- unsigned l, unsigned r)
-{
- return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
- (stripe->next_alloc[l] < stripe->next_alloc[r]));
-}
-
-#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
-
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
- struct dev_stripe_state *stripe,
- struct bch_devs_mask *devs)
-{
- struct dev_alloc_list ret = { .nr = 0 };
- unsigned i;
-
- for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
- ret.data[ret.nr++] = i;
-
- bubble_sort(ret.data, ret.nr, dev_stripe_cmp);
- return ret;
-}
-
-static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
- struct dev_stripe_state *stripe,
- struct bch_dev_usage *usage)
-{
- u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
- u64 free_space_inv = free_space
- ? div64_u64(1ULL << 48, free_space)
- : 1ULL << 48;
- u64 scale = *v / 4;
-
- if (*v + free_space_inv >= *v)
- *v += free_space_inv;
- else
- *v = U64_MAX;
-
- for (v = stripe->next_alloc;
- v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
- *v = *v < scale ? 0 : *v - scale;
-}
-
-void bch2_dev_stripe_increment(struct bch_dev *ca,
- struct dev_stripe_state *stripe)
-{
- struct bch_dev_usage usage;
-
- bch2_dev_usage_read_fast(ca, &usage);
- bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
-}
-
-static int add_new_bucket(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- struct open_bucket *ob)
-{
- unsigned durability = ob_dev(c, ob)->mi.durability;
-
- BUG_ON(*nr_effective >= nr_replicas);
-
- __clear_bit(ob->dev, devs_may_alloc->d);
- *nr_effective += durability;
- *have_cache |= !durability;
-
- ob_push(c, ptrs, ob);
-
- if (*nr_effective >= nr_replicas)
- return 1;
- if (ob->ec)
- return 1;
- return 0;
-}
-
-int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct dev_stripe_state *stripe,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_write_flags flags,
- enum bch_data_type data_type,
- enum bch_watermark watermark,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- int ret = -BCH_ERR_insufficient_devices;
-
- BUG_ON(*nr_effective >= nr_replicas);
-
- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc);
- darray_for_each(devs_sorted, i) {
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i);
- if (!ca)
- continue;
-
- if (!ca->mi.durability && *have_cache) {
- bch2_dev_put(ca);
- continue;
- }
-
- struct bch_dev_usage usage;
- struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
- cl, flags & BCH_WRITE_alloc_nowait, &usage);
- if (!IS_ERR(ob))
- bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
- bch2_dev_put(ca);
-
- if (IS_ERR(ob)) {
- ret = PTR_ERR(ob);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
- break;
- continue;
- }
-
- if (add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob)) {
- ret = 0;
- break;
- }
- }
-
- return ret;
-}
-
-/* Allocate from stripes: */
-
-/*
- * if we can't allocate a new stripe because there are already too many
- * partially filled stripes, force allocating from an existing stripe even when
- * it's to a device we don't want:
- */
-
-static int bucket_alloc_from_stripe(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- u16 target,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- if (nr_replicas < 2)
- return 0;
-
- if (ec_open_bucket(c, ptrs))
- return 0;
-
- struct ec_stripe_head *h =
- bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
- if (IS_ERR(h))
- return PTR_ERR(h);
- if (!h)
- return 0;
-
- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
- darray_for_each(devs_sorted, i)
- for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
- if (!h->s->blocks[ec_idx])
- continue;
-
- struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx];
- if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) {
- ob->ec_idx = ec_idx;
- ob->ec = h->s;
- ec_stripe_new_get(h->s, STRIPE_REF_io);
-
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
- goto out;
- }
- }
-out:
- bch2_ec_stripe_head_put(c, h);
- return ret;
-}
-
-/* Sector allocator */
-
-static bool want_bucket(struct bch_fs *c,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- bool *have_cache, bool ec,
- struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
-
- if (!test_bit(ob->dev, devs_may_alloc->d))
- return false;
-
- if (ob->data_type != wp->data_type)
- return false;
-
- if (!ca->mi.durability &&
- (wp->data_type == BCH_DATA_btree || ec || *have_cache))
- return false;
-
- if (ec != (ob->ec != NULL))
- return false;
-
- return true;
-}
-
-static int bucket_alloc_set_writepoint(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- bool ec)
-{
- struct open_buckets ptrs_skip = { .nr = 0 };
- struct open_bucket *ob;
- unsigned i;
- int ret = 0;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- if (!ret && want_bucket(c, wp, devs_may_alloc,
- have_cache, ec, ob))
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
- else
- ob_push(c, &ptrs_skip, ob);
- }
- wp->ptrs = ptrs_skip;
-
- return ret;
-}
-
-static int bucket_alloc_set_partial(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache, bool ec,
- enum bch_watermark watermark)
-{
- int i, ret = 0;
-
- if (!c->open_buckets_partial_nr)
- return 0;
-
- spin_lock(&c->freelist_lock);
-
- if (!c->open_buckets_partial_nr)
- goto unlock;
-
- for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
- struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
-
- if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
- struct bch_dev *ca = ob_dev(c, ob);
- struct bch_dev_usage usage;
- u64 avail;
-
- bch2_dev_usage_read_fast(ca, &usage);
- avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets;
- if (!avail)
- continue;
-
- array_remove_item(c->open_buckets_partial,
- c->open_buckets_partial_nr,
- i);
- ob->on_partial_list = false;
-
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- rcu_read_unlock();
-
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
- if (ret)
- break;
- }
- }
-unlock:
- spin_unlock(&c->freelist_lock);
- return ret;
-}
-
-static int __open_bucket_add_buckets(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_list *devs_have,
- u16 target,
- bool erasure_code,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *_cl)
-{
- struct bch_fs *c = trans->c;
- struct bch_devs_mask devs;
- struct open_bucket *ob;
- struct closure *cl = NULL;
- unsigned i;
- int ret;
-
- devs = target_rw_devs(c, wp->data_type, target);
-
- /* Don't allocate from devices we already have pointers to: */
- darray_for_each(*devs_have, i)
- __clear_bit(*i, devs.d);
-
- open_bucket_for_each(c, ptrs, ob, i)
- __clear_bit(ob->dev, devs.d);
-
- ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
- nr_replicas, nr_effective,
- have_cache, erasure_code);
- if (ret)
- return ret;
-
- ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
- nr_replicas, nr_effective,
- have_cache, erasure_code, watermark);
- if (ret)
- return ret;
-
- if (erasure_code) {
- ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
- target,
- nr_replicas, nr_effective,
- have_cache,
- watermark, flags, _cl);
- } else {
-retry_blocking:
- /*
- * Try nonblocking first, so that if one device is full we'll try from
- * other devices:
- */
- ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
- nr_replicas, nr_effective, have_cache,
- flags, wp->data_type, watermark, cl);
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
- !cl && _cl) {
- cl = _cl;
- goto retry_blocking;
- }
- }
-
- return ret;
-}
-
-static int open_bucket_add_buckets(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_list *devs_have,
- u16 target,
- unsigned erasure_code,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *cl)
-{
- int ret;
-
- if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
- ret = __open_bucket_add_buckets(trans, ptrs, wp,
- devs_have, target, erasure_code,
- nr_replicas, nr_effective, have_cache,
- watermark, flags, cl);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
- bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
- bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
- return ret;
- if (*nr_effective >= nr_replicas)
- return 0;
- }
-
- ret = __open_bucket_add_buckets(trans, ptrs, wp,
- devs_have, target, false,
- nr_replicas, nr_effective, have_cache,
- watermark, flags, cl);
- return ret < 0 ? ret : 0;
-}
-
-/**
- * should_drop_bucket - check if this is open_bucket should go away
- * @ob: open_bucket to predicate on
- * @c: filesystem handle
- * @ca: if set, we're killing buckets for a particular device
- * @ec: if true, we're shutting down erasure coding and killing all ec
- * open_buckets
- * otherwise, return true
- * Returns: true if we should kill this open_bucket
- *
- * We're killing open_buckets because we're shutting down a device, erasure
- * coding, or the entire filesystem - check if this open_bucket matches:
- */
-static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
- struct bch_dev *ca, bool ec)
-{
- if (ec) {
- return ob->ec != NULL;
- } else if (ca) {
- bool drop = ob->dev == ca->dev_idx;
- struct open_bucket *ob2;
- unsigned i;
-
- if (!drop && ob->ec) {
- unsigned nr_blocks;
-
- mutex_lock(&ob->ec->lock);
- nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
-
- for (i = 0; i < nr_blocks; i++) {
- if (!ob->ec->blocks[i])
- continue;
-
- ob2 = c->open_buckets + ob->ec->blocks[i];
- drop |= ob2->dev == ca->dev_idx;
- }
- mutex_unlock(&ob->ec->lock);
- }
-
- return drop;
- } else {
- return true;
- }
-}
-
-static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
- bool ec, struct write_point *wp)
-{
- struct open_buckets ptrs = { .nr = 0 };
- struct open_bucket *ob;
- unsigned i;
-
- mutex_lock(&wp->lock);
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- if (should_drop_bucket(ob, c, ca, ec))
- bch2_open_bucket_put(c, ob);
- else
- ob_push(c, &ptrs, ob);
- wp->ptrs = ptrs;
- mutex_unlock(&wp->lock);
-}
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
- bool ec)
-{
- unsigned i;
-
- /* Next, close write points that point to this device... */
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
- bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
-
- bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
- bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
- bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
-
- mutex_lock(&c->btree_reserve_cache_lock);
- while (c->btree_reserve_cache_nr) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
- bch2_open_buckets_put(c, &a->ob);
- }
- mutex_unlock(&c->btree_reserve_cache_lock);
-
- spin_lock(&c->freelist_lock);
- i = 0;
- while (i < c->open_buckets_partial_nr) {
- struct open_bucket *ob =
- c->open_buckets + c->open_buckets_partial[i];
-
- if (should_drop_bucket(ob, c, ca, ec)) {
- --c->open_buckets_partial_nr;
- swap(c->open_buckets_partial[i],
- c->open_buckets_partial[c->open_buckets_partial_nr]);
-
- ob->on_partial_list = false;
-
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- rcu_read_unlock();
-
- spin_unlock(&c->freelist_lock);
- bch2_open_bucket_put(c, ob);
- spin_lock(&c->freelist_lock);
- } else {
- i++;
- }
- }
- spin_unlock(&c->freelist_lock);
-
- bch2_ec_stop_dev(c, ca);
-}
-
-static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
- unsigned long write_point)
-{
- unsigned hash =
- hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
-
- return &c->write_points_hash[hash];
-}
-
-static struct write_point *__writepoint_find(struct hlist_head *head,
- unsigned long write_point)
-{
- struct write_point *wp;
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(wp, head, node)
- if (wp->write_point == write_point)
- goto out;
- wp = NULL;
-out:
- rcu_read_unlock();
- return wp;
-}
-
-static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
-{
- u64 stranded = c->write_points_nr * c->bucket_size_max;
- u64 free = bch2_fs_usage_read_short(c).free;
-
- return stranded * factor > free;
-}
-
-static bool try_increase_writepoints(struct bch_fs *c)
-{
- struct write_point *wp;
-
- if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
- too_many_writepoints(c, 32))
- return false;
-
- wp = c->write_points + c->write_points_nr++;
- hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
- return true;
-}
-
-static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
-{
- struct bch_fs *c = trans->c;
- struct write_point *wp;
- struct open_bucket *ob;
- unsigned i;
-
- mutex_lock(&c->write_points_hash_lock);
- if (c->write_points_nr < old_nr) {
- mutex_unlock(&c->write_points_hash_lock);
- return true;
- }
-
- if (c->write_points_nr == 1 ||
- !too_many_writepoints(c, 8)) {
- mutex_unlock(&c->write_points_hash_lock);
- return false;
- }
-
- wp = c->write_points + --c->write_points_nr;
-
- hlist_del_rcu(&wp->node);
- mutex_unlock(&c->write_points_hash_lock);
-
- bch2_trans_mutex_lock_norelock(trans, &wp->lock);
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- open_bucket_free_unused(c, ob);
- wp->ptrs.nr = 0;
- mutex_unlock(&wp->lock);
- return true;
-}
-
-static struct write_point *writepoint_find(struct btree_trans *trans,
- unsigned long write_point)
-{
- struct bch_fs *c = trans->c;
- struct write_point *wp, *oldest;
- struct hlist_head *head;
-
- if (!(write_point & 1UL)) {
- wp = (struct write_point *) write_point;
- bch2_trans_mutex_lock_norelock(trans, &wp->lock);
- return wp;
- }
-
- head = writepoint_hash(c, write_point);
-restart_find:
- wp = __writepoint_find(head, write_point);
- if (wp) {
-lock_wp:
- bch2_trans_mutex_lock_norelock(trans, &wp->lock);
- if (wp->write_point == write_point)
- goto out;
- mutex_unlock(&wp->lock);
- goto restart_find;
- }
-restart_find_oldest:
- oldest = NULL;
- for (wp = c->write_points;
- wp < c->write_points + c->write_points_nr; wp++)
- if (!oldest || time_before64(wp->last_used, oldest->last_used))
- oldest = wp;
-
- bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
- bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
- if (oldest >= c->write_points + c->write_points_nr ||
- try_increase_writepoints(c)) {
- mutex_unlock(&c->write_points_hash_lock);
- mutex_unlock(&oldest->lock);
- goto restart_find_oldest;
- }
-
- wp = __writepoint_find(head, write_point);
- if (wp && wp != oldest) {
- mutex_unlock(&c->write_points_hash_lock);
- mutex_unlock(&oldest->lock);
- goto lock_wp;
- }
-
- wp = oldest;
- hlist_del_rcu(&wp->node);
- wp->write_point = write_point;
- hlist_add_head_rcu(&wp->node, head);
- mutex_unlock(&c->write_points_hash_lock);
-out:
- wp->last_used = local_clock();
- return wp;
-}
-
-static noinline void
-deallocate_extra_replicas(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct open_buckets *ptrs_no_use,
- unsigned extra_replicas)
-{
- struct open_buckets ptrs2 = { 0 };
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, ptrs, ob, i) {
- unsigned d = ob_dev(c, ob)->mi.durability;
-
- if (d && d <= extra_replicas) {
- extra_replicas -= d;
- ob_push(c, ptrs_no_use, ob);
- } else {
- ob_push(c, &ptrs2, ob);
- }
- }
-
- *ptrs = ptrs2;
-}
-
-/*
- * Get us an open_bucket we can allocate from, return with it locked:
- */
-int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
- unsigned target,
- unsigned erasure_code,
- struct write_point_specifier write_point,
- struct bch_devs_list *devs_have,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *cl,
- struct write_point **wp_ret)
-{
- struct bch_fs *c = trans->c;
- struct write_point *wp;
- struct open_bucket *ob;
- struct open_buckets ptrs;
- unsigned nr_effective, write_points_nr;
- bool have_cache;
- int ret;
- int i;
-
- if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
- erasure_code = false;
-
- BUG_ON(!nr_replicas || !nr_replicas_required);
-retry:
- ptrs.nr = 0;
- nr_effective = 0;
- write_points_nr = c->write_points_nr;
- have_cache = false;
-
- *wp_ret = wp = writepoint_find(trans, write_point.v);
-
- ret = bch2_trans_relock(trans);
- if (ret)
- goto err;
-
- /* metadata may not allocate on cache devices: */
- if (wp->data_type != BCH_DATA_user)
- have_cache = true;
-
- if (target && !(flags & BCH_WRITE_only_specified_devs)) {
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, NULL);
- if (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto alloc_done;
-
- /* Don't retry from all devices if we're out of open buckets: */
- if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
- int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, cl);
- if (!ret2 ||
- bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
- ret = ret2;
- goto alloc_done;
- }
- }
-
- /*
- * Only try to allocate cache (durability = 0 devices) from the
- * specified target:
- */
- have_cache = true;
-
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- 0, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, cl);
- } else {
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, cl);
- }
-alloc_done:
- BUG_ON(!ret && nr_effective < nr_replicas);
-
- if (erasure_code && !ec_open_bucket(c, &ptrs))
- pr_debug("failed to get ec bucket: ret %u", ret);
-
- if (ret == -BCH_ERR_insufficient_devices &&
- nr_effective >= nr_replicas_required)
- ret = 0;
-
- if (ret)
- goto err;
-
- if (nr_effective > nr_replicas)
- deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
-
- /* Free buckets we didn't use: */
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- open_bucket_free_unused(c, ob);
-
- wp->ptrs = ptrs;
-
- wp->sectors_free = UINT_MAX;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
-
- BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
-
- return 0;
-err:
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- if (ptrs.nr < ARRAY_SIZE(ptrs.v))
- ob_push(c, &ptrs, ob);
- else
- open_bucket_free_unused(c, ob);
- wp->ptrs = ptrs;
-
- mutex_unlock(&wp->lock);
-
- if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
- try_decrease_writepoints(trans, write_points_nr))
- goto retry;
-
- if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
- ret = -BCH_ERR_bucket_alloc_blocked;
-
- if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
- bch2_err_matches(ret, BCH_ERR_freelist_empty))
- ret = -BCH_ERR_bucket_alloc_blocked;
-
- return ret;
-}
-
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
-
- return (struct bch_extent_ptr) {
- .type = 1 << BCH_EXTENT_ENTRY_ptr,
- .gen = ob->gen,
- .dev = ob->dev,
- .offset = bucket_to_sector(ca, ob->bucket) +
- ca->mi.bucket_size -
- ob->sectors_free,
- };
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
- struct bkey_i *k, unsigned sectors,
- bool cached)
-{
- bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
-{
- bch2_alloc_sectors_done_inlined(c, wp);
-}
-
-static inline void writepoint_init(struct write_point *wp,
- enum bch_data_type type)
-{
- mutex_init(&wp->lock);
- wp->data_type = type;
-
- INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
- INIT_LIST_HEAD(&wp->writes);
- spin_lock_init(&wp->writes_lock);
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *c)
-{
- struct open_bucket *ob;
- struct write_point *wp;
-
- mutex_init(&c->write_points_hash_lock);
- c->write_points_nr = ARRAY_SIZE(c->write_points);
-
- /* open bucket 0 is a sentinal NULL: */
- spin_lock_init(&c->open_buckets[0].lock);
-
- for (ob = c->open_buckets + 1;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
- spin_lock_init(&ob->lock);
- c->open_buckets_nr_free++;
-
- ob->freelist = c->open_buckets_freelist;
- c->open_buckets_freelist = ob - c->open_buckets;
- }
-
- writepoint_init(&c->btree_write_point, BCH_DATA_btree);
- writepoint_init(&c->rebalance_write_point, BCH_DATA_user);
- writepoint_init(&c->copygc_write_point, BCH_DATA_user);
-
- for (wp = c->write_points;
- wp < c->write_points + c->write_points_nr; wp++) {
- writepoint_init(wp, BCH_DATA_user);
-
- wp->last_used = local_clock();
- wp->write_point = (unsigned long) wp;
- hlist_add_head_rcu(&wp->node,
- writepoint_hash(c, wp->write_point));
- }
-}
-
-void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
- unsigned data_type = ob->data_type;
- barrier(); /* READ_ONCE() doesn't work on bitfields */
-
- prt_printf(out, "%zu ref %u ",
- ob - c->open_buckets,
- atomic_read(&ob->pin));
- bch2_prt_data_type(out, data_type);
- prt_printf(out, " %u:%llu gen %u allocated %u/%u",
- ob->dev, ob->bucket, ob->gen,
- ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
- if (ob->ec)
- prt_printf(out, " ec idx %llu", ob->ec->idx);
- if (ob->on_partial_list)
- prt_str(out, " partial");
- prt_newline(out);
-}
-
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_dev *ca)
-{
- struct open_bucket *ob;
-
- out->atomic++;
-
- for (ob = c->open_buckets;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++) {
- spin_lock(&ob->lock);
- if (ob->valid && (!ca || ob->dev == ca->dev_idx))
- bch2_open_bucket_to_text(out, c, ob);
- spin_unlock(&ob->lock);
- }
-
- --out->atomic;
-}
-
-void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
-{
- unsigned i;
-
- out->atomic++;
- spin_lock(&c->freelist_lock);
-
- for (i = 0; i < c->open_buckets_partial_nr; i++)
- bch2_open_bucket_to_text(out, c,
- c->open_buckets + c->open_buckets_partial[i]);
-
- spin_unlock(&c->freelist_lock);
- --out->atomic;
-}
-
-static const char * const bch2_write_point_states[] = {
-#define x(n) #n,
- WRITE_POINT_STATES()
-#undef x
- NULL
-};
-
-static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
- struct write_point *wp)
-{
- struct open_bucket *ob;
- unsigned i;
-
- prt_printf(out, "%lu: ", wp->write_point);
- prt_human_readable_u64(out, wp->sectors_allocated);
-
- prt_printf(out, " last wrote: ");
- bch2_pr_time_units(out, sched_clock() - wp->last_used);
-
- for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
- prt_printf(out, " %s: ", bch2_write_point_states[i]);
- bch2_pr_time_units(out, wp->time[i]);
- }
-
- prt_newline(out);
-
- printbuf_indent_add(out, 2);
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- bch2_open_bucket_to_text(out, c, ob);
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct write_point *wp;
-
- prt_str(out, "Foreground write points\n");
- for (wp = c->write_points;
- wp < c->write_points + ARRAY_SIZE(c->write_points);
- wp++)
- bch2_write_point_to_text(out, c, wp);
-
- prt_str(out, "Copygc write point\n");
- bch2_write_point_to_text(out, c, &c->copygc_write_point);
-
- prt_str(out, "Rebalance write point\n");
- bch2_write_point_to_text(out, c, &c->rebalance_write_point);
-
- prt_str(out, "Btree write point\n");
- bch2_write_point_to_text(out, c, &c->btree_write_point);
-}
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
-{
- unsigned nr[BCH_DATA_NR];
-
- memset(nr, 0, sizeof(nr));
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
- nr[c->open_buckets[i].data_type]++;
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 24);
-
- prt_printf(out, "capacity\t%llu\n", c->capacity);
- prt_printf(out, "reserved\t%llu\n", c->reserved);
- prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden));
- prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree));
- prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data));
- prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached));
- prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved));
- prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
- prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes));
-
- prt_newline(out);
- prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty");
- prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
- prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT);
- prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty");
- prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]);
- prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]);
- prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr);
-}
-
-void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
- unsigned nr[BCH_DATA_NR];
-
- memset(nr, 0, sizeof(nr));
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
- nr[c->open_buckets[i].data_type]++;
-
- bch2_dev_usage_to_text(out, ca, &stats);
-
- prt_newline(out);
-
- prt_printf(out, "reserves:\n");
- for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
- prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
-
- prt_newline(out);
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 16);
-
- prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
- prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
-}
-
-static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
-{
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
- c->opts.allocator_stuck_timeout);
-
- prt_printf(&buf, "Allocator debug:\n");
- printbuf_indent_add(&buf, 2);
- bch2_fs_alloc_debug_to_text(&buf, c);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
-
- for_each_online_member(c, ca) {
- prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
- printbuf_indent_add(&buf, 2);
- bch2_dev_alloc_debug_to_text(&buf, ca);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
-
- prt_printf(&buf, "Copygc debug:\n");
- printbuf_indent_add(&buf, 2);
- bch2_copygc_wait_to_text(&buf, c);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
-
- prt_printf(&buf, "Journal debug:\n");
- printbuf_indent_add(&buf, 2);
- bch2_journal_debug_to_text(&buf, &c->journal);
- printbuf_indent_sub(&buf, 2);
-
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-}
-
-static inline unsigned allocator_wait_timeout(struct bch_fs *c)
-{
- if (c->allocator_last_stuck &&
- time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
- return 0;
-
- return c->opts.allocator_stuck_timeout * HZ;
-}
-
-void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
-{
- unsigned t = allocator_wait_timeout(c);
-
- if (t && closure_sync_timeout(cl, t)) {
- c->allocator_last_stuck = jiffies;
- bch2_print_allocator_stuck(c);
- }
-
- closure_sync(cl);
-}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
deleted file mode 100644
index 69ec6a012898..000000000000
--- a/fs/bcachefs/alloc_foreground.h
+++ /dev/null
@@ -1,257 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
-#define _BCACHEFS_ALLOC_FOREGROUND_H
-
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "extents.h"
-#include "sb-members.h"
-
-#include <linux/hash.h>
-
-struct bkey;
-struct bch_dev;
-struct bch_fs;
-struct bch_devs_List;
-
-extern const char * const bch2_watermarks[];
-
-void bch2_reset_alloc_cursors(struct bch_fs *);
-
-struct dev_alloc_list {
- unsigned nr;
- u8 data[BCH_SB_MEMBERS_MAX];
-};
-
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
- struct dev_stripe_state *,
- struct bch_devs_mask *);
-void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
-
-static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
-{
- return bch2_dev_have_ref(c, ob->dev);
-}
-
-static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark)
-{
- switch (watermark) {
- case BCH_WATERMARK_interior_updates:
- return 0;
- case BCH_WATERMARK_reclaim:
- return OPEN_BUCKETS_COUNT / 6;
- case BCH_WATERMARK_btree:
- case BCH_WATERMARK_btree_copygc:
- return OPEN_BUCKETS_COUNT / 4;
- case BCH_WATERMARK_copygc:
- return OPEN_BUCKETS_COUNT / 3;
- default:
- return OPEN_BUCKETS_COUNT / 2;
- }
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
- enum bch_watermark, enum bch_data_type,
- struct closure *);
-
-static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
- struct open_bucket *ob)
-{
- BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
-
- obs->v[obs->nr++] = ob - c->open_buckets;
-}
-
-#define open_bucket_for_each(_c, _obs, _ob, _i) \
- for ((_i) = 0; \
- (_i) < (_obs)->nr && \
- ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \
- (_i)++)
-
-static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
- struct open_buckets *obs)
-{
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, obs, ob, i)
- if (ob->ec)
- return ob;
-
- return NULL;
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *,
- struct open_buckets *, unsigned, int);
-
-void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
-
-static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
- if (atomic_dec_and_test(&ob->pin))
- __bch2_open_bucket_put(c, ob);
-}
-
-static inline void bch2_open_buckets_put(struct bch_fs *c,
- struct open_buckets *ptrs)
-{
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, ptrs, ob, i)
- bch2_open_bucket_put(c, ob);
- ptrs->nr = 0;
-}
-
-static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
-{
- struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
- wp->ptrs = keep;
-
- mutex_unlock(&wp->lock);
-
- bch2_open_buckets_put(c, &ptrs);
-}
-
-static inline void bch2_open_bucket_get(struct bch_fs *c,
- struct write_point *wp,
- struct open_buckets *ptrs)
-{
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- ob->data_type = wp->data_type;
- atomic_inc(&ob->pin);
- ob_push(c, ptrs, ob);
- }
-}
-
-static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
- unsigned dev, u64 bucket)
-{
- return c->open_buckets_hash +
- (jhash_3words(dev, bucket, bucket >> 32, 0) &
- (OPEN_BUCKETS_COUNT - 1));
-}
-
-static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
-{
- open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
-
- while (slot) {
- struct open_bucket *ob = &c->open_buckets[slot];
-
- if (ob->dev == dev && ob->bucket == bucket)
- return true;
-
- slot = ob->hash;
- }
-
- return false;
-}
-
-static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
-{
- bool ret;
-
- if (bch2_bucket_is_open(c, dev, bucket))
- return true;
-
- spin_lock(&c->freelist_lock);
- ret = bch2_bucket_is_open(c, dev, bucket);
- spin_unlock(&c->freelist_lock);
-
- return ret;
-}
-
-enum bch_write_flags;
-int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
- struct dev_stripe_state *, struct bch_devs_mask *,
- unsigned, unsigned *, bool *, enum bch_write_flags,
- enum bch_data_type, enum bch_watermark,
- struct closure *);
-
-int bch2_alloc_sectors_start_trans(struct btree_trans *,
- unsigned, unsigned,
- struct write_point_specifier,
- struct bch_devs_list *,
- unsigned, unsigned,
- enum bch_watermark,
- enum bch_write_flags,
- struct closure *,
- struct write_point **);
-
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-static inline void
-bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
- struct bkey_i *k, unsigned sectors,
- bool cached)
-{
- struct open_bucket *ob;
- unsigned i;
-
- BUG_ON(sectors > wp->sectors_free);
- wp->sectors_free -= sectors;
- wp->sectors_allocated += sectors;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = ob_dev(c, ob);
- struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
-
- ptr.cached = cached ||
- (!ca->mi.durability &&
- wp->data_type == BCH_DATA_user);
-
- bch2_bkey_append_ptr(k, ptr);
-
- BUG_ON(sectors > ob->sectors_free);
- ob->sectors_free -= sectors;
- }
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
- struct bkey_i *, unsigned, bool);
-void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
-
-static inline struct write_point_specifier writepoint_hashed(unsigned long v)
-{
- return (struct write_point_specifier) { .v = v | 1 };
-}
-
-static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
-{
- return (struct write_point_specifier) { .v = (unsigned long) wp };
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *);
-
-void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *);
-void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
-void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
-
-void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
-static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
-{
- if (cl->closure_get_happened)
- __bch2_wait_on_allocator(c, cl);
-}
-
-#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
deleted file mode 100644
index 8f79f46c2a78..000000000000
--- a/fs/bcachefs/alloc_types.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_TYPES_H
-#define _BCACHEFS_ALLOC_TYPES_H
-
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include "clock_types.h"
-#include "fifo.h"
-
-struct bucket_alloc_state {
- enum {
- BTREE_BITMAP_NO,
- BTREE_BITMAP_YES,
- BTREE_BITMAP_ANY,
- } btree_bitmap;
-
- u64 buckets_seen;
- u64 skipped_open;
- u64 skipped_need_journal_commit;
- u64 need_journal_commit;
- u64 skipped_nocow;
- u64 skipped_nouse;
- u64 skipped_mi_btree_bitmap;
-};
-
-#define BCH_WATERMARKS() \
- x(stripe) \
- x(normal) \
- x(copygc) \
- x(btree) \
- x(btree_copygc) \
- x(reclaim) \
- x(interior_updates)
-
-enum bch_watermark {
-#define x(name) BCH_WATERMARK_##name,
- BCH_WATERMARKS()
-#undef x
- BCH_WATERMARK_NR,
-};
-
-#define BCH_WATERMARK_BITS 3
-#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS)
-
-#define OPEN_BUCKETS_COUNT 1024
-
-#define WRITE_POINT_HASH_NR 32
-#define WRITE_POINT_MAX 32
-
-/*
- * 0 is never a valid open_bucket_idx_t:
- */
-typedef u16 open_bucket_idx_t;
-
-struct open_bucket {
- spinlock_t lock;
- atomic_t pin;
- open_bucket_idx_t freelist;
- open_bucket_idx_t hash;
-
- /*
- * When an open bucket has an ec_stripe attached, this is the index of
- * the block in the stripe this open_bucket corresponds to:
- */
- u8 ec_idx;
- enum bch_data_type data_type:6;
- unsigned valid:1;
- unsigned on_partial_list:1;
-
- u8 dev;
- u8 gen;
- u32 sectors_free;
- u64 bucket;
- struct ec_stripe_new *ec;
-};
-
-#define OPEN_BUCKET_LIST_MAX 15
-
-struct open_buckets {
- open_bucket_idx_t nr;
- open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX];
-};
-
-struct dev_stripe_state {
- u64 next_alloc[BCH_SB_MEMBERS_MAX];
-};
-
-#define WRITE_POINT_STATES() \
- x(stopped) \
- x(waiting_io) \
- x(waiting_work) \
- x(runnable) \
- x(running)
-
-enum write_point_state {
-#define x(n) WRITE_POINT_##n,
- WRITE_POINT_STATES()
-#undef x
- WRITE_POINT_STATE_NR
-};
-
-struct write_point {
- struct {
- struct hlist_node node;
- struct mutex lock;
- u64 last_used;
- unsigned long write_point;
- enum bch_data_type data_type;
-
- /* calculated based on how many pointers we're actually going to use: */
- unsigned sectors_free;
-
- struct open_buckets ptrs;
- struct dev_stripe_state stripe;
-
- u64 sectors_allocated;
- } __aligned(SMP_CACHE_BYTES);
-
- struct {
- struct work_struct index_update_work;
-
- struct list_head writes;
- spinlock_t writes_lock;
-
- enum write_point_state state;
- u64 last_state_change;
- u64 time[WRITE_POINT_STATE_NR];
- u64 last_runtime;
- } __aligned(SMP_CACHE_BYTES);
-};
-
-struct write_point_specifier {
- unsigned long v;
-};
-
-#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
deleted file mode 100644
index 20c497f0c2cb..000000000000
--- a/fs/bcachefs/backpointers.c
+++ /dev/null
@@ -1,1229 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "checksum.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "progress.h"
-
-#include <linux/mm.h>
-
-int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- int ret = 0;
-
- bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH,
- c, backpointer_level_bad,
- "backpointer level bad: %u >= %u",
- bp.v->level, BTREE_MAX_DEPTH);
-
- bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID,
- c, backpointer_dev_bad,
- "backpointer for BCH_SB_MEMBER_INVALID");
-fsck_err:
- return ret;
-}
-
-void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
- if (ca) {
- u32 bucket_offset;
- struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
- rcu_read_unlock();
- prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
- } else {
- rcu_read_unlock();
- prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
- }
-
- bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
- prt_str(out, " data_type=");
- bch2_prt_data_type(out, bp.v->data_type);
- prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
- (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
- bp.v->bucket_len,
- bp.v->bucket_gen);
- bch2_bpos_to_text(out, bp.v->pos);
-}
-
-void bch2_backpointer_swab(struct bkey_s k)
-{
- struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
-
- bp.v->bucket_len = swab32(bp.v->bucket_len);
- bch2_bpos_swab(&bp.v->pos);
-}
-
-static bool extent_matches_bp(struct bch_fs *c,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k,
- struct bkey_s_c_backpointer bp)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bkey_i_backpointer bp2;
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2);
-
- if (bpos_eq(bp.k->p, bp2.k.p) &&
- !memcmp(bp.v, &bp2.v, sizeof(bp2.v)))
- return true;
- }
-
- return false;
-}
-
-static noinline int backpointer_mod_err(struct btree_trans *trans,
- struct bkey_s_c orig_k,
- struct bkey_i_backpointer *new_bp,
- struct bkey_s_c found_bp,
- bool insert)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- if (insert) {
- prt_printf(&buf, "existing backpointer found when inserting ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
- prt_newline(&buf);
- printbuf_indent_add(&buf, 2);
-
- prt_printf(&buf, "found ");
- bch2_bkey_val_to_text(&buf, c, found_bp);
- prt_newline(&buf);
-
- prt_printf(&buf, "for ");
- bch2_bkey_val_to_text(&buf, c, orig_k);
-
- bch_err(c, "%s", buf.buf);
- } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- prt_printf(&buf, "backpointer not found when deleting\n");
- printbuf_indent_add(&buf, 2);
-
- prt_printf(&buf, "searching for ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
- prt_newline(&buf);
-
- prt_printf(&buf, "got ");
- bch2_bkey_val_to_text(&buf, c, found_bp);
- prt_newline(&buf);
-
- prt_printf(&buf, "for ");
- bch2_bkey_val_to_text(&buf, c, orig_k);
-
- bch_err(c, "%s", buf.buf);
- }
-
- printbuf_exit(&buf);
-
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
- } else {
- return 0;
- }
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
- struct bkey_s_c orig_k,
- struct bkey_i_backpointer *bp,
- bool insert)
-{
- struct btree_iter bp_iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
- bp->k.p,
- BTREE_ITER_intent|
- BTREE_ITER_slots|
- BTREE_ITER_with_updates);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (insert
- ? k.k->type
- : (k.k->type != KEY_TYPE_backpointer ||
- memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) {
- ret = backpointer_mod_err(trans, orig_k, bp, k, insert);
- if (ret)
- goto err;
- }
-
- if (!insert) {
- bp->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp->k, 0);
- }
-
- ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0);
-err:
- bch2_trans_iter_exit(trans, &bp_iter);
- return ret;
-}
-
-static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
-{
- return (likely(!bch2_backpointers_no_use_write_buffer)
- ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
- : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
- struct bkey_s_c visiting_k,
- struct bkey_buf *last_flushed)
-{
- return likely(!bch2_backpointers_no_use_write_buffer)
- ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
- : 0;
-}
-
-static int backpointer_target_not_found(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct bkey_s_c target_k,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- /*
- * If we're using the btree write buffer, the backpointer we were
- * looking at may have already been deleted - failure to find what it
- * pointed to is not an error:
- */
- ret = last_flushed
- ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed)
- : 0;
- if (ret)
- return ret;
-
- prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
- bp.v->level ? "btree node" : "extent");
- bch2_bkey_val_to_text(&buf, c, bp.s_c);
-
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, target_k);
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
- if (p.ptr.dev == bp.k->p.inode) {
- prt_printf(&buf, "\n ");
- struct bkey_i_backpointer bp2;
- bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
- }
-
- if (fsck_err(trans, backpointer_to_missing_ptr,
- "%s", buf.buf))
- ret = bch2_backpointer_del(trans, bp.k->p);
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct btree_iter *iter,
- unsigned iter_flags,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
-
- if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
- return bkey_s_c_null;
-
- bch2_trans_node_iter_init(trans, iter,
- bp.v->btree_id,
- bp.v->pos,
- 0,
- bp.v->level,
- iter_flags);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- if (bkey_err(k)) {
- bch2_trans_iter_exit(trans, iter);
- return k;
- }
-
- if (k.k &&
- extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
-
- if (!bp.v->level) {
- int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
- } else {
- struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
- if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
- return bkey_s_c_null;
- if (IS_ERR_OR_NULL(b))
- return ((struct bkey_s_c) { .k = ERR_CAST(b) });
-
- return bkey_i_to_s_c(&b->key);
- }
-}
-
-struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct btree_iter *iter,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
-
- BUG_ON(!bp.v->level);
-
- bch2_trans_node_iter_init(trans, iter,
- bp.v->btree_id,
- bp.v->pos,
- 0,
- bp.v->level - 1,
- 0);
- struct btree *b = bch2_btree_iter_peek_node(iter);
- if (IS_ERR_OR_NULL(b))
- goto err;
-
- BUG_ON(b->c.level != bp.v->level - 1);
-
- if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
- bkey_i_to_s_c(&b->key), bp))
- return b;
-
- if (btree_node_will_make_reachable(b)) {
- b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
- } else {
- int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
- b = ret ? ERR_PTR(ret) : NULL;
- }
-err:
- bch2_trans_iter_exit(trans, iter);
- return b;
-}
-
-static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
- struct bkey_buf *last_flushed)
-{
- if (k.k->type != KEY_TYPE_backpointer)
- return 0;
-
- struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter = { NULL };
- struct bkey_s_c alloc_k;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bpos bucket;
- if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
- ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
- if (ret)
- goto out;
-
- if (fsck_err(trans, backpointer_to_missing_device,
- "backpointer for missing device:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_backpointer_del(trans, k.k->p);
- goto out;
- }
-
- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0);
- ret = bkey_err(alloc_k);
- if (ret)
- goto out;
-
- if (alloc_k.k->type != KEY_TYPE_alloc_v4) {
- ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
- if (ret)
- goto out;
-
- if (fsck_err(trans, backpointer_to_missing_alloc,
- "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
- alloc_iter.pos.inode, alloc_iter.pos.offset,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_backpointer_del(trans, k.k->p);
- }
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &alloc_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-/* verify that every backpointer has a corresponding alloc key */
-int bch2_check_btree_backpointers(struct bch_fs *c)
-{
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_backpointers, POS_MIN, 0, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch_err_fn(c, ret);
- return ret;
-}
-
-struct extents_to_bp_state {
- struct bpos bp_start;
- struct bpos bp_end;
- struct bkey_buf last_flushed;
-};
-
-static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
- struct bkey_s_c extent, unsigned dev)
-{
- struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- bch2_bkey_drop_device(bkey_i_to_s(n), dev);
- return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-static int check_extent_checksum(struct btree_trans *trans,
- enum btree_id btree, struct bkey_s_c extent,
- enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct printbuf buf = PRINTBUF;
- void *data_buf = NULL;
- struct bio *bio = NULL;
- size_t bytes;
- int ret = 0;
-
- if (bkey_is_btree_ptr(extent.k))
- return false;
-
- bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
- if (p.ptr.dev == dev)
- goto found;
- BUG();
-found:
- if (!p.crc.csum_type)
- return false;
-
- bytes = p.crc.compressed_size << 9;
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ);
- if (!ca)
- return false;
-
- data_buf = kvmalloc(bytes, GFP_KERNEL);
- if (!data_buf) {
- ret = -ENOMEM;
- goto err;
- }
-
- bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
- bio->bi_iter.bi_sector = p.ptr.offset;
- bch2_bio_map(bio, data_buf, bytes);
- ret = submit_bio_wait(bio);
- if (ret)
- goto err;
-
- prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
- prt_printf(&buf, "\n ");
- bch2_btree_id_to_text(&buf, btree);
- prt_str(&buf, " ");
- bch2_bkey_val_to_text(&buf, c, extent);
- prt_printf(&buf, "\n ");
- bch2_btree_id_to_text(&buf, o_btree);
- prt_str(&buf, " ");
- bch2_bkey_val_to_text(&buf, c, extent2);
-
- struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
- struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
- if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
- trans, dup_backpointer_to_bad_csum_extent,
- "%s", buf.buf))
- ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
-fsck_err:
-err:
- if (bio)
- bio_put(bio);
- kvfree(data_buf);
- percpu_ref_put(&ca->io_ref);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int check_bp_exists(struct btree_trans *trans,
- struct extents_to_bp_state *s,
- struct bkey_i_backpointer *bp,
- struct bkey_s_c orig_k)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter other_extent_iter = {};
- struct printbuf buf = PRINTBUF;
-
- if (bpos_lt(bp->k.p, s->bp_start) ||
- bpos_gt(bp->k.p, s->bp_end))
- return 0;
-
- struct btree_iter bp_iter;
- struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
- int ret = bkey_err(bp_k);
- if (ret)
- goto err;
-
- if (bp_k.k->type != KEY_TYPE_backpointer ||
- memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) {
- ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
- if (ret)
- goto err;
-
- goto check_existing_bp;
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &other_extent_iter);
- bch2_trans_iter_exit(trans, &bp_iter);
- printbuf_exit(&buf);
- return ret;
-check_existing_bp:
- /* Do we have a backpointer for a different extent? */
- if (bp_k.k->type != KEY_TYPE_backpointer)
- goto missing;
-
- struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
-
- struct bkey_s_c other_extent =
- bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
- ret = bkey_err(other_extent);
- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
- ret = 0;
- if (ret)
- goto err;
-
- if (!other_extent.k)
- goto missing;
-
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode);
- if (ca) {
- struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent);
- bkey_for_each_ptr(other_extent_ptrs, ptr)
- if (ptr->dev == bp->k.p.inode &&
- dev_ptr_stale_rcu(ca, ptr)) {
- ret = drop_dev_and_update(trans, other_bp.v->btree_id,
- other_extent, bp->k.p.inode);
- if (ret)
- goto err;
- goto out;
- }
- }
- rcu_read_unlock();
-
- if (bch2_extents_match(orig_k, other_extent)) {
- printbuf_reset(&buf);
- prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
- bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_str(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, other_extent);
- bch_err(c, "%s", buf.buf);
-
- if (other_extent.k->size <= orig_k.k->size) {
- ret = drop_dev_and_update(trans, other_bp.v->btree_id,
- other_extent, bp->k.p.inode);
- if (ret)
- goto err;
- goto out;
- } else {
- ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode);
- if (ret)
- goto err;
- goto missing;
- }
- }
-
- ret = check_extent_checksum(trans,
- other_bp.v->btree_id, other_extent,
- bp->v.btree_id, orig_k,
- bp->k.p.inode);
- if (ret < 0)
- goto err;
- if (ret) {
- ret = 0;
- goto missing;
- }
-
- ret = check_extent_checksum(trans, bp->v.btree_id, orig_k,
- other_bp.v->btree_id, other_extent, bp->k.p.inode);
- if (ret < 0)
- goto err;
- if (ret) {
- ret = 0;
- goto out;
- }
-
- printbuf_reset(&buf);
- prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode);
- bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_str(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, other_extent);
- bch_err(c, "%s", buf.buf);
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto err;
-missing:
- printbuf_reset(&buf);
- prt_str(&buf, "missing backpointer\n for: ");
- bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\n want: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
- prt_printf(&buf, "\n got: ");
- bch2_bkey_val_to_text(&buf, c, bp_k);
-
- if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
- ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true);
-
- goto out;
-}
-
-static int check_extent_to_backpointers(struct btree_trans *trans,
- struct extents_to_bp_state *s,
- enum btree_id btree, unsigned level,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
- continue;
-
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
- bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
- bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
-
- bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr));
- rcu_read_unlock();
-
- if ((check || empty) && !stale) {
- struct bkey_i_backpointer bp;
- bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
-
- int ret = check
- ? check_bp_exists(trans, s, &bp, k)
- : bch2_bucket_backpointer_mod(trans, k, &bp, true);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-static int check_btree_root_to_backpointers(struct btree_trans *trans,
- struct extents_to_bp_state *s,
- enum btree_id btree_id,
- int *level)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct btree *b;
- struct bkey_s_c k;
- int ret;
-retry:
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
- 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
- b = bch2_btree_iter_peek_node(&iter);
- ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto err;
-
- if (b != btree_node_root(c, b)) {
- bch2_trans_iter_exit(trans, &iter);
- goto retry;
- }
-
- *level = b->c.level;
-
- k = bkey_i_to_s_c(&b->key);
- ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
-{
- return (struct bbpos) {
- .btree = bp.btree_id,
- .pos = bp.pos,
- };
-}
-
-static u64 mem_may_pin_bytes(struct bch_fs *c)
-{
- struct sysinfo i;
- si_meminfo(&i);
-
- u64 mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
-}
-
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
-{
- return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
-}
-
-static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
- u64 btree_leaf_mask,
- u64 btree_interior_mask,
- struct bbpos start, struct bbpos *end)
-{
- struct bch_fs *c = trans->c;
- s64 mem_may_pin = mem_may_pin_bytes(c);
- int ret = 0;
-
- bch2_btree_cache_unpin(c);
-
- btree_interior_mask |= btree_leaf_mask;
-
- c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
- c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
- c->btree_cache.pinned_nodes_start = start;
- c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
-
- for (enum btree_id btree = start.btree;
- btree < BTREE_ID_NR && !ret;
- btree++) {
- unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
-
- if (!(BIT_ULL(btree) & btree_leaf_mask) &&
- !(BIT_ULL(btree) & btree_interior_mask))
- continue;
-
- ret = __for_each_btree_node(trans, iter, btree,
- btree == start.btree ? start.pos : POS_MIN,
- 0, depth, BTREE_ITER_prefetch, b, ({
- mem_may_pin -= btree_buf_bytes(b);
- if (mem_may_pin <= 0) {
- c->btree_cache.pinned_nodes_end = *end =
- BBPOS(btree, b->key.k.p);
- break;
- }
- bch2_node_pin(c, b);
- 0;
- }));
- }
-
- return ret;
-}
-
-static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
- struct extents_to_bp_state *s)
-{
- struct bch_fs *c = trans->c;
- struct progress_indicator_state progress;
- int ret = 0;
-
- bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
-
- for (enum btree_id btree_id = 0;
- btree_id < btree_id_nr_alive(c);
- btree_id++) {
- int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
-
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- check_btree_root_to_backpointers(trans, s, btree_id, &level));
- if (ret)
- return ret;
-
- while (level >= depth) {
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level,
- BTREE_ITER_prefetch);
-
- ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
- check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- }));
- if (ret)
- return ret;
-
- --level;
- }
- }
-
- return 0;
-}
-
-enum alloc_sector_counter {
- ALLOC_dirty,
- ALLOC_cached,
- ALLOC_stripe,
- ALLOC_SECTORS_NR
-};
-
-static int data_type_to_alloc_counter(enum bch_data_type t)
-{
- switch (t) {
- case BCH_DATA_btree:
- case BCH_DATA_user:
- return ALLOC_dirty;
- case BCH_DATA_cached:
- return ALLOC_cached;
- case BCH_DATA_stripe:
- case BCH_DATA_parity:
- return ALLOC_stripe;
- default:
- return -1;
- }
-}
-
-static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
-
-static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
- bool need_commit = false;
-
- if (a->data_type == BCH_DATA_sb ||
- a->data_type == BCH_DATA_journal ||
- a->data_type == BCH_DATA_parity)
- return 0;
-
- u32 sectors[ALLOC_SECTORS_NR];
- memset(sectors, 0, sizeof(sectors));
-
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p);
- if (!ca)
- return 0;
-
- struct btree_iter iter;
- struct bkey_s_c bp_k;
- int ret = 0;
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_start(ca, alloc_k.k->p),
- bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) {
- if (bp_k.k->type != KEY_TYPE_backpointer)
- continue;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
- (bp.v->bucket_gen != a->gen ||
- bp.v->pad)) {
- ret = bch2_backpointer_del(trans, bp_k.k->p);
- if (ret)
- break;
-
- need_commit = true;
- continue;
- }
-
- if (bp.v->bucket_gen != a->gen)
- continue;
-
- int alloc_counter = data_type_to_alloc_counter(bp.v->data_type);
- if (alloc_counter < 0)
- continue;
-
- sectors[alloc_counter] += bp.v->bucket_len;
- };
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- goto err;
-
- if (need_commit) {
- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
- }
-
- if (sectors[ALLOC_dirty] != a->dirty_sectors ||
- sectors[ALLOC_cached] != a->cached_sectors ||
- sectors[ALLOC_stripe] != a->stripe_sectors) {
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
- ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
- if (ret)
- goto err;
- }
-
- if (sectors[ALLOC_dirty] > a->dirty_sectors ||
- sectors[ALLOC_cached] > a->cached_sectors ||
- sectors[ALLOC_stripe] > a->stripe_sectors) {
- ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
- -BCH_ERR_transaction_restart_nested;
- goto err;
- }
-
- if (!sectors[ALLOC_dirty] &&
- !sectors[ALLOC_stripe] &&
- !sectors[ALLOC_cached])
- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
- else
- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
- }
-err:
- bch2_dev_put(ca);
- return ret;
-}
-
-static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr_v2: {
- bool ret = false;
-
- rcu_read_lock();
- struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
- while (pos.inode <= k.k->p.inode) {
- if (pos.inode >= c->sb.nr_devices)
- break;
-
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
- if (!ca)
- goto next;
-
- struct bpos bucket = bp_pos_to_bucket(ca, pos);
- bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches,
- ca->mi.nbuckets, bucket.offset);
- if (bucket.offset == ca->mi.nbuckets)
- goto next;
-
- ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p);
- if (ret)
- break;
-next:
- pos = SPOS(pos.inode + 1, 0, 0);
- }
- rcu_read_unlock();
-
- return ret;
- }
- case KEY_TYPE_btree_ptr:
- return true;
- default:
- return false;
- }
-}
-
-static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
- enum btree_id btree, unsigned level)
-{
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto err;
-
- if (b)
- bch2_node_pin(trans->c, b);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans,
- struct bpos start, struct bpos *end)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- struct bkey_buf tmp;
- bch2_bkey_buf_init(&tmp);
-
- bch2_btree_cache_unpin(c);
-
- *end = SPOS_MAX;
-
- s64 mem_may_pin = mem_may_pin_bytes(c);
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
- 0, 1, BTREE_ITER_prefetch);
- ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- if (!backpointer_node_has_missing(c, k))
- continue;
-
- mem_may_pin -= c->opts.btree_node_size;
- if (mem_may_pin <= 0)
- break;
-
- bch2_bkey_buf_reassemble(&tmp, c, k);
- struct btree_path *path = btree_iter_path(trans, &iter);
-
- BUG_ON(path->level != 1);
-
- bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1);
- }));
- if (ret)
- return ret;
-
- struct bpos pinned = SPOS_MAX;
- mem_may_pin = mem_may_pin_bytes(c);
- bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
- 0, 1, BTREE_ITER_prefetch);
- ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- if (!backpointer_node_has_missing(c, k))
- continue;
-
- mem_may_pin -= c->opts.btree_node_size;
- if (mem_may_pin <= 0) {
- *end = pinned;
- break;
- }
-
- bch2_bkey_buf_reassemble(&tmp, c, k);
- struct btree_path *path = btree_iter_path(trans, &iter);
-
- BUG_ON(path->level != 1);
-
- int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1);
-
- if (!ret2)
- pinned = tmp.k->k.p;
-
- ret;
- }));
- if (ret)
- return ret;
-
- return ret;
-}
-
-int bch2_check_extents_to_backpointers(struct bch_fs *c)
-{
- int ret = 0;
-
- /*
- * Can't allow devices to come/go/resize while we have bucket bitmaps
- * allocated
- */
- lockdep_assert_held(&c->state_lock);
-
- for_each_member_device(c, ca) {
- BUG_ON(ca->bucket_backpointer_mismatches);
- ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
- sizeof(unsigned long),
- GFP_KERNEL);
- ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
- sizeof(unsigned long),
- GFP_KERNEL);
- if (!ca->bucket_backpointer_mismatches ||
- !ca->bucket_backpointer_empty) {
- bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
- goto err_free_bitmaps;
- }
- }
-
- struct btree_trans *trans = bch2_trans_get(c);
- struct extents_to_bp_state s = { .bp_start = POS_MIN };
-
- bch2_bkey_buf_init(&s.last_flushed);
- bkey_init(&s.last_flushed.k->k);
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
- POS_MIN, BTREE_ITER_prefetch, k, ({
- check_bucket_backpointer_mismatch(trans, k, &s.last_flushed);
- }));
- if (ret)
- goto err;
-
- u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
- for_each_member_device(c, ca) {
- nr_buckets += ca->mi.nbuckets;
- nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets);
- nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets);
- }
-
- if (!nr_mismatches && !nr_empty)
- goto err;
-
- bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
- nr_mismatches + nr_empty, nr_buckets);
-
- while (1) {
- ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
- if (ret)
- break;
-
- if ( bpos_eq(s.bp_start, POS_MIN) &&
- !bpos_eq(s.bp_end, SPOS_MAX))
- bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
- __func__, btree_nodes_fit_in_ram(c));
-
- if (!bpos_eq(s.bp_start, POS_MIN) ||
- !bpos_eq(s.bp_end, SPOS_MAX)) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "check_extents_to_backpointers(): ");
- bch2_bpos_to_text(&buf, s.bp_start);
- prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, s.bp_end);
-
- bch_verbose(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = bch2_check_extents_to_backpointers_pass(trans, &s);
- if (ret || bpos_eq(s.bp_end, SPOS_MAX))
- break;
-
- s.bp_start = bpos_successor(s.bp_end);
- }
-err:
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&s.last_flushed, c);
- bch2_btree_cache_unpin(c);
-err_free_bitmaps:
- for_each_member_device(c, ca) {
- kvfree(ca->bucket_backpointer_empty);
- ca->bucket_backpointer_empty = NULL;
- kvfree(ca->bucket_backpointer_mismatches);
- ca->bucket_backpointer_mismatches = NULL;
- }
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_one_backpointer(struct btree_trans *trans,
- struct bbpos start,
- struct bbpos end,
- struct bkey_s_c bp_k,
- struct bkey_buf *last_flushed)
-{
- if (bp_k.k->type != KEY_TYPE_backpointer)
- return 0;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
- struct bbpos pos = bp_to_bbpos(*bp.v);
-
- if (bbpos_cmp(pos, start) < 0 ||
- bbpos_cmp(pos, end) > 0)
- return 0;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed);
- int ret = bkey_err(k);
- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
- return 0;
- if (ret)
- return ret;
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int check_bucket_backpointers_to_extents(struct btree_trans *trans,
- struct bch_dev *ca, struct bpos bucket)
-{
- u32 restart_count = trans->restart_count;
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_start(ca, bucket),
- bucket_pos_to_bp_end(ca, bucket),
- 0, k,
- check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed)
- );
-
- bch2_bkey_buf_exit(&last_flushed, trans->c);
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
- struct bbpos start,
- struct bbpos end)
-{
- struct bch_fs *c = trans->c;
- struct bkey_buf last_flushed;
- struct progress_indicator_state progress;
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
- bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
-
- int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
- POS_MIN, BTREE_ITER_prefetch, k, ({
- bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
- check_one_backpointer(trans, start, end, k, &last_flushed);
- }));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- return ret;
-}
-
-int bch2_check_backpointers_to_extents(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
- int ret;
-
- while (1) {
- ret = bch2_get_btree_in_memory_pos(trans,
- BIT_ULL(BTREE_ID_extents)|
- BIT_ULL(BTREE_ID_reflink),
- ~0,
- start, &end);
- if (ret)
- break;
-
- if (!bbpos_cmp(start, BBPOS_MIN) &&
- bbpos_cmp(end, BBPOS_MAX))
- bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
- __func__, btree_nodes_fit_in_ram(c));
-
- if (bbpos_cmp(start, BBPOS_MIN) ||
- bbpos_cmp(end, BBPOS_MAX)) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "check_backpointers_to_extents(): ");
- bch2_bbpos_to_text(&buf, start);
- prt_str(&buf, "-");
- bch2_bbpos_to_text(&buf, end);
-
- bch_verbose(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
- if (ret || !bbpos_cmp(end, BBPOS_MAX))
- break;
-
- start = bbpos_successor(end);
- }
- bch2_trans_put(trans);
-
- bch2_btree_cache_unpin(c);
-
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
deleted file mode 100644
index 16575dbc5736..000000000000
--- a/fs/bcachefs/backpointers.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BACKPOINTERS_H
-#define _BCACHEFS_BACKPOINTERS_H
-
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "error.h"
-#include "super.h"
-
-static inline u64 swab40(u64 x)
-{
- return (((x & 0x00000000ffULL) << 32)|
- ((x & 0x000000ff00ULL) << 16)|
- ((x & 0x0000ff0000ULL) >> 0)|
- ((x & 0x00ff000000ULL) >> 16)|
- ((x & 0xff00000000ULL) >> 32));
-}
-
-int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k,
- struct bkey_validate_context);
-void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_backpointer_swab(struct bkey_s);
-
-#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
- .key_validate = bch2_backpointer_validate, \
- .val_to_text = bch2_backpointer_to_text, \
- .swab = bch2_backpointer_swab, \
- .min_val_size = 32, \
-})
-
-#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
-
-/*
- * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
- * btree:
- */
-static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos)
-{
- u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
- return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
-}
-
-static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos,
- u32 *bucket_offset)
-{
- u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
- return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset));
-}
-
-static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
-{
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
- if (ca)
- *bucket = bp_pos_to_bucket(ca, bp_pos);
- rcu_read_unlock();
- return ca != NULL;
-}
-
-static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
- struct bpos bucket,
- u64 bucket_offset)
-{
- return POS(bucket.inode,
- (bucket_to_sector(ca, bucket.offset) <<
- MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
-}
-
-/*
- * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
- */
-static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
- struct bpos bucket,
- u64 bucket_offset)
-{
- struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
- EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret)));
- return ret;
-}
-
-static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket)
-{
- return bucket_pos_to_bp(ca, bucket, 0);
-}
-
-static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket)
-{
- return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
- struct bkey_s_c,
- struct bkey_i_backpointer *,
- bool);
-
-static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
- struct bkey_s_c orig_k,
- struct bkey_i_backpointer *bp,
- bool insert)
-{
- if (unlikely(bch2_backpointers_no_use_write_buffer))
- return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
-
- if (!insert) {
- bp->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp->k, 0);
- }
-
- return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i);
-}
-
-static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
- struct extent_ptr_decoded p,
- const union bch_extent_entry *entry)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- return BCH_DATA_btree;
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- if (p.has_ec)
- return BCH_DATA_stripe;
- if (p.ptr.cached)
- return BCH_DATA_cached;
- else
- return BCH_DATA_user;
- case KEY_TYPE_stripe: {
- const struct bch_extent_ptr *ptr = &entry->ptr;
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- BUG_ON(ptr < s.v->ptrs ||
- ptr >= s.v->ptrs + s.v->nr_blocks);
-
- return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity
- : BCH_DATA_user;
- }
- default:
- BUG();
- }
-}
-
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- const union bch_extent_entry *entry,
- struct bkey_i_backpointer *bp)
-{
- bkey_backpointer_init(&bp->k_i);
- bp->k.p.inode = p.ptr.dev;
-
- if (k.k->type != KEY_TYPE_stripe)
- bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
- else {
- /*
- * Put stripe backpointers where they won't collide with the
- * extent backpointers within the stripe:
- */
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
- bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) <<
- MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1;
- }
-
- bp->v = (struct bch_backpointer) {
- .btree_id = btree_id,
- .level = level,
- .data_type = bch2_bkey_ptr_data_type(k, p, entry),
- .bucket_gen = p.ptr.gen,
- .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p),
- .pos = k.k->p,
- };
-}
-
-struct bkey_buf;
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
- struct btree_iter *, unsigned, struct bkey_buf *);
-struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
- struct btree_iter *, struct bkey_buf *);
-
-int bch2_check_btree_backpointers(struct bch_fs *);
-int bch2_check_extents_to_backpointers(struct bch_fs *);
-int bch2_check_backpointers_to_extents(struct bch_fs *);
-
-#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
deleted file mode 100644
index 63abe17f35ea..000000000000
--- a/fs/bcachefs/bbpos.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_H
-#define _BCACHEFS_BBPOS_H
-
-#include "bbpos_types.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-
-static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
-{
- return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
-}
-
-static inline struct bbpos bbpos_successor(struct bbpos pos)
-{
- if (bpos_cmp(pos.pos, SPOS_MAX)) {
- pos.pos = bpos_successor(pos.pos);
- return pos;
- }
-
- if (pos.btree != BTREE_ID_NR) {
- pos.btree++;
- pos.pos = POS_MIN;
- return pos;
- }
-
- BUG();
-}
-
-static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
-{
- bch2_btree_id_to_text(out, pos.btree);
- prt_char(out, ':');
- bch2_bpos_to_text(out, pos.pos);
-}
-
-#endif /* _BCACHEFS_BBPOS_H */
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
deleted file mode 100644
index f63893344f80..000000000000
--- a/fs/bcachefs/bbpos_types.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_TYPES_H
-#define _BCACHEFS_BBPOS_TYPES_H
-
-struct bbpos {
- enum btree_id btree;
- struct bpos pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
- return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
-
-#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
deleted file mode 100644
index f52311017aee..000000000000
--- a/fs/bcachefs/bcachefs.h
+++ /dev/null
@@ -1,1256 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_H
-#define _BCACHEFS_H
-
-/*
- * SOME HIGH LEVEL CODE DOCUMENTATION:
- *
- * Bcache mostly works with cache sets, cache devices, and backing devices.
- *
- * Support for multiple cache devices hasn't quite been finished off yet, but
- * it's about 95% plumbed through. A cache set and its cache devices is sort of
- * like a md raid array and its component devices. Most of the code doesn't care
- * about individual cache devices, the main abstraction is the cache set.
- *
- * Multiple cache devices is intended to give us the ability to mirror dirty
- * cached data and metadata, without mirroring clean cached data.
- *
- * Backing devices are different, in that they have a lifetime independent of a
- * cache set. When you register a newly formatted backing device it'll come up
- * in passthrough mode, and then you can attach and detach a backing device from
- * a cache set at runtime - while it's mounted and in use. Detaching implicitly
- * invalidates any cached data for that backing device.
- *
- * A cache set can have multiple (many) backing devices attached to it.
- *
- * There's also flash only volumes - this is the reason for the distinction
- * between struct cached_dev and struct bcache_device. A flash only volume
- * works much like a bcache device that has a backing device, except the
- * "cached" data is always dirty. The end result is that we get thin
- * provisioning with very little additional code.
- *
- * Flash only volumes work but they're not production ready because the moving
- * garbage collector needs more work. More on that later.
- *
- * BUCKETS/ALLOCATION:
- *
- * Bcache is primarily designed for caching, which means that in normal
- * operation all of our available space will be allocated. Thus, we need an
- * efficient way of deleting things from the cache so we can write new things to
- * it.
- *
- * To do this, we first divide the cache device up into buckets. A bucket is the
- * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
- * works efficiently.
- *
- * Each bucket has a 16 bit priority, and an 8 bit generation associated with
- * it. The gens and priorities for all the buckets are stored contiguously and
- * packed on disk (in a linked list of buckets - aside from the superblock, all
- * of bcache's metadata is stored in buckets).
- *
- * The priority is used to implement an LRU. We reset a bucket's priority when
- * we allocate it or on cache it, and every so often we decrement the priority
- * of each bucket. It could be used to implement something more sophisticated,
- * if anyone ever gets around to it.
- *
- * The generation is used for invalidating buckets. Each pointer also has an 8
- * bit generation embedded in it; for a pointer to be considered valid, its gen
- * must match the gen of the bucket it points into. Thus, to reuse a bucket all
- * we have to do is increment its gen (and write its new gen to disk; we batch
- * this up).
- *
- * Bcache is entirely COW - we never write twice to a bucket, even buckets that
- * contain metadata (including btree nodes).
- *
- * THE BTREE:
- *
- * Bcache is in large part design around the btree.
- *
- * At a high level, the btree is just an index of key -> ptr tuples.
- *
- * Keys represent extents, and thus have a size field. Keys also have a variable
- * number of pointers attached to them (potentially zero, which is handy for
- * invalidating the cache).
- *
- * The key itself is an inode:offset pair. The inode number corresponds to a
- * backing device or a flash only volume. The offset is the ending offset of the
- * extent within the inode - not the starting offset; this makes lookups
- * slightly more convenient.
- *
- * Pointers contain the cache device id, the offset on that device, and an 8 bit
- * generation number. More on the gen later.
- *
- * Index lookups are not fully abstracted - cache lookups in particular are
- * still somewhat mixed in with the btree code, but things are headed in that
- * direction.
- *
- * Updates are fairly well abstracted, though. There are two different ways of
- * updating the btree; insert and replace.
- *
- * BTREE_INSERT will just take a list of keys and insert them into the btree -
- * overwriting (possibly only partially) any extents they overlap with. This is
- * used to update the index after a write.
- *
- * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
- * overwriting a key that matches another given key. This is used for inserting
- * data into the cache after a cache miss, and for background writeback, and for
- * the moving garbage collector.
- *
- * There is no "delete" operation; deleting things from the index is
- * accomplished by either by invalidating pointers (by incrementing a bucket's
- * gen) or by inserting a key with 0 pointers - which will overwrite anything
- * previously present at that location in the index.
- *
- * This means that there are always stale/invalid keys in the btree. They're
- * filtered out by the code that iterates through a btree node, and removed when
- * a btree node is rewritten.
- *
- * BTREE NODES:
- *
- * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
- * free smaller than a bucket - so, that's how big our btree nodes are.
- *
- * (If buckets are really big we'll only use part of the bucket for a btree node
- * - no less than 1/4th - but a bucket still contains no more than a single
- * btree node. I'd actually like to change this, but for now we rely on the
- * bucket's gen for deleting btree nodes when we rewrite/split a node.)
- *
- * Anyways, btree nodes are big - big enough to be inefficient with a textbook
- * btree implementation.
- *
- * The way this is solved is that btree nodes are internally log structured; we
- * can append new keys to an existing btree node without rewriting it. This
- * means each set of keys we write is sorted, but the node is not.
- *
- * We maintain this log structure in memory - keeping 1Mb of keys sorted would
- * be expensive, and we have to distinguish between the keys we have written and
- * the keys we haven't. So to do a lookup in a btree node, we have to search
- * each sorted set. But we do merge written sets together lazily, so the cost of
- * these extra searches is quite low (normally most of the keys in a btree node
- * will be in one big set, and then there'll be one or two sets that are much
- * smaller).
- *
- * This log structure makes bcache's btree more of a hybrid between a
- * conventional btree and a compacting data structure, with some of the
- * advantages of both.
- *
- * GARBAGE COLLECTION:
- *
- * We can't just invalidate any bucket - it might contain dirty data or
- * metadata. If it once contained dirty data, other writes might overwrite it
- * later, leaving no valid pointers into that bucket in the index.
- *
- * Thus, the primary purpose of garbage collection is to find buckets to reuse.
- * It also counts how much valid data it each bucket currently contains, so that
- * allocation can reuse buckets sooner when they've been mostly overwritten.
- *
- * It also does some things that are really internal to the btree
- * implementation. If a btree node contains pointers that are stale by more than
- * some threshold, it rewrites the btree node to avoid the bucket's generation
- * wrapping around. It also merges adjacent btree nodes if they're empty enough.
- *
- * THE JOURNAL:
- *
- * Bcache's journal is not necessary for consistency; we always strictly
- * order metadata writes so that the btree and everything else is consistent on
- * disk in the event of an unclean shutdown, and in fact bcache had writeback
- * caching (with recovery from unclean shutdown) before journalling was
- * implemented.
- *
- * Rather, the journal is purely a performance optimization; we can't complete a
- * write until we've updated the index on disk, otherwise the cache would be
- * inconsistent in the event of an unclean shutdown. This means that without the
- * journal, on random write workloads we constantly have to update all the leaf
- * nodes in the btree, and those writes will be mostly empty (appending at most
- * a few keys each) - highly inefficient in terms of amount of metadata writes,
- * and it puts more strain on the various btree resorting/compacting code.
- *
- * The journal is just a log of keys we've inserted; on startup we just reinsert
- * all the keys in the open journal entries. That means that when we're updating
- * a node in the btree, we can wait until a 4k block of keys fills up before
- * writing them out.
- *
- * For simplicity, we only journal updates to leaf nodes; updates to parent
- * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
- * the complexity to deal with journalling them (in particular, journal replay)
- * - updates to non leaf nodes just happen synchronously (see btree_split()).
- */
-
-#undef pr_fmt
-#ifdef __KERNEL__
-#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
-#else
-#define pr_fmt(fmt) "%s() " fmt "\n", __func__
-#endif
-
-#include <linux/backing-dev-defs.h>
-#include <linux/bug.h>
-#include <linux/bio.h>
-#include <linux/closure.h>
-#include <linux/kobject.h>
-#include <linux/list.h>
-#include <linux/math64.h>
-#include <linux/mutex.h>
-#include <linux/percpu-refcount.h>
-#include <linux/percpu-rwsem.h>
-#include <linux/refcount.h>
-#include <linux/rhashtable.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/seqlock.h>
-#include <linux/shrinker.h>
-#include <linux/srcu.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-#include <linux/zstd.h>
-#include <linux/unicode.h>
-
-#include "bcachefs_format.h"
-#include "btree_journal_iter_types.h"
-#include "disk_accounting_types.h"
-#include "errcode.h"
-#include "fifo.h"
-#include "nocow_locking_types.h"
-#include "opts.h"
-#include "recovery_passes_types.h"
-#include "sb-errors_types.h"
-#include "seqmutex.h"
-#include "time_stats.h"
-#include "util.h"
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCH_WRITE_REF_DEBUG
-#endif
-
-#ifndef dynamic_fault
-#define dynamic_fault(...) 0
-#endif
-
-#define race_fault(...) dynamic_fault("bcachefs:race")
-
-#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
-
-#define trace_and_count(_c, _name, ...) \
-do { \
- count_event(_c, _name); \
- trace_##_name(__VA_ARGS__); \
-} while (0)
-
-#define bch2_fs_init_fault(name) \
- dynamic_fault("bcachefs:bch_fs_init:" name)
-#define bch2_meta_read_fault(name) \
- dynamic_fault("bcachefs:meta:read:" name)
-#define bch2_meta_write_fault(name) \
- dynamic_fault("bcachefs:meta:write:" name)
-
-#ifdef __KERNEL__
-#define BCACHEFS_LOG_PREFIX
-#endif
-
-#ifdef BCACHEFS_LOG_PREFIX
-
-#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name)
-#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \
- "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
-
-#else
-
-#define bch2_log_msg(_c, fmt) fmt
-#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \
- "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
-
-#endif
-
-#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
-
-void bch2_print_str(struct bch_fs *, const char *);
-
-__printf(2, 3)
-void bch2_print_opts(struct bch_opts *, const char *, ...);
-
-__printf(2, 3)
-void __bch2_print(struct bch_fs *c, const char *fmt, ...);
-
-#define maybe_dev_to_fs(_c) _Generic((_c), \
- struct bch_dev *: ((struct bch_dev *) (_c))->fs, \
- struct bch_fs *: (_c))
-
-#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
-
-#define bch2_print_ratelimited(_c, ...) \
-do { \
- static DEFINE_RATELIMIT_STATE(_rs, \
- DEFAULT_RATELIMIT_INTERVAL, \
- DEFAULT_RATELIMIT_BURST); \
- \
- if (__ratelimit(&_rs)) \
- bch2_print(_c, __VA_ARGS__); \
-} while (0)
-
-#define bch_info(c, fmt, ...) \
- bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_info_ratelimited(c, fmt, ...) \
- bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_notice(c, fmt, ...) \
- bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn(c, fmt, ...) \
- bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn_ratelimited(c, fmt, ...) \
- bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-
-#define bch_err(c, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev(ca, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset(ca, _offset, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum(c, _inum, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-#define bch_err_ratelimited(c, fmt, ...) \
- bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev_ratelimited(ca, fmt, ...) \
- bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
- bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
- bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
- bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-static inline bool should_print_err(int err)
-{
- return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
-}
-
-#define bch_err_fn(_c, _ret) \
-do { \
- if (should_print_err(_ret)) \
- bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_fn_ratelimited(_c, _ret) \
-do { \
- if (should_print_err(_ret)) \
- bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_msg(_c, _ret, _msg, ...) \
-do { \
- if (should_print_err(_ret)) \
- bch_err(_c, "%s(): error " _msg " %s", __func__, \
- ##__VA_ARGS__, bch2_err_str(_ret)); \
-} while (0)
-
-#define bch_verbose(c, fmt, ...) \
-do { \
- if ((c)->opts.verbose) \
- bch_info(c, fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define bch_verbose_ratelimited(c, fmt, ...) \
-do { \
- if ((c)->opts.verbose) \
- bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define pr_verbose_init(opts, fmt, ...) \
-do { \
- if (opt_get(opts, verbose)) \
- pr_info(fmt, ##__VA_ARGS__); \
-} while (0)
-
-/* Parameters that are useful for debugging, but should always be compiled in: */
-#define BCH_DEBUG_PARAMS_ALWAYS() \
- BCH_DEBUG_PARAM(key_merging_disabled, \
- "Disables merging of extents") \
- BCH_DEBUG_PARAM(btree_node_merging_disabled, \
- "Disables merging of btree nodes") \
- BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
- "Causes mark and sweep to compact and rewrite every " \
- "btree node it traverses") \
- BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \
- "Disables rewriting of btree nodes during mark and sweep")\
- BCH_DEBUG_PARAM(btree_shrinker_disabled, \
- "Disables the shrinker callback for the btree node cache")\
- BCH_DEBUG_PARAM(verify_btree_ondisk, \
- "Reread btree nodes at various points to verify the " \
- "mergesort in the read path against modifications " \
- "done in memory") \
- BCH_DEBUG_PARAM(verify_all_btree_replicas, \
- "When reading btree nodes, read all replicas and " \
- "compare them") \
- BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \
- "Don't use the write buffer for backpointers, enabling "\
- "extra runtime checks")
-
-/* Parameters that should only be compiled in debug mode: */
-#define BCH_DEBUG_PARAMS_DEBUG() \
- BCH_DEBUG_PARAM(expensive_debug_checks, \
- "Enables various runtime debugging checks that " \
- "significantly affect performance") \
- BCH_DEBUG_PARAM(debug_check_iterators, \
- "Enables extra verification for btree iterators") \
- BCH_DEBUG_PARAM(debug_check_btree_accounting, \
- "Verify btree accounting for keys within a node") \
- BCH_DEBUG_PARAM(journal_seq_verify, \
- "Store the journal sequence number in the version " \
- "number of every btree key, and verify that btree " \
- "update ordering is preserved during recovery") \
- BCH_DEBUG_PARAM(inject_invalid_keys, \
- "Store the journal sequence number in the version " \
- "number of every btree key, and verify that btree " \
- "update ordering is preserved during recovery") \
- BCH_DEBUG_PARAM(test_alloc_startup, \
- "Force allocator startup to use the slowpath where it" \
- "can't find enough free buckets without invalidating" \
- "cached data") \
- BCH_DEBUG_PARAM(force_reconstruct_read, \
- "Force reads to use the reconstruct path, when reading" \
- "from erasure coded extents") \
- BCH_DEBUG_PARAM(test_restart_gc, \
- "Test restarting mark and sweep gc when bucket gens change")
-
-#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
-#else
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
-#endif
-
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-#endif
-
-#define BCH_TIME_STATS() \
- x(btree_node_mem_alloc) \
- x(btree_node_split) \
- x(btree_node_compact) \
- x(btree_node_merge) \
- x(btree_node_sort) \
- x(btree_node_read) \
- x(btree_node_read_done) \
- x(btree_node_write) \
- x(btree_interior_update_foreground) \
- x(btree_interior_update_total) \
- x(btree_gc) \
- x(data_write) \
- x(data_read) \
- x(data_promote) \
- x(journal_flush_write) \
- x(journal_noflush_write) \
- x(journal_flush_seq) \
- x(blocked_journal_low_on_space) \
- x(blocked_journal_low_on_pin) \
- x(blocked_journal_max_in_flight) \
- x(blocked_journal_max_open) \
- x(blocked_key_cache_flush) \
- x(blocked_allocate) \
- x(blocked_allocate_open_bucket) \
- x(blocked_write_buffer_full) \
- x(nocow_lock_contended)
-
-enum bch_time_stats {
-#define x(name) BCH_TIME_##name,
- BCH_TIME_STATS()
-#undef x
- BCH_TIME_STAT_NR
-};
-
-#include "alloc_types.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-#include "btree_node_scan_types.h"
-#include "btree_write_buffer_types.h"
-#include "buckets_types.h"
-#include "buckets_waiting_for_journal_types.h"
-#include "clock_types.h"
-#include "disk_groups_types.h"
-#include "ec_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "quota_types.h"
-#include "rebalance_types.h"
-#include "replicas_types.h"
-#include "sb-members_types.h"
-#include "subvolume_types.h"
-#include "super_types.h"
-#include "thread_with_file_types.h"
-
-/* Number of nodes btree coalesce will try to coalesce at once */
-#define GC_MERGE_NODES 4U
-
-/* Maximum number of nodes we might need to allocate atomically: */
-#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
-
-/* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
-
-#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
-
-struct btree;
-
-struct io_count {
- u64 sectors[2][BCH_DATA_NR];
-};
-
-struct discard_in_flight {
- bool in_progress:1;
- u64 bucket:63;
-};
-
-struct bch_dev {
- struct kobject kobj;
-#ifdef CONFIG_BCACHEFS_DEBUG
- atomic_long_t ref;
- bool dying;
- unsigned long last_put;
-#else
- struct percpu_ref ref;
-#endif
- struct completion ref_completion;
- struct percpu_ref io_ref;
- struct completion io_ref_completion;
-
- struct bch_fs *fs;
-
- u8 dev_idx;
- /*
- * Cached version of this device's member info from superblock
- * Committed by bch2_write_super() -> bch_fs_mi_update()
- */
- struct bch_member_cpu mi;
- atomic64_t errors[BCH_MEMBER_ERROR_NR];
- unsigned long write_errors_start;
-
- __uuid_t uuid;
- char name[BDEVNAME_SIZE];
-
- struct bch_sb_handle disk_sb;
- struct bch_sb *sb_read_scratch;
- int sb_write_error;
- dev_t dev;
- atomic_t flush_seq;
-
- struct bch_devs_mask self;
-
- /*
- * Buckets:
- * Per-bucket arrays are protected by either rcu_read_lock or
- * state_lock, for device resize.
- */
- GENRADIX(struct bucket) buckets_gc;
- struct bucket_gens __rcu *bucket_gens;
- u8 *oldest_gen;
- unsigned long *buckets_nouse;
-
- unsigned long *bucket_backpointer_mismatches;
- unsigned long *bucket_backpointer_empty;
-
- struct bch_dev_usage __percpu *usage;
-
- /* Allocator: */
- u64 alloc_cursor[3];
-
- unsigned nr_open_buckets;
- unsigned nr_partial_buckets;
- unsigned nr_btree_reserve;
-
- size_t inc_gen_needs_gc;
- size_t inc_gen_really_needs_gc;
- size_t buckets_waiting_on_journal;
-
- struct work_struct invalidate_work;
- struct work_struct discard_work;
- struct mutex discard_buckets_in_flight_lock;
- DARRAY(struct discard_in_flight) discard_buckets_in_flight;
- struct work_struct discard_fast_work;
-
- atomic64_t rebalance_work;
-
- struct journal_device journal;
- u64 prev_journal_sector;
-
- struct work_struct io_error_work;
-
- /* The rest of this all shows up in sysfs */
- atomic64_t cur_latency[2];
- struct bch2_time_stats_quantiles io_latency[2];
-
-#define CONGESTED_MAX 1024
- atomic_t congested;
- u64 congested_last;
-
- struct io_count __percpu *io_done;
-};
-
-/*
- * initial_gc_unfixed
- * error
- * topology error
- */
-
-#define BCH_FS_FLAGS() \
- x(new_fs) \
- x(started) \
- x(clean_recovery) \
- x(btree_running) \
- x(accounting_replay_done) \
- x(may_go_rw) \
- x(rw) \
- x(was_rw) \
- x(stopping) \
- x(emergency_ro) \
- x(going_ro) \
- x(write_disable_complete) \
- x(clean_shutdown) \
- x(recovery_running) \
- x(fsck_running) \
- x(initial_gc_unfixed) \
- x(need_delete_dead_snapshots) \
- x(error) \
- x(topology_error) \
- x(errors_fixed) \
- x(errors_not_fixed) \
- x(no_invalid_checks) \
- x(discard_mount_opt_set) \
-
-enum bch_fs_flags {
-#define x(n) BCH_FS_##n,
- BCH_FS_FLAGS()
-#undef x
-};
-
-struct btree_debug {
- unsigned id;
-};
-
-#define BCH_TRANSACTIONS_NR 128
-
-struct btree_transaction_stats {
- struct bch2_time_stats duration;
- struct bch2_time_stats lock_hold_times;
- struct mutex lock;
- unsigned nr_max_paths;
- unsigned journal_entries_size;
- unsigned max_mem;
- char *max_paths_text;
-};
-
-struct bch_fs_pcpu {
- u64 sectors_available;
-};
-
-struct journal_seq_blacklist_table {
- size_t nr;
- struct journal_seq_blacklist_table_entry {
- u64 start;
- u64 end;
- bool dirty;
- } entries[];
-};
-
-struct btree_trans_buf {
- struct btree_trans *trans;
-};
-
-#define BCACHEFS_ROOT_SUBVOL_INUM \
- ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
-
-#define BCH_WRITE_REFS() \
- x(journal) \
- x(trans) \
- x(write) \
- x(promote) \
- x(node_rewrite) \
- x(stripe_create) \
- x(stripe_delete) \
- x(reflink) \
- x(fallocate) \
- x(fsync) \
- x(dio_write) \
- x(discard) \
- x(discard_fast) \
- x(check_discard_freespace_key) \
- x(invalidate) \
- x(delete_dead_snapshots) \
- x(gc_gens) \
- x(snapshot_delete_pagecache) \
- x(sysfs) \
- x(btree_write_buffer) \
- x(btree_node_scrub)
-
-enum bch_write_ref {
-#define x(n) BCH_WRITE_REF_##n,
- BCH_WRITE_REFS()
-#undef x
- BCH_WRITE_REF_NR,
-};
-
-#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0)
-
-struct bch_fs {
- struct closure cl;
-
- struct list_head list;
- struct kobject kobj;
- struct kobject counters_kobj;
- struct kobject internal;
- struct kobject opts_dir;
- struct kobject time_stats;
- unsigned long flags;
-
- int minor;
- struct device *chardev;
- struct super_block *vfs_sb;
- dev_t dev;
- char name[40];
- struct stdio_redirect *stdio;
- struct task_struct *stdio_filter;
-
- /* ro/rw, add/remove/resize devices: */
- struct rw_semaphore state_lock;
-
- /* Counts outstanding writes, for clean transition to read-only */
-#ifdef BCH_WRITE_REF_DEBUG
- atomic_long_t writes[BCH_WRITE_REF_NR];
-#else
- struct percpu_ref writes;
-#endif
- /*
- * Certain operations are only allowed in single threaded mode, during
- * recovery, and we want to assert that this is the case:
- */
- struct task_struct *recovery_task;
-
- /*
- * Analagous to c->writes, for asynchronous ops that don't necessarily
- * need fs to be read-write
- */
- refcount_t ro_ref;
- wait_queue_head_t ro_ref_wait;
-
- struct work_struct read_only_work;
-
- struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
-
- struct bch_accounting_mem accounting;
-
- struct bch_replicas_cpu replicas;
- struct bch_replicas_cpu replicas_gc;
- struct mutex replicas_gc_lock;
-
- struct journal_entry_res btree_root_journal_res;
- struct journal_entry_res clock_journal_res;
-
- struct bch_disk_groups_cpu __rcu *disk_groups;
-
- struct bch_opts opts;
-
- /* Updated by bch2_sb_update():*/
- struct {
- __uuid_t uuid;
- __uuid_t user_uuid;
-
- u16 version;
- u16 version_incompat;
- u16 version_incompat_allowed;
- u16 version_min;
- u16 version_upgrade_complete;
-
- u8 nr_devices;
- u8 clean;
-
- u8 encryption_type;
-
- u64 time_base_lo;
- u32 time_base_hi;
- unsigned time_units_per_sec;
- unsigned nsec_per_time_unit;
- u64 features;
- u64 compat;
- unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
- u64 btrees_lost_data;
- } sb;
-
-#ifdef CONFIG_UNICODE
- struct unicode_map *cf_encoding;
-#endif
-
- struct bch_sb_handle disk_sb;
-
- unsigned short block_bits; /* ilog2(block_size) */
-
- u16 btree_foreground_merge_threshold;
-
- struct closure sb_write;
- struct mutex sb_lock;
-
- /* snapshot.c: */
- struct snapshot_table __rcu *snapshots;
- struct mutex snapshot_table_lock;
- struct rw_semaphore snapshot_create_lock;
-
- struct work_struct snapshot_delete_work;
- struct work_struct snapshot_wait_for_pagecache_and_delete_work;
- snapshot_id_list snapshots_unlinked;
- struct mutex snapshots_unlinked_lock;
-
- /* BTREE CACHE */
- struct bio_set btree_bio;
- struct workqueue_struct *btree_read_complete_wq;
- struct workqueue_struct *btree_write_submit_wq;
-
- struct btree_root btree_roots_known[BTREE_ID_NR];
- DARRAY(struct btree_root) btree_roots_extra;
- struct mutex btree_root_lock;
-
- struct btree_cache btree_cache;
-
- /*
- * Cache of allocated btree nodes - if we allocate a btree node and
- * don't use it, if we free it that space can't be reused until going
- * _all_ the way through the allocator (which exposes us to a livelock
- * when allocating btree reserves fail halfway through) - instead, we
- * can stick them here:
- */
- struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
- unsigned btree_reserve_cache_nr;
- struct mutex btree_reserve_cache_lock;
-
- mempool_t btree_interior_update_pool;
- struct list_head btree_interior_update_list;
- struct list_head btree_interior_updates_unwritten;
- struct mutex btree_interior_update_lock;
- struct closure_waitlist btree_interior_update_wait;
-
- struct workqueue_struct *btree_interior_update_worker;
- struct work_struct btree_interior_update_work;
-
- struct workqueue_struct *btree_node_rewrite_worker;
- struct list_head btree_node_rewrites;
- struct list_head btree_node_rewrites_pending;
- spinlock_t btree_node_rewrites_lock;
- struct closure_waitlist btree_node_rewrites_wait;
-
- /* btree_io.c: */
- spinlock_t btree_write_error_lock;
- struct btree_write_stats {
- atomic64_t nr;
- atomic64_t bytes;
- } btree_write_stats[BTREE_WRITE_TYPE_NR];
-
- /* btree_iter.c: */
- struct seqmutex btree_trans_lock;
- struct list_head btree_trans_list;
- mempool_t btree_trans_pool;
- mempool_t btree_trans_mem_pool;
- struct btree_trans_buf __percpu *btree_trans_bufs;
-
- struct srcu_struct btree_trans_barrier;
- bool btree_trans_barrier_initialized;
-
- struct btree_key_cache btree_key_cache;
- unsigned btree_key_cache_btrees;
-
- struct btree_write_buffer btree_write_buffer;
-
- struct workqueue_struct *btree_update_wq;
- struct workqueue_struct *btree_io_complete_wq;
- /* copygc needs its own workqueue for index updates.. */
- struct workqueue_struct *copygc_wq;
- /*
- * Use a dedicated wq for write ref holder tasks. Required to avoid
- * dependency problems with other wq tasks that can block on ref
- * draining, such as read-only transition.
- */
- struct workqueue_struct *write_ref_wq;
-
- /* ALLOCATION */
- struct bch_devs_mask rw_devs[BCH_DATA_NR];
- unsigned long rw_devs_change_count;
-
- u64 capacity; /* sectors */
- u64 reserved; /* sectors */
-
- /*
- * When capacity _decreases_ (due to a disk being removed), we
- * increment capacity_gen - this invalidates outstanding reservations
- * and forces them to be revalidated
- */
- u32 capacity_gen;
- unsigned bucket_size_max;
-
- atomic64_t sectors_available;
- struct mutex sectors_available_lock;
-
- struct bch_fs_pcpu __percpu *pcpu;
-
- struct percpu_rw_semaphore mark_lock;
-
- seqcount_t usage_lock;
- struct bch_fs_usage_base __percpu *usage;
- u64 __percpu *online_reserved;
-
- unsigned long allocator_last_stuck;
-
- struct io_clock io_clock[2];
-
- /* JOURNAL SEQ BLACKLIST */
- struct journal_seq_blacklist_table *
- journal_seq_blacklist_table;
-
- /* ALLOCATOR */
- spinlock_t freelist_lock;
- struct closure_waitlist freelist_wait;
-
- open_bucket_idx_t open_buckets_freelist;
- open_bucket_idx_t open_buckets_nr_free;
- struct closure_waitlist open_buckets_wait;
- struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
- open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT];
-
- open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
- open_bucket_idx_t open_buckets_partial_nr;
-
- struct write_point btree_write_point;
- struct write_point rebalance_write_point;
-
- struct write_point write_points[WRITE_POINT_MAX];
- struct hlist_head write_points_hash[WRITE_POINT_HASH_NR];
- struct mutex write_points_hash_lock;
- unsigned write_points_nr;
-
- struct buckets_waiting_for_journal buckets_waiting_for_journal;
-
- /* GARBAGE COLLECTION */
- struct work_struct gc_gens_work;
- unsigned long gc_count;
-
- enum btree_id gc_gens_btree;
- struct bpos gc_gens_pos;
-
- /*
- * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
- * has been marked by GC.
- *
- * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
- *
- * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
- * can read without a lock.
- */
- seqcount_t gc_pos_lock;
- struct gc_pos gc_pos;
-
- /*
- * The allocation code needs gc_mark in struct bucket to be correct, but
- * it's not while a gc is in progress.
- */
- struct rw_semaphore gc_lock;
- struct mutex gc_gens_lock;
-
- /* IO PATH */
- struct semaphore io_in_flight;
- struct bio_set bio_read;
- struct bio_set bio_read_split;
- struct bio_set bio_write;
- struct bio_set replica_set;
- struct mutex bio_bounce_pages_lock;
- mempool_t bio_bounce_pages;
- struct bucket_nocow_lock_table
- nocow_locks;
- struct rhashtable promote_table;
-
- mempool_t compression_bounce[2];
- mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
- size_t zstd_workspace_size;
-
- struct crypto_sync_skcipher *chacha20;
- struct crypto_shash *poly1305;
-
- atomic64_t key_version;
-
- mempool_t large_bkey_pool;
-
- /* MOVE.C */
- struct list_head moving_context_list;
- struct mutex moving_context_lock;
-
- /* REBALANCE */
- struct bch_fs_rebalance rebalance;
-
- /* COPYGC */
- struct task_struct *copygc_thread;
- struct write_point copygc_write_point;
- s64 copygc_wait_at;
- s64 copygc_wait;
- bool copygc_running;
- wait_queue_head_t copygc_running_wq;
-
- /* STRIPES: */
- GENRADIX(struct gc_stripe) gc_stripes;
-
- struct hlist_head ec_stripes_new[32];
- spinlock_t ec_stripes_new_lock;
-
- /* ERASURE CODING */
- struct list_head ec_stripe_head_list;
- struct mutex ec_stripe_head_lock;
-
- struct list_head ec_stripe_new_list;
- struct mutex ec_stripe_new_lock;
- wait_queue_head_t ec_stripe_new_wait;
-
- struct work_struct ec_stripe_create_work;
- u64 ec_stripe_hint;
-
- struct work_struct ec_stripe_delete_work;
-
- struct bio_set ec_bioset;
-
- /* REFLINK */
- reflink_gc_table reflink_gc_table;
- size_t reflink_gc_nr;
-
- /* fs.c */
- struct list_head vfs_inodes_list;
- struct mutex vfs_inodes_lock;
- struct rhashtable vfs_inodes_table;
- struct rhltable vfs_inodes_by_inum_table;
-
- /* VFS IO PATH - fs-io.c */
- struct bio_set writepage_bioset;
- struct bio_set dio_write_bioset;
- struct bio_set dio_read_bioset;
- struct bio_set nocow_flush_bioset;
-
- /* QUOTAS */
- struct bch_memquota_type quotas[QTYP_NR];
-
- /* RECOVERY */
- u64 journal_replay_seq_start;
- u64 journal_replay_seq_end;
- /*
- * Two different uses:
- * "Has this fsck pass?" - i.e. should this type of error be an
- * emergency read-only
- * And, in certain situations fsck will rewind to an earlier pass: used
- * for signaling to the toplevel code which pass we want to run now.
- */
- enum bch_recovery_pass curr_recovery_pass;
- enum bch_recovery_pass next_recovery_pass;
- /* bitmask of recovery passes that we actually ran */
- u64 recovery_passes_complete;
- /* never rewinds version of curr_recovery_pass */
- enum bch_recovery_pass recovery_pass_done;
- spinlock_t recovery_pass_lock;
- struct semaphore online_fsck_mutex;
-
- /* DEBUG JUNK */
- struct dentry *fs_debug_dir;
- struct dentry *btree_debug_dir;
- struct btree_debug btree_debug[BTREE_ID_NR];
- struct btree *verify_data;
- struct btree_node *verify_ondisk;
- struct mutex verify_lock;
-
- /*
- * A btree node on disk could have too many bsets for an iterator to fit
- * on the stack - have to dynamically allocate them
- */
- mempool_t fill_iter;
-
- mempool_t btree_bounce_pool;
-
- struct journal journal;
- GENRADIX(struct journal_replay *) journal_entries;
- u64 journal_entries_base_seq;
- struct journal_keys journal_keys;
- struct list_head journal_iters;
-
- struct find_btree_nodes found_btree_nodes;
-
- u64 last_bucket_seq_cleanup;
-
- u64 counters_on_mount[BCH_COUNTER_NR];
- u64 __percpu *counters;
-
- struct bch2_time_stats times[BCH_TIME_STAT_NR];
-
- struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
-
- /* ERRORS */
- struct list_head fsck_error_msgs;
- struct mutex fsck_error_msgs_lock;
- bool fsck_alloc_msgs_err;
-
- bch_sb_errors_cpu fsck_error_counts;
- struct mutex fsck_error_counts_lock;
-};
-
-extern struct wait_queue_head bch2_read_only_wait;
-
-static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- atomic_long_inc(&c->writes[ref]);
-#else
- percpu_ref_get(&c->writes);
-#endif
-}
-
-static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- return !test_bit(BCH_FS_going_ro, &c->flags) &&
- atomic_long_inc_not_zero(&c->writes[ref]);
-#else
- return percpu_ref_tryget(&c->writes);
-#endif
-}
-
-static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- return !test_bit(BCH_FS_going_ro, &c->flags) &&
- atomic_long_inc_not_zero(&c->writes[ref]);
-#else
- return percpu_ref_tryget_live(&c->writes);
-#endif
-}
-
-static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- long v = atomic_long_dec_return(&c->writes[ref]);
-
- BUG_ON(v < 0);
- if (v)
- return;
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
- if (atomic_long_read(&c->writes[i]))
- return;
-
- set_bit(BCH_FS_write_disable_complete, &c->flags);
- wake_up(&bch2_read_only_wait);
-#else
- percpu_ref_put(&c->writes);
-#endif
-}
-
-static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
-{
- if (test_bit(BCH_FS_stopping, &c->flags))
- return false;
-
- return refcount_inc_not_zero(&c->ro_ref);
-}
-
-static inline void bch2_ro_ref_put(struct bch_fs *c)
-{
- if (refcount_dec_and_test(&c->ro_ref))
- wake_up(&c->ro_ref_wait);
-}
-
-static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
-{
-#ifndef NO_BCACHEFS_FS
- if (c->vfs_sb)
- c->vfs_sb->s_bdi->ra_pages = ra_pages;
-#endif
-}
-
-static inline unsigned bucket_bytes(const struct bch_dev *ca)
-{
- return ca->mi.bucket_size << 9;
-}
-
-static inline unsigned block_bytes(const struct bch_fs *c)
-{
- return c->opts.block_size;
-}
-
-static inline unsigned block_sectors(const struct bch_fs *c)
-{
- return c->opts.block_size >> 9;
-}
-
-static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
-{
- return c->btree_key_cache_btrees & (1U << btree);
-}
-
-static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
-{
- struct timespec64 t;
- s64 sec;
- s32 rem;
-
- time += c->sb.time_base_lo;
-
- sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
-
- set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
-
- return t;
-}
-
-static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
-{
- return (ts.tv_sec * c->sb.time_units_per_sec +
- (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
-}
-
-static inline s64 bch2_current_time(const struct bch_fs *c)
-{
- struct timespec64 now;
-
- ktime_get_coarse_real_ts64(&now);
- return timespec_to_bch2_time(c, now);
-}
-
-static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
-{
- return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
-}
-
-static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
-{
- struct stdio_redirect *stdio = c->stdio;
-
- if (c->stdio_filter && c->stdio_filter != current)
- stdio = NULL;
- return stdio;
-}
-
-static inline unsigned metadata_replicas_required(struct bch_fs *c)
-{
- return min(c->opts.metadata_replicas,
- c->opts.metadata_replicas_required);
-}
-
-static inline unsigned data_replicas_required(struct bch_fs *c)
-{
- return min(c->opts.data_replicas,
- c->opts.data_replicas_required);
-}
-
-#define BKEY_PADDED_ONSTACK(key, pad) \
- struct { struct bkey_i key; __u64 key ## _pad[pad]; }
-
-#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
deleted file mode 100644
index e96d87767020..000000000000
--- a/fs/bcachefs/bcachefs_format.h
+++ /dev/null
@@ -1,1517 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FORMAT_H
-#define _BCACHEFS_FORMAT_H
-
-/*
- * bcachefs on disk data structures
- *
- * OVERVIEW:
- *
- * There are three main types of on disk data structures in bcachefs (this is
- * reduced from 5 in bcache)
- *
- * - superblock
- * - journal
- * - btree
- *
- * The btree is the primary structure; most metadata exists as keys in the
- * various btrees. There are only a small number of btrees, they're not
- * sharded - we have one btree for extents, another for inodes, et cetera.
- *
- * SUPERBLOCK:
- *
- * The superblock contains the location of the journal, the list of devices in
- * the filesystem, and in general any metadata we need in order to decide
- * whether we can start a filesystem or prior to reading the journal/btree
- * roots.
- *
- * The superblock is extensible, and most of the contents of the superblock are
- * in variable length, type tagged fields; see struct bch_sb_field.
- *
- * Backup superblocks do not reside in a fixed location; also, superblocks do
- * not have a fixed size. To locate backup superblocks we have struct
- * bch_sb_layout; we store a copy of this inside every superblock, and also
- * before the first superblock.
- *
- * JOURNAL:
- *
- * The journal primarily records btree updates in the order they occurred;
- * journal replay consists of just iterating over all the keys in the open
- * journal entries and re-inserting them into the btrees.
- *
- * The journal also contains entry types for the btree roots, and blacklisted
- * journal sequence numbers (see journal_seq_blacklist.c).
- *
- * BTREE:
- *
- * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
- * 128k-256k) and log structured. We use struct btree_node for writing the first
- * entry in a given node (offset 0), and struct btree_node_entry for all
- * subsequent writes.
- *
- * After the header, btree node entries contain a list of keys in sorted order.
- * Values are stored inline with the keys; since values are variable length (and
- * keys effectively are variable length too, due to packing) we can't do random
- * access without building up additional in memory tables in the btree node read
- * path.
- *
- * BTREE KEYS (struct bkey):
- *
- * The various btrees share a common format for the key - so as to avoid
- * switching in fastpath lookup/comparison code - but define their own
- * structures for the key values.
- *
- * The size of a key/value pair is stored as a u8 in units of u64s, so the max
- * size is just under 2k. The common part also contains a type tag for the
- * value, and a format field indicating whether the key is packed or not (and
- * also meant to allow adding new key fields in the future, if desired).
- *
- * bkeys, when stored within a btree node, may also be packed. In that case, the
- * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
- * be generous with field sizes in the common part of the key format (64 bit
- * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
- */
-
-#include <asm/types.h>
-#include <asm/byteorder.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <uapi/linux/magic.h>
-#include "vstructs.h"
-
-#ifdef __KERNEL__
-typedef uuid_t __uuid_t;
-#endif
-
-#define BITMASK(name, type, field, offset, end) \
-static const __maybe_unused unsigned name##_OFFSET = offset; \
-static const __maybe_unused unsigned name##_BITS = (end - offset); \
- \
-static inline __u64 name(const type *k) \
-{ \
- return (k->field >> offset) & ~(~0ULL << (end - offset)); \
-} \
- \
-static inline void SET_##name(type *k, __u64 v) \
-{ \
- k->field &= ~(~(~0ULL << (end - offset)) << offset); \
- k->field |= (v & ~(~0ULL << (end - offset))) << offset; \
-}
-
-#define LE_BITMASK(_bits, name, type, field, offset, end) \
-static const __maybe_unused unsigned name##_OFFSET = offset; \
-static const __maybe_unused unsigned name##_BITS = (end - offset); \
-static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\
- \
-static inline __u64 name(const type *k) \
-{ \
- return (__le##_bits##_to_cpu(k->field) >> offset) & \
- ~(~0ULL << (end - offset)); \
-} \
- \
-static inline void SET_##name(type *k, __u64 v) \
-{ \
- __u##_bits new = __le##_bits##_to_cpu(k->field); \
- \
- new &= ~(~(~0ULL << (end - offset)) << offset); \
- new |= (v & ~(~0ULL << (end - offset))) << offset; \
- k->field = __cpu_to_le##_bits(new); \
-}
-
-#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e)
-#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e)
-#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e)
-
-struct bkey_format {
- __u8 key_u64s;
- __u8 nr_fields;
- /* One unused slot for now: */
- __u8 bits_per_field[6];
- __le64 field_offset[6];
-};
-
-/* Btree keys - all units are in sectors */
-
-struct bpos {
- /*
- * Word order matches machine byte order - btree code treats a bpos as a
- * single large integer, for search/comparison purposes
- *
- * Note that wherever a bpos is embedded in another on disk data
- * structure, it has to be byte swabbed when reading in metadata that
- * wasn't written in native endian order:
- */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- __u32 snapshot;
- __u64 offset;
- __u64 inode;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- __u64 inode;
- __u64 offset; /* Points to end of extent - sectors */
- __u32 snapshot;
-#else
-#error edit for your odd byteorder.
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-#define KEY_INODE_MAX ((__u64)~0ULL)
-#define KEY_OFFSET_MAX ((__u64)~0ULL)
-#define KEY_SNAPSHOT_MAX ((__u32)~0U)
-#define KEY_SIZE_MAX ((__u32)~0U)
-
-static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
-{
- return (struct bpos) {
- .inode = inode,
- .offset = offset,
- .snapshot = snapshot,
- };
-}
-
-#define POS_MIN SPOS(0, 0, 0)
-#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
-#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
-#define POS(_inode, _offset) SPOS(_inode, _offset, 0)
-
-/* Empty placeholder struct, for container_of() */
-struct bch_val {
- __u64 __nothing[0];
-};
-
-struct bversion {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- __u64 lo;
- __u32 hi;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- __u32 hi;
- __u64 lo;
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-struct bkey {
- /* Size of combined key and value, in u64s */
- __u8 u64s;
-
- /* Format of key (0 for format local to btree node) */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u8 format:7,
- needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u8 needs_whiteout:1,
- format:7;
-#else
-#error edit for your odd byteorder.
-#endif
-
- /* Type of the value */
- __u8 type;
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- __u8 pad[1];
-
- struct bversion bversion;
- __u32 size; /* extent size, in sectors */
- struct bpos p;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- struct bpos p;
- __u32 size; /* extent size, in sectors */
- struct bversion bversion;
-
- __u8 pad[1];
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-/*
- * The big-endian version of bkey can't be compiled by rustc with the "aligned"
- * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
- * So for Rust compatibility, don't include this. It can be included in the LE
- * version because the "packed" attr is redundant in that case.
- *
- * History: (quoting Kent)
- *
- * Specifically, when i was designing bkey, I wanted the header to be no
- * bigger than necessary so that bkey_packed could use the rest. That means that
- * decently offten extent keys will fit into only 8 bytes, instead of spilling over
- * to 16.
- *
- * But packed_bkey treats the part after the header - the packed section -
- * as a single multi word, variable length integer. And bkey, the unpacked
- * version, is just a special case version of a bkey_packed; all the packed
- * bkey code will work on keys in any packed format, the in-memory
- * representation of an unpacked key also is just one type of packed key...
- *
- * So that constrains the key part of a bkig endian bkey to start right
- * after the header.
- *
- * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
- * some reason - that will clean up this wart.
- */
-__aligned(8)
-#endif
-;
-
-struct bkey_packed {
- __u64 _data[0];
-
- /* Size of combined key and value, in u64s */
- __u8 u64s;
-
- /* Format of key (0 for format local to btree node) */
-
- /*
- * XXX: next incompat on disk format change, switch format and
- * needs_whiteout - bkey_packed() will be cheaper if format is the high
- * bits of the bitfield
- */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u8 format:7,
- needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u8 needs_whiteout:1,
- format:7;
-#endif
-
- /* Type of the value */
- __u8 type;
- __u8 key_start[0];
-
- /*
- * We copy bkeys with struct assignment in various places, and while
- * that shouldn't be done with packed bkeys we can't disallow it in C,
- * and it's legal to cast a bkey to a bkey_packed - so padding it out
- * to the same size as struct bkey should hopefully be safest.
- */
- __u8 pad[sizeof(struct bkey) - 3];
-} __packed __aligned(8);
-
-typedef struct {
- __le64 lo;
- __le64 hi;
-} bch_le128;
-
-#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64))
-#define BKEY_U64s_MAX U8_MAX
-#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s)
-
-#define KEY_PACKED_BITS_START 24
-
-#define KEY_FORMAT_LOCAL_BTREE 0
-#define KEY_FORMAT_CURRENT 1
-
-enum bch_bkey_fields {
- BKEY_FIELD_INODE,
- BKEY_FIELD_OFFSET,
- BKEY_FIELD_SNAPSHOT,
- BKEY_FIELD_SIZE,
- BKEY_FIELD_VERSION_HI,
- BKEY_FIELD_VERSION_LO,
- BKEY_NR_FIELDS,
-};
-
-#define bkey_format_field(name, field) \
- [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
-
-#define BKEY_FORMAT_CURRENT \
-((struct bkey_format) { \
- .key_u64s = BKEY_U64s, \
- .nr_fields = BKEY_NR_FIELDS, \
- .bits_per_field = { \
- bkey_format_field(INODE, p.inode), \
- bkey_format_field(OFFSET, p.offset), \
- bkey_format_field(SNAPSHOT, p.snapshot), \
- bkey_format_field(SIZE, size), \
- bkey_format_field(VERSION_HI, bversion.hi), \
- bkey_format_field(VERSION_LO, bversion.lo), \
- }, \
-})
-
-/* bkey with inline value */
-struct bkey_i {
- __u64 _data[0];
-
- struct bkey k;
- struct bch_val v;
-};
-
-#define POS_KEY(_pos) \
-((struct bkey) { \
- .u64s = BKEY_U64s, \
- .format = KEY_FORMAT_CURRENT, \
- .p = _pos, \
-})
-
-#define KEY(_inode, _offset, _size) \
-((struct bkey) { \
- .u64s = BKEY_U64s, \
- .format = KEY_FORMAT_CURRENT, \
- .p = POS(_inode, _offset), \
- .size = _size, \
-})
-
-static inline void bkey_init(struct bkey *k)
-{
- *k = KEY(0, 0, 0);
-}
-
-#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64))
-
-#define __BKEY_PADDED(key, pad) \
- struct bkey_i key; __u64 key ## _pad[pad]
-
-/*
- * - DELETED keys are used internally to mark keys that should be ignored but
- * override keys in composition order. Their version number is ignored.
- *
- * - DISCARDED keys indicate that the data is all 0s because it has been
- * discarded. DISCARDs may have a version; if the version is nonzero the key
- * will be persistent, otherwise the key will be dropped whenever the btree
- * node is rewritten (like DELETED keys).
- *
- * - ERROR: any read of the data returns a read error, as the data was lost due
- * to a failing device. Like DISCARDED keys, they can be removed (overridden)
- * by new writes or cluster-wide GC. Node repair can also overwrite them with
- * the same or a more recent version number, but not with an older version
- * number.
- *
- * - WHITEOUT: for hash table btrees
- */
-#define BCH_BKEY_TYPES() \
- x(deleted, 0) \
- x(whiteout, 1) \
- x(error, 2) \
- x(cookie, 3) \
- x(hash_whiteout, 4) \
- x(btree_ptr, 5) \
- x(extent, 6) \
- x(reservation, 7) \
- x(inode, 8) \
- x(inode_generation, 9) \
- x(dirent, 10) \
- x(xattr, 11) \
- x(alloc, 12) \
- x(quota, 13) \
- x(stripe, 14) \
- x(reflink_p, 15) \
- x(reflink_v, 16) \
- x(inline_data, 17) \
- x(btree_ptr_v2, 18) \
- x(indirect_inline_data, 19) \
- x(alloc_v2, 20) \
- x(subvolume, 21) \
- x(snapshot, 22) \
- x(inode_v2, 23) \
- x(alloc_v3, 24) \
- x(set, 25) \
- x(lru, 26) \
- x(alloc_v4, 27) \
- x(backpointer, 28) \
- x(inode_v3, 29) \
- x(bucket_gens, 30) \
- x(snapshot_tree, 31) \
- x(logged_op_truncate, 32) \
- x(logged_op_finsert, 33) \
- x(accounting, 34) \
- x(inode_alloc_cursor, 35)
-
-enum bch_bkey_type {
-#define x(name, nr) KEY_TYPE_##name = nr,
- BCH_BKEY_TYPES()
-#undef x
- KEY_TYPE_MAX,
-};
-
-struct bch_deleted {
- struct bch_val v;
-};
-
-struct bch_whiteout {
- struct bch_val v;
-};
-
-struct bch_error {
- struct bch_val v;
-};
-
-struct bch_cookie {
- struct bch_val v;
- __le64 cookie;
-};
-
-struct bch_hash_whiteout {
- struct bch_val v;
-};
-
-struct bch_set {
- struct bch_val v;
-};
-
-/* 128 bits, sufficient for cryptographic MACs: */
-struct bch_csum {
- __le64 lo;
- __le64 hi;
-} __packed __aligned(8);
-
-struct bch_backpointer {
- struct bch_val v;
- __u8 btree_id;
- __u8 level;
- __u8 data_type;
- __u8 bucket_gen;
- __u32 pad;
- __u32 bucket_len;
- struct bpos pos;
-} __packed __aligned(8);
-
-/* Optional/variable size superblock sections: */
-
-struct bch_sb_field {
- __u64 _data[0];
- __le32 u64s;
- __le32 type;
-};
-
-#define BCH_SB_FIELDS() \
- x(journal, 0) \
- x(members_v1, 1) \
- x(crypt, 2) \
- x(replicas_v0, 3) \
- x(quota, 4) \
- x(disk_groups, 5) \
- x(clean, 6) \
- x(replicas, 7) \
- x(journal_seq_blacklist, 8) \
- x(journal_v2, 9) \
- x(counters, 10) \
- x(members_v2, 11) \
- x(errors, 12) \
- x(ext, 13) \
- x(downgrade, 14)
-
-#include "alloc_background_format.h"
-#include "dirent_format.h"
-#include "disk_accounting_format.h"
-#include "disk_groups_format.h"
-#include "extents_format.h"
-#include "ec_format.h"
-#include "inode_format.h"
-#include "journal_seq_blacklist_format.h"
-#include "logged_ops_format.h"
-#include "lru_format.h"
-#include "quota_format.h"
-#include "reflink_format.h"
-#include "replicas_format.h"
-#include "snapshot_format.h"
-#include "subvolume_format.h"
-#include "sb-counters_format.h"
-#include "sb-downgrade_format.h"
-#include "sb-errors_format.h"
-#include "sb-members_format.h"
-#include "xattr_format.h"
-
-enum bch_sb_field_type {
-#define x(f, nr) BCH_SB_FIELD_##f = nr,
- BCH_SB_FIELDS()
-#undef x
- BCH_SB_FIELD_NR
-};
-
-/*
- * Most superblock fields are replicated in all device's superblocks - a few are
- * not:
- */
-#define BCH_SINGLE_DEVICE_SB_FIELDS \
- ((1U << BCH_SB_FIELD_journal)| \
- (1U << BCH_SB_FIELD_journal_v2))
-
-/* BCH_SB_FIELD_journal: */
-
-struct bch_sb_field_journal {
- struct bch_sb_field field;
- __le64 buckets[];
-};
-
-struct bch_sb_field_journal_v2 {
- struct bch_sb_field field;
-
- struct bch_sb_field_journal_v2_entry {
- __le64 start;
- __le64 nr;
- } d[];
-};
-
-/* BCH_SB_FIELD_crypt: */
-
-struct nonce {
- __le32 d[4];
-};
-
-struct bch_key {
- __le64 key[4];
-};
-
-#define BCH_KEY_MAGIC \
- (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \
- ((__u64) 'h' << 16)|((__u64) '*' << 24)| \
- ((__u64) '*' << 32)|((__u64) 'k' << 40)| \
- ((__u64) 'e' << 48)|((__u64) 'y' << 56))
-
-struct bch_encrypted_key {
- __le64 magic;
- struct bch_key key;
-};
-
-/*
- * If this field is present in the superblock, it stores an encryption key which
- * is used encrypt all other data/metadata. The key will normally be encrypted
- * with the key userspace provides, but if encryption has been turned off we'll
- * just store the master key unencrypted in the superblock so we can access the
- * previously encrypted data.
- */
-struct bch_sb_field_crypt {
- struct bch_sb_field field;
-
- __le64 flags;
- __le64 kdf_flags;
- struct bch_encrypted_key key;
-};
-
-LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4);
-
-enum bch_kdf_types {
- BCH_KDF_SCRYPT = 0,
- BCH_KDF_NR = 1,
-};
-
-/* stored as base 2 log of scrypt params: */
-LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
-LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
-LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
-
-/*
- * On clean shutdown, store btree roots and current journal sequence number in
- * the superblock:
- */
-struct jset_entry {
- __le16 u64s;
- __u8 btree_id;
- __u8 level;
- __u8 type; /* designates what this jset holds */
- __u8 pad[3];
-
- struct bkey_i start[0];
- __u64 _data[];
-};
-
-struct bch_sb_field_clean {
- struct bch_sb_field field;
-
- __le32 flags;
- __le16 _read_clock; /* no longer used */
- __le16 _write_clock;
- __le64 journal_seq;
-
- struct jset_entry start[0];
- __u64 _data[];
-};
-
-struct bch_sb_field_ext {
- struct bch_sb_field field;
- __le64 recovery_passes_required[2];
- __le64 errors_silent[8];
- __le64 btrees_lost_data;
-};
-
-/* Superblock: */
-
-/*
- * New versioning scheme:
- * One common version number for all on disk data structures - superblock, btree
- * nodes, journal entries
- */
-#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10))
-#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10)))
-#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0)
-
-/*
- * field 1: version name
- * field 2: BCH_VERSION(major, minor)
- * field 3: recovery passess required on upgrade
- */
-#define BCH_METADATA_VERSIONS() \
- x(bkey_renumber, BCH_VERSION(0, 10)) \
- x(inode_btree_change, BCH_VERSION(0, 11)) \
- x(snapshot, BCH_VERSION(0, 12)) \
- x(inode_backpointers, BCH_VERSION(0, 13)) \
- x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \
- x(snapshot_2, BCH_VERSION(0, 15)) \
- x(reflink_p_fix, BCH_VERSION(0, 16)) \
- x(subvol_dirent, BCH_VERSION(0, 17)) \
- x(inode_v2, BCH_VERSION(0, 18)) \
- x(freespace, BCH_VERSION(0, 19)) \
- x(alloc_v4, BCH_VERSION(0, 20)) \
- x(new_data_types, BCH_VERSION(0, 21)) \
- x(backpointers, BCH_VERSION(0, 22)) \
- x(inode_v3, BCH_VERSION(0, 23)) \
- x(unwritten_extents, BCH_VERSION(0, 24)) \
- x(bucket_gens, BCH_VERSION(0, 25)) \
- x(lru_v2, BCH_VERSION(0, 26)) \
- x(fragmentation_lru, BCH_VERSION(0, 27)) \
- x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \
- x(snapshot_trees, BCH_VERSION(0, 29)) \
- x(major_minor, BCH_VERSION(1, 0)) \
- x(snapshot_skiplists, BCH_VERSION(1, 1)) \
- x(deleted_inodes, BCH_VERSION(1, 2)) \
- x(rebalance_work, BCH_VERSION(1, 3)) \
- x(member_seq, BCH_VERSION(1, 4)) \
- x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
- x(btree_subvolume_children, BCH_VERSION(1, 6)) \
- x(mi_btree_bitmap, BCH_VERSION(1, 7)) \
- x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
- x(disk_accounting_v2, BCH_VERSION(1, 9)) \
- x(disk_accounting_v3, BCH_VERSION(1, 10)) \
- x(disk_accounting_inum, BCH_VERSION(1, 11)) \
- x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
- x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \
- x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \
- x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \
- x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \
- x(inode_depth, BCH_VERSION(1, 17)) \
- x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
- x(autofix_errors, BCH_VERSION(1, 19)) \
- x(directory_size, BCH_VERSION(1, 20)) \
- x(cached_backpointers, BCH_VERSION(1, 21)) \
- x(stripe_backpointers, BCH_VERSION(1, 22)) \
- x(stripe_lru, BCH_VERSION(1, 23)) \
- x(casefolding, BCH_VERSION(1, 24)) \
- x(extent_flags, BCH_VERSION(1, 25))
-
-enum bcachefs_metadata_version {
- bcachefs_metadata_version_min = 9,
-#define x(t, n) bcachefs_metadata_version_##t = n,
- BCH_METADATA_VERSIONS()
-#undef x
- bcachefs_metadata_version_max
-};
-
-static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
-
-#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
-
-#define BCH_SB_SECTOR 8
-
-#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */
-
-struct bch_sb_layout {
- __uuid_t magic; /* bcachefs superblock UUID */
- __u8 layout_type;
- __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */
- __u8 nr_superblocks;
- __u8 pad[5];
- __le64 sb_offset[61];
-} __packed __aligned(8);
-
-#define BCH_SB_LAYOUT_SECTOR 7
-
-/*
- * @offset - sector where this sb was written
- * @version - on disk format version
- * @version_min - Oldest metadata version this filesystem contains; so we can
- * safely drop compatibility code and refuse to mount filesystems
- * we'd need it for
- * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC)
- * @seq - incremented each time superblock is written
- * @uuid - used for generating various magic numbers and identifying
- * member devices, never changes
- * @user_uuid - user visible UUID, may be changed
- * @label - filesystem label
- * @seq - identifies most recent superblock, incremented each time
- * superblock is written
- * @features - enabled incompatible features
- */
-struct bch_sb {
- struct bch_csum csum;
- __le16 version;
- __le16 version_min;
- __le16 pad[2];
- __uuid_t magic;
- __uuid_t uuid;
- __uuid_t user_uuid;
- __u8 label[BCH_SB_LABEL_SIZE];
- __le64 offset;
- __le64 seq;
-
- __le16 block_size;
- __u8 dev_idx;
- __u8 nr_devices;
- __le32 u64s;
-
- __le64 time_base_lo;
- __le32 time_base_hi;
- __le32 time_precision;
-
- __le64 flags[7];
- __le64 write_time;
- __le64 features[2];
- __le64 compat[2];
-
- struct bch_sb_layout layout;
-
- struct bch_sb_field start[0];
- __le64 _data[];
-} __packed __aligned(8);
-
-/*
- * Flags:
- * BCH_SB_INITALIZED - set on first mount
- * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect
- * behaviour of mount/recovery path:
- * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits
- * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80
- * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
- * DATA/META_CSUM_TYPE. Also indicates encryption
- * algorithm in use, if/when we get more than one
- */
-
-LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16);
-
-LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1);
-LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2);
-LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8);
-LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12);
-
-LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28);
-
-LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33);
-LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40);
-
-LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44);
-LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
-
-LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57);
-LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58);
-LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59);
-LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60);
-
-LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
-LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
-
-LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
-LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
- struct bch_sb, flags[0], 63, 64);
-
-LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);
-LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9);
-
-LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10);
-LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14);
-
-/*
- * Max size of an extent that may require bouncing to read or write
- * (checksummed, compressed): 64k
- */
-LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
- struct bch_sb, flags[1], 14, 20);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
-
-LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40);
-LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
-LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
-
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
- struct bch_sb, flags[2], 0, 4);
-LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
-
-LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
-LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
-LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
-LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
-/* one free bit */
-LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
-LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
-LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
-LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54);
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56);
-
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
- struct bch_sb, flags[4], 60, 64);
-
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
- struct bch_sb, flags[5], 0, 16);
-LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
- struct bch_sb, flags[5], 16, 32);
-LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
-LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
- struct bch_sb, flags[5], 48, 64);
-LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
-LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
-LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
-
-static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
- return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
- SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
- SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
- return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
- (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
- SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
- SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-/*
- * Features:
- *
- * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist
- * reflink: gates KEY_TYPE_reflink
- * inline_data: gates KEY_TYPE_inline_data
- * new_siphash: gates BCH_STR_HASH_siphash
- * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
- */
-#define BCH_SB_FEATURES() \
- x(lz4, 0) \
- x(gzip, 1) \
- x(zstd, 2) \
- x(atomic_nlink, 3) \
- x(ec, 4) \
- x(journal_seq_blacklist_v3, 5) \
- x(reflink, 6) \
- x(new_siphash, 7) \
- x(inline_data, 8) \
- x(new_extent_overwrite, 9) \
- x(incompressible, 10) \
- x(btree_ptr_v2, 11) \
- x(extents_above_btree_updates, 12) \
- x(btree_updates_journalled, 13) \
- x(reflink_inline_data, 14) \
- x(new_varint, 15) \
- x(journal_no_flush, 16) \
- x(alloc_v2, 17) \
- x(extents_across_btree_nodes, 18) \
- x(incompat_version_field, 19) \
- x(casefolding, 20)
-
-#define BCH_SB_FEATURES_ALWAYS \
- (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
- BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\
- BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\
- BIT_ULL(BCH_FEATURE_alloc_v2)|\
- BIT_ULL(BCH_FEATURE_extents_across_btree_nodes))
-
-#define BCH_SB_FEATURES_ALL \
- (BCH_SB_FEATURES_ALWAYS| \
- BIT_ULL(BCH_FEATURE_new_siphash)| \
- BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
- BIT_ULL(BCH_FEATURE_new_varint)| \
- BIT_ULL(BCH_FEATURE_journal_no_flush)| \
- BIT_ULL(BCH_FEATURE_incompat_version_field))
-
-enum bch_sb_feature {
-#define x(f, n) BCH_FEATURE_##f,
- BCH_SB_FEATURES()
-#undef x
- BCH_FEATURE_NR,
-};
-
-#define BCH_SB_COMPAT() \
- x(alloc_info, 0) \
- x(alloc_metadata, 1) \
- x(extents_above_btree_updates_done, 2) \
- x(bformat_overflow_done, 3)
-
-enum bch_sb_compat {
-#define x(f, n) BCH_COMPAT_##f,
- BCH_SB_COMPAT()
-#undef x
- BCH_COMPAT_NR,
-};
-
-/* options: */
-
-#define BCH_VERSION_UPGRADE_OPTS() \
- x(compatible, 0) \
- x(incompatible, 1) \
- x(none, 2)
-
-enum bch_version_upgrade_opts {
-#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
- BCH_VERSION_UPGRADE_OPTS()
-#undef x
-};
-
-#define BCH_REPLICAS_MAX 4U
-
-#define BCH_BKEY_PTRS_MAX 16U
-
-#define BCH_ERROR_ACTIONS() \
- x(continue, 0) \
- x(fix_safe, 1) \
- x(panic, 2) \
- x(ro, 3)
-
-enum bch_error_actions {
-#define x(t, n) BCH_ON_ERROR_##t = n,
- BCH_ERROR_ACTIONS()
-#undef x
- BCH_ON_ERROR_NR
-};
-
-#define BCH_STR_HASH_TYPES() \
- x(crc32c, 0) \
- x(crc64, 1) \
- x(siphash_old, 2) \
- x(siphash, 3)
-
-enum bch_str_hash_type {
-#define x(t, n) BCH_STR_HASH_##t = n,
- BCH_STR_HASH_TYPES()
-#undef x
- BCH_STR_HASH_NR
-};
-
-#define BCH_STR_HASH_OPTS() \
- x(crc32c, 0) \
- x(crc64, 1) \
- x(siphash, 2)
-
-enum bch_str_hash_opts {
-#define x(t, n) BCH_STR_HASH_OPT_##t = n,
- BCH_STR_HASH_OPTS()
-#undef x
- BCH_STR_HASH_OPT_NR
-};
-
-#define BCH_CSUM_TYPES() \
- x(none, 0) \
- x(crc32c_nonzero, 1) \
- x(crc64_nonzero, 2) \
- x(chacha20_poly1305_80, 3) \
- x(chacha20_poly1305_128, 4) \
- x(crc32c, 5) \
- x(crc64, 6) \
- x(xxhash, 7)
-
-enum bch_csum_type {
-#define x(t, n) BCH_CSUM_##t = n,
- BCH_CSUM_TYPES()
-#undef x
- BCH_CSUM_NR
-};
-
-static const __maybe_unused unsigned bch_crc_bytes[] = {
- [BCH_CSUM_none] = 0,
- [BCH_CSUM_crc32c_nonzero] = 4,
- [BCH_CSUM_crc32c] = 4,
- [BCH_CSUM_crc64_nonzero] = 8,
- [BCH_CSUM_crc64] = 8,
- [BCH_CSUM_xxhash] = 8,
- [BCH_CSUM_chacha20_poly1305_80] = 10,
- [BCH_CSUM_chacha20_poly1305_128] = 16,
-};
-
-static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
-{
- switch (type) {
- case BCH_CSUM_chacha20_poly1305_80:
- case BCH_CSUM_chacha20_poly1305_128:
- return true;
- default:
- return false;
- }
-}
-
-#define BCH_CSUM_OPTS() \
- x(none, 0) \
- x(crc32c, 1) \
- x(crc64, 2) \
- x(xxhash, 3)
-
-enum bch_csum_opt {
-#define x(t, n) BCH_CSUM_OPT_##t = n,
- BCH_CSUM_OPTS()
-#undef x
- BCH_CSUM_OPT_NR
-};
-
-#define BCH_COMPRESSION_TYPES() \
- x(none, 0) \
- x(lz4_old, 1) \
- x(gzip, 2) \
- x(lz4, 3) \
- x(zstd, 4) \
- x(incompressible, 5)
-
-enum bch_compression_type {
-#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
- BCH_COMPRESSION_TYPES()
-#undef x
- BCH_COMPRESSION_TYPE_NR
-};
-
-#define BCH_COMPRESSION_OPTS() \
- x(none, 0) \
- x(lz4, 1) \
- x(gzip, 2) \
- x(zstd, 3)
-
-enum bch_compression_opts {
-#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
- BCH_COMPRESSION_OPTS()
-#undef x
- BCH_COMPRESSION_OPT_NR
-};
-
-/*
- * Magic numbers
- *
- * The various other data structures have their own magic numbers, which are
- * xored with the first part of the cache set's UUID
- */
-
-#define BCACHE_MAGIC \
- UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \
- 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
-#define BCHFS_MAGIC \
- UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \
- 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-
-#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC
-
-#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
-#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
-
-static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
-{
- __le64 ret;
-
- memcpy(&ret, &sb->uuid, sizeof(ret));
- return ret;
-}
-
-static inline __u64 __jset_magic(struct bch_sb *sb)
-{
- return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
-}
-
-static inline __u64 __bset_magic(struct bch_sb *sb)
-{
- return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
-}
-
-/* Journal */
-
-#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
-
-#define BCH_JSET_ENTRY_TYPES() \
- x(btree_keys, 0) \
- x(btree_root, 1) \
- x(prio_ptrs, 2) \
- x(blacklist, 3) \
- x(blacklist_v2, 4) \
- x(usage, 5) \
- x(data_usage, 6) \
- x(clock, 7) \
- x(dev_usage, 8) \
- x(log, 9) \
- x(overwrite, 10) \
- x(write_buffer_keys, 11) \
- x(datetime, 12)
-
-enum bch_jset_entry_type {
-#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
- BCH_JSET_ENTRY_TYPES()
-#undef x
- BCH_JSET_ENTRY_NR
-};
-
-static inline bool jset_entry_is_key(struct jset_entry *e)
-{
- switch (e->type) {
- case BCH_JSET_ENTRY_btree_keys:
- case BCH_JSET_ENTRY_btree_root:
- case BCH_JSET_ENTRY_write_buffer_keys:
- return true;
- }
-
- return false;
-}
-
-/*
- * Journal sequence numbers can be blacklisted: bsets record the max sequence
- * number of all the journal entries they contain updates for, so that on
- * recovery we can ignore those bsets that contain index updates newer that what
- * made it into the journal.
- *
- * This means that we can't reuse that journal_seq - we have to skip it, and
- * then record that we skipped it so that the next time we crash and recover we
- * don't think there was a missing journal entry.
- */
-struct jset_entry_blacklist {
- struct jset_entry entry;
- __le64 seq;
-};
-
-struct jset_entry_blacklist_v2 {
- struct jset_entry entry;
- __le64 start;
- __le64 end;
-};
-
-#define BCH_FS_USAGE_TYPES() \
- x(reserved, 0) \
- x(inodes, 1) \
- x(key_version, 2)
-
-enum bch_fs_usage_type {
-#define x(f, nr) BCH_FS_USAGE_##f = nr,
- BCH_FS_USAGE_TYPES()
-#undef x
- BCH_FS_USAGE_NR
-};
-
-struct jset_entry_usage {
- struct jset_entry entry;
- __le64 v;
-} __packed;
-
-struct jset_entry_data_usage {
- struct jset_entry entry;
- __le64 v;
- struct bch_replicas_entry_v1 r;
-} __packed;
-
-struct jset_entry_clock {
- struct jset_entry entry;
- __u8 rw;
- __u8 pad[7];
- __le64 time;
-} __packed;
-
-struct jset_entry_dev_usage_type {
- __le64 buckets;
- __le64 sectors;
- __le64 fragmented;
-} __packed;
-
-struct jset_entry_dev_usage {
- struct jset_entry entry;
- __le32 dev;
- __u32 pad;
-
- __le64 _buckets_ec; /* No longer used */
- __le64 _buckets_unavailable; /* No longer used */
-
- struct jset_entry_dev_usage_type d[];
-};
-
-static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
-{
- return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
- sizeof(struct jset_entry_dev_usage_type);
-}
-
-struct jset_entry_log {
- struct jset_entry entry;
- u8 d[];
-} __packed __aligned(8);
-
-static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
-{
- unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
-
- while (b && !l->d[b - 1])
- --b;
- return b;
-}
-
-struct jset_entry_datetime {
- struct jset_entry entry;
- __le64 seconds;
-} __packed __aligned(8);
-
-/*
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
-struct jset {
- struct bch_csum csum;
-
- __le64 magic;
- __le64 seq;
- __le32 version;
- __le32 flags;
-
- __le32 u64s; /* size of d[] in u64s */
-
- __u8 encrypted_start[0];
-
- __le16 _read_clock; /* no longer used */
- __le16 _write_clock;
-
- /* Sequence number of oldest dirty journal entry */
- __le64 last_seq;
-
-
- struct jset_entry start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
-LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
-LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
-
-#define BCH_JOURNAL_BUCKETS_MIN 8
-
-/* Btree: */
-
-enum btree_id_flags {
- BTREE_IS_extents = BIT(0),
- BTREE_IS_snapshots = BIT(1),
- BTREE_IS_snapshot_field = BIT(2),
- BTREE_IS_data = BIT(3),
- BTREE_IS_write_buffer = BIT(4),
-};
-
-#define BCH_BTREE_IDS() \
- x(extents, 0, \
- BTREE_IS_extents| \
- BTREE_IS_snapshots| \
- BTREE_IS_data, \
- BIT_ULL(KEY_TYPE_whiteout)| \
- BIT_ULL(KEY_TYPE_error)| \
- BIT_ULL(KEY_TYPE_cookie)| \
- BIT_ULL(KEY_TYPE_extent)| \
- BIT_ULL(KEY_TYPE_reservation)| \
- BIT_ULL(KEY_TYPE_reflink_p)| \
- BIT_ULL(KEY_TYPE_inline_data)) \
- x(inodes, 1, \
- BTREE_IS_snapshots, \
- BIT_ULL(KEY_TYPE_whiteout)| \
- BIT_ULL(KEY_TYPE_inode)| \
- BIT_ULL(KEY_TYPE_inode_v2)| \
- BIT_ULL(KEY_TYPE_inode_v3)| \
- BIT_ULL(KEY_TYPE_inode_generation)) \
- x(dirents, 2, \
- BTREE_IS_snapshots, \
- BIT_ULL(KEY_TYPE_whiteout)| \
- BIT_ULL(KEY_TYPE_hash_whiteout)| \
- BIT_ULL(KEY_TYPE_dirent)) \
- x(xattrs, 3, \
- BTREE_IS_snapshots, \
- BIT_ULL(KEY_TYPE_whiteout)| \
- BIT_ULL(KEY_TYPE_cookie)| \
- BIT_ULL(KEY_TYPE_hash_whiteout)| \
- BIT_ULL(KEY_TYPE_xattr)) \
- x(alloc, 4, 0, \
- BIT_ULL(KEY_TYPE_alloc)| \
- BIT_ULL(KEY_TYPE_alloc_v2)| \
- BIT_ULL(KEY_TYPE_alloc_v3)| \
- BIT_ULL(KEY_TYPE_alloc_v4)) \
- x(quotas, 5, 0, \
- BIT_ULL(KEY_TYPE_quota)) \
- x(stripes, 6, 0, \
- BIT_ULL(KEY_TYPE_stripe)) \
- x(reflink, 7, \
- BTREE_IS_extents| \
- BTREE_IS_data, \
- BIT_ULL(KEY_TYPE_reflink_v)| \
- BIT_ULL(KEY_TYPE_indirect_inline_data)| \
- BIT_ULL(KEY_TYPE_error)) \
- x(subvolumes, 8, 0, \
- BIT_ULL(KEY_TYPE_subvolume)) \
- x(snapshots, 9, 0, \
- BIT_ULL(KEY_TYPE_snapshot)) \
- x(lru, 10, \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_set)) \
- x(freespace, 11, \
- BTREE_IS_extents, \
- BIT_ULL(KEY_TYPE_set)) \
- x(need_discard, 12, 0, \
- BIT_ULL(KEY_TYPE_set)) \
- x(backpointers, 13, \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_backpointer)) \
- x(bucket_gens, 14, 0, \
- BIT_ULL(KEY_TYPE_bucket_gens)) \
- x(snapshot_trees, 15, 0, \
- BIT_ULL(KEY_TYPE_snapshot_tree)) \
- x(deleted_inodes, 16, \
- BTREE_IS_snapshot_field| \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_set)) \
- x(logged_ops, 17, 0, \
- BIT_ULL(KEY_TYPE_logged_op_truncate)| \
- BIT_ULL(KEY_TYPE_logged_op_finsert)| \
- BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \
- x(rebalance_work, 18, \
- BTREE_IS_snapshot_field| \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
- x(subvolume_children, 19, 0, \
- BIT_ULL(KEY_TYPE_set)) \
- x(accounting, 20, \
- BTREE_IS_snapshot_field| \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_accounting)) \
-
-enum btree_id {
-#define x(name, nr, ...) BTREE_ID_##name = nr,
- BCH_BTREE_IDS()
-#undef x
- BTREE_ID_NR
-};
-
-/*
- * Maximum number of btrees that we will _ever_ have under the current scheme,
- * where we refer to them with 64 bit bitfields - and we also need a bit for
- * the interior btree node type:
- */
-#define BTREE_ID_NR_MAX 63
-
-static inline bool btree_id_is_alloc(enum btree_id id)
-{
- switch (id) {
- case BTREE_ID_alloc:
- case BTREE_ID_backpointers:
- case BTREE_ID_need_discard:
- case BTREE_ID_freespace:
- case BTREE_ID_bucket_gens:
- case BTREE_ID_lru:
- case BTREE_ID_accounting:
- return true;
- default:
- return false;
- }
-}
-
-#define BTREE_MAX_DEPTH 4U
-
-/* Btree nodes */
-
-/*
- * Btree nodes
- *
- * On disk a btree node is a list/log of these; within each set the keys are
- * sorted
- */
-struct bset {
- __le64 seq;
-
- /*
- * Highest journal entry this bset contains keys for.
- * If on recovery we don't see that journal entry, this bset is ignored:
- * this allows us to preserve the order of all index updates after a
- * crash, since the journal records a total order of all index updates
- * and anything that didn't make it to the journal doesn't get used.
- */
- __le64 journal_seq;
-
- __le32 flags;
- __le16 version;
- __le16 u64s; /* count of d[] in u64s */
-
- struct bkey_packed start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4);
-
-LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5);
-LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
- struct bset, flags, 5, 6);
-
-/* Sector offset within the btree node: */
-LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32);
-
-struct btree_node {
- struct bch_csum csum;
- __le64 magic;
-
- /* this flags field is encrypted, unlike bset->flags: */
- __le64 flags;
-
- /* Closed interval: */
- struct bpos min_key;
- struct bpos max_key;
- struct bch_extent_ptr _ptr; /* not used anymore */
- struct bkey_format format;
-
- union {
- struct bset keys;
- struct {
- __u8 pad[22];
- __le16 u64s;
- __u64 _data[0];
-
- };
- };
-} __packed __aligned(8);
-
-LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4);
-LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
-LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
- struct btree_node, flags, 8, 9);
-LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25);
-/* 25-32 unused */
-LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64);
-
-static inline __u64 BTREE_NODE_ID(struct btree_node *n)
-{
- return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
-}
-
-static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
-{
- SET_BTREE_NODE_ID_LO(n, v);
- SET_BTREE_NODE_ID_HI(n, v >> 4);
-}
-
-struct btree_node_entry {
- struct bch_csum csum;
-
- union {
- struct bset keys;
- struct {
- __u8 pad[22];
- __le16 u64s;
- __u64 _data[0];
- };
- };
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
deleted file mode 100644
index 52594e925eb7..000000000000
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ /dev/null
@@ -1,473 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IOCTL_H
-#define _BCACHEFS_IOCTL_H
-
-#include <linux/uuid.h>
-#include <asm/ioctl.h>
-#include "bcachefs_format.h"
-#include "bkey_types.h"
-
-/*
- * Flags common to multiple ioctls:
- */
-#define BCH_FORCE_IF_DATA_LOST (1 << 0)
-#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
-#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
-#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
-
-#define BCH_FORCE_IF_LOST \
- (BCH_FORCE_IF_DATA_LOST| \
- BCH_FORCE_IF_METADATA_LOST)
-#define BCH_FORCE_IF_DEGRADED \
- (BCH_FORCE_IF_DATA_DEGRADED| \
- BCH_FORCE_IF_METADATA_DEGRADED)
-
-/*
- * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
- * (e.g. /dev/sda1); if set, the dev field is the device's index within the
- * filesystem:
- */
-#define BCH_BY_INDEX (1 << 4)
-
-/*
- * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
- * wide superblock:
- */
-#define BCH_READ_DEV (1 << 5)
-
-/* global control dev: */
-
-/* These are currently broken, and probably unnecessary: */
-#if 0
-#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
-#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
-
-struct bch_ioctl_assemble {
- __u32 flags;
- __u32 nr_devs;
- __u64 pad;
- __u64 devs[];
-};
-
-struct bch_ioctl_incremental {
- __u32 flags;
- __u64 pad;
- __u64 dev;
-};
-#endif
-
-/* filesystem ioctls: */
-
-#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
-
-/* These only make sense when we also have incremental assembly */
-#if 0
-#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
-#define BCH_IOCTL_STOP _IO(0xbc, 3)
-#endif
-
-#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
-#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
-#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage)
-#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage)
-#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
-#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
-#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
-#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
-
-#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
-#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
-
-#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
-
-#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
-#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
-#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
-#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
-
-/* ioctl below act on a particular file, not the filesystem as a whole: */
-
-#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
-
-/*
- * BCH_IOCTL_QUERY_UUID: get filesystem UUID
- *
- * Returns user visible UUID, not internal UUID (which may not ever be changed);
- * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
- * this UUID.
- */
-struct bch_ioctl_query_uuid {
- __uuid_t uuid;
-};
-
-#if 0
-struct bch_ioctl_start {
- __u32 flags;
- __u32 pad;
-};
-#endif
-
-/*
- * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
- *
- * The specified device must not be open or in use. On success, the new device
- * will be an online member of the filesystem just like any other member.
- *
- * The device must first be prepared by userspace by formatting with a bcachefs
- * superblock, which is only used for passing in superblock options/parameters
- * for that device (in struct bch_member). The new device's superblock should
- * not claim to be a member of any existing filesystem - UUIDs on it will be
- * ignored.
- */
-
-/*
- * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
- *
- * Any data present on @dev will be permanently deleted, and @dev will be
- * removed from its slot in the filesystem's list of member devices. The device
- * may be either offline or offline.
- *
- * Will fail removing @dev would leave us with insufficient read write devices
- * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
- * set.
- */
-
-/*
- * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
- * but is not open (e.g. because we started in degraded mode), bring it online
- *
- * all existing data on @dev will be available once the device is online,
- * exactly as if @dev was present when the filesystem was first mounted
- */
-
-/*
- * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
- * block device, without removing it from the filesystem (so it can be brought
- * back online later)
- *
- * Data present on @dev will be unavailable while @dev is offline (unless
- * replicated), but will still be intact and untouched if @dev is brought back
- * online
- *
- * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
- * leave us with insufficient read write devices or degraded/unavailable data,
- * unless the approprate BCH_FORCE_IF_* flags are set.
- */
-
-struct bch_ioctl_disk {
- __u32 flags;
- __u32 pad;
- __u64 dev;
-};
-
-/*
- * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
- *
- * @new_state - one of the bch_member_state states (rw, ro, failed,
- * spare)
- *
- * Will refuse to change member state if we would then have insufficient devices
- * to write to, or if it would result in degraded data (when @new_state is
- * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
- */
-struct bch_ioctl_disk_set_state {
- __u32 flags;
- __u8 new_state;
- __u8 pad[3];
- __u64 dev;
-};
-
-#define BCH_DATA_OPS() \
- x(scrub, 0) \
- x(rereplicate, 1) \
- x(migrate, 2) \
- x(rewrite_old_nodes, 3) \
- x(drop_extra_replicas, 4)
-
-enum bch_data_ops {
-#define x(t, n) BCH_DATA_OP_##t = n,
- BCH_DATA_OPS()
-#undef x
- BCH_DATA_OP_NR
-};
-
-/*
- * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
- * scrub, rereplicate, migrate).
- *
- * This ioctl kicks off a job in the background, and returns a file descriptor.
- * Reading from the file descriptor returns a struct bch_ioctl_data_event,
- * indicating current progress, and closing the file descriptor will stop the
- * job. The file descriptor is O_CLOEXEC.
- */
-struct bch_ioctl_data {
- __u16 op;
- __u8 start_btree;
- __u8 end_btree;
- __u32 flags;
-
- struct bpos start_pos;
- struct bpos end_pos;
-
- union {
- struct {
- __u32 dev;
- __u32 data_types;
- } scrub;
- struct {
- __u32 dev;
- __u32 pad;
- } migrate;
- struct {
- __u64 pad[8];
- };
- };
-} __packed __aligned(8);
-
-enum bch_data_event {
- BCH_DATA_EVENT_PROGRESS = 0,
- /* XXX: add an event for reporting errors */
- BCH_DATA_EVENT_NR = 1,
-};
-
-enum data_progress_data_type_special {
- DATA_PROGRESS_DATA_TYPE_phys = 254,
- DATA_PROGRESS_DATA_TYPE_done = 255,
-};
-
-struct bch_ioctl_data_progress {
- __u8 data_type;
- __u8 btree_id;
- __u8 pad[2];
- struct bpos pos;
-
- __u64 sectors_done;
- __u64 sectors_total;
- __u64 sectors_error_corrected;
- __u64 sectors_error_uncorrected;
-} __packed __aligned(8);
-
-enum bch_ioctl_data_event_ret {
- BCH_IOCTL_DATA_EVENT_RET_done = 1,
- BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
-};
-
-struct bch_ioctl_data_event {
- __u8 type;
- __u8 ret;
- __u8 pad[6];
- union {
- struct bch_ioctl_data_progress p;
- __u64 pad2[15];
- };
-} __packed __aligned(8);
-
-struct bch_replicas_usage {
- __u64 sectors;
- struct bch_replicas_entry_v1 r;
-} __packed;
-
-static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
-{
- return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
-}
-
-static inline struct bch_replicas_usage *
-replicas_usage_next(struct bch_replicas_usage *u)
-{
- return (void *) u + replicas_usage_bytes(u);
-}
-
-/* Obsolete */
-/*
- * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
- *
- * On success, @replica_entries_bytes will be changed to indicate the number of
- * bytes actually used.
- *
- * Returns -ERANGE if @replica_entries_bytes was too small
- */
-struct bch_ioctl_fs_usage {
- __u64 capacity;
- __u64 used;
- __u64 online_reserved;
- __u64 persistent_reserved[BCH_REPLICAS_MAX];
-
- __u32 replica_entries_bytes;
- __u32 pad;
-
- struct bch_replicas_usage replicas[];
-};
-
-/* Obsolete */
-/*
- * BCH_IOCTL_DEV_USAGE: query device disk space usage
- *
- * Returns disk space usage broken out by data type - both by buckets and
- * sectors.
- */
-struct bch_ioctl_dev_usage {
- __u64 dev;
- __u32 flags;
- __u8 state;
- __u8 pad[7];
-
- __u32 bucket_size;
- __u64 nr_buckets;
-
- __u64 buckets_ec;
-
- struct bch_ioctl_dev_usage_type {
- __u64 buckets;
- __u64 sectors;
- __u64 fragmented;
- } d[10];
-};
-
-/* Obsolete */
-struct bch_ioctl_dev_usage_v2 {
- __u64 dev;
- __u32 flags;
- __u8 state;
- __u8 nr_data_types;
- __u8 pad[6];
-
- __u32 bucket_size;
- __u64 nr_buckets;
-
- struct bch_ioctl_dev_usage_type d[];
-};
-
-/*
- * BCH_IOCTL_READ_SUPER: read filesystem superblock
- *
- * Equivalent to reading the superblock directly from the block device, except
- * avoids racing with the kernel writing the superblock or having to figure out
- * which block device to read
- *
- * @sb - buffer to read into
- * @size - size of userspace allocated buffer
- * @dev - device to read superblock for, if BCH_READ_DEV flag is
- * specified
- *
- * Returns -ERANGE if buffer provided is too small
- */
-struct bch_ioctl_read_super {
- __u32 flags;
- __u32 pad;
- __u64 dev;
- __u64 size;
- __u64 sb;
-};
-
-/*
- * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
- * determine if disk is a (online) member - if so, returns device's index
- *
- * Returns -ENOENT if not found
- */
-struct bch_ioctl_disk_get_idx {
- __u64 dev;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
- *
- * @dev - member to resize
- * @nbuckets - new number of buckets
- */
-struct bch_ioctl_disk_resize {
- __u32 flags;
- __u32 pad;
- __u64 dev;
- __u64 nbuckets;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
- *
- * @dev - member to resize
- * @nbuckets - new number of buckets
- */
-struct bch_ioctl_disk_resize_journal {
- __u32 flags;
- __u32 pad;
- __u64 dev;
- __u64 nbuckets;
-};
-
-struct bch_ioctl_subvolume {
- __u32 flags;
- __u32 dirfd;
- __u16 mode;
- __u16 pad[3];
- __u64 dst_ptr;
- __u64 src_ptr;
-};
-
-#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
-#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
-
-/*
- * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_offline {
- __u64 flags;
- __u64 opts; /* string */
- __u64 nr_devs;
- __u64 devs[] __counted_by(nr_devs);
-};
-
-/*
- * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_online {
- __u64 flags;
- __u64 opts; /* string */
-};
-
-/*
- * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
- *
- * On success, @replica_entries_bytes will be changed to indicate the number of
- * bytes actually used.
- *
- * Returns -ERANGE if @replica_entries_bytes was too small
- */
-struct bch_ioctl_query_accounting {
- __u64 capacity;
- __u64 used;
- __u64 online_reserved;
-
- __u32 accounting_u64s; /* input parameter */
- __u32 accounting_types_mask; /* input parameter */
-
- struct bkey_i_accounting accounting[];
-};
-
-#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0)
-
-struct bch_ioctl_query_counters {
- __u16 nr;
- __u16 flags;
- __u32 pad;
- __u64 d[];
-};
-
-#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
deleted file mode 100644
index 995ba32e9b6e..000000000000
--- a/fs/bcachefs/bkey.c
+++ /dev/null
@@ -1,1117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_cmp.h"
-#include "bkey_methods.h"
-#include "bset.h"
-#include "util.h"
-
-const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *out,
- const struct bkey_format *f,
- const struct bkey_packed *k)
-{
- const u64 *p = high_word(f, k);
- unsigned word_bits = 64 - high_bit_offset;
- unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
- u64 v = *p & (~0ULL >> high_bit_offset);
-
- if (!nr_key_bits) {
- prt_str(out, "(empty)");
- return;
- }
-
- while (1) {
- unsigned next_key_bits = nr_key_bits;
-
- if (nr_key_bits < 64) {
- v >>= 64 - nr_key_bits;
- next_key_bits = 0;
- } else {
- next_key_bits -= 64;
- }
-
- bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
-
- if (!next_key_bits)
- break;
-
- prt_char(out, ' ');
-
- p = next_word(p);
- v = *p;
- word_bits = 64;
- nr_key_bits = next_key_bits;
- }
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format)
-{
- struct bkey tmp;
-
- BUG_ON(bkeyp_val_u64s(format, packed) !=
- bkey_val_u64s(unpacked));
-
- BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
-
- tmp = __bch2_bkey_unpack_key(format, packed);
-
- if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
- format->key_u64s,
- format->bits_per_field[0],
- format->bits_per_field[1],
- format->bits_per_field[2],
- format->bits_per_field[3],
- format->bits_per_field[4]);
-
- prt_printf(&buf, "compiled unpack: ");
- bch2_bkey_to_text(&buf, unpacked);
- prt_newline(&buf);
-
- prt_printf(&buf, "c unpack: ");
- bch2_bkey_to_text(&buf, &tmp);
- prt_newline(&buf);
-
- prt_printf(&buf, "compiled unpack: ");
- bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
- (struct bkey_packed *) unpacked);
- prt_newline(&buf);
-
- prt_printf(&buf, "c unpack: ");
- bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
- (struct bkey_packed *) &tmp);
- prt_newline(&buf);
-
- panic("%s", buf.buf);
- }
-}
-
-#else
-static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format) {}
-#endif
-
-struct pack_state {
- const struct bkey_format *format;
- unsigned bits; /* bits remaining in current word */
- u64 w; /* current word */
- u64 *p; /* pointer to next word */
-};
-
-__always_inline
-static struct pack_state pack_state_init(const struct bkey_format *format,
- struct bkey_packed *k)
-{
- u64 *p = high_word(format, k);
-
- return (struct pack_state) {
- .format = format,
- .bits = 64 - high_bit_offset,
- .w = 0,
- .p = p,
- };
-}
-
-__always_inline
-static void pack_state_finish(struct pack_state *state,
- struct bkey_packed *k)
-{
- EBUG_ON(state->p < k->_data);
- EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
-
- *state->p = state->w;
-}
-
-struct unpack_state {
- const struct bkey_format *format;
- unsigned bits; /* bits remaining in current word */
- u64 w; /* current word */
- const u64 *p; /* pointer to next word */
-};
-
-__always_inline
-static struct unpack_state unpack_state_init(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- const u64 *p = high_word(format, k);
-
- return (struct unpack_state) {
- .format = format,
- .bits = 64 - high_bit_offset,
- .w = *p << high_bit_offset,
- .p = p,
- };
-}
-
-__always_inline
-static u64 get_inc_field(struct unpack_state *state, unsigned field)
-{
- unsigned bits = state->format->bits_per_field[field];
- u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
-
- if (bits >= state->bits) {
- v = state->w >> (64 - bits);
- bits -= state->bits;
-
- state->p = next_word(state->p);
- state->w = *state->p;
- state->bits = 64;
- }
-
- /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
- v |= (state->w >> 1) >> (63 - bits);
- state->w <<= bits;
- state->bits -= bits;
-
- return v + offset;
-}
-
-__always_inline
-static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
- unsigned bits = state->format->bits_per_field[field];
-
- if (bits) {
- if (bits > state->bits) {
- bits -= state->bits;
- /* avoid shift by 64 if bits is 64 - bits is never 0 here: */
- state->w |= (v >> 1) >> (bits - 1);
-
- *state->p = state->w;
- state->p = next_word(state->p);
- state->w = 0;
- state->bits = 64;
- }
-
- state->bits -= bits;
- state->w |= v << state->bits;
- }
-}
-
-__always_inline
-static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
- unsigned bits = state->format->bits_per_field[field];
- u64 offset = le64_to_cpu(state->format->field_offset[field]);
-
- if (v < offset)
- return false;
-
- v -= offset;
-
- if (fls64(v) > bits)
- return false;
-
- __set_inc_field(state, field, v);
- return true;
-}
-
-/*
- * Note: does NOT set out->format (we don't know what it should be here!)
- *
- * Also: doesn't work on extents - it doesn't preserve the invariant that
- * if k is packed bkey_start_pos(k) will successfully pack
- */
-static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
- struct bkey_packed *out,
- const struct bkey_format *in_f,
- const struct bkey_packed *in)
-{
- struct pack_state out_s = pack_state_init(out_f, out);
- struct unpack_state in_s = unpack_state_init(in_f, in);
- u64 *w = out->_data;
- unsigned i;
-
- *w = 0;
-
- for (i = 0; i < BKEY_NR_FIELDS; i++)
- if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
- return false;
-
- /* Can't happen because the val would be too big to unpack: */
- EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
-
- pack_state_finish(&out_s, out);
- out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s;
- out->needs_whiteout = in->needs_whiteout;
- out->type = in->type;
-
- return true;
-}
-
-bool bch2_bkey_transform(const struct bkey_format *out_f,
- struct bkey_packed *out,
- const struct bkey_format *in_f,
- const struct bkey_packed *in)
-{
- if (!bch2_bkey_transform_key(out_f, out, in_f, in))
- return false;
-
- memcpy_u64s((u64 *) out + out_f->key_u64s,
- (u64 *) in + in_f->key_u64s,
- (in->u64s - in_f->key_u64s));
- return true;
-}
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
- const struct bkey_packed *in)
-{
- struct unpack_state state = unpack_state_init(format, in);
- struct bkey out;
-
- EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
- EBUG_ON(in->u64s < format->key_u64s);
- EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
- EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
-
- out.u64s = BKEY_U64s + in->u64s - format->key_u64s;
- out.format = KEY_FORMAT_CURRENT;
- out.needs_whiteout = in->needs_whiteout;
- out.type = in->type;
- out.pad[0] = 0;
-
-#define x(id, field) out.field = get_inc_field(&state, id);
- bkey_fields()
-#undef x
-
- return out;
-}
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *format,
- const struct bkey_packed *in)
-{
- struct unpack_state state = unpack_state_init(format, in);
- struct bpos out;
-
- EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
- EBUG_ON(in->u64s < format->key_u64s);
- EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-
- out.inode = get_inc_field(&state, BKEY_FIELD_INODE);
- out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET);
- out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
-
- return out;
-}
-#endif
-
-/**
- * bch2_bkey_pack_key -- pack just the key, not the value
- * @out: packed result
- * @in: key to pack
- * @format: format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
- const struct bkey_format *format)
-{
- struct pack_state state = pack_state_init(format, out);
- u64 *w = out->_data;
-
- EBUG_ON((void *) in == (void *) out);
- EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
- EBUG_ON(in->format != KEY_FORMAT_CURRENT);
-
- *w = 0;
-
-#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false;
- bkey_fields()
-#undef x
- pack_state_finish(&state, out);
- out->u64s = format->key_u64s + in->u64s - BKEY_U64s;
- out->format = KEY_FORMAT_LOCAL_BTREE;
- out->needs_whiteout = in->needs_whiteout;
- out->type = in->type;
-
- bch2_bkey_pack_verify(out, in, format);
- return true;
-}
-
-/**
- * bch2_bkey_unpack -- unpack the key and the value
- * @b: btree node of @src key (for packed format)
- * @dst: unpacked result
- * @src: packed input
- */
-void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
- const struct bkey_packed *src)
-{
- __bkey_unpack_key(b, &dst->k, src);
-
- memcpy_u64s(&dst->v,
- bkeyp_val(&b->format, src),
- bkeyp_val_u64s(&b->format, src));
-}
-
-/**
- * bch2_bkey_pack -- pack the key and the value
- * @dst: packed result
- * @src: unpacked input
- * @format: format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
- const struct bkey_format *format)
-{
- struct bkey_packed tmp;
-
- if (!bch2_bkey_pack_key(&tmp, &src->k, format))
- return false;
-
- memmove_u64s((u64 *) dst + format->key_u64s,
- &src->v,
- bkey_val_u64s(&src->k));
- memcpy_u64s_small(dst, &tmp, format->key_u64s);
-
- return true;
-}
-
-__always_inline
-static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
-{
- unsigned bits = state->format->bits_per_field[field];
- u64 offset = le64_to_cpu(state->format->field_offset[field]);
- bool ret = true;
-
- EBUG_ON(v < offset);
- v -= offset;
-
- if (fls64(v) > bits) {
- v = ~(~0ULL << bits);
- ret = false;
- }
-
- __set_inc_field(state, field, v);
- return ret;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static bool bkey_packed_successor(struct bkey_packed *out,
- const struct btree *b,
- struct bkey_packed k)
-{
- const struct bkey_format *f = &b->format;
- unsigned nr_key_bits = b->nr_key_bits;
- unsigned first_bit, offset;
- u64 *p;
-
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
- if (!nr_key_bits)
- return false;
-
- *out = k;
-
- first_bit = high_bit_offset + nr_key_bits - 1;
- p = nth_word(high_word(f, out), first_bit >> 6);
- offset = 63 - (first_bit & 63);
-
- while (nr_key_bits) {
- unsigned bits = min(64 - offset, nr_key_bits);
- u64 mask = (~0ULL >> (64 - bits)) << offset;
-
- if ((*p & mask) != mask) {
- *p += 1ULL << offset;
- EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
- return true;
- }
-
- *p &= ~mask;
- p = prev_word(p);
- nr_key_bits -= bits;
- offset = 0;
- }
-
- return false;
-}
-
-static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
-{
- for (unsigned i = 0; i < f->nr_fields; i++) {
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 packed_max = f->bits_per_field[i]
- ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
- : 0;
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
- if (packed_max + field_offset < packed_max ||
- packed_max + field_offset > unpacked_max)
- return true;
- }
-
- return false;
-}
-#endif
-
-/*
- * Returns a packed key that compares <= in
- *
- * This is used in bset_search_tree(), where we need a packed pos in order to be
- * able to compare against the keys in the auxiliary search tree - and it's
- * legal to use a packed pos that isn't equivalent to the original pos,
- * _provided_ it compares <= to the original pos.
- */
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
- struct bpos in,
- const struct btree *b)
-{
- const struct bkey_format *f = &b->format;
- struct pack_state state = pack_state_init(f, out);
- u64 *w = out->_data;
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bpos orig = in;
-#endif
- bool exact = true;
- unsigned i;
-
- /*
- * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
- * byte header, but pack_pos() won't if the len/version fields are big
- * enough - we need to make sure to zero them out:
- */
- for (i = 0; i < f->key_u64s; i++)
- w[i] = 0;
-
- if (unlikely(in.snapshot <
- le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
- if (!in.offset-- &&
- !in.inode--)
- return BKEY_PACK_POS_FAIL;
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(in.offset <
- le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
- if (!in.inode--)
- return BKEY_PACK_POS_FAIL;
- in.offset = KEY_OFFSET_MAX;
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(in.inode <
- le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
- return BKEY_PACK_POS_FAIL;
-
- if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
- in.offset = KEY_OFFSET_MAX;
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
- exact = false;
-
- pack_state_finish(&state, out);
- out->u64s = f->key_u64s;
- out->format = KEY_FORMAT_LOCAL_BTREE;
- out->type = KEY_TYPE_deleted;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- if (exact) {
- BUG_ON(bkey_cmp_left_packed(b, out, &orig));
- } else {
- struct bkey_packed successor;
-
- BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
- BUG_ON(bkey_packed_successor(&successor, b, *out) &&
- bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
- !bkey_format_has_too_big_fields(f));
- }
-#endif
-
- return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
-}
-
-void bch2_bkey_format_init(struct bkey_format_state *s)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
- s->field_min[i] = U64_MAX;
-
- for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
- s->field_max[i] = 0;
-
- /* Make sure we can store a size of 0: */
- s->field_min[BKEY_FIELD_SIZE] = 0;
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
-{
- unsigned field = 0;
-
- __bkey_format_add(s, field++, p.inode);
- __bkey_format_add(s, field++, p.offset);
- __bkey_format_add(s, field++, p.snapshot);
-}
-
-/*
- * We don't want it to be possible for the packed format to represent fields
- * bigger than a u64... that will cause confusion and issues (like with
- * bkey_packed_successor())
- */
-static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
- unsigned bits, u64 offset)
-{
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-
- bits = min(bits, unpacked_bits);
-
- offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
-
- f->bits_per_field[i] = bits;
- f->field_offset[i] = cpu_to_le64(offset);
-}
-
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
-{
- unsigned i, bits = KEY_PACKED_BITS_START;
- struct bkey_format ret = {
- .nr_fields = BKEY_NR_FIELDS,
- };
-
- for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
- s->field_min[i] = min(s->field_min[i], s->field_max[i]);
-
- set_format_field(&ret, i,
- fls64(s->field_max[i] - s->field_min[i]),
- s->field_min[i]);
-
- bits += ret.bits_per_field[i];
- }
-
- /* allow for extent merging: */
- if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
- unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
-
- ret.bits_per_field[BKEY_FIELD_SIZE] += b;
- bits += b;
- }
-
- ret.key_u64s = DIV_ROUND_UP(bits, 64);
-
- /* if we have enough spare bits, round fields up to nearest byte */
- bits = ret.key_u64s * 64 - bits;
-
- for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
- unsigned r = round_up(ret.bits_per_field[i], 8) -
- ret.bits_per_field[i];
-
- if (r <= bits) {
- set_format_field(&ret, i,
- ret.bits_per_field[i] + r,
- le64_to_cpu(ret.field_offset[i]));
- bits -= r;
- }
- }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- {
- struct printbuf buf = PRINTBUF;
-
- BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
- printbuf_exit(&buf);
- }
-#endif
- return ret;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *c,
- struct bkey_format *f,
- enum bch_validate_flags flags,
- struct printbuf *err)
-{
- unsigned bits = KEY_PACKED_BITS_START;
-
- if (f->nr_fields != BKEY_NR_FIELDS) {
- prt_printf(err, "incorrect number of fields: got %u, should be %u",
- f->nr_fields, BKEY_NR_FIELDS);
- return -BCH_ERR_invalid;
- }
-
- /*
- * Verify that the packed format can't represent fields larger than the
- * unpacked format:
- */
- for (unsigned i = 0; i < f->nr_fields; i++) {
- if (bch2_bkey_format_field_overflows(f, i)) {
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
- unsigned packed_bits = min(64, f->bits_per_field[i]);
- u64 packed_max = packed_bits
- ? ~((~0ULL << 1) << (packed_bits - 1))
- : 0;
-
- prt_printf(err, "field %u too large: %llu + %llu > %llu",
- i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max);
- return -BCH_ERR_invalid;
- }
-
- bits += f->bits_per_field[i];
- }
-
- if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
- prt_printf(err, "incorrect key_u64s: got %u, should be %u",
- f->key_u64s, DIV_ROUND_UP(bits, 64));
- return -BCH_ERR_invalid;
- }
-
- return 0;
-}
-
-void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
-{
- prt_printf(out, "u64s %u fields ", f->key_u64s);
-
- for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
- if (i)
- prt_str(out, ", ");
- prt_printf(out, "%u:%llu",
- f->bits_per_field[i],
- le64_to_cpu(f->field_offset[i]));
- }
-}
-
-/*
- * Most significant differing bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
- const struct bkey_packed *l_k,
- const struct bkey_packed *r_k)
-{
- const u64 *l = high_word(&b->format, l_k);
- const u64 *r = high_word(&b->format, r_k);
- unsigned nr_key_bits = b->nr_key_bits;
- unsigned word_bits = 64 - high_bit_offset;
- u64 l_v, r_v;
-
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
- /* for big endian, skip past header */
- l_v = *l & (~0ULL >> high_bit_offset);
- r_v = *r & (~0ULL >> high_bit_offset);
-
- while (nr_key_bits) {
- if (nr_key_bits < word_bits) {
- l_v >>= word_bits - nr_key_bits;
- r_v >>= word_bits - nr_key_bits;
- nr_key_bits = 0;
- } else {
- nr_key_bits -= word_bits;
- }
-
- if (l_v != r_v)
- return fls64(l_v ^ r_v) - 1 + nr_key_bits;
-
- l = next_word(l);
- r = next_word(r);
-
- l_v = *l;
- r_v = *r;
- word_bits = 64;
- }
-
- return 0;
-}
-
-/*
- * First set bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
-{
- const u64 *p = high_word(&b->format, k);
- unsigned nr_key_bits = b->nr_key_bits;
- unsigned ret = 0, offset;
-
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
- offset = nr_key_bits;
- while (offset > 64) {
- p = next_word(p);
- offset -= 64;
- }
-
- offset = 64 - offset;
-
- while (nr_key_bits) {
- unsigned bits = nr_key_bits + offset < 64
- ? nr_key_bits
- : 64 - offset;
-
- u64 mask = (~0ULL >> (64 - bits)) << offset;
-
- if (*p & mask)
- return ret + __ffs64(*p & mask) - offset;
-
- p = prev_word(p);
- nr_key_bits -= bits;
- ret += bits;
- offset = 0;
- }
-
- return 0;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-#define I(_x) (*(out)++ = (_x))
-#define I1(i0) I(i0)
-#define I2(i0, i1) (I1(i0), I(i1))
-#define I3(i0, i1, i2) (I2(i0, i1), I(i2))
-#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3))
-#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4))
-
-static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
- enum bch_bkey_fields field,
- unsigned dst_offset, unsigned dst_size,
- bool *eax_zeroed)
-{
- unsigned bits = format->bits_per_field[field];
- u64 offset = le64_to_cpu(format->field_offset[field]);
- unsigned i, byte, bit_offset, align, shl, shr;
-
- if (!bits && !offset) {
- if (!*eax_zeroed) {
- /* xor eax, eax */
- I2(0x31, 0xc0);
- }
-
- *eax_zeroed = true;
- goto set_field;
- }
-
- if (!bits) {
- /* just return offset: */
-
- switch (dst_size) {
- case 8:
- if (offset > S32_MAX) {
- /* mov [rdi + dst_offset], offset */
- I3(0xc7, 0x47, dst_offset);
- memcpy(out, &offset, 4);
- out += 4;
-
- I3(0xc7, 0x47, dst_offset + 4);
- memcpy(out, (void *) &offset + 4, 4);
- out += 4;
- } else {
- /* mov [rdi + dst_offset], offset */
- /* sign extended */
- I4(0x48, 0xc7, 0x47, dst_offset);
- memcpy(out, &offset, 4);
- out += 4;
- }
- break;
- case 4:
- /* mov [rdi + dst_offset], offset */
- I3(0xc7, 0x47, dst_offset);
- memcpy(out, &offset, 4);
- out += 4;
- break;
- default:
- BUG();
- }
-
- return out;
- }
-
- bit_offset = format->key_u64s * 64;
- for (i = 0; i <= field; i++)
- bit_offset -= format->bits_per_field[i];
-
- byte = bit_offset / 8;
- bit_offset -= byte * 8;
-
- *eax_zeroed = false;
-
- if (bit_offset == 0 && bits == 8) {
- /* movzx eax, BYTE PTR [rsi + imm8] */
- I4(0x0f, 0xb6, 0x46, byte);
- } else if (bit_offset == 0 && bits == 16) {
- /* movzx eax, WORD PTR [rsi + imm8] */
- I4(0x0f, 0xb7, 0x46, byte);
- } else if (bit_offset + bits <= 32) {
- align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
- byte -= align;
- bit_offset += align * 8;
-
- BUG_ON(bit_offset + bits > 32);
-
- /* mov eax, [rsi + imm8] */
- I3(0x8b, 0x46, byte);
-
- if (bit_offset) {
- /* shr eax, imm8 */
- I3(0xc1, 0xe8, bit_offset);
- }
-
- if (bit_offset + bits < 32) {
- unsigned mask = ~0U >> (32 - bits);
-
- /* and eax, imm32 */
- I1(0x25);
- memcpy(out, &mask, 4);
- out += 4;
- }
- } else if (bit_offset + bits <= 64) {
- align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
- byte -= align;
- bit_offset += align * 8;
-
- BUG_ON(bit_offset + bits > 64);
-
- /* mov rax, [rsi + imm8] */
- I4(0x48, 0x8b, 0x46, byte);
-
- shl = 64 - bit_offset - bits;
- shr = bit_offset + shl;
-
- if (shl) {
- /* shl rax, imm8 */
- I4(0x48, 0xc1, 0xe0, shl);
- }
-
- if (shr) {
- /* shr rax, imm8 */
- I4(0x48, 0xc1, 0xe8, shr);
- }
- } else {
- align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
- byte -= align;
- bit_offset += align * 8;
-
- BUG_ON(bit_offset + bits > 96);
-
- /* mov rax, [rsi + byte] */
- I4(0x48, 0x8b, 0x46, byte);
-
- /* mov edx, [rsi + byte + 8] */
- I3(0x8b, 0x56, byte + 8);
-
- /* bits from next word: */
- shr = bit_offset + bits - 64;
- BUG_ON(shr > bit_offset);
-
- /* shr rax, bit_offset */
- I4(0x48, 0xc1, 0xe8, shr);
-
- /* shl rdx, imm8 */
- I4(0x48, 0xc1, 0xe2, 64 - shr);
-
- /* or rax, rdx */
- I3(0x48, 0x09, 0xd0);
-
- shr = bit_offset - shr;
-
- if (shr) {
- /* shr rax, imm8 */
- I4(0x48, 0xc1, 0xe8, shr);
- }
- }
-
- /* rax += offset: */
- if (offset > S32_MAX) {
- /* mov rdx, imm64 */
- I2(0x48, 0xba);
- memcpy(out, &offset, 8);
- out += 8;
- /* add %rdx, %rax */
- I3(0x48, 0x01, 0xd0);
- } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
- /* add rax, imm32 */
- I2(0x48, 0x05);
- memcpy(out, &offset, 4);
- out += 4;
- } else if (offset) {
- /* add eax, imm32 */
- I1(0x05);
- memcpy(out, &offset, 4);
- out += 4;
- }
-set_field:
- switch (dst_size) {
- case 8:
- /* mov [rdi + dst_offset], rax */
- I4(0x48, 0x89, 0x47, dst_offset);
- break;
- case 4:
- /* mov [rdi + dst_offset], eax */
- I3(0x89, 0x47, dst_offset);
- break;
- default:
- BUG();
- }
-
- return out;
-}
-
-int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
-{
- bool eax_zeroed = false;
- u8 *out = _out;
-
- /*
- * rdi: dst - unpacked key
- * rsi: src - packed key
- */
-
- /* k->u64s, k->format, k->type */
-
- /* mov eax, [rsi] */
- I2(0x8b, 0x06);
-
- /* add eax, BKEY_U64s - format->key_u64s */
- I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
-
- /* and eax, imm32: mask out k->pad: */
- I5(0x25, 0xff, 0xff, 0xff, 0);
-
- /* mov [rdi], eax */
- I2(0x89, 0x07);
-
-#define x(id, field) \
- out = compile_bkey_field(format, out, id, \
- offsetof(struct bkey, field), \
- sizeof(((struct bkey *) NULL)->field), \
- &eax_zeroed);
- bkey_fields()
-#undef x
-
- /* retq */
- I1(0xc3);
-
- return (void *) out - _out;
-}
-
-#else
-#endif
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
- const struct bkey_packed *r,
- const struct btree *b)
-{
- return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
- const struct bkey_packed *l,
- const struct bpos *r)
-{
- return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
-}
-
-__pure __flatten
-int bch2_bkey_cmp_packed(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed_inlined(b, l, r);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed(const struct btree *b,
- const struct bkey_packed *l,
- const struct bpos *r)
-{
- const struct bkey *l_unpacked;
-
- return unlikely(l_unpacked = packed_to_bkey_c(l))
- ? bpos_cmp(l_unpacked->p, *r)
- : __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-void bch2_bpos_swab(struct bpos *p)
-{
- u8 *l = (u8 *) p;
- u8 *h = ((u8 *) &p[1]) - 1;
-
- while (l < h) {
- swap(*l, *h);
- l++;
- --h;
- }
-}
-
-void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
-{
- const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
- u8 *l = k->key_start;
- u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;
-
- while (l < h) {
- swap(*l, *h);
- l++;
- --h;
- }
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void)
-{
- struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
- struct bkey_packed p;
-
- struct bkey_format test_format = {
- .key_u64s = 3,
- .nr_fields = BKEY_NR_FIELDS,
- .bits_per_field = {
- 13,
- 64,
- 32,
- },
- };
-
- struct unpack_state in_s =
- unpack_state_init(&bch2_bkey_format_current, (void *) &t);
- struct pack_state out_s = pack_state_init(&test_format, &p);
- unsigned i;
-
- for (i = 0; i < out_s.format->nr_fields; i++) {
- u64 a, v = get_inc_field(&in_s, i);
-
- switch (i) {
-#define x(id, field) case id: a = t.field; break;
- bkey_fields()
-#undef x
- default:
- BUG();
- }
-
- if (a != v)
- panic("got %llu actual %llu i %u\n", v, a, i);
-
- if (!set_inc_field(&out_s, i, v))
- panic("failed at %u\n", i);
- }
-
- BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
-}
-#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
deleted file mode 100644
index 054e2d5e8448..000000000000
--- a/fs/bcachefs/bkey.h
+++ /dev/null
@@ -1,605 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_H
-#define _BCACHEFS_BKEY_H
-
-#include <linux/bug.h>
-#include "bcachefs_format.h"
-#include "bkey_types.h"
-#include "btree_types.h"
-#include "util.h"
-#include "vstructs.h"
-
-#if 0
-
-/*
- * compiled unpack functions are disabled, pending a new interface for
- * dynamically allocating executable memory:
- */
-
-#ifdef CONFIG_X86_64
-#define HAVE_BCACHEFS_COMPILED_UNPACK 1
-#endif
-#endif
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *,
- const struct bkey_format *,
- const struct bkey_packed *);
-
-enum bkey_lr_packed {
- BKEY_PACKED_BOTH,
- BKEY_PACKED_RIGHT,
- BKEY_PACKED_LEFT,
- BKEY_PACKED_NONE,
-};
-
-#define bkey_lr_packed(_l, _r) \
- ((_l)->format + ((_r)->format << 1))
-
-static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
-{
- memcpy_u64s_small(dst, src, src->u64s);
-}
-
-static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
-{
- memcpy_u64s_small(dst, src, src->k.u64s);
-}
-
-struct btree;
-
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-__pure
-unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
- const struct bkey_packed *,
- const struct btree *);
-
-__pure
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
- const struct bkey_packed *,
- const struct bpos *);
-
-__pure
-int bch2_bkey_cmp_packed(const struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_left_packed(const struct btree *,
- const struct bkey_packed *,
- const struct bpos *);
-
-static inline __pure
-int bkey_cmp_left_packed(const struct btree *b,
- const struct bkey_packed *l, const struct bpos *r)
-{
- return __bch2_bkey_cmp_left_packed(b, l, r);
-}
-
-/*
- * The compiler generates better code when we pass bpos by ref, but it's often
- * enough terribly convenient to pass it by val... as much as I hate c++, const
- * ref would be nice here:
- */
-__pure __flatten
-static inline int bkey_cmp_left_packed_byval(const struct btree *b,
- const struct bkey_packed *l,
- struct bpos r)
-{
- return bkey_cmp_left_packed(b, l, &r);
-}
-
-static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
-{
- return !((l.inode ^ r.inode) |
- (l.offset ^ r.offset) |
- (l.snapshot ^ r.snapshot));
-}
-
-static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
-{
- return l.inode != r.inode ? l.inode < r.inode :
- l.offset != r.offset ? l.offset < r.offset :
- l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
-}
-
-static __always_inline bool bpos_le(struct bpos l, struct bpos r)
-{
- return l.inode != r.inode ? l.inode < r.inode :
- l.offset != r.offset ? l.offset < r.offset :
- l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
-}
-
-static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
-{
- return bpos_lt(r, l);
-}
-
-static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
-{
- return bpos_le(r, l);
-}
-
-static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
-{
- return cmp_int(l.inode, r.inode) ?:
- cmp_int(l.offset, r.offset) ?:
- cmp_int(l.snapshot, r.snapshot);
-}
-
-static inline struct bpos bpos_min(struct bpos l, struct bpos r)
-{
- return bpos_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bpos_max(struct bpos l, struct bpos r)
-{
- return bpos_gt(l, r) ? l : r;
-}
-
-static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
-{
- return !((l.inode ^ r.inode) |
- (l.offset ^ r.offset));
-}
-
-static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
-{
- return l.inode != r.inode
- ? l.inode < r.inode
- : l.offset < r.offset;
-}
-
-static __always_inline bool bkey_le(struct bpos l, struct bpos r)
-{
- return l.inode != r.inode
- ? l.inode < r.inode
- : l.offset <= r.offset;
-}
-
-static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
-{
- return bkey_lt(r, l);
-}
-
-static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
-{
- return bkey_le(r, l);
-}
-
-static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
-{
- return cmp_int(l.inode, r.inode) ?:
- cmp_int(l.offset, r.offset);
-}
-
-static inline struct bpos bkey_min(struct bpos l, struct bpos r)
-{
- return bkey_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bkey_max(struct bpos l, struct bpos r)
-{
- return bkey_gt(l, r) ? l : r;
-}
-
-static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
-{
- return bpos_eq(l.k->p, r.k->p) &&
- bkey_bytes(l.k) == bkey_bytes(r.k) &&
- !memcmp(l.v, r.v, bkey_val_bytes(l.k));
-}
-
-void bch2_bpos_swab(struct bpos *);
-void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
-
-static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
-{
- return cmp_int(l.hi, r.hi) ?:
- cmp_int(l.lo, r.lo);
-}
-
-#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
-#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
-
-static __always_inline bool bversion_zero(struct bversion v)
-{
- return bversion_cmp(v, ZERO_VERSION) == 0;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-/* statement expressions confusing unlikely()? */
-#define bkey_packed(_k) \
- ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \
- (_k)->format != KEY_FORMAT_CURRENT; })
-#else
-#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT)
-#endif
-
-/*
- * It's safe to treat an unpacked bkey as a packed one, but not the reverse
- */
-static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
-{
- return (struct bkey_packed *) k;
-}
-
-static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
-{
- return (const struct bkey_packed *) k;
-}
-
-static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
-{
- return bkey_packed(k) ? NULL : (struct bkey_i *) k;
-}
-
-static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
-{
- return bkey_packed(k) ? NULL : (const struct bkey *) k;
-}
-
-static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
-{
- return format->bits_per_field[BKEY_FIELD_INODE] +
- format->bits_per_field[BKEY_FIELD_OFFSET] +
- format->bits_per_field[BKEY_FIELD_SNAPSHOT];
-}
-
-static inline struct bpos bpos_successor(struct bpos p)
-{
- if (!++p.snapshot &&
- !++p.offset &&
- !++p.inode)
- BUG();
-
- return p;
-}
-
-static inline struct bpos bpos_predecessor(struct bpos p)
-{
- if (!p.snapshot-- &&
- !p.offset-- &&
- !p.inode--)
- BUG();
-
- return p;
-}
-
-static inline struct bpos bpos_nosnap_successor(struct bpos p)
-{
- p.snapshot = 0;
-
- if (!++p.offset &&
- !++p.inode)
- BUG();
-
- return p;
-}
-
-static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
-{
- p.snapshot = 0;
-
- if (!p.offset-- &&
- !p.inode--)
- BUG();
-
- return p;
-}
-
-static inline u64 bkey_start_offset(const struct bkey *k)
-{
- return k->p.offset - k->size;
-}
-
-static inline struct bpos bkey_start_pos(const struct bkey *k)
-{
- return (struct bpos) {
- .inode = k->p.inode,
- .offset = bkey_start_offset(k),
- .snapshot = k->p.snapshot,
- };
-}
-
-/* Packed helpers */
-
-static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
-}
-
-static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
- const struct bkey_packed *k)
-{
- return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
-}
-
-static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return bkeyp_key_u64s(format, k) * sizeof(u64);
-}
-
-static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return k->u64s - bkeyp_key_u64s(format, k);
-}
-
-static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return bkeyp_val_u64s(format, k) * sizeof(u64);
-}
-
-static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
- struct bkey_packed *k, unsigned val_u64s)
-{
- k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
-}
-
-#define bkeyp_val(_format, _k) \
- ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
-
-extern const struct bkey_format bch2_bkey_format_current;
-
-bool bch2_bkey_transform(const struct bkey_format *,
- struct bkey_packed *,
- const struct bkey_format *,
- const struct bkey_packed *);
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
- const struct bkey_packed *);
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *,
- const struct bkey_packed *);
-#endif
-
-bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
- const struct bkey_format *);
-
-enum bkey_pack_pos_ret {
- BKEY_PACK_POS_EXACT,
- BKEY_PACK_POS_SMALLER,
- BKEY_PACK_POS_FAIL,
-};
-
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
- const struct btree *);
-
-static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
- const struct btree *b)
-{
- return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
-}
-
-void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
- const struct bkey_packed *);
-bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
- const struct bkey_format *);
-
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
- struct bkey *dst,
- const struct bkey_packed *src)
-{
- if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
- compiled_unpack_fn unpack_fn = b->aux_data;
- unpack_fn(dst, src);
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- bch2_expensive_debug_checks) {
- struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
- BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
- }
- } else {
- *dst = __bch2_bkey_unpack_key(&b->format, src);
- }
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
- struct bkey dst;
-
- __bkey_unpack_key_format_checked(b, &dst, src);
- return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
- struct bkey *dst,
- const struct bkey_packed *src)
-{
- if (likely(bkey_packed(src)))
- __bkey_unpack_key_format_checked(b, dst, src);
- else
- *dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_key_format_checked(b, src)
- : *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
- return bkey_unpack_key_format_checked(b, src).p;
-#else
- return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_pos_format_checked(b, src)
- : packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
- const struct bkey_packed *k,
- struct bkey *u)
-{
- __bkey_unpack_key(b, u, k);
-
- return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(const struct btree *b,
- struct bkey_packed *k,
- struct bkey *u)
-{
- __bkey_unpack_key(b, u, k);
-
- return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
-static inline u64 bkey_field_max(const struct bkey_format *f,
- enum bch_bkey_fields nr)
-{
- return f->bits_per_field[nr] < 64
- ? (le64_to_cpu(f->field_offset[nr]) +
- ~(~0ULL << f->bits_per_field[nr]))
- : U64_MAX;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-int bch2_compile_bkey_format(const struct bkey_format *, void *);
-
-#else
-
-static inline int bch2_compile_bkey_format(const struct bkey_format *format,
- void *out) { return 0; }
-
-#endif
-
-static inline void bkey_reassemble(struct bkey_i *dst,
- struct bkey_s_c src)
-{
- dst->k = *src.k;
- memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
-}
-
-/* byte order helpers */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
- return f->key_u64s - 1;
-}
-
-#define high_bit_offset 0
-#define nth_word(p, n) ((p) - (n))
-
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
- return 0;
-}
-
-#define high_bit_offset KEY_PACKED_BITS_START
-#define nth_word(p, n) ((p) + (n))
-
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f))
-#define next_word(p) nth_word(p, 1)
-#define prev_word(p) nth_word(p, -1)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void);
-#else
-static inline void bch2_bkey_pack_test(void) {}
-#endif
-
-#define bkey_fields() \
- x(BKEY_FIELD_INODE, p.inode) \
- x(BKEY_FIELD_OFFSET, p.offset) \
- x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
- x(BKEY_FIELD_SIZE, size) \
- x(BKEY_FIELD_VERSION_HI, bversion.hi) \
- x(BKEY_FIELD_VERSION_LO, bversion.lo)
-
-struct bkey_format_state {
- u64 field_min[BKEY_NR_FIELDS];
- u64 field_max[BKEY_NR_FIELDS];
-};
-
-void bch2_bkey_format_init(struct bkey_format_state *);
-
-static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
-{
- s->field_min[field] = min(s->field_min[field], v);
- s->field_max[field] = max(s->field_max[field], v);
-}
-
-/*
- * Changes @format so that @k can be successfully packed with @format
- */
-static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
-{
-#define x(id, field) __bkey_format_add(s, id, k->field);
- bkey_fields()
-#undef x
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-
-static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i)
-{
- unsigned f_bits = f->bits_per_field[i];
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
- if (f_bits > unpacked_bits)
- return true;
-
- if ((f_bits == unpacked_bits) && field_offset)
- return true;
-
- u64 f_mask = f_bits
- ? ~((~0ULL << (f_bits - 1)) << 1)
- : 0;
-
- if (((field_offset + f_mask) & unpacked_mask) < field_offset)
- return true;
- return false;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
- enum bch_validate_flags, struct printbuf *);
-void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
-
-#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
deleted file mode 100644
index a30c4ae8eb36..000000000000
--- a/fs/bcachefs/bkey_buf.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_BUF_H
-#define _BCACHEFS_BKEY_BUF_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-
-struct bkey_buf {
- struct bkey_i *k;
- u64 onstack[12];
-};
-
-static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
- struct bch_fs *c, unsigned u64s)
-{
- if (s->k == (void *) s->onstack &&
- u64s > ARRAY_SIZE(s->onstack)) {
- s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
- memcpy(s->k, s->onstack, sizeof(s->onstack));
- }
-}
-
-static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
- struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_buf_realloc(s, c, k.k->u64s);
- bkey_reassemble(s->k, k);
-}
-
-static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
- struct bch_fs *c,
- struct bkey_i *src)
-{
- bch2_bkey_buf_realloc(s, c, src->k.u64s);
- bkey_copy(s->k, src);
-}
-
-static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
- struct bch_fs *c,
- struct btree *b,
- struct bkey_packed *src)
-{
- bch2_bkey_buf_realloc(s, c, BKEY_U64s +
- bkeyp_val_u64s(&b->format, src));
- bch2_bkey_unpack(b, s->k, src);
-}
-
-static inline void bch2_bkey_buf_init(struct bkey_buf *s)
-{
- s->k = (void *) s->onstack;
-}
-
-static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
-{
- if (s->k != (void *) s->onstack)
- mempool_free(s->k, &c->large_bkey_pool);
- s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
deleted file mode 100644
index 5f42a6e69360..000000000000
--- a/fs/bcachefs/bkey_cmp.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_CMP_H
-#define _BCACHEFS_BKEY_CMP_H
-
-#include "bkey.h"
-
-#ifdef CONFIG_X86_64
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- long d0, d1, d2, d3;
- int cmp;
-
- /* we shouldn't need asm for this, but gcc is being retarded: */
-
- asm(".intel_syntax noprefix;"
- "xor eax, eax;"
- "xor edx, edx;"
- "1:;"
- "mov r8, [rdi];"
- "mov r9, [rsi];"
- "sub ecx, 64;"
- "jl 2f;"
-
- "cmp r8, r9;"
- "jnz 3f;"
-
- "lea rdi, [rdi - 8];"
- "lea rsi, [rsi - 8];"
- "jmp 1b;"
-
- "2:;"
- "not ecx;"
- "shr r8, 1;"
- "shr r9, 1;"
- "shr r8, cl;"
- "shr r9, cl;"
- "cmp r8, r9;"
-
- "3:\n"
- "seta al;"
- "setb dl;"
- "sub eax, edx;"
- ".att_syntax prefix;"
- : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
- : "0" (l), "1" (r), "3" (nr_key_bits)
- : "r8", "r9", "cc", "memory");
-
- return cmp;
-}
-#else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- u64 l_v, r_v;
-
- if (!nr_key_bits)
- return 0;
-
- /* for big endian, skip past header */
- nr_key_bits += high_bit_offset;
- l_v = *l & (~0ULL >> high_bit_offset);
- r_v = *r & (~0ULL >> high_bit_offset);
-
- while (1) {
- if (nr_key_bits < 64) {
- l_v >>= 64 - nr_key_bits;
- r_v >>= 64 - nr_key_bits;
- nr_key_bits = 0;
- } else {
- nr_key_bits -= 64;
- }
-
- if (!nr_key_bits || l_v != r_v)
- break;
-
- l = next_word(l);
- r = next_word(r);
-
- l_v = *l;
- r_v = *r;
- }
-
- return cmp_int(l_v, r_v);
-}
-#endif
-
-static inline __pure __flatten
-int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
- const struct bkey_packed *r,
- const struct btree *b)
-{
- const struct bkey_format *f = &b->format;
- int ret;
-
- EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
- ret = __bkey_cmp_bits(high_word(f, l),
- high_word(f, r),
- b->nr_key_bits);
-
- EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
- bkey_unpack_pos(b, r)));
- return ret;
-}
-
-static inline __pure __flatten
-int bch2_bkey_cmp_packed_inlined(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- struct bkey unpacked;
-
- if (likely(bkey_packed(l) && bkey_packed(r)))
- return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-
- if (bkey_packed(l)) {
- __bkey_unpack_key_format_checked(b, &unpacked, l);
- l = (void *) &unpacked;
- } else if (bkey_packed(r)) {
- __bkey_unpack_key_format_checked(b, &unpacked, r);
- r = (void *) &unpacked;
- }
-
- return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
-}
-
-#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
deleted file mode 100644
index 15c93576b5c2..000000000000
--- a/fs/bcachefs/bkey_methods.c
+++ /dev/null
@@ -1,481 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_types.h"
-#include "alloc_background.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "lru.h"
-#include "quota.h"
-#include "reflink.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-const char * const bch2_bkey_types[] = {
-#define x(name, nr) #name,
- BCH_BKEY_TYPES()
-#undef x
- NULL
-};
-
-static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-#define bch2_bkey_ops_deleted ((struct bkey_ops) { \
- .key_validate = deleted_key_validate, \
-})
-
-#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \
- .key_validate = deleted_key_validate, \
-})
-
-static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_bytes(k.k),
- c, bkey_val_size_nonzero,
- "incorrect value size (%zu != 0)",
- bkey_val_bytes(k.k));
-fsck_err:
- return ret;
-}
-
-#define bch2_bkey_ops_error ((struct bkey_ops) { \
- .key_validate = empty_val_key_validate, \
-})
-
-static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
-
- prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
-}
-
-#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
- .key_validate = key_type_cookie_validate, \
- .val_to_text = key_type_cookie_to_text, \
- .min_val_size = 8, \
-})
-
-#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
- .key_validate = empty_val_key_validate, \
-})
-
-static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
- unsigned datalen = bkey_inline_data_bytes(k.k);
-
- prt_printf(out, "datalen %u: %*phN",
- datalen, min(datalen, 32U), d.v->data);
-}
-
-#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \
- .key_validate = key_type_inline_data_validate, \
- .val_to_text = key_type_inline_data_to_text, \
-})
-
-static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
- bch2_key_resize(l.k, l.k->size + r.k->size);
- return true;
-}
-
-#define bch2_bkey_ops_set ((struct bkey_ops) { \
- .key_validate = empty_val_key_validate, \
- .key_merge = key_type_set_merge, \
-})
-
-const struct bkey_ops bch2_bkey_ops[] = {
-#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
- BCH_BKEY_TYPES()
-#undef x
-};
-
-const struct bkey_ops bch2_bkey_null_ops = {
-};
-
-int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
- return 0;
-
- const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size,
- c, bkey_val_size_too_small,
- "bad val size (%zu < %u)",
- bkey_val_bytes(k.k), ops->min_val_size);
-
- if (!ops->key_validate)
- return 0;
-
- ret = ops->key_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-static u64 bch2_key_types_allowed[] = {
- [BKEY_TYPE_btree] =
- BIT_ULL(KEY_TYPE_deleted)|
- BIT_ULL(KEY_TYPE_btree_ptr)|
- BIT_ULL(KEY_TYPE_btree_ptr_v2),
-#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
- BCH_BTREE_IDS()
-#undef x
-};
-
-const char *bch2_btree_node_type_str(enum btree_node_type type)
-{
- return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
-}
-
-int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- enum btree_node_type type = __btree_node_type(from.level, from.btree);
-
- if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
- return 0;
-
- int ret = 0;
-
- bkey_fsck_err_on(k.k->u64s < BKEY_U64s,
- c, bkey_u64s_too_small,
- "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
-
- if (type >= BKEY_TYPE_NR)
- return 0;
-
- bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
- (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
- !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
- c, bkey_invalid_type_for_btree,
- "invalid key type for btree %s (%s)",
- bch2_btree_node_type_str(type),
- k.k->type < KEY_TYPE_MAX
- ? bch2_bkey_types[k.k->type]
- : "(unknown)");
-
- if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
- bkey_fsck_err_on(k.k->size == 0,
- c, bkey_extent_size_zero,
- "size == 0");
-
- bkey_fsck_err_on(k.k->size > k.k->p.offset,
- c, bkey_extent_size_greater_than_offset,
- "size greater than offset (%u > %llu)",
- k.k->size, k.k->p.offset);
- } else {
- bkey_fsck_err_on(k.k->size,
- c, bkey_size_nonzero,
- "size != 0");
- }
-
- if (type != BKEY_TYPE_btree) {
- enum btree_id btree = type - 1;
-
- if (btree_type_has_snapshots(btree)) {
- bkey_fsck_err_on(!k.k->p.snapshot,
- c, bkey_snapshot_zero,
- "snapshot == 0");
- } else if (!btree_type_has_snapshot_field(btree)) {
- bkey_fsck_err_on(k.k->p.snapshot,
- c, bkey_snapshot_nonzero,
- "nonzero snapshot");
- } else {
- /*
- * btree uses snapshot field but it's not required to be
- * nonzero
- */
- }
-
- bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX),
- c, bkey_at_pos_max,
- "key at POS_MAX");
- }
-fsck_err:
- return ret;
-}
-
-int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return __bch2_bkey_validate(c, k, from) ?:
- bch2_bkey_val_validate(c, k, from);
-}
-
-int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key),
- c, bkey_before_start_of_btree_node,
- "key before start of btree node");
-
- bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key),
- c, bkey_after_end_of_btree_node,
- "key past end of btree node");
-fsck_err:
- return ret;
-}
-
-void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
-{
- if (bpos_eq(pos, POS_MIN))
- prt_printf(out, "POS_MIN");
- else if (bpos_eq(pos, POS_MAX))
- prt_printf(out, "POS_MAX");
- else if (bpos_eq(pos, SPOS_MAX))
- prt_printf(out, "SPOS_MAX");
- else {
- if (pos.inode == U64_MAX)
- prt_printf(out, "U64_MAX");
- else
- prt_printf(out, "%llu", pos.inode);
- prt_printf(out, ":");
- if (pos.offset == U64_MAX)
- prt_printf(out, "U64_MAX");
- else
- prt_printf(out, "%llu", pos.offset);
- prt_printf(out, ":");
- if (pos.snapshot == U32_MAX)
- prt_printf(out, "U32_MAX");
- else
- prt_printf(out, "%u", pos.snapshot);
- }
-}
-
-void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
-{
- if (k) {
- prt_printf(out, "u64s %u type ", k->u64s);
-
- if (k->type < KEY_TYPE_MAX)
- prt_printf(out, "%s ", bch2_bkey_types[k->type]);
- else
- prt_printf(out, "%u ", k->type);
-
- bch2_bpos_to_text(out, k->p);
-
- prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
- } else {
- prt_printf(out, "(null)");
- }
-}
-
-void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
- if (likely(ops->val_to_text))
- ops->val_to_text(out, c, k);
-}
-
-void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_to_text(out, k.k);
-
- if (bkey_val_bytes(k.k)) {
- prt_printf(out, ": ");
- bch2_val_to_text(out, c, k);
- }
-}
-
-void bch2_bkey_swab_val(struct bkey_s k)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
- if (ops->swab)
- ops->swab(k);
-}
-
-bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
- return ops->key_normalize
- ? ops->key_normalize(c, k)
- : false;
-}
-
-bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
-
- return ops->key_merge &&
- bch2_bkey_maybe_mergable(l.k, r.k) &&
- (u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
- !bch2_key_merging_disabled &&
- ops->key_merge(c, l, r);
-}
-
-static const struct old_bkey_type {
- u8 btree_node_type;
- u8 old;
- u8 new;
-} bkey_renumber_table[] = {
- {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr },
- {BKEY_TYPE_extents, 128, KEY_TYPE_extent },
- {BKEY_TYPE_extents, 129, KEY_TYPE_extent },
- {BKEY_TYPE_extents, 130, KEY_TYPE_reservation },
- {BKEY_TYPE_inodes, 128, KEY_TYPE_inode },
- {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation },
- {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent },
- {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout },
- {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr },
- {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout },
- {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc },
- {BKEY_TYPE_quotas, 128, KEY_TYPE_quota },
-};
-
-void bch2_bkey_renumber(enum btree_node_type btree_node_type,
- struct bkey_packed *k,
- int write)
-{
- const struct old_bkey_type *i;
-
- for (i = bkey_renumber_table;
- i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
- i++)
- if (btree_node_type == i->btree_node_type &&
- k->type == (write ? i->new : i->old)) {
- k->type = write ? i->old : i->new;
- break;
- }
-}
-
-void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write,
- struct bkey_format *f,
- struct bkey_packed *k)
-{
- const struct bkey_ops *ops;
- struct bkey uk;
- unsigned nr_compat = 5;
- int i;
-
- /*
- * Do these operations in reverse order in the write path:
- */
-
- for (i = 0; i < nr_compat; i++)
- switch (!write ? i : nr_compat - 1 - i) {
- case 0:
- if (big_endian != CPU_BIG_ENDIAN) {
- bch2_bkey_swab_key(f, k);
- } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- bch2_bkey_swab_key(f, k);
- bch2_bkey_swab_key(f, k);
- }
- break;
- case 1:
- if (version < bcachefs_metadata_version_bkey_renumber)
- bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
- break;
- case 2:
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_inodes) {
- if (!bkey_packed(k)) {
- struct bkey_i *u = packed_to_bkey(k);
-
- swap(u->k.p.inode, u->k.p.offset);
- } else if (f->bits_per_field[BKEY_FIELD_INODE] &&
- f->bits_per_field[BKEY_FIELD_OFFSET]) {
- struct bkey_format tmp = *f, *in = f, *out = &tmp;
-
- swap(tmp.bits_per_field[BKEY_FIELD_INODE],
- tmp.bits_per_field[BKEY_FIELD_OFFSET]);
- swap(tmp.field_offset[BKEY_FIELD_INODE],
- tmp.field_offset[BKEY_FIELD_OFFSET]);
-
- if (!write)
- swap(in, out);
-
- uk = __bch2_bkey_unpack_key(in, k);
- swap(uk.p.inode, uk.p.offset);
- BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
- }
- }
- break;
- case 3:
- if (version < bcachefs_metadata_version_snapshot &&
- (level || btree_type_has_snapshots(btree_id))) {
- struct bkey_i *u = packed_to_bkey(k);
-
- if (u) {
- u->k.p.snapshot = write
- ? 0 : U32_MAX;
- } else {
- u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
- u64 max_packed = min_packed +
- ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
- uk = __bch2_bkey_unpack_key(f, k);
- uk.p.snapshot = write
- ? min_packed : min_t(u64, U32_MAX, max_packed);
-
- BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
- }
- }
-
- break;
- case 4: {
- struct bkey_s u;
-
- if (!bkey_packed(k)) {
- u = bkey_i_to_s(packed_to_bkey(k));
- } else {
- uk = __bch2_bkey_unpack_key(f, k);
- u.k = &uk;
- u.v = bkeyp_val(f, k);
- }
-
- if (big_endian != CPU_BIG_ENDIAN)
- bch2_bkey_swab_val(u);
-
- ops = bch2_bkey_type_ops(k->type);
-
- if (ops->compat)
- ops->compat(btree_id, version, big_endian, write, u);
- break;
- }
- default:
- BUG();
- }
-}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
deleted file mode 100644
index bf34111cdf00..000000000000
--- a/fs/bcachefs/bkey_methods.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_METHODS_H
-#define _BCACHEFS_BKEY_METHODS_H
-
-#include "bkey.h"
-
-struct bch_fs;
-struct btree;
-struct btree_trans;
-struct bkey;
-enum btree_node_type;
-
-extern const char * const bch2_bkey_types[];
-extern const struct bkey_ops bch2_bkey_null_ops;
-
-/*
- * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If
- * invalid, entire key will be deleted.
- *
- * When invalid, error string is returned via @err. @rw indicates whether key is
- * being read or written; more aggressive checks can be enabled when rw == WRITE.
- */
-struct bkey_ops {
- int (*key_validate)(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from);
- void (*val_to_text)(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
- void (*swab)(struct bkey_s);
- bool (*key_normalize)(struct bch_fs *, struct bkey_s);
- bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
- int (*trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
- void (*compat)(enum btree_id id, unsigned version,
- unsigned big_endian, int write,
- struct bkey_s);
-
- /* Size of value type when first created: */
- unsigned min_val_size;
-};
-
-extern const struct bkey_ops bch2_bkey_ops[];
-
-static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
-{
- return likely(type < KEY_TYPE_MAX)
- ? &bch2_bkey_ops[type]
- : &bch2_bkey_null_ops;
-}
-
-int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
- struct bkey_validate_context from);
-
-void bch2_bpos_to_text(struct printbuf *, struct bpos);
-void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
-void bch2_val_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-
-void bch2_bkey_swab_val(struct bkey_s);
-
-bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
-
-static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
-{
- return l->type == r->type &&
- !bversion_cmp(l->bversion, r->bversion) &&
- bpos_eq(l->p, bkey_start_pos(r));
-}
-
-bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-static inline int bch2_key_trigger(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
-
- return ops->trigger
- ? ops->trigger(trans, btree, level, old, new, flags)
- : 0;
-}
-
-static inline int bch2_key_trigger_old(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_i deleted;
-
- bkey_init(&deleted.k);
- deleted.k.p = old.k->p;
-
- return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
- BTREE_TRIGGER_overwrite|flags);
-}
-
-static inline int bch2_key_trigger_new(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_i deleted;
-
- bkey_init(&deleted.k);
- deleted.k.p = new.k->p;
-
- return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
- BTREE_TRIGGER_insert|flags);
-}
-
-void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
-
-void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
- int, struct bkey_format *, struct bkey_packed *);
-
-static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write,
- struct bkey_format *f,
- struct bkey_packed *k)
-{
- if (version < bcachefs_metadata_version_current ||
- big_endian != CPU_BIG_ENDIAN ||
- IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- __bch2_bkey_compat(level, btree_id, version,
- big_endian, write, f, k);
-
-}
-
-#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
deleted file mode 100644
index 4536eb50fc40..000000000000
--- a/fs/bcachefs/bkey_sort.c
+++ /dev/null
@@ -1,214 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_cmp.h"
-#include "bkey_sort.h"
-#include "bset.h"
-#include "extents.h"
-
-typedef int (*sort_cmp_fn)(const struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-
-static inline bool sort_iter_end(struct sort_iter *iter)
-{
- return !iter->used;
-}
-
-static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
- sort_cmp_fn cmp)
-{
- unsigned i;
-
- for (i = from;
- i + 1 < iter->used &&
- cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
- i++)
- swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
-{
- unsigned i = iter->used;
-
- while (i--)
- sort_iter_sift(iter, i, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
-{
- return !sort_iter_end(iter) ? iter->data->k : NULL;
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
- struct sort_iter_set *i = iter->data;
-
- BUG_ON(!iter->used);
-
- i->k = bkey_p_next(i->k);
-
- BUG_ON(i->k > i->end);
-
- if (i->k == i->end)
- array_remove_item(iter->data, iter->used, 0);
- else
- sort_iter_sift(iter, 0, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
- sort_cmp_fn cmp)
-{
- struct bkey_packed *ret = sort_iter_peek(iter);
-
- if (ret)
- sort_iter_advance(iter, cmp);
-
- return ret;
-}
-
-/*
- * If keys compare equal, compare by pointer order:
- */
-static inline int key_sort_fix_overlapping_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed(b, l, r) ?:
- cmp_int((unsigned long) l, (unsigned long) r);
-}
-
-static inline bool should_drop_next_key(struct sort_iter *iter)
-{
- /*
- * key_sort_cmp() ensures that when keys compare equal the older key
- * comes first; so if l->k compares equal to r->k then l->k is older
- * and should be dropped.
- */
- return iter->used >= 2 &&
- !bch2_bkey_cmp_packed(iter->b,
- iter->data[0].k,
- iter->data[1].k);
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
- struct sort_iter *iter)
-{
- struct bkey_packed *out = dst->start;
- struct bkey_packed *k;
- struct btree_nr_keys nr;
-
- memset(&nr, 0, sizeof(nr));
-
- sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
-
- while ((k = sort_iter_peek(iter))) {
- if (!bkey_deleted(k) &&
- !should_drop_next_key(iter)) {
- bkey_p_copy(out, k);
- btree_keys_account_key_add(&nr, 0, out);
- out = bkey_p_next(out);
- }
-
- sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- return nr;
-}
-
-/* Sort + repack in a new format: */
-struct btree_nr_keys
-bch2_sort_repack(struct bset *dst, struct btree *src,
- struct btree_node_iter *src_iter,
- struct bkey_format *out_f,
- bool filter_whiteouts)
-{
- struct bkey_format *in_f = &src->format;
- struct bkey_packed *in, *out = vstruct_last(dst);
- struct btree_nr_keys nr;
- bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
-
- memset(&nr, 0, sizeof(nr));
-
- while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
- if (filter_whiteouts && bkey_deleted(in))
- continue;
-
- if (!transform)
- bkey_p_copy(out, in);
- else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
- ? in_f : &bch2_bkey_format_current, in))
- out->format = KEY_FORMAT_LOCAL_BTREE;
- else
- bch2_bkey_unpack(src, (void *) out, in);
-
- out->needs_whiteout = false;
-
- btree_keys_account_key_add(&nr, 0, out);
- out = bkey_p_next(out);
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- return nr;
-}
-
-static inline int keep_unwritten_whiteouts_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
- (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
- (long) l - (long) r;
-}
-
-#include "btree_update_interior.h"
-
-/*
- * For sorting in the btree node write path: whiteouts not in the unwritten
- * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are
- * dropped if overwritten by real keys:
- */
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter)
-{
- struct bkey_packed *in, *next, *out = dst;
-
- sort_iter_sort(iter, keep_unwritten_whiteouts_cmp);
-
- while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) {
- if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b))
- continue;
-
- if ((next = sort_iter_peek(iter)) &&
- !bch2_bkey_cmp_packed_inlined(iter->b, in, next))
- continue;
-
- bkey_p_copy(out, in);
- out = bkey_p_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
-
-/*
- * Main sort routine for compacting a btree node in memory: we always drop
- * whiteouts because any whiteouts that need to be written are in the unwritten
- * whiteouts area:
- */
-unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter)
-{
- struct bkey_packed *in, *out = dst;
-
- sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined);
-
- while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) {
- if (bkey_deleted(in))
- continue;
-
- bkey_p_copy(out, in);
- out = bkey_p_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
deleted file mode 100644
index 9be969d46890..000000000000
--- a/fs/bcachefs/bkey_sort.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_SORT_H
-#define _BCACHEFS_BKEY_SORT_H
-
-struct sort_iter {
- struct btree *b;
- unsigned used;
- unsigned size;
-
- struct sort_iter_set {
- struct bkey_packed *k, *end;
- } data[];
-};
-
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
-{
- iter->b = b;
- iter->used = 0;
- iter->size = size;
-}
-
-struct sort_iter_stack {
- struct sort_iter iter;
- struct sort_iter_set sets[MAX_BSETS + 1];
-};
-
-static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
-{
- sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
-}
-
-static inline void sort_iter_add(struct sort_iter *iter,
- struct bkey_packed *k,
- struct bkey_packed *end)
-{
- BUG_ON(iter->used >= iter->size);
-
- if (k != end)
- iter->data[iter->used++] = (struct sort_iter_set) { k, end };
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
- struct sort_iter *);
-
-struct btree_nr_keys
-bch2_sort_repack(struct bset *, struct btree *,
- struct btree_node_iter *,
- struct bkey_format *, bool);
-
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
-unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
-
-#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
deleted file mode 100644
index b4f328f9853c..000000000000
--- a/fs/bcachefs/bkey_types.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_TYPES_H
-#define _BCACHEFS_BKEY_TYPES_H
-
-#include "bcachefs_format.h"
-
-/*
- * bkey_i - bkey with inline value
- * bkey_s - bkey with split value
- * bkey_s_c - bkey with split value, const
- */
-
-#define bkey_p_next(_k) vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
- return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
- return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
- unsigned u64s = BKEY_U64s + val_u64s;
-
- BUG_ON(u64s > U8_MAX);
- k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
- set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
-/* bkey with split value, const */
-struct bkey_s_c {
- const struct bkey *k;
- const struct bch_val *v;
-};
-
-/* bkey with split value */
-struct bkey_s {
- union {
- struct {
- struct bkey *k;
- struct bch_val *v;
- };
- struct bkey_s_c s_c;
- };
-};
-
-#define bkey_s_null ((struct bkey_s) { .k = NULL })
-#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
- return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
- return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
- return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
- return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...) \
-struct bkey_i_##name { \
- union { \
- struct bkey k; \
- struct bkey_i k_i; \
- }; \
- struct bch_##name v; \
-}; \
- \
-struct bkey_s_c_##name { \
- union { \
- struct { \
- const struct bkey *k; \
- const struct bch_##name *v; \
- }; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-struct bkey_s_##name { \
- union { \
- struct { \
- struct bkey *k; \
- struct bch_##name *v; \
- }; \
- struct bkey_s_c_##name c; \
- struct bkey_s s; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline const struct bkey_i_##name * \
-bkey_i_to_##name##_c(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{ \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-name##_i_to_s_c(const struct bkey_i_##name *k) \
-{ \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-bkey_i_to_s_c_##name(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{ \
- struct bkey_i_##name *k = \
- container_of(&_k->k, struct bkey_i_##name, k); \
- \
- bkey_init(&k->k); \
- memset(&k->v, 0, sizeof(k->v)); \
- k->k.type = KEY_TYPE_##name; \
- set_bkey_val_bytes(&k->k, sizeof(k->v)); \
- \
- return k; \
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
-enum bch_validate_flags {
- BCH_VALIDATE_write = BIT(0),
- BCH_VALIDATE_commit = BIT(1),
- BCH_VALIDATE_silent = BIT(2),
-};
-
-#define BKEY_VALIDATE_CONTEXTS() \
- x(unknown) \
- x(superblock) \
- x(journal) \
- x(btree_root) \
- x(btree_node) \
- x(commit)
-
-struct bkey_validate_context {
- enum {
-#define x(n) BKEY_VALIDATE_##n,
- BKEY_VALIDATE_CONTEXTS()
-#undef x
- } from:8;
- enum bch_validate_flags flags:8;
- u8 level;
- enum btree_id btree;
- bool root:1;
- unsigned journal_offset;
- u64 journal_seq;
-};
-
-#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
deleted file mode 100644
index 9a4a83d6fd2d..000000000000
--- a/fs/bcachefs/bset.c
+++ /dev/null
@@ -1,1570 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for working with individual keys, and sorted sets of keys with in a
- * btree node
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "bset.h"
-#include "eytzinger.h"
-#include "trace.h"
-#include "util.h"
-
-#include <linux/unaligned.h>
-#include <linux/console.h>
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
- struct btree *);
-
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
- unsigned n = ARRAY_SIZE(iter->data);
-
- while (n && __btree_node_iter_set_end(iter, n - 1))
- --n;
-
- return n;
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
-{
- return bch2_bkey_to_bset_inlined(b, k);
-}
-
-/*
- * There are never duplicate live keys in the btree - but including keys that
- * have been flagged as deleted (and will be cleaned up later) we _will_ see
- * duplicates.
- *
- * Thus the sort order is: usual key comparison first, but for keys that compare
- * equal the deleted key(s) come first, and the (at most one) live version comes
- * last.
- *
- * The main reason for this is insertion: to handle overwrites, we first iterate
- * over keys that compare equal to our insert key, and then insert immediately
- * prior to the first key greater than the key we're inserting - our insert
- * position will be after all keys that compare equal to our insert key, which
- * by the time we actually do the insert will all be deleted.
- */
-
-void bch2_dump_bset(struct bch_fs *c, struct btree *b,
- struct bset *i, unsigned set)
-{
- struct bkey_packed *_k, *_n;
- struct bkey uk, n;
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
-
- if (!i->u64s)
- return;
-
- for (_k = i->start;
- _k < vstruct_last(i);
- _k = _n) {
- _n = bkey_p_next(_k);
-
- if (!_k->u64s) {
- printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
- _k->_data - i->_data);
- break;
- }
-
- k = bkey_disassemble(b, _k, &uk);
-
- printbuf_reset(&buf);
- if (c)
- bch2_bkey_val_to_text(&buf, c, k);
- else
- bch2_bkey_to_text(&buf, k.k);
- printk(KERN_ERR "block %u key %5zu: %s\n", set,
- _k->_data - i->_data, buf.buf);
-
- if (_n == vstruct_last(i))
- continue;
-
- n = bkey_unpack_key(b, _n);
-
- if (bpos_lt(n.p, k.k->p)) {
- printk(KERN_ERR "Key skipped backwards\n");
- continue;
- }
-
- if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
- printk(KERN_ERR "Duplicate keys\n");
- }
-
- printbuf_exit(&buf);
-}
-
-void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
-{
- console_lock();
- for_each_bset(b, t)
- bch2_dump_bset(c, b, bset(b, t), t - b->set);
- console_unlock();
-}
-
-void bch2_dump_btree_node_iter(struct btree *b,
- struct btree_node_iter *iter)
-{
- struct btree_node_iter_set *set;
- struct printbuf buf = PRINTBUF;
-
- printk(KERN_ERR "btree node iter with %u/%u sets:\n",
- __btree_node_iter_used(iter), b->nsets);
-
- btree_node_iter_for_each(iter, set) {
- struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
- struct bset_tree *t = bch2_bkey_to_bset(b, k);
- struct bkey uk = bkey_unpack_key(b, k);
-
- printbuf_reset(&buf);
- bch2_bkey_to_text(&buf, &uk);
- printk(KERN_ERR "set %zu key %u: %s\n",
- t - b->set, set->k, buf.buf);
- }
-
- printbuf_exit(&buf);
-}
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
-{
- struct bkey_packed *k;
- struct btree_nr_keys nr = {};
-
- for_each_bset(b, t)
- bset_tree_for_each_key(b, t, k)
- if (!bkey_deleted(k))
- btree_keys_account_key_add(&nr, t - b->set, k);
- return nr;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
-{
- struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
-
- BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
-}
-
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
- struct btree *b)
-{
- struct btree_node_iter iter = *_iter;
- const struct bkey_packed *k, *n;
-
- k = bch2_btree_node_iter_peek_all(&iter, b);
- __bch2_btree_node_iter_advance(&iter, b);
- n = bch2_btree_node_iter_peek_all(&iter, b);
-
- bkey_unpack_key(b, k);
-
- if (n &&
- bkey_iter_cmp(b, k, n) > 0) {
- struct btree_node_iter_set *set;
- struct bkey ku = bkey_unpack_key(b, k);
- struct bkey nu = bkey_unpack_key(b, n);
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&buf1, &ku);
- bch2_bkey_to_text(&buf2, &nu);
- printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
- buf1.buf, buf2.buf);
- printk(KERN_ERR "iter was:");
-
- btree_node_iter_for_each(_iter, set) {
- struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
- struct bset_tree *t = bch2_bkey_to_bset(b, k2);
- printk(" [%zi %zi]", t - b->set,
- k2->_data - bset(b, t)->_data);
- }
- panic("\n");
- }
-}
-
-void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct btree_node_iter_set *set, *s2;
- struct bkey_packed *k, *p;
-
- if (bch2_btree_node_iter_end(iter))
- return;
-
- /* Verify no duplicates: */
- btree_node_iter_for_each(iter, set) {
- BUG_ON(set->k > set->end);
- btree_node_iter_for_each(iter, s2)
- BUG_ON(set != s2 && set->end == s2->end);
- }
-
- /* Verify that set->end is correct: */
- btree_node_iter_for_each(iter, set) {
- for_each_bset(b, t)
- if (set->end == t->end_offset) {
- BUG_ON(set->k < btree_bkey_first_offset(t) ||
- set->k >= t->end_offset);
- goto found;
- }
- BUG();
-found:
- do {} while (0);
- }
-
- /* Verify iterator is sorted: */
- btree_node_iter_for_each(iter, set)
- BUG_ON(set != iter->data &&
- btree_node_iter_cmp(b, set[-1], set[0]) > 0);
-
- k = bch2_btree_node_iter_peek_all(iter, b);
-
- for_each_bset(b, t) {
- if (iter->data[0].end == t->end_offset)
- continue;
-
- p = bch2_bkey_prev_all(b, t,
- bch2_btree_node_iter_bset_pos(iter, b, t));
-
- BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
- }
-}
-
-void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
- struct bkey_packed *insert, unsigned clobber_u64s)
-{
- struct bset_tree *t = bch2_bkey_to_bset(b, where);
- struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
- struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-#if 0
- BUG_ON(prev &&
- bkey_iter_cmp(b, prev, insert) > 0);
-#else
- if (prev &&
- bkey_iter_cmp(b, prev, insert) > 0) {
- struct bkey k1 = bkey_unpack_key(b, prev);
- struct bkey k2 = bkey_unpack_key(b, insert);
-
- bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&buf1, &k1);
- bch2_bkey_to_text(&buf2, &k2);
-
- panic("prev > insert:\n"
- "prev key %s\n"
- "insert key %s\n",
- buf1.buf, buf2.buf);
- }
-#endif
-#if 0
- BUG_ON(next != btree_bkey_last(b, t) &&
- bkey_iter_cmp(b, insert, next) > 0);
-#else
- if (next != btree_bkey_last(b, t) &&
- bkey_iter_cmp(b, insert, next) > 0) {
- struct bkey k1 = bkey_unpack_key(b, insert);
- struct bkey k2 = bkey_unpack_key(b, next);
-
- bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&buf1, &k1);
- bch2_bkey_to_text(&buf2, &k2);
-
- panic("insert > next:\n"
- "insert key %s\n"
- "next key %s\n",
- buf1.buf, buf2.buf);
- }
-#endif
-}
-
-#else
-
-static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
- struct btree *b) {}
-
-#endif
-
-/* Auxiliary search trees */
-
-#define BFLOAT_FAILED_UNPACKED U8_MAX
-#define BFLOAT_FAILED U8_MAX
-
-struct bkey_float {
- u8 exponent;
- u8 key_offset;
- u16 mantissa;
-};
-#define BKEY_MANTISSA_BITS 16
-
-struct ro_aux_tree {
- u8 nothing[0];
- struct bkey_float f[];
-};
-
-struct rw_aux_tree {
- u16 offset;
- struct bpos k;
-};
-
-static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
-{
- BUG_ON(t->aux_data_offset == U16_MAX);
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- return t->aux_data_offset;
- case BSET_RO_AUX_TREE:
- return t->aux_data_offset +
- DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
- case BSET_RW_AUX_TREE:
- return t->aux_data_offset +
- DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
- default:
- BUG();
- }
-}
-
-static unsigned bset_aux_tree_buf_start(const struct btree *b,
- const struct bset_tree *t)
-{
- return t == b->set
- ? DIV_ROUND_UP(b->unpack_fn_len, 8)
- : bset_aux_tree_buf_end(t - 1);
-}
-
-static void *__aux_tree_base(const struct btree *b,
- const struct bset_tree *t)
-{
- return b->aux_data + t->aux_data_offset * 8;
-}
-
-static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- return __aux_tree_base(b, t);
-}
-
-static struct bkey_float *bkey_float(const struct btree *b,
- const struct bset_tree *t,
- unsigned idx)
-{
- return ro_aux_tree_base(b, t)->f + idx;
-}
-
-static void bset_aux_tree_verify(struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- for_each_bset(b, t) {
- if (t->aux_data_offset == U16_MAX)
- continue;
-
- BUG_ON(t != b->set &&
- t[-1].aux_data_offset == U16_MAX);
-
- BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
- BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
- BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
- }
-#endif
-}
-
-void bch2_btree_keys_init(struct btree *b)
-{
- unsigned i;
-
- b->nsets = 0;
- memset(&b->nr, 0, sizeof(b->nr));
-
- for (i = 0; i < MAX_BSETS; i++)
- b->set[i].data_offset = U16_MAX;
-
- bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-/* Binary tree stuff for auxiliary search trees */
-
-/*
- * Cacheline/offset <-> bkey pointer arithmetic:
- *
- * t->tree is a binary search tree in an array; each node corresponds to a key
- * in one cacheline in t->set (BSET_CACHELINE bytes).
- *
- * This means we don't have to store the full index of the key that a node in
- * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
- * then bkey_float->m gives us the offset within that cacheline, in units of 8
- * bytes.
- *
- * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
- * make this work.
- *
- * To construct the bfloat for an arbitrary key we need to know what the key
- * immediately preceding it is: we have to check if the two keys differ in the
- * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
- * of the previous key so we can walk backwards to it from t->tree[j]'s key.
- */
-
-static inline void *bset_cacheline(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline)
-{
- return (void *) round_down((unsigned long) btree_bkey_first(b, t),
- L1_CACHE_BYTES) +
- cacheline * BSET_CACHELINE;
-}
-
-static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline,
- unsigned offset)
-{
- return bset_cacheline(b, t, cacheline) + offset * 8;
-}
-
-static unsigned bkey_to_cacheline(const struct btree *b,
- const struct bset_tree *t,
- const struct bkey_packed *k)
-{
- return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
-}
-
-static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline,
- const struct bkey_packed *k)
-{
- return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
-}
-
-static unsigned bkey_to_cacheline_offset(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline,
- const struct bkey_packed *k)
-{
- size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
-
- EBUG_ON(m > U8_MAX);
- return m;
-}
-
-static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned j)
-{
- return cacheline_to_bkey(b, t,
- __eytzinger1_to_inorder(j, t->size - 1, t->extra),
- bkey_float(b, t, j)->key_offset);
-}
-
-static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-
- return __aux_tree_base(b, t);
-}
-
-/*
- * For the write set - the one we're currently inserting keys into - we don't
- * maintain a full search tree, we just keep a simple lookup table in t->prev.
- */
-static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
- struct bset_tree *t,
- unsigned j)
-{
- return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
-}
-
-static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
- unsigned j, struct bkey_packed *k)
-{
- EBUG_ON(k >= btree_bkey_last(b, t));
-
- rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
- .offset = __btree_node_key_to_offset(b, k),
- .k = bkey_unpack_pos(b, k),
- };
-}
-
-static void bch2_bset_verify_rw_aux_tree(struct btree *b,
- struct bset_tree *t)
-{
- struct bkey_packed *k = btree_bkey_first(b, t);
- unsigned j = 0;
-
- if (!bch2_expensive_debug_checks)
- return;
-
- BUG_ON(bset_has_ro_aux_tree(t));
-
- if (!bset_has_rw_aux_tree(t))
- return;
-
- BUG_ON(t->size < 1);
- BUG_ON(rw_aux_to_bkey(b, t, j) != k);
-
- goto start;
- while (1) {
- if (rw_aux_to_bkey(b, t, j) == k) {
- BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
- bkey_unpack_pos(b, k)));
-start:
- if (++j == t->size)
- break;
-
- BUG_ON(rw_aux_tree(b, t)[j].offset <=
- rw_aux_tree(b, t)[j - 1].offset);
- }
-
- k = bkey_p_next(k);
- BUG_ON(k >= btree_bkey_last(b, t));
- }
-}
-
-/* returns idx of first entry >= offset: */
-static unsigned rw_aux_tree_bsearch(struct btree *b,
- struct bset_tree *t,
- unsigned offset)
-{
- unsigned bset_offs = offset - btree_bkey_first_offset(t);
- unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
- unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
-
- EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
- EBUG_ON(!t->size);
- EBUG_ON(idx > t->size);
-
- while (idx < t->size &&
- rw_aux_tree(b, t)[idx].offset < offset)
- idx++;
-
- while (idx &&
- rw_aux_tree(b, t)[idx - 1].offset >= offset)
- idx--;
-
- EBUG_ON(idx < t->size &&
- rw_aux_tree(b, t)[idx].offset < offset);
- EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
- EBUG_ON(idx + 1 < t->size &&
- rw_aux_tree(b, t)[idx].offset ==
- rw_aux_tree(b, t)[idx + 1].offset);
-
- return idx;
-}
-
-static inline unsigned bkey_mantissa(const struct bkey_packed *k,
- const struct bkey_float *f)
-{
- u64 v;
-
- EBUG_ON(!bkey_packed(k));
-
- v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
-
- /*
- * In little endian, we're shifting off low bits (and then the bits we
- * want are at the low end), in big endian we're shifting off high bits
- * (and then the bits we want are at the high end, so we shift them
- * back down):
- */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- v >>= f->exponent & 7;
-#else
- v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
-#endif
- return (u16) v;
-}
-
-static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
-{
- struct bkey_float *f = bkey_float(b, t, j);
- struct bkey_packed *m = tree_to_bkey(b, t, j);
- struct bkey_packed *l = is_power_of_2(j)
- ? min_key
- : tree_to_bkey(b, t, j >> ffs(j));
- struct bkey_packed *r = is_power_of_2(j + 1)
- ? max_key
- : tree_to_bkey(b, t, j >> (ffz(j) + 1));
- unsigned mantissa;
- int shift, exponent, high_bit;
-
- /*
- * for failed bfloats, the lookup code falls back to comparing against
- * the original key.
- */
-
- if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
- !b->nr_key_bits) {
- f->exponent = BFLOAT_FAILED_UNPACKED;
- return;
- }
-
- /*
- * The greatest differing bit of l and r is the first bit we must
- * include in the bfloat mantissa we're creating in order to do
- * comparisons - that bit always becomes the high bit of
- * bfloat->mantissa, and thus the exponent we're calculating here is
- * the position of what will become the low bit in bfloat->mantissa:
- *
- * Note that this may be negative - we may be running off the low end
- * of the key: we handle this later:
- */
- high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
- min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
- exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
-
- /*
- * Then we calculate the actual shift value, from the start of the key
- * (k->_data), to get the key bits starting at exponent:
- */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
-
- EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
-#else
- shift = high_bit_offset +
- b->nr_key_bits -
- exponent -
- BKEY_MANTISSA_BITS;
-
- EBUG_ON(shift < KEY_PACKED_BITS_START);
-#endif
- EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
-
- f->exponent = shift;
- mantissa = bkey_mantissa(m, f);
-
- /*
- * If we've got garbage bits, set them to all 1s - it's legal for the
- * bfloat to compare larger than the original key, but not smaller:
- */
- if (exponent < 0)
- mantissa |= ~(~0U << -exponent);
-
- f->mantissa = mantissa;
-}
-
-/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
- bset_aux_tree_verify(b);
-
- return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
-}
-
-static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
- return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
-}
-
-static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
- return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
-}
-
-static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
-{
- struct bkey_packed *k;
-
- t->size = 1;
- t->extra = BSET_RW_AUX_TREE_VAL;
- rw_aux_tree(b, t)[0].offset =
- __btree_node_key_to_offset(b, btree_bkey_first(b, t));
-
- bset_tree_for_each_key(b, t, k) {
- if (t->size == bset_rw_tree_capacity(b, t))
- break;
-
- if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
- L1_CACHE_BYTES)
- rw_aux_tree_set(b, t, t->size++, k);
- }
-}
-
-static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
-{
- struct bkey_packed *k = btree_bkey_first(b, t);
- struct bkey_i min_key, max_key;
- unsigned cacheline = 1;
-
- t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
- bset_ro_tree_capacity(b, t));
-retry:
- if (t->size < 2) {
- t->size = 0;
- t->extra = BSET_NO_AUX_TREE_VAL;
- return;
- }
-
- t->extra = eytzinger1_extra(t->size - 1);
-
- /* First we figure out where the first key in each cacheline is */
- eytzinger1_for_each(j, t->size - 1) {
- while (bkey_to_cacheline(b, t, k) < cacheline)
- k = bkey_p_next(k);
-
- if (k >= btree_bkey_last(b, t)) {
- /* XXX: this path sucks */
- t->size--;
- goto retry;
- }
-
- bkey_float(b, t, j)->key_offset =
- bkey_to_cacheline_offset(b, t, cacheline++, k);
-
- EBUG_ON(tree_to_bkey(b, t, j) != k);
- }
-
- if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
- bkey_init(&min_key.k);
- min_key.k.p = b->data->min_key;
- }
-
- if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
- bkey_init(&max_key.k);
- max_key.k.p = b->data->max_key;
- }
-
- /* Then we build the tree */
- eytzinger1_for_each(j, t->size - 1)
- make_bfloat(b, t, j,
- bkey_to_packed(&min_key),
- bkey_to_packed(&max_key));
-}
-
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
-{
- struct bset_tree *i;
-
- for (i = b->set; i != t; i++)
- BUG_ON(bset_has_rw_aux_tree(i));
-
- bch2_bset_set_no_aux_tree(b, t);
-
- /* round up to next cacheline: */
- t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
- SMP_CACHE_BYTES / sizeof(u64));
-
- bset_aux_tree_verify(b);
-}
-
-void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
- bool writeable)
-{
- if (writeable
- ? bset_has_rw_aux_tree(t)
- : bset_has_ro_aux_tree(t))
- return;
-
- bset_alloc_tree(b, t);
-
- if (!__bset_tree_capacity(b, t))
- return;
-
- if (writeable)
- __build_rw_aux_tree(b, t);
- else
- __build_ro_aux_tree(b, t);
-
- bset_aux_tree_verify(b);
-}
-
-void bch2_bset_init_first(struct btree *b, struct bset *i)
-{
- struct bset_tree *t;
-
- BUG_ON(b->nsets);
-
- memset(i, 0, sizeof(*i));
- get_random_bytes(&i->seq, sizeof(i->seq));
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
- t = &b->set[b->nsets++];
- set_btree_bset(b, t, i);
-}
-
-void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
-{
- struct bset *i = &bne->keys;
- struct bset_tree *t;
-
- BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
- BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
- BUG_ON(b->nsets >= MAX_BSETS);
-
- memset(i, 0, sizeof(*i));
- i->seq = btree_bset_first(b)->seq;
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
- t = &b->set[b->nsets++];
- set_btree_bset(b, t, i);
-}
-
-/*
- * find _some_ key in the same bset as @k that precedes @k - not necessarily the
- * immediate predecessor:
- */
-static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
- struct bkey_packed *k)
-{
- struct bkey_packed *p;
- unsigned offset;
- int j;
-
- EBUG_ON(k < btree_bkey_first(b, t) ||
- k > btree_bkey_last(b, t));
-
- if (k == btree_bkey_first(b, t))
- return NULL;
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- p = btree_bkey_first(b, t);
- break;
- case BSET_RO_AUX_TREE:
- j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
-
- do {
- p = j ? tree_to_bkey(b, t,
- __inorder_to_eytzinger1(j--,
- t->size - 1, t->extra))
- : btree_bkey_first(b, t);
- } while (p >= k);
- break;
- case BSET_RW_AUX_TREE:
- offset = __btree_node_key_to_offset(b, k);
- j = rw_aux_tree_bsearch(b, t, offset);
- p = j ? rw_aux_to_bkey(b, t, j - 1)
- : btree_bkey_first(b, t);
- break;
- }
-
- return p;
-}
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k,
- unsigned min_key_type)
-{
- struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
-
- while ((p = __bkey_prev(b, t, k)) && !ret) {
- for (i = p; i != k; i = bkey_p_next(i))
- if (i->type >= min_key_type)
- ret = i;
-
- k = p;
- }
-
- if (bch2_expensive_debug_checks) {
- BUG_ON(ret >= orig_k);
-
- for (i = ret
- ? bkey_p_next(ret)
- : btree_bkey_first(b, t);
- i != orig_k;
- i = bkey_p_next(i))
- BUG_ON(i->type >= min_key_type);
- }
-
- return ret;
-}
-
-/* Insert */
-
-static void rw_aux_tree_insert_entry(struct btree *b,
- struct bset_tree *t,
- unsigned idx)
-{
- EBUG_ON(!idx || idx > t->size);
- struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
- struct bkey_packed *end = idx < t->size
- ? rw_aux_to_bkey(b, t, idx)
- : btree_bkey_last(b, t);
-
- if (t->size < bset_rw_tree_capacity(b, t) &&
- (void *) end - (void *) start > L1_CACHE_BYTES) {
- struct bkey_packed *k = start;
-
- while (1) {
- k = bkey_p_next(k);
- if (k == end)
- break;
-
- if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
- memmove(&rw_aux_tree(b, t)[idx + 1],
- &rw_aux_tree(b, t)[idx],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[idx]);
- t->size++;
- rw_aux_tree_set(b, t, idx, k);
- break;
- }
- }
- }
-}
-
-static void bch2_bset_fix_lookup_table(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *_where,
- unsigned clobber_u64s,
- unsigned new_u64s)
-{
- int shift = new_u64s - clobber_u64s;
- unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
-
- EBUG_ON(bset_has_ro_aux_tree(t));
-
- if (!bset_has_rw_aux_tree(t))
- return;
-
- if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
- rw_aux_tree_insert_entry(b, t, t->size);
- goto verify;
- }
-
- /* returns first entry >= where */
- idx = rw_aux_tree_bsearch(b, t, where);
-
- if (rw_aux_tree(b, t)[idx].offset == where) {
- if (!idx) { /* never delete first entry */
- idx++;
- } else if (where < t->end_offset) {
- rw_aux_tree_set(b, t, idx++, _where);
- } else {
- EBUG_ON(where != t->end_offset);
- rw_aux_tree_insert_entry(b, t, --t->size);
- goto verify;
- }
- }
-
- EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
- if (idx < t->size &&
- rw_aux_tree(b, t)[idx].offset + shift ==
- rw_aux_tree(b, t)[idx - 1].offset) {
- memmove(&rw_aux_tree(b, t)[idx],
- &rw_aux_tree(b, t)[idx + 1],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[idx + 1]);
- t->size -= 1;
- }
-
- for (j = idx; j < t->size; j++)
- rw_aux_tree(b, t)[j].offset += shift;
-
- EBUG_ON(idx < t->size &&
- rw_aux_tree(b, t)[idx].offset ==
- rw_aux_tree(b, t)[idx - 1].offset);
-
- rw_aux_tree_insert_entry(b, t, idx);
-
-verify:
- bch2_bset_verify_rw_aux_tree(b, t);
- bset_aux_tree_verify(b);
-}
-
-void bch2_bset_insert(struct btree *b,
- struct bkey_packed *where,
- struct bkey_i *insert,
- unsigned clobber_u64s)
-{
- struct bkey_format *f = &b->format;
- struct bset_tree *t = bset_tree_last(b);
- struct bkey_packed packed, *src = bkey_to_packed(insert);
-
- bch2_bset_verify_rw_aux_tree(b, t);
- bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
-
- if (bch2_bkey_pack_key(&packed, &insert->k, f))
- src = &packed;
-
- if (!bkey_deleted(&insert->k))
- btree_keys_account_key_add(&b->nr, t - b->set, src);
-
- if (src->u64s != clobber_u64s) {
- u64 *src_p = (u64 *) where->_data + clobber_u64s;
- u64 *dst_p = (u64 *) where->_data + src->u64s;
-
- EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
- (int) clobber_u64s - src->u64s);
-
- memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
- le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
- set_btree_bset_end(b, t);
- }
-
- memcpy_u64s_small(where, src,
- bkeyp_key_u64s(f, src));
- memcpy_u64s(bkeyp_val(f, where), &insert->v,
- bkeyp_val_u64s(f, src));
-
- if (src->u64s != clobber_u64s)
- bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
-
- bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_bset_delete(struct btree *b,
- struct bkey_packed *where,
- unsigned clobber_u64s)
-{
- struct bset_tree *t = bset_tree_last(b);
- u64 *src_p = (u64 *) where->_data + clobber_u64s;
- u64 *dst_p = where->_data;
-
- bch2_bset_verify_rw_aux_tree(b, t);
-
- EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
-
- memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
- le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
- set_btree_bset_end(b, t);
-
- bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
-}
-
-/* Lookup */
-
-__flatten
-static struct bkey_packed *bset_search_write_set(const struct btree *b,
- struct bset_tree *t,
- struct bpos *search)
-{
- unsigned l = 0, r = t->size;
-
- while (l + 1 != r) {
- unsigned m = (l + r) >> 1;
-
- if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
- l = m;
- else
- r = m;
- }
-
- return rw_aux_to_bkey(b, t, l);
-}
-
-static inline void prefetch_four_cachelines(void *p)
-{
-#ifdef CONFIG_X86_64
- asm("prefetcht0 (-127 + 64 * 0)(%0);"
- "prefetcht0 (-127 + 64 * 1)(%0);"
- "prefetcht0 (-127 + 64 * 2)(%0);"
- "prefetcht0 (-127 + 64 * 3)(%0);"
- :
- : "r" (p + 127));
-#else
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- prefetch(p + L1_CACHE_BYTES * 3);
-#endif
-}
-
-static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
- const struct bkey_float *f)
-{
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
-
- return f->exponent > key_bits_start;
-#else
- unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
-
- return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
-#endif
-}
-
-__flatten
-static struct bkey_packed *bset_search_tree(const struct btree *b,
- const struct bset_tree *t,
- const struct bpos *search,
- const struct bkey_packed *packed_search)
-{
- struct ro_aux_tree *base = ro_aux_tree_base(b, t);
- struct bkey_float *f;
- struct bkey_packed *k;
- unsigned inorder, n = 1, l, r;
- int cmp;
-
- do {
- if (likely(n << 4 < t->size))
- prefetch(&base->f[n << 4]);
-
- f = &base->f[n];
- if (unlikely(f->exponent >= BFLOAT_FAILED))
- goto slowpath;
-
- l = f->mantissa;
- r = bkey_mantissa(packed_search, f);
-
- if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
- goto slowpath;
-
- n = n * 2 + (l < r);
- continue;
-slowpath:
- k = tree_to_bkey(b, t, n);
- cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
- if (!cmp)
- return k;
-
- n = n * 2 + (cmp < 0);
- } while (n < t->size);
-
- inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
-
- /*
- * n would have been the node we recursed to - the low bit tells us if
- * we recursed left or recursed right.
- */
- if (likely(!(n & 1))) {
- --inorder;
- if (unlikely(!inorder))
- return btree_bkey_first(b, t);
-
- f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
- }
-
- return cacheline_to_bkey(b, t, inorder, f->key_offset);
-}
-
-static __always_inline __flatten
-struct bkey_packed *__bch2_bset_search(struct btree *b,
- struct bset_tree *t,
- struct bpos *search,
- const struct bkey_packed *lossy_packed_search)
-{
-
- /*
- * First, we search for a cacheline, then lastly we do a linear search
- * within that cacheline.
- *
- * To search for the cacheline, there's three different possibilities:
- * * The set is too small to have a search tree, so we just do a linear
- * search over the whole set.
- * * The set is the one we're currently inserting into; keeping a full
- * auxiliary search tree up to date would be too expensive, so we
- * use a much simpler lookup table to do a binary search -
- * bset_search_write_set().
- * * Or we use the auxiliary search tree we constructed earlier -
- * bset_search_tree()
- */
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- return btree_bkey_first(b, t);
- case BSET_RW_AUX_TREE:
- return bset_search_write_set(b, t, search);
- case BSET_RO_AUX_TREE:
- return bset_search_tree(b, t, search, lossy_packed_search);
- default:
- BUG();
- }
-}
-
-static __always_inline __flatten
-struct bkey_packed *bch2_bset_search_linear(struct btree *b,
- struct bset_tree *t,
- struct bpos *search,
- struct bkey_packed *packed_search,
- const struct bkey_packed *lossy_packed_search,
- struct bkey_packed *m)
-{
- if (lossy_packed_search)
- while (m != btree_bkey_last(b, t) &&
- bkey_iter_cmp_p_or_unp(b, m,
- lossy_packed_search, search) < 0)
- m = bkey_p_next(m);
-
- if (!packed_search)
- while (m != btree_bkey_last(b, t) &&
- bkey_iter_pos_cmp(b, m, search) < 0)
- m = bkey_p_next(m);
-
- if (bch2_expensive_debug_checks) {
- struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
-
- BUG_ON(prev &&
- bkey_iter_cmp_p_or_unp(b, prev,
- packed_search, search) >= 0);
- }
-
- return m;
-}
-
-/* Btree node iterator */
-
-static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
- struct btree *b,
- const struct bkey_packed *k,
- const struct bkey_packed *end)
-{
- if (k != end) {
- struct btree_node_iter_set *pos;
-
- btree_node_iter_for_each(iter, pos)
- ;
-
- BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
- *pos = (struct btree_node_iter_set) {
- __btree_node_key_to_offset(b, k),
- __btree_node_key_to_offset(b, end)
- };
- }
-}
-
-void bch2_btree_node_iter_push(struct btree_node_iter *iter,
- struct btree *b,
- const struct bkey_packed *k,
- const struct bkey_packed *end)
-{
- __bch2_btree_node_iter_push(iter, b, k, end);
- bch2_btree_node_iter_sort(iter, b);
-}
-
-noinline __flatten __cold
-static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
- struct btree *b, struct bpos *search)
-{
- struct bkey_packed *k;
-
- trace_bkey_pack_pos_fail(search);
-
- bch2_btree_node_iter_init_from_start(iter, b);
-
- while ((k = bch2_btree_node_iter_peek(iter, b)) &&
- bkey_iter_pos_cmp(b, k, search) < 0)
- bch2_btree_node_iter_advance(iter, b);
-}
-
-/**
- * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
- * given position
- *
- * @iter: iterator to initialize
- * @b: btree node to search
- * @search: search key
- *
- * Main entry point to the lookup code for individual btree nodes:
- *
- * NOTE:
- *
- * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
- * keys. This doesn't matter for most code, but it does matter for lookups.
- *
- * Some adjacent keys with a string of equal keys:
- * i j k k k k l m
- *
- * If you search for k, the lookup code isn't guaranteed to return you any
- * specific k. The lookup code is conceptually doing a binary search and
- * iterating backwards is very expensive so if the pivot happens to land at the
- * last k that's what you'll get.
- *
- * This works out ok, but it's something to be aware of:
- *
- * - For non extents, we guarantee that the live key comes last - see
- * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
- * see will only be deleted keys you don't care about.
- *
- * - For extents, deleted keys sort last (see the comment at the top of this
- * file). But when you're searching for extents, you actually want the first
- * key strictly greater than your search key - an extent that compares equal
- * to the search key is going to have 0 sectors after the search key.
- *
- * But this does mean that we can't just search for
- * bpos_successor(start_of_range) to get the first extent that overlaps with
- * the range we want - if we're unlucky and there's an extent that ends
- * exactly where we searched, then there could be a deleted key at the same
- * position and we'd get that when we search instead of the preceding extent
- * we needed.
- *
- * So we've got to search for start_of_range, then after the lookup iterate
- * past any extents that compare equal to the position we searched for.
- */
-__flatten
-void bch2_btree_node_iter_init(struct btree_node_iter *iter,
- struct btree *b, struct bpos *search)
-{
- struct bkey_packed p, *packed_search = NULL;
- struct btree_node_iter_set *pos = iter->data;
- struct bkey_packed *k[MAX_BSETS];
- unsigned i;
-
- EBUG_ON(bpos_lt(*search, b->data->min_key));
- EBUG_ON(bpos_gt(*search, b->data->max_key));
- bset_aux_tree_verify(b);
-
- memset(iter, 0, sizeof(*iter));
-
- switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
- case BKEY_PACK_POS_EXACT:
- packed_search = &p;
- break;
- case BKEY_PACK_POS_SMALLER:
- packed_search = NULL;
- break;
- case BKEY_PACK_POS_FAIL:
- btree_node_iter_init_pack_failed(iter, b, search);
- return;
- }
-
- for (i = 0; i < b->nsets; i++) {
- k[i] = __bch2_bset_search(b, b->set + i, search, &p);
- prefetch_four_cachelines(k[i]);
- }
-
- for (i = 0; i < b->nsets; i++) {
- struct bset_tree *t = b->set + i;
- struct bkey_packed *end = btree_bkey_last(b, t);
-
- k[i] = bch2_bset_search_linear(b, t, search,
- packed_search, &p, k[i]);
- if (k[i] != end)
- *pos++ = (struct btree_node_iter_set) {
- __btree_node_key_to_offset(b, k[i]),
- __btree_node_key_to_offset(b, end)
- };
- }
-
- bch2_btree_node_iter_sort(iter, b);
-}
-
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
- struct btree *b)
-{
- memset(iter, 0, sizeof(*iter));
-
- for_each_bset(b, t)
- __bch2_btree_node_iter_push(iter, b,
- btree_bkey_first(b, t),
- btree_bkey_last(b, t));
- bch2_btree_node_iter_sort(iter, b);
-}
-
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
- struct btree *b,
- struct bset_tree *t)
-{
- struct btree_node_iter_set *set;
-
- btree_node_iter_for_each(iter, set)
- if (set->end == t->end_offset)
- return __btree_node_offset_to_key(b, set->k);
-
- return btree_bkey_last(b, t);
-}
-
-static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
- struct btree *b,
- unsigned first)
-{
- bool ret;
-
- if ((ret = (btree_node_iter_cmp(b,
- iter->data[first],
- iter->data[first + 1]) > 0)))
- swap(iter->data[first], iter->data[first + 1]);
- return ret;
-}
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
- struct btree *b)
-{
- /* unrolled bubble sort: */
-
- if (!__btree_node_iter_set_end(iter, 2)) {
- btree_node_iter_sort_two(iter, b, 0);
- btree_node_iter_sort_two(iter, b, 1);
- }
-
- if (!__btree_node_iter_set_end(iter, 1))
- btree_node_iter_sort_two(iter, b, 0);
-}
-
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
- struct btree_node_iter_set *set)
-{
- struct btree_node_iter_set *last =
- iter->data + ARRAY_SIZE(iter->data) - 1;
-
- memmove(&set[0], &set[1], (void *) last - (void *) set);
- *last = (struct btree_node_iter_set) { 0, 0 };
-}
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
- struct btree *b)
-{
- iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
-
- EBUG_ON(iter->data->k > iter->data->end);
-
- if (unlikely(__btree_node_iter_set_end(iter, 0))) {
- /* avoid an expensive memmove call: */
- iter->data[0] = iter->data[1];
- iter->data[1] = iter->data[2];
- iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
- return;
- }
-
- if (__btree_node_iter_set_end(iter, 1))
- return;
-
- if (!btree_node_iter_sort_two(iter, b, 0))
- return;
-
- if (__btree_node_iter_set_end(iter, 2))
- return;
-
- btree_node_iter_sort_two(iter, b, 1);
-}
-
-void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
- struct btree *b)
-{
- if (bch2_expensive_debug_checks) {
- bch2_btree_node_iter_verify(iter, b);
- bch2_btree_node_iter_next_check(iter, b);
- }
-
- __bch2_btree_node_iter_advance(iter, b);
-}
-
-/*
- * Expensive:
- */
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct bkey_packed *k, *prev = NULL;
- struct btree_node_iter_set *set;
- unsigned end = 0;
-
- if (bch2_expensive_debug_checks)
- bch2_btree_node_iter_verify(iter, b);
-
- for_each_bset(b, t) {
- k = bch2_bkey_prev_all(b, t,
- bch2_btree_node_iter_bset_pos(iter, b, t));
- if (k &&
- (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
- prev = k;
- end = t->end_offset;
- }
- }
-
- if (!prev)
- return NULL;
-
- /*
- * We're manually memmoving instead of just calling sort() to ensure the
- * prev we picked ends up in slot 0 - sort won't necessarily put it
- * there because of duplicate deleted keys:
- */
- btree_node_iter_for_each(iter, set)
- if (set->end == end)
- goto found;
-
- BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
-found:
- BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
-
- memmove(&iter->data[1],
- &iter->data[0],
- (void *) set - (void *) &iter->data[0]);
-
- iter->data[0].k = __btree_node_key_to_offset(b, prev);
- iter->data[0].end = end;
-
- if (bch2_expensive_debug_checks)
- bch2_btree_node_iter_verify(iter, b);
- return prev;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct bkey_packed *prev;
-
- do {
- prev = bch2_btree_node_iter_prev_all(iter, b);
- } while (prev && bkey_deleted(prev));
-
- return prev;
-}
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
- struct btree *b,
- struct bkey *u)
-{
- struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
-
- return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
-}
-
-/* Mergesort */
-
-void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
-{
- for_each_bset_c(b, t) {
- enum bset_aux_tree_type type = bset_aux_tree_type(t);
- size_t j;
-
- stats->sets[type].nr++;
- stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
- sizeof(u64);
-
- if (bset_has_ro_aux_tree(t)) {
- stats->floats += t->size - 1;
-
- for (j = 1; j < t->size; j++)
- stats->failed +=
- bkey_float(b, t, j)->exponent ==
- BFLOAT_FAILED;
- }
- }
-}
-
-void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
- struct bkey_packed *k)
-{
- struct bset_tree *t = bch2_bkey_to_bset(b, k);
- struct bkey uk;
- unsigned j, inorder;
-
- if (!bset_has_ro_aux_tree(t))
- return;
-
- inorder = bkey_to_cacheline(b, t, k);
- if (!inorder || inorder >= t->size)
- return;
-
- j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
- if (k != tree_to_bkey(b, t, j))
- return;
-
- switch (bkey_float(b, t, j)->exponent) {
- case BFLOAT_FAILED:
- uk = bkey_unpack_key(b, k);
- prt_printf(out,
- " failed unpacked at depth %u\n"
- "\t",
- ilog2(j));
- bch2_bpos_to_text(out, uk.p);
- prt_printf(out, "\n");
- break;
- }
-}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
deleted file mode 100644
index 6953d55b72cc..000000000000
--- a/fs/bcachefs/bset.h
+++ /dev/null
@@ -1,544 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BSET_H
-#define _BCACHEFS_BSET_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "btree_types.h"
-#include "util.h" /* for time_stats */
-#include "vstructs.h"
-
-/*
- * BKEYS:
- *
- * A bkey contains a key, a size field, a variable number of pointers, and some
- * ancillary flag bits.
- *
- * We use two different functions for validating bkeys, bkey_invalid and
- * bkey_deleted().
- *
- * The one exception to the rule that ptr_invalid() filters out invalid keys is
- * that it also filters out keys of size 0 - these are keys that have been
- * completely overwritten. It'd be safe to delete these in memory while leaving
- * them on disk, just unnecessary work - so we filter them out when resorting
- * instead.
- *
- * We can't filter out stale keys when we're resorting, because garbage
- * collection needs to find them to ensure bucket gens don't wrap around -
- * unless we're rewriting the btree node those stale keys still exist on disk.
- *
- * We also implement functions here for removing some number of sectors from the
- * front or the back of a bkey - this is mainly used for fixing overlapping
- * extents, by removing the overlapping sectors from the older key.
- *
- * BSETS:
- *
- * A bset is an array of bkeys laid out contiguously in memory in sorted order,
- * along with a header. A btree node is made up of a number of these, written at
- * different times.
- *
- * There could be many of them on disk, but we never allow there to be more than
- * 4 in memory - we lazily resort as needed.
- *
- * We implement code here for creating and maintaining auxiliary search trees
- * (described below) for searching an individial bset, and on top of that we
- * implement a btree iterator.
- *
- * BTREE ITERATOR:
- *
- * Most of the code in bcache doesn't care about an individual bset - it needs
- * to search entire btree nodes and iterate over them in sorted order.
- *
- * The btree iterator code serves both functions; it iterates through the keys
- * in a btree node in sorted order, starting from either keys after a specific
- * point (if you pass it a search key) or the start of the btree node.
- *
- * AUXILIARY SEARCH TREES:
- *
- * Since keys are variable length, we can't use a binary search on a bset - we
- * wouldn't be able to find the start of the next key. But binary searches are
- * slow anyways, due to terrible cache behaviour; bcache originally used binary
- * searches and that code topped out at under 50k lookups/second.
- *
- * So we need to construct some sort of lookup table. Since we only insert keys
- * into the last (unwritten) set, most of the keys within a given btree node are
- * usually in sets that are mostly constant. We use two different types of
- * lookup tables to take advantage of this.
- *
- * Both lookup tables share in common that they don't index every key in the
- * set; they index one key every BSET_CACHELINE bytes, and then a linear search
- * is used for the rest.
- *
- * For sets that have been written to disk and are no longer being inserted
- * into, we construct a binary search tree in an array - traversing a binary
- * search tree in an array gives excellent locality of reference and is very
- * fast, since both children of any node are adjacent to each other in memory
- * (and their grandchildren, and great grandchildren...) - this means
- * prefetching can be used to great effect.
- *
- * It's quite useful performance wise to keep these nodes small - not just
- * because they're more likely to be in L2, but also because we can prefetch
- * more nodes on a single cacheline and thus prefetch more iterations in advance
- * when traversing this tree.
- *
- * Nodes in the auxiliary search tree must contain both a key to compare against
- * (we don't want to fetch the key from the set, that would defeat the purpose),
- * and a pointer to the key. We use a few tricks to compress both of these.
- *
- * To compress the pointer, we take advantage of the fact that one node in the
- * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
- * a function (to_inorder()) that takes the index of a node in a binary tree and
- * returns what its index would be in an inorder traversal, so we only have to
- * store the low bits of the offset.
- *
- * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
- * compress that, we take advantage of the fact that when we're traversing the
- * search tree at every iteration we know that both our search key and the key
- * we're looking for lie within some range - bounded by our previous
- * comparisons. (We special case the start of a search so that this is true even
- * at the root of the tree).
- *
- * So we know the key we're looking for is between a and b, and a and b don't
- * differ higher than bit 50, we don't need to check anything higher than bit
- * 50.
- *
- * We don't usually need the rest of the bits, either; we only need enough bits
- * to partition the key range we're currently checking. Consider key n - the
- * key our auxiliary search tree node corresponds to, and key p, the key
- * immediately preceding n. The lowest bit we need to store in the auxiliary
- * search tree is the highest bit that differs between n and p.
- *
- * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
- * comparison. But we'd really like our nodes in the auxiliary search tree to be
- * of fixed size.
- *
- * The solution is to make them fixed size, and when we're constructing a node
- * check if p and n differed in the bits we needed them to. If they don't we
- * flag that node, and when doing lookups we fallback to comparing against the
- * real key. As long as this doesn't happen to often (and it seems to reliably
- * happen a bit less than 1% of the time), we win - even on failures, that key
- * is then more likely to be in cache than if we were doing binary searches all
- * the way, since we're touching so much less memory.
- *
- * The keys in the auxiliary search tree are stored in (software) floating
- * point, with an exponent and a mantissa. The exponent needs to be big enough
- * to address all the bits in the original key, but the number of bits in the
- * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
- *
- * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
- * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
- * We need one node per 128 bytes in the btree node, which means the auxiliary
- * search trees take up 3% as much memory as the btree itself.
- *
- * Constructing these auxiliary search trees is moderately expensive, and we
- * don't want to be constantly rebuilding the search tree for the last set
- * whenever we insert another key into it. For the unwritten set, we use a much
- * simpler lookup table - it's just a flat array, so index i in the lookup table
- * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
- * within each byte range works the same as with the auxiliary search trees.
- *
- * These are much easier to keep up to date when we insert a key - we do it
- * somewhat lazily; when we shift a key up we usually just increment the pointer
- * to it, only when it would overflow do we go to the trouble of finding the
- * first key in that range of bytes again.
- */
-
-enum bset_aux_tree_type {
- BSET_NO_AUX_TREE,
- BSET_RO_AUX_TREE,
- BSET_RW_AUX_TREE,
-};
-
-#define BSET_TREE_NR_TYPES 3
-
-#define BSET_NO_AUX_TREE_VAL (U16_MAX)
-#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1)
-
-static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
-{
- switch (t->extra) {
- case BSET_NO_AUX_TREE_VAL:
- EBUG_ON(t->size);
- return BSET_NO_AUX_TREE;
- case BSET_RW_AUX_TREE_VAL:
- EBUG_ON(!t->size);
- return BSET_RW_AUX_TREE;
- default:
- EBUG_ON(!t->size);
- return BSET_RO_AUX_TREE;
- }
-}
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE 256
-
-static inline size_t btree_keys_cachelines(const struct btree *b)
-{
- return (1U << b->byte_order) / BSET_CACHELINE;
-}
-
-static inline size_t btree_aux_data_bytes(const struct btree *b)
-{
- return btree_keys_cachelines(b) * 8;
-}
-
-static inline size_t btree_aux_data_u64s(const struct btree *b)
-{
- return btree_aux_data_bytes(b) / sizeof(u64);
-}
-
-#define for_each_bset(_b, _t) \
- for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define for_each_bset_c(_b, _t) \
- for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define bset_tree_for_each_key(_b, _t, _k) \
- for (_k = btree_bkey_first(_b, _t); \
- _k != btree_bkey_last(_b, _t); \
- _k = bkey_p_next(_k))
-
-static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
-{
- return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
-}
-
-static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
-{
- return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
-}
-
-static inline void bch2_bset_set_no_aux_tree(struct btree *b,
- struct bset_tree *t)
-{
- BUG_ON(t < b->set);
-
- for (; t < b->set + ARRAY_SIZE(b->set); t++) {
- t->size = 0;
- t->extra = BSET_NO_AUX_TREE_VAL;
- t->aux_data_offset = U16_MAX;
- }
-}
-
-static inline void btree_node_set_format(struct btree *b,
- struct bkey_format f)
-{
- int len;
-
- b->format = f;
- b->nr_key_bits = bkey_format_key_bits(&f);
-
- len = bch2_compile_bkey_format(&b->format, b->aux_data);
- BUG_ON(len < 0 || len > U8_MAX);
-
- b->unpack_fn_len = len;
-
- bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-static inline struct bset *bset_next_set(struct btree *b,
- unsigned block_bytes)
-{
- struct bset *i = btree_bset_last(b);
-
- EBUG_ON(!is_power_of_2(block_bytes));
-
- return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
-}
-
-void bch2_btree_keys_init(struct btree *);
-
-void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
-void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-
-void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
- unsigned);
-void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
-
-/* Bkey utility code */
-
-/* packed or unpacked */
-static inline int bkey_cmp_p_or_unp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r_packed,
- const struct bpos *r)
-{
- EBUG_ON(r_packed && !bkey_packed(r_packed));
-
- if (unlikely(!bkey_packed(l)))
- return bpos_cmp(packed_to_bkey_c(l)->p, *r);
-
- if (likely(r_packed))
- return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
-
- return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-static inline struct bset_tree *
-bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
-{
- unsigned offset = __btree_node_key_to_offset(b, k);
-
- for_each_bset(b, t)
- if (offset <= t->end_offset) {
- EBUG_ON(offset < btree_bkey_first_offset(t));
- return t;
- }
-
- BUG();
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
- struct bkey_packed *, unsigned);
-
-static inline struct bkey_packed *
-bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
- return bch2_bkey_prev_filter(b, t, k, 0);
-}
-
-static inline struct bkey_packed *
-bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
- return bch2_bkey_prev_filter(b, t, k, 1);
-}
-
-/* Btree key iteration */
-
-void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
- struct bpos *);
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
- struct btree *);
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
- struct btree *,
- struct bset_tree *);
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
- struct btree_node_iter_set *);
-void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
-
-#define btree_node_iter_for_each(_iter, _set) \
- for (_set = (_iter)->data; \
- _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \
- (_set)->k != (_set)->end; \
- _set++)
-
-static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
- unsigned i)
-{
- return iter->data[i].k == iter->data[i].end;
-}
-
-static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
-{
- return __btree_node_iter_set_end(iter, 0);
-}
-
-/*
- * When keys compare equal, deleted keys compare first:
- *
- * XXX: only need to compare pointers for keys that are both within a
- * btree_node_iterator - we need to break ties for prev() to work correctly
- */
-static inline int bkey_iter_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed(b, l, r)
- ?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
- ?: cmp_int(l, r);
-}
-
-static inline int btree_node_iter_cmp(const struct btree *b,
- struct btree_node_iter_set l,
- struct btree_node_iter_set r)
-{
- return bkey_iter_cmp(b,
- __btree_node_offset_to_key(b, l.k),
- __btree_node_offset_to_key(b, r.k));
-}
-
-/* These assume r (the search key) is not a deleted key: */
-static inline int bkey_iter_pos_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bpos *r)
-{
- return bkey_cmp_left_packed(b, l, r)
- ?: -((int) bkey_deleted(l));
-}
-
-static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r_packed,
- const struct bpos *r)
-{
- return bkey_cmp_p_or_unp(b, l, r_packed, r)
- ?: -((int) bkey_deleted(l));
-}
-
-static inline struct bkey_packed *
-__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
- struct btree *b)
-{
- return __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
-{
- return !bch2_btree_node_iter_end(iter)
- ? __btree_node_offset_to_key(b, iter->data->k)
- : NULL;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
-{
- struct bkey_packed *k;
-
- while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
- bkey_deleted(k))
- bch2_btree_node_iter_advance(iter, b);
-
- return k;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
-{
- struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
-
- if (ret)
- bch2_btree_node_iter_advance(iter, b);
-
- return ret;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
- struct btree *);
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
- struct btree *);
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
- struct btree *,
- struct bkey *);
-
-#define for_each_btree_node_key(b, k, iter) \
- for (bch2_btree_node_iter_init_from_start((iter), (b)); \
- (k = bch2_btree_node_iter_peek((iter), (b))); \
- bch2_btree_node_iter_advance(iter, b))
-
-#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \
- for (bch2_btree_node_iter_init_from_start((iter), (b)); \
- (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
- bch2_btree_node_iter_advance(iter, b))
-
-/* Accounting: */
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
-
-static inline void btree_keys_account_key(struct btree_nr_keys *n,
- unsigned bset,
- struct bkey_packed *k,
- int sign)
-{
- n->live_u64s += k->u64s * sign;
- n->bset_u64s[bset] += k->u64s * sign;
-
- if (bkey_packed(k))
- n->packed_keys += sign;
- else
- n->unpacked_keys += sign;
-}
-
-static inline void btree_keys_account_val_delta(struct btree *b,
- struct bkey_packed *k,
- int delta)
-{
- struct bset_tree *t = bch2_bkey_to_bset(b, k);
-
- b->nr.live_u64s += delta;
- b->nr.bset_u64s[t - b->set] += delta;
-}
-
-#define btree_keys_account_key_add(_nr, _bset_idx, _k) \
- btree_keys_account_key(_nr, _bset_idx, _k, 1)
-#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \
- btree_keys_account_key(_nr, _bset_idx, _k, -1)
-
-#define btree_account_key_add(_b, _k) \
- btree_keys_account_key(&(_b)->nr, \
- bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
-#define btree_account_key_drop(_b, _k) \
- btree_keys_account_key(&(_b)->nr, \
- bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
-
-struct bset_stats {
- struct {
- size_t nr, bytes;
- } sets[BSET_TREE_NR_TYPES];
-
- size_t floats;
- size_t failed;
-};
-
-void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
-void bch2_bfloat_to_text(struct printbuf *, struct btree *,
- struct bkey_packed *);
-
-/* Debug stuff */
-
-void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
-void bch2_dump_btree_node(struct bch_fs *, struct btree *);
-void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *);
-void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
- struct bkey_packed *, unsigned);
-
-#else
-
-static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
-static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b) {}
-static inline void bch2_verify_insert_pos(struct btree *b,
- struct bkey_packed *where,
- struct bkey_packed *insert,
- unsigned clobber_u64s) {}
-#endif
-
-static inline void bch2_verify_btree_nr_keys(struct btree *b)
-{
- if (bch2_debug_check_btree_accounting)
- __bch2_verify_btree_nr_keys(b);
-}
-
-#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
deleted file mode 100644
index 54666027aa85..000000000000
--- a/fs/bcachefs/btree_cache.c
+++ /dev/null
@@ -1,1515 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "trace.h"
-
-#include <linux/prefetch.h>
-#include <linux/sched/mm.h>
-#include <linux/swap.h>
-
-#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
-do { \
- if (shrinker_counter) \
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
-} while (0)
-
-const char * const bch2_btree_node_flags[] = {
- "typebit",
- "typebit",
- "typebit",
-#define x(f) [BTREE_NODE_##f] = #f,
- BTREE_FLAGS()
-#undef x
- NULL
-};
-
-void bch2_recalc_btree_reserve(struct bch_fs *c)
-{
- unsigned reserve = 16;
-
- if (!c->btree_roots_known[0].b)
- reserve += 8;
-
- for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (r->b)
- reserve += min_t(unsigned, 1, r->b->c.level) * 8;
- }
-
- c->btree_cache.nr_reserve = reserve;
-}
-
-static inline size_t btree_cache_can_free(struct btree_cache_list *list)
-{
- struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
-
- size_t can_free = list->nr;
- if (!list->idx)
- can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
- return can_free;
-}
-
-static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
-{
- BUG_ON(!list_empty(&b->list));
-
- if (b->c.lock.readers)
- list_add(&b->list, &bc->freed_pcpu);
- else
- list_add(&b->list, &bc->freed_nonpcpu);
-}
-
-static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b)
-{
- BUG_ON(!list_empty(&b->list));
- BUG_ON(!b->data);
-
- bc->nr_freeable++;
- list_add(&b->list, &bc->freeable);
-}
-
-void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
-{
- struct btree_cache *bc = &c->btree_cache;
-
- mutex_lock(&bc->lock);
- __bch2_btree_node_to_freelist(bc, b);
- mutex_unlock(&bc->lock);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-}
-
-static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
-{
- BUG_ON(!list_empty(&b->list));
- BUG_ON(btree_node_hashed(b));
-
- /*
- * This should really be done in slub/vmalloc, but we're using the
- * kmalloc_large() path, so we're working around a slub bug by doing
- * this here:
- */
- if (b->data)
- mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
- if (b->aux_data)
- mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
-
- EBUG_ON(btree_node_write_in_flight(b));
-
- clear_btree_node_just_written(b);
-
- kvfree(b->data);
- b->data = NULL;
-#ifdef __KERNEL__
- kvfree(b->aux_data);
-#else
- munmap(b->aux_data, btree_aux_data_bytes(b));
-#endif
- b->aux_data = NULL;
-
- btree_node_to_freedlist(bc, b);
-}
-
-static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
-{
- BUG_ON(list_empty(&b->list));
- list_del_init(&b->list);
- --bc->nr_freeable;
- __btree_node_data_free(bc, b);
-}
-
-static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const struct btree *b = obj;
- const u64 *v = arg->key;
-
- return b->hash_val == *v ? 0 : 1;
-}
-
-static const struct rhashtable_params bch_btree_cache_params = {
- .head_offset = offsetof(struct btree, hash),
- .key_offset = offsetof(struct btree, hash_val),
- .key_len = sizeof(u64),
- .obj_cmpfn = bch2_btree_cache_cmp_fn,
- .automatic_shrinking = true,
-};
-
-static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
-{
- BUG_ON(b->data || b->aux_data);
-
- gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
-
- b->data = kvmalloc(btree_buf_bytes(b), gfp);
- if (!b->data)
- return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
-#ifdef __KERNEL__
- b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
-#else
- b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
- PROT_READ|PROT_WRITE|PROT_EXEC,
- MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
- if (b->aux_data == MAP_FAILED)
- b->aux_data = NULL;
-#endif
- if (!b->aux_data) {
- kvfree(b->data);
- b->data = NULL;
- return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
- }
-
- return 0;
-}
-
-static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
-{
- struct btree *b;
-
- b = kzalloc(sizeof(struct btree), gfp);
- if (!b)
- return NULL;
-
- bkey_btree_ptr_init(&b->key);
- INIT_LIST_HEAD(&b->list);
- INIT_LIST_HEAD(&b->write_blocked);
- b->byte_order = ilog2(c->opts.btree_node_size);
- return b;
-}
-
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- b = __btree_node_mem_alloc(c, GFP_KERNEL);
- if (!b)
- return NULL;
-
- if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
- kfree(b);
- return NULL;
- }
-
- bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
-
- __bch2_btree_node_to_freelist(bc, b);
- return b;
-}
-
-static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
-{
- struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
- u64 mask = bc->pinned_nodes_mask[!!b->c.level];
-
- return ((mask & BIT_ULL(b->c.btree_id)) &&
- bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
- bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
-}
-
-void bch2_node_pin(struct bch_fs *c, struct btree *b)
-{
- struct btree_cache *bc = &c->btree_cache;
-
- mutex_lock(&bc->lock);
- if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
- set_btree_node_pinned(b);
- list_move(&b->list, &bc->live[1].list);
- bc->live[0].nr--;
- bc->live[1].nr++;
- }
- mutex_unlock(&bc->lock);
-}
-
-void bch2_btree_cache_unpin(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b, *n;
-
- mutex_lock(&bc->lock);
- c->btree_cache.pinned_nodes_mask[0] = 0;
- c->btree_cache.pinned_nodes_mask[1] = 0;
-
- list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
- clear_btree_node_pinned(b);
- list_move(&b->list, &bc->live[0].list);
- bc->live[0].nr++;
- bc->live[1].nr--;
- }
-
- mutex_unlock(&bc->lock);
-}
-
-/* Btree in memory cache - hash table */
-
-void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
-{
- lockdep_assert_held(&bc->lock);
-
- int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
- BUG_ON(ret);
-
- /* Cause future lookups for this node to fail: */
- b->hash_val = 0;
-
- if (b->c.btree_id < BTREE_ID_NR)
- --bc->nr_by_btree[b->c.btree_id];
- --bc->live[btree_node_pinned(b)].nr;
- list_del_init(&b->list);
-}
-
-void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
-{
- __bch2_btree_node_hash_remove(bc, b);
- __bch2_btree_node_to_freelist(bc, b);
-}
-
-int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
-{
- BUG_ON(!list_empty(&b->list));
- BUG_ON(b->hash_val);
-
- b->hash_val = btree_ptr_hash_val(&b->key);
- int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
- bch_btree_cache_params);
- if (ret)
- return ret;
-
- if (b->c.btree_id < BTREE_ID_NR)
- bc->nr_by_btree[b->c.btree_id]++;
-
- bool p = __btree_node_pinned(bc, b);
- mod_bit(BTREE_NODE_pinned, &b->flags, p);
-
- list_add_tail(&b->list, &bc->live[p].list);
- bc->live[p].nr++;
- return 0;
-}
-
-int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
- unsigned level, enum btree_id id)
-{
- b->c.level = level;
- b->c.btree_id = id;
-
- mutex_lock(&bc->lock);
- int ret = __bch2_btree_node_hash_insert(bc, b);
- mutex_unlock(&bc->lock);
-
- return ret;
-}
-
-void bch2_btree_node_update_key_early(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_i *new)
-{
- struct bch_fs *c = trans->c;
- struct btree *b;
- struct bkey_buf tmp;
- int ret;
-
- bch2_bkey_buf_init(&tmp);
- bch2_bkey_buf_reassemble(&tmp, c, old);
-
- b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
- if (!IS_ERR_OR_NULL(b)) {
- mutex_lock(&c->btree_cache.lock);
-
- __bch2_btree_node_hash_remove(&c->btree_cache, b);
-
- bkey_copy(&b->key, new);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
-
- mutex_unlock(&c->btree_cache.lock);
- six_unlock_read(&b->c.lock);
- }
-
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-__flatten
-static inline struct btree *btree_cache_find(struct btree_cache *bc,
- const struct bkey_i *k)
-{
- u64 v = btree_ptr_hash_val(k);
-
- return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
-}
-
-/*
- * this version is for btree nodes that have already been freed (we're not
- * reaping a real btree node)
- */
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
-{
- struct btree_cache *bc = &c->btree_cache;
- int ret = 0;
-
- lockdep_assert_held(&bc->lock);
-wait_on_io:
- if (b->flags & ((1U << BTREE_NODE_dirty)|
- (1U << BTREE_NODE_read_in_flight)|
- (1U << BTREE_NODE_write_in_flight))) {
- if (!flush) {
- if (btree_node_dirty(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
- else if (btree_node_read_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
- else if (btree_node_write_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
- }
-
- /* XXX: waiting on IO with btree cache lock held */
- bch2_btree_node_wait_on_read(b);
- bch2_btree_node_wait_on_write(b);
- }
-
- if (!six_trylock_intent(&b->c.lock)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
- }
-
- if (!six_trylock_write(&b->c.lock)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
- goto out_unlock_intent;
- }
-
- /* recheck under lock */
- if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
- (1U << BTREE_NODE_write_in_flight))) {
- if (!flush) {
- if (btree_node_read_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
- else if (btree_node_write_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
- goto out_unlock;
- }
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- goto wait_on_io;
- }
-
- if (btree_node_noevict(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
- goto out_unlock;
- }
- if (btree_node_write_blocked(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
- goto out_unlock;
- }
- if (btree_node_will_make_reachable(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
- goto out_unlock;
- }
-
- if (btree_node_dirty(b)) {
- if (!flush) {
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
- goto out_unlock;
- }
- /*
- * Using the underscore version because we don't want to compact
- * bsets after the write, since this node is about to be evicted
- * - unless btree verify mode is enabled, since it runs out of
- * the post write cleanup:
- */
- if (bch2_verify_btree_ondisk)
- bch2_btree_node_write(c, b, SIX_LOCK_intent,
- BTREE_WRITE_cache_reclaim);
- else
- __bch2_btree_node_write(c, b,
- BTREE_WRITE_cache_reclaim);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- goto wait_on_io;
- }
-out:
- if (b->hash_val && !ret)
- trace_and_count(c, btree_cache_reap, c, b);
- return ret;
-out_unlock:
- six_unlock_write(&b->c.lock);
-out_unlock_intent:
- six_unlock_intent(&b->c.lock);
- ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
- goto out;
-}
-
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
-{
- return __btree_node_reclaim(c, b, false, shrinker_counter);
-}
-
-static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
-{
- return __btree_node_reclaim(c, b, true, false);
-}
-
-static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct btree_cache_list *list = shrink->private_data;
- struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
- struct btree *b, *t;
- unsigned long nr = sc->nr_to_scan;
- unsigned long can_free = 0;
- unsigned long freed = 0;
- unsigned long touched = 0;
- unsigned i, flags;
- unsigned long ret = SHRINK_STOP;
- bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
-
- if (bch2_btree_shrinker_disabled)
- return SHRINK_STOP;
-
- mutex_lock(&bc->lock);
- flags = memalloc_nofs_save();
-
- /*
- * It's _really_ critical that we don't free too many btree nodes - we
- * have to always leave ourselves a reserve. The reserve is how we
- * guarantee that allocating memory for a new btree node can always
- * succeed, so that inserting keys into the btree can always succeed and
- * IO can always make forward progress:
- */
- can_free = btree_cache_can_free(list);
- nr = min_t(unsigned long, nr, can_free);
-
- i = 0;
- list_for_each_entry_safe(b, t, &bc->freeable, list) {
- /*
- * Leave a few nodes on the freeable list, so that a btree split
- * won't have to hit the system allocator:
- */
- if (++i <= 3)
- continue;
-
- touched++;
-
- if (touched >= nr)
- goto out;
-
- if (!btree_node_reclaim(c, b, true)) {
- btree_node_data_free(bc, b);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- freed++;
- bc->nr_freed++;
- }
- }
-restart:
- list_for_each_entry_safe(b, t, &list->list, list) {
- touched++;
-
- if (btree_node_accessed(b)) {
- clear_btree_node_accessed(b);
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
- --touched;;
- } else if (!btree_node_reclaim(c, b, true)) {
- __bch2_btree_node_hash_remove(bc, b);
- __btree_node_data_free(bc, b);
-
- freed++;
- bc->nr_freed++;
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-
- if (freed == nr)
- goto out_rotate;
- } else if (trigger_writes &&
- btree_node_dirty(b) &&
- !btree_node_will_make_reachable(b) &&
- !btree_node_write_blocked(b) &&
- six_trylock_read(&b->c.lock)) {
- list_move(&list->list, &b->list);
- mutex_unlock(&bc->lock);
- __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
- six_unlock_read(&b->c.lock);
- if (touched >= nr)
- goto out_nounlock;
- mutex_lock(&bc->lock);
- goto restart;
- }
-
- if (touched >= nr)
- break;
- }
-out_rotate:
- if (&t->list != &list->list)
- list_move_tail(&list->list, &t->list);
-out:
- mutex_unlock(&bc->lock);
-out_nounlock:
- ret = freed;
- memalloc_nofs_restore(flags);
- trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
- return ret;
-}
-
-static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct btree_cache_list *list = shrink->private_data;
-
- if (bch2_btree_shrinker_disabled)
- return 0;
-
- return btree_cache_can_free(list);
-}
-
-void bch2_fs_btree_cache_exit(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b, *t;
- unsigned long flags;
-
- shrinker_free(bc->live[1].shrink);
- shrinker_free(bc->live[0].shrink);
-
- /* vfree() can allocate memory: */
- flags = memalloc_nofs_save();
- mutex_lock(&bc->lock);
-
- if (c->verify_data)
- list_move(&c->verify_data->list, &bc->live[0].list);
-
- kvfree(c->verify_ondisk);
-
- for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (r->b)
- list_add(&r->b->list, &bc->live[0].list);
- }
-
- list_for_each_entry_safe(b, t, &bc->live[1].list, list)
- bch2_btree_node_hash_remove(bc, b);
- list_for_each_entry_safe(b, t, &bc->live[0].list, list)
- bch2_btree_node_hash_remove(bc, b);
-
- list_for_each_entry_safe(b, t, &bc->freeable, list) {
- BUG_ON(btree_node_read_in_flight(b) ||
- btree_node_write_in_flight(b));
-
- btree_node_data_free(bc, b);
- cond_resched();
- }
-
- BUG_ON(!bch2_journal_error(&c->journal) &&
- atomic_long_read(&c->btree_cache.nr_dirty));
-
- list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
-
- list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
- list_del(&b->list);
- six_lock_exit(&b->c.lock);
- kfree(b);
- }
-
- mutex_unlock(&bc->lock);
- memalloc_nofs_restore(flags);
-
- for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
- BUG_ON(bc->nr_by_btree[i]);
- BUG_ON(bc->live[0].nr);
- BUG_ON(bc->live[1].nr);
- BUG_ON(bc->nr_freeable);
-
- if (bc->table_init_done)
- rhashtable_destroy(&bc->table);
-}
-
-int bch2_fs_btree_cache_init(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- struct shrinker *shrink;
- unsigned i;
- int ret = 0;
-
- ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
- if (ret)
- goto err;
-
- bc->table_init_done = true;
-
- bch2_recalc_btree_reserve(c);
-
- for (i = 0; i < bc->nr_reserve; i++)
- if (!__bch2_btree_node_mem_alloc(c))
- goto err;
-
- list_splice_init(&bc->live[0].list, &bc->freeable);
-
- mutex_init(&c->verify_lock);
-
- shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
- if (!shrink)
- goto err;
- bc->live[0].shrink = shrink;
- shrink->count_objects = bch2_btree_cache_count;
- shrink->scan_objects = bch2_btree_cache_scan;
- shrink->seeks = 2;
- shrink->private_data = &bc->live[0];
- shrinker_register(shrink);
-
- shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
- if (!shrink)
- goto err;
- bc->live[1].shrink = shrink;
- shrink->count_objects = bch2_btree_cache_count;
- shrink->scan_objects = bch2_btree_cache_scan;
- shrink->seeks = 8;
- shrink->private_data = &bc->live[1];
- shrinker_register(shrink);
-
- return 0;
-err:
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-}
-
-void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
-{
- mutex_init(&bc->lock);
- for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
- bc->live[i].idx = i;
- INIT_LIST_HEAD(&bc->live[i].list);
- }
- INIT_LIST_HEAD(&bc->freeable);
- INIT_LIST_HEAD(&bc->freed_pcpu);
- INIT_LIST_HEAD(&bc->freed_nonpcpu);
-}
-
-/*
- * We can only have one thread cannibalizing other cached btree nodes at a time,
- * or we'll deadlock. We use an open coded mutex to ensure that, which a
- * cannibalize_bucket() will take. This means every time we unlock the root of
- * the btree, we need to release this lock if we have it held.
- */
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
-
- if (bc->alloc_lock == current) {
- trace_and_count(c, btree_cache_cannibalize_unlock, trans);
- bc->alloc_lock = NULL;
- closure_wake_up(&bc->alloc_wait);
- }
-}
-
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct task_struct *old;
-
- old = NULL;
- if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
- goto success;
-
- if (!cl) {
- trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
- }
-
- closure_wait(&bc->alloc_wait, cl);
-
- /* Try again, after adding ourselves to waitlist */
- old = NULL;
- if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
- /* We raced */
- closure_wake_up(&bc->alloc_wait);
- goto success;
- }
-
- trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
-
-success:
- trace_and_count(c, btree_cache_cannibalize_lock, trans);
- return 0;
-}
-
-static struct btree *btree_node_cannibalize(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
- list_for_each_entry_reverse(b, &bc->live[i].list, list)
- if (!btree_node_reclaim(c, b, false))
- return b;
-
- while (1) {
- for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
- list_for_each_entry_reverse(b, &bc->live[i].list, list)
- if (!btree_node_write_and_reclaim(c, b))
- return b;
-
- /*
- * Rare case: all nodes were intent-locked.
- * Just busy-wait.
- */
- WARN_ONCE(1, "btree cache cannibalize failed\n");
- cond_resched();
- }
-}
-
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct list_head *freed = pcpu_read_locks
- ? &bc->freed_pcpu
- : &bc->freed_nonpcpu;
- struct btree *b, *b2;
- u64 start_time = local_clock();
-
- mutex_lock(&bc->lock);
-
- /*
- * We never free struct btree itself, just the memory that holds the on
- * disk node. Check the freed list before allocating a new one:
- */
- list_for_each_entry(b, freed, list)
- if (!btree_node_reclaim(c, b, false)) {
- list_del_init(&b->list);
- goto got_node;
- }
-
- b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
- if (b) {
- bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
- } else {
- mutex_unlock(&bc->lock);
- bch2_trans_unlock(trans);
- b = __btree_node_mem_alloc(c, GFP_KERNEL);
- if (!b)
- goto err;
- bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
- mutex_lock(&bc->lock);
- }
-
- BUG_ON(!six_trylock_intent(&b->c.lock));
- BUG_ON(!six_trylock_write(&b->c.lock));
-
-got_node:
- /*
- * btree_free() doesn't free memory; it sticks the node on the end of
- * the list. Check if there's any freed nodes there:
- */
- list_for_each_entry(b2, &bc->freeable, list)
- if (!btree_node_reclaim(c, b2, false)) {
- swap(b->data, b2->data);
- swap(b->aux_data, b2->aux_data);
-
- list_del_init(&b2->list);
- --bc->nr_freeable;
- btree_node_to_freedlist(bc, b2);
- mutex_unlock(&bc->lock);
-
- six_unlock_write(&b2->c.lock);
- six_unlock_intent(&b2->c.lock);
- goto got_mem;
- }
-
- mutex_unlock(&bc->lock);
-
- if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
- bch2_trans_unlock(trans);
- if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
- goto err;
- }
-
-got_mem:
- BUG_ON(!list_empty(&b->list));
- BUG_ON(btree_node_hashed(b));
- BUG_ON(btree_node_dirty(b));
- BUG_ON(btree_node_write_in_flight(b));
-out:
- b->flags = 0;
- b->written = 0;
- b->nsets = 0;
- b->sib_u64s[0] = 0;
- b->sib_u64s[1] = 0;
- b->whiteout_u64s = 0;
- bch2_btree_keys_init(b);
- set_btree_node_accessed(b);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
- start_time);
-
- int ret = bch2_trans_relock(trans);
- if (unlikely(ret)) {
- bch2_btree_node_to_freelist(c, b);
- return ERR_PTR(ret);
- }
-
- return b;
-err:
- mutex_lock(&bc->lock);
-
- /* Try to cannibalize another cached btree node: */
- if (bc->alloc_lock == current) {
- b2 = btree_node_cannibalize(c);
- clear_btree_node_just_written(b2);
- __bch2_btree_node_hash_remove(bc, b2);
-
- if (b) {
- swap(b->data, b2->data);
- swap(b->aux_data, b2->aux_data);
- btree_node_to_freedlist(bc, b2);
- six_unlock_write(&b2->c.lock);
- six_unlock_intent(&b2->c.lock);
- } else {
- b = b2;
- }
-
- BUG_ON(!list_empty(&b->list));
- mutex_unlock(&bc->lock);
-
- trace_and_count(c, btree_cache_cannibalize, trans);
- goto out;
- }
-
- mutex_unlock(&bc->lock);
- return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
-}
-
-/* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
- struct btree_path *path,
- const struct bkey_i *k,
- enum btree_id btree_id,
- unsigned level,
- enum six_lock_type lock_type,
- bool sync)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- if (unlikely(level >= BTREE_MAX_DEPTH)) {
- int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
- level, BTREE_MAX_DEPTH);
- return ERR_PTR(ret);
- }
-
- if (unlikely(!bkey_is_btree_ptr(&k->k))) {
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
- int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
- printbuf_exit(&buf);
- return ERR_PTR(ret);
- }
-
- if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
- int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
- printbuf_exit(&buf);
- return ERR_PTR(ret);
- }
-
- /*
- * Parent node must be locked, else we could read in a btree node that's
- * been freed:
- */
- if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
- trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
- }
-
- b = bch2_btree_node_mem_alloc(trans, level != 0);
-
- if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
- if (!path)
- return b;
-
- trans->memory_allocation_failure = true;
- trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
- }
-
- if (IS_ERR(b))
- return b;
-
- bkey_copy(&b->key, k);
- if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
- /* raced with another fill: */
-
- /* mark as unhashed... */
- b->hash_val = 0;
-
- mutex_lock(&bc->lock);
- __bch2_btree_node_to_freelist(bc, b);
- mutex_unlock(&bc->lock);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- return NULL;
- }
-
- set_btree_node_read_in_flight(b);
- six_unlock_write(&b->c.lock);
-
- if (path) {
- u32 seq = six_lock_seq(&b->c.lock);
-
- /* Unlock before doing IO: */
- six_unlock_intent(&b->c.lock);
- bch2_trans_unlock_noassert(trans);
-
- bch2_btree_node_read(trans, b, sync);
-
- int ret = bch2_trans_relock(trans);
- if (ret)
- return ERR_PTR(ret);
-
- if (!sync)
- return NULL;
-
- if (!six_relock_type(&b->c.lock, lock_type, seq))
- b = NULL;
- } else {
- bch2_btree_node_read(trans, b, sync);
- if (lock_type == SIX_LOCK_read)
- six_lock_downgrade(&b->c.lock);
- }
-
- return b;
-}
-
-static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
-{
- struct printbuf buf = PRINTBUF;
-
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
- return;
-
- prt_printf(&buf,
- "btree node header doesn't match ptr: ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, "\nptr: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- prt_str(&buf, "\nheader: ");
- bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data));
- prt_str(&buf, "\nmin ");
- bch2_bpos_to_text(&buf, b->data->min_key);
-
- prt_printf(&buf, "\nmax ");
- bch2_bpos_to_text(&buf, b->data->max_key);
-
- bch2_fs_topology_error(c, "%s", buf.buf);
-
- printbuf_exit(&buf);
-}
-
-static inline void btree_check_header(struct bch_fs *c, struct btree *b)
-{
- if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
- b->c.level != BTREE_NODE_LEVEL(b->data) ||
- !bpos_eq(b->data->max_key, b->key.k.p) ||
- (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- !bpos_eq(b->data->min_key,
- bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
- btree_bad_header(c, b);
-}
-
-static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
- const struct bkey_i *k, unsigned level,
- enum six_lock_type lock_type,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- bool need_relock = false;
- int ret;
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
-retry:
- b = btree_cache_find(bc, k);
- if (unlikely(!b)) {
- /*
- * We must have the parent locked to call bch2_btree_node_fill(),
- * else we could read in a btree node from disk that's been
- * freed:
- */
- b = bch2_btree_node_fill(trans, path, k, path->btree_id,
- level, lock_type, true);
- need_relock = true;
-
- /* We raced and found the btree node in the cache */
- if (!b)
- goto retry;
-
- if (IS_ERR(b))
- return b;
- } else {
- if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(trans, path, level + 1);
-
- ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ERR_PTR(ret);
-
- BUG_ON(ret);
-
- if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
- b->c.level != level ||
- race_fault())) {
- six_unlock_type(&b->c.lock, lock_type);
- if (bch2_btree_node_relock(trans, path, level + 1))
- goto retry;
-
- trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
- }
-
- /* avoid atomic set bit if it's not needed: */
- if (!btree_node_accessed(b))
- set_btree_node_accessed(b);
- }
-
- if (unlikely(btree_node_read_in_flight(b))) {
- u32 seq = six_lock_seq(&b->c.lock);
-
- six_unlock_type(&b->c.lock, lock_type);
- bch2_trans_unlock(trans);
- need_relock = true;
-
- bch2_btree_node_wait_on_read(b);
-
- ret = bch2_trans_relock(trans);
- if (ret)
- return ERR_PTR(ret);
-
- /*
- * should_be_locked is not set on this path yet, so we need to
- * relock it specifically:
- */
- if (!six_relock_type(&b->c.lock, lock_type, seq))
- goto retry;
- }
-
- if (unlikely(need_relock)) {
- ret = bch2_trans_relock(trans) ?:
- bch2_btree_path_relock_intent(trans, path);
- if (ret) {
- six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(ret);
- }
- }
-
- prefetch(b->aux_data);
-
- for_each_bset(b, t) {
- void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- }
-
- if (unlikely(btree_node_read_error(b))) {
- six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
- }
-
- EBUG_ON(b->c.btree_id != path->btree_id);
- EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
- btree_check_header(c, b);
-
- return b;
-}
-
-/**
- * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
- * in from disk if necessary.
- *
- * @trans: btree transaction object
- * @path: btree_path being traversed
- * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
- * @level: level of btree node being looked up (0 == leaf node)
- * @lock_type: SIX_LOCK_read or SIX_LOCK_intent
- * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
- *
- * The btree node will have either a read or a write lock held, depending on
- * the @write parameter.
- *
- * Returns: btree node or ERR_PTR()
- */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
- const struct bkey_i *k, unsigned level,
- enum six_lock_type lock_type,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree *b;
- int ret;
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
-
- b = btree_node_mem_ptr(k);
-
- /*
- * Check b->hash_val _before_ calling btree_node_lock() - this might not
- * be the node we want anymore, and trying to lock the wrong node could
- * cause an unneccessary transaction restart:
- */
- if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
- !b ||
- b->hash_val != btree_ptr_hash_val(k)))
- return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
- if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(trans, path, level + 1);
-
- ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ERR_PTR(ret);
-
- BUG_ON(ret);
-
- if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
- b->c.level != level ||
- race_fault())) {
- six_unlock_type(&b->c.lock, lock_type);
- if (bch2_btree_node_relock(trans, path, level + 1))
- return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
- trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
- }
-
- if (unlikely(btree_node_read_in_flight(b))) {
- six_unlock_type(&b->c.lock, lock_type);
- return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
- }
-
- prefetch(b->aux_data);
-
- for_each_bset(b, t) {
- void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- }
-
- /* avoid atomic set bit if it's not needed: */
- if (!btree_node_accessed(b))
- set_btree_node_accessed(b);
-
- if (unlikely(btree_node_read_error(b))) {
- six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
- }
-
- EBUG_ON(b->c.btree_id != path->btree_id);
- EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
- btree_check_header(c, b);
-
- return b;
-}
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
- const struct bkey_i *k,
- enum btree_id btree_id,
- unsigned level,
- bool nofill)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- int ret;
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
-
- if (c->opts.btree_node_mem_ptr_optimization) {
- b = btree_node_mem_ptr(k);
- if (b)
- goto lock_node;
- }
-retry:
- b = btree_cache_find(bc, k);
- if (unlikely(!b)) {
- if (nofill)
- goto out;
-
- b = bch2_btree_node_fill(trans, NULL, k, btree_id,
- level, SIX_LOCK_read, true);
-
- /* We raced and found the btree node in the cache */
- if (!b)
- goto retry;
-
- if (IS_ERR(b) &&
- !bch2_btree_cache_cannibalize_lock(trans, NULL))
- goto retry;
-
- if (IS_ERR(b))
- goto out;
- } else {
-lock_node:
- ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ERR_PTR(ret);
-
- BUG_ON(ret);
-
- if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
- b->c.btree_id != btree_id ||
- b->c.level != level)) {
- six_unlock_read(&b->c.lock);
- goto retry;
- }
- }
-
- /* XXX: waiting on IO with btree locks held: */
- __bch2_btree_node_wait_on_read(b);
-
- prefetch(b->aux_data);
-
- for_each_bset(b, t) {
- void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- }
-
- /* avoid atomic set bit if it's not needed: */
- if (!btree_node_accessed(b))
- set_btree_node_accessed(b);
-
- if (unlikely(btree_node_read_error(b))) {
- six_unlock_read(&b->c.lock);
- b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
- goto out;
- }
-
- EBUG_ON(b->c.btree_id != btree_id);
- EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
- btree_check_header(c, b);
-out:
- bch2_btree_cache_cannibalize_unlock(trans);
- return b;
-}
-
-int bch2_btree_node_prefetch(struct btree_trans *trans,
- struct btree_path *path,
- const struct bkey_i *k,
- enum btree_id btree_id, unsigned level)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
-
- BUG_ON(path && !btree_node_locked(path, level + 1));
- BUG_ON(level >= BTREE_MAX_DEPTH);
-
- struct btree *b = btree_cache_find(bc, k);
- if (b)
- return 0;
-
- b = bch2_btree_node_fill(trans, path, k, btree_id,
- level, SIX_LOCK_read, false);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- return ret;
- if (b)
- six_unlock_read(&b->c.lock);
- return 0;
-}
-
-void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- b = btree_cache_find(bc, k);
- if (!b)
- return;
-
- BUG_ON(b == btree_node_root(trans->c, b));
-wait_on_io:
- /* not allowed to wait on io with btree locks held: */
-
- /* XXX we're called from btree_gc which will be holding other btree
- * nodes locked
- */
- __bch2_btree_node_wait_on_read(b);
- __bch2_btree_node_wait_on_write(b);
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
- if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
- goto out;
-
- if (btree_node_dirty(b)) {
- __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- goto wait_on_io;
- }
-
- BUG_ON(btree_node_dirty(b));
-
- mutex_lock(&bc->lock);
- bch2_btree_node_hash_remove(bc, b);
- btree_node_data_free(bc, b);
- mutex_unlock(&bc->lock);
-out:
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-}
-
-const char *bch2_btree_id_str(enum btree_id btree)
-{
- return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
-}
-
-void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
-{
- if (btree < BTREE_ID_NR)
- prt_str(out, __bch2_btree_ids[btree]);
- else
- prt_printf(out, "(unknown btree %u)", btree);
-}
-
-void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level)
-{
- prt_str(out, "btree=");
- bch2_btree_id_to_text(out, btree);
- prt_printf(out, " level=%u", level);
-}
-
-void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
- enum btree_id btree, unsigned level, struct bkey_s_c k)
-{
- bch2_btree_id_to_text(out, btree);
- prt_printf(out, " level %u/", level);
- struct btree_root *r = bch2_btree_id_root(c, btree);
- if (r)
- prt_printf(out, "%u", r->level);
- else
- prt_printf(out, "(unknown)");
- prt_printf(out, "\n ");
-
- bch2_bkey_val_to_text(out, c, k);
-}
-
-void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
- __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key));
-}
-
-void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
- struct bset_stats stats;
-
- memset(&stats, 0, sizeof(stats));
-
- bch2_btree_keys_stats(b, &stats);
-
- prt_printf(out, "l %u ", b->c.level);
- bch2_bpos_to_text(out, b->data->min_key);
- prt_printf(out, " - ");
- bch2_bpos_to_text(out, b->data->max_key);
- prt_printf(out, ":\n"
- " ptrs: ");
- bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
- prt_newline(out);
-
- prt_printf(out,
- " format: ");
- bch2_bkey_format_to_text(out, &b->format);
-
- prt_printf(out,
- " unpack fn len: %u\n"
- " bytes used %zu/%zu (%zu%% full)\n"
- " sib u64s: %u, %u (merge threshold %u)\n"
- " nr packed keys %u\n"
- " nr unpacked keys %u\n"
- " floats %zu\n"
- " failed unpacked %zu\n",
- b->unpack_fn_len,
- b->nr.live_u64s * sizeof(u64),
- btree_buf_bytes(b) - sizeof(struct btree_node),
- b->nr.live_u64s * 100 / btree_max_u64s(c),
- b->sib_u64s[0],
- b->sib_u64s[1],
- c->btree_foreground_merge_threshold,
- b->nr.packed_keys,
- b->nr.unpacked_keys,
- stats.floats,
- stats.failed);
-}
-
-static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
- const char *label, size_t nr)
-{
- prt_printf(out, "%s\t", label);
- prt_human_readable_u64(out, nr * c->opts.btree_node_size);
- prt_printf(out, " (%zu)\n", nr);
-}
-
-static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
-#define x(n) #n,
- BCH_BTREE_CACHE_NOT_FREED_REASONS()
-#undef x
- NULL
-};
-
-void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
-
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
- prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
- prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
- prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
- prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
- prt_newline(out);
-
- for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
- bch2_btree_id_to_text(out, i);
- prt_printf(out, "\t");
- prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size);
- prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]);
- }
-
- prt_newline(out);
- prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
- prt_printf(out, "not freed:\n");
-
- for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
- prt_printf(out, " %s\t%llu\n",
- bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
-}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
deleted file mode 100644
index ca3c1b145330..000000000000
--- a/fs/bcachefs/btree_cache.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_CACHE_H
-#define _BCACHEFS_BTREE_CACHE_H
-
-#include "bcachefs.h"
-#include "btree_types.h"
-#include "bkey_methods.h"
-
-extern const char * const bch2_btree_node_flags[];
-
-struct btree_iter;
-
-void bch2_recalc_btree_reserve(struct bch_fs *);
-
-void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
-
-void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
-void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
-
-int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
-int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
- unsigned, enum btree_id);
-
-void bch2_node_pin(struct bch_fs *, struct btree *);
-void bch2_btree_cache_unpin(struct bch_fs *);
-
-void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *);
-
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
-
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
-
-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
- const struct bkey_i *, unsigned,
- enum six_lock_type, unsigned long);
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
- enum btree_id, unsigned, bool);
-
-int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
- const struct bkey_i *, enum btree_id, unsigned);
-
-void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
-
-void bch2_fs_btree_cache_exit(struct bch_fs *);
-int bch2_fs_btree_cache_init(struct bch_fs *);
-void bch2_fs_btree_cache_init_early(struct btree_cache *);
-
-static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
-{
- switch (k->k.type) {
- case KEY_TYPE_btree_ptr:
- return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
- case KEY_TYPE_btree_ptr_v2:
- /*
- * The cast/deref is only necessary to avoid sparse endianness
- * warnings:
- */
- return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
- default:
- return 0;
- }
-}
-
-static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
-{
- return k->k.type == KEY_TYPE_btree_ptr_v2
- ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
- : NULL;
-}
-
-/* is btree node in hash table? */
-static inline bool btree_node_hashed(struct btree *b)
-{
- return b->hash_val != 0;
-}
-
-#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
- for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \
- &(_c)->btree_cache.table), \
- _iter = 0; _iter < (_tbl)->size; _iter++) \
- rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-
-static inline size_t btree_buf_bytes(const struct btree *b)
-{
- return 1UL << b->byte_order;
-}
-
-static inline size_t btree_buf_max_u64s(const struct btree *b)
-{
- return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_max_u64s(const struct bch_fs *c)
-{
- return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
- return c->opts.btree_node_size >> SECTOR_SHIFT;
-}
-
-static inline unsigned btree_blocks(const struct bch_fs *c)
-{
- return btree_sectors(c) >> c->block_bits;
-}
-
-#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3)
-
-#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
-#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
- (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
- (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
-
-static inline unsigned btree_id_nr_alive(struct bch_fs *c)
-{
- return BTREE_ID_NR + c->btree_roots_extra.nr;
-}
-
-static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
-{
- if (likely(id < BTREE_ID_NR)) {
- return &c->btree_roots_known[id];
- } else {
- unsigned idx = id - BTREE_ID_NR;
-
- /* This can happen when we're called from btree_node_scan */
- if (idx >= c->btree_roots_extra.nr)
- return NULL;
-
- return &c->btree_roots_extra.data[idx];
- }
-}
-
-static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
-{
- struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
-
- return r ? r->b : NULL;
-}
-
-const char *bch2_btree_id_str(enum btree_id); /* avoid */
-void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
-void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
-
-void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *,
- enum btree_id, unsigned, struct bkey_s_c);
-void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
-
-#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
deleted file mode 100644
index ff681e733598..000000000000
--- a/fs/bcachefs/btree_gc.c
+++ /dev/null
@@ -1,1259 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright (C) 2014 Datera Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "progress.h"
-#include "recovery_passes.h"
-#include "reflink.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-
-#define DROP_THIS_NODE 10
-#define DROP_PREV_NODE 11
-#define DID_FILL_FROM_SCAN 12
-
-static const char * const bch2_gc_phase_strs[] = {
-#define x(n) #n,
- GC_PHASES()
-#undef x
- NULL
-};
-
-void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
-{
- prt_str(out, bch2_gc_phase_strs[p->phase]);
- prt_char(out, ' ');
- bch2_btree_id_level_to_text(out, p->btree, p->level);
- prt_char(out, ' ');
- bch2_bpos_to_text(out, p->pos);
-}
-
-static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
-{
- return (struct bkey_s) {{{
- (struct bkey *) k.k,
- (struct bch_val *) k.v
- }}};
-}
-
-static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
- preempt_disable();
- write_seqcount_begin(&c->gc_pos_lock);
- c->gc_pos = new_pos;
- write_seqcount_end(&c->gc_pos_lock);
- preempt_enable();
-}
-
-static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
- BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
- __gc_pos_set(c, new_pos);
-}
-
-static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
-{
- switch (b->key.k.type) {
- case KEY_TYPE_btree_ptr: {
- struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
-
- dst->k.p = src->k.p;
- dst->v.mem_ptr = 0;
- dst->v.seq = b->data->keys.seq;
- dst->v.sectors_written = 0;
- dst->v.flags = 0;
- dst->v.min_key = b->data->min_key;
- set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
- memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
- break;
- }
- case KEY_TYPE_btree_ptr_v2:
- bkey_copy(&dst->k_i, &b->key);
- break;
- default:
- BUG();
- }
-}
-
-static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
-{
- struct bkey_i_btree_ptr_v2 *new;
- int ret;
-
- if (c->opts.verbose) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, " -> ");
- bch2_bpos_to_text(&buf, new_min);
-
- bch_info(c, "%s(): %s", __func__, buf.buf);
- printbuf_exit(&buf);
- }
-
- new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
- if (!new)
- return -BCH_ERR_ENOMEM_gc_repair_key;
-
- btree_ptr_to_v2(b, new);
- b->data->min_key = new_min;
- new->v.min_key = new_min;
- SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
- ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
- if (ret) {
- kfree(new);
- return ret;
- }
-
- bch2_btree_node_drop_keys_outside_node(b);
- bkey_copy(&b->key, &new->k_i);
- return 0;
-}
-
-static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
-{
- struct bkey_i_btree_ptr_v2 *new;
- int ret;
-
- if (c->opts.verbose) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, " -> ");
- bch2_bpos_to_text(&buf, new_max);
-
- bch_info(c, "%s(): %s", __func__, buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
- if (ret)
- return ret;
-
- new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
- if (!new)
- return -BCH_ERR_ENOMEM_gc_repair_key;
-
- btree_ptr_to_v2(b, new);
- b->data->max_key = new_max;
- new->k.p = new_max;
- SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
- ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
- if (ret) {
- kfree(new);
- return ret;
- }
-
- bch2_btree_node_drop_keys_outside_node(b);
-
- mutex_lock(&c->btree_cache.lock);
- __bch2_btree_node_hash_remove(&c->btree_cache, b);
-
- bkey_copy(&b->key, &new->k_i);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
- mutex_unlock(&c->btree_cache.lock);
- return 0;
-}
-
-static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
- struct btree *prev, struct btree *cur,
- struct bpos *pulled_from_scan)
-{
- struct bch_fs *c = trans->c;
- struct bpos expected_start = !prev
- ? b->data->min_key
- : bpos_successor(prev->key.k.p);
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
- b->data->min_key));
-
- if (bpos_eq(expected_start, cur->data->min_key))
- return 0;
-
- prt_printf(&buf, " at ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_printf(&buf, ":\n parent: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- if (prev) {
- prt_printf(&buf, "\n prev: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
- }
-
- prt_str(&buf, "\n next: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
-
- if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
- if (b->c.level == 1 &&
- bpos_lt(*pulled_from_scan, cur->data->min_key)) {
- ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
- expected_start,
- bpos_predecessor(cur->data->min_key));
- if (ret)
- goto err;
-
- *pulled_from_scan = cur->data->min_key;
- ret = DID_FILL_FROM_SCAN;
- } else {
- if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
- "btree node with incorrect min_key%s", buf.buf))
- ret = set_node_min(c, cur, expected_start);
- }
- } else { /* overlap */
- if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
- if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
- if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
- "btree node overwritten by next node%s", buf.buf))
- ret = DROP_PREV_NODE;
- } else {
- if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
- "btree node with incorrect max_key%s", buf.buf))
- ret = set_node_max(c, prev,
- bpos_predecessor(cur->data->min_key));
- }
- } else {
- if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
- if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
- "btree node overwritten by prev node%s", buf.buf))
- ret = DROP_THIS_NODE;
- } else {
- if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
- "btree node with incorrect min_key%s", buf.buf))
- ret = set_node_min(c, cur, expected_start);
- }
- }
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
- struct btree *child, struct bpos *pulled_from_scan)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (bpos_eq(child->key.k.p, b->key.k.p))
- return 0;
-
- prt_printf(&buf, " at ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_printf(&buf, ":\n parent: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- prt_str(&buf, "\n child: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
-
- if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
- "btree node with incorrect max_key%s", buf.buf)) {
- if (b->c.level == 1 &&
- bpos_lt(*pulled_from_scan, b->key.k.p)) {
- ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
- bpos_successor(child->key.k.p), b->key.k.p);
- if (ret)
- goto err;
-
- *pulled_from_scan = b->key.k.p;
- ret = DID_FILL_FROM_SCAN;
- } else {
- ret = set_node_max(c, child, b->key.k.p);
- }
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
- struct bpos *pulled_from_scan)
-{
- struct bch_fs *c = trans->c;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct bkey_buf prev_k, cur_k;
- struct btree *prev = NULL, *cur = NULL;
- bool have_child, new_pass = false;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (!b->c.level)
- return 0;
-
- bch2_bkey_buf_init(&prev_k);
- bch2_bkey_buf_init(&cur_k);
-again:
- cur = prev = NULL;
- have_child = new_pass = false;
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- iter.prefetch = true;
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- BUG_ON(bpos_lt(k.k->p, b->data->min_key));
- BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-
- bch2_btree_and_journal_iter_advance(&iter);
- bch2_bkey_buf_reassemble(&cur_k, c, k);
-
- cur = bch2_btree_node_get_noiter(trans, cur_k.k,
- b->c.btree_id, b->c.level - 1,
- false);
- ret = PTR_ERR_OR_ZERO(cur);
-
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
- prt_char(&buf, ' ');
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
-
- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
- trans, btree_node_read_error,
- "Topology repair: unreadable btree node at\n"
- " %s",
- buf.buf)) {
- bch2_btree_node_evict(trans, cur_k.k);
- cur = NULL;
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
- if (ret)
- break;
-
- ret = bch2_btree_lost_data(c, b->c.btree_id);
- if (ret)
- break;
- continue;
- }
-
- bch_err_msg(c, ret, "getting btree node");
- if (ret)
- break;
-
- if (bch2_btree_node_is_stale(c, cur)) {
- bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf);
- six_unlock_read(&cur->c.lock);
- bch2_btree_node_evict(trans, cur_k.k);
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
- cur = NULL;
- if (ret)
- break;
- continue;
- }
-
- ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
- if (ret == DID_FILL_FROM_SCAN) {
- new_pass = true;
- ret = 0;
- }
-
- if (ret == DROP_THIS_NODE) {
- six_unlock_read(&cur->c.lock);
- bch2_btree_node_evict(trans, cur_k.k);
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
- cur = NULL;
- if (ret)
- break;
- continue;
- }
-
- if (prev)
- six_unlock_read(&prev->c.lock);
- prev = NULL;
-
- if (ret == DROP_PREV_NODE) {
- bch_info(c, "dropped prev node");
- bch2_btree_node_evict(trans, prev_k.k);
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, prev_k.k->k.p);
- if (ret)
- break;
-
- bch2_btree_and_journal_iter_exit(&iter);
- goto again;
- } else if (ret)
- break;
-
- prev = cur;
- cur = NULL;
- bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
- }
-
- if (!ret && !IS_ERR_OR_NULL(prev)) {
- BUG_ON(cur);
- ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
- if (ret == DID_FILL_FROM_SCAN) {
- new_pass = true;
- ret = 0;
- }
- }
-
- if (!IS_ERR_OR_NULL(prev))
- six_unlock_read(&prev->c.lock);
- prev = NULL;
- if (!IS_ERR_OR_NULL(cur))
- six_unlock_read(&cur->c.lock);
- cur = NULL;
-
- if (ret)
- goto err;
-
- bch2_btree_and_journal_iter_exit(&iter);
-
- if (new_pass)
- goto again;
-
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- iter.prefetch = true;
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- bch2_bkey_buf_reassemble(&cur_k, c, k);
- bch2_btree_and_journal_iter_advance(&iter);
-
- cur = bch2_btree_node_get_noiter(trans, cur_k.k,
- b->c.btree_id, b->c.level - 1,
- false);
- ret = PTR_ERR_OR_ZERO(cur);
-
- bch_err_msg(c, ret, "getting btree node");
- if (ret)
- goto err;
-
- ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
- six_unlock_read(&cur->c.lock);
- cur = NULL;
-
- if (ret == DROP_THIS_NODE) {
- bch2_btree_node_evict(trans, cur_k.k);
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
- new_pass = true;
- }
-
- if (ret)
- goto err;
-
- have_child = true;
- }
-
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- if (mustfix_fsck_err_on(!have_child,
- trans, btree_node_topology_interior_node_empty,
- "empty interior btree node at %s", buf.buf))
- ret = DROP_THIS_NODE;
-err:
-fsck_err:
- if (!IS_ERR_OR_NULL(prev))
- six_unlock_read(&prev->c.lock);
- if (!IS_ERR_OR_NULL(cur))
- six_unlock_read(&cur->c.lock);
-
- bch2_btree_and_journal_iter_exit(&iter);
-
- if (!ret && new_pass)
- goto again;
-
- BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
-
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_topology(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bpos pulled_from_scan = POS_MIN;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- bch2_trans_srcu_unlock(trans);
-
- for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
- bool reconstructed_root = false;
-
- printbuf_reset(&buf);
- bch2_btree_id_to_text(&buf, i);
-
- if (r->error) {
- ret = bch2_btree_lost_data(c, i);
- if (ret)
- break;
-reconstruct_root:
- bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
-
- r->alive = false;
- r->error = 0;
-
- if (!bch2_btree_has_scanned_nodes(c, i)) {
- mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
- "no nodes found for btree %s, continue?", buf.buf);
- bch2_btree_root_alloc_fake_trans(trans, i, 0);
- } else {
- bch2_btree_root_alloc_fake_trans(trans, i, 1);
- bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
- if (ret)
- break;
- }
-
- reconstructed_root = true;
- }
-
- struct btree *b = r->b;
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
- six_unlock_read(&b->c.lock);
-
- if (ret == DROP_THIS_NODE) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
-
- r->b = NULL;
-
- if (!reconstructed_root)
- goto reconstruct_root;
-
- bch_err(c, "empty btree root %s", buf.buf);
- bch2_btree_root_alloc_fake_trans(trans, i, 0);
- r->alive = false;
- ret = 0;
- }
- }
-fsck_err:
- printbuf_exit(&buf);
- bch2_trans_put(trans);
- return ret;
-}
-
-/* marking of btree keys/nodes: */
-
-static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
- unsigned level, struct btree **prev,
- struct btree_iter *iter, struct bkey_s_c k,
- bool initial)
-{
- struct bch_fs *c = trans->c;
-
- if (iter) {
- struct btree_path *path = btree_iter_path(trans, iter);
- struct btree *b = path_l(path)->b;
-
- if (*prev != b) {
- int ret = bch2_btree_node_check_topology(trans, b);
- if (ret)
- return ret;
- }
- *prev = b;
- }
-
- struct bkey deleted = KEY(0, 0, 0);
- struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- deleted.p = k.k->p;
-
- if (initial) {
- BUG_ON(bch2_journal_seq_verify &&
- k.k->bversion.lo > atomic64_read(&c->journal.seq));
-
- if (fsck_err_on(btree_id != BTREE_ID_accounting &&
- k.k->bversion.lo > atomic64_read(&c->key_version),
- trans, bkey_version_in_future,
- "key version number higher than recorded %llu\n %s",
- atomic64_read(&c->key_version),
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- atomic64_set(&c->key_version, k.k->bversion.lo);
- }
-
- if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
- trans, btree_bitmap_not_marked,
- "btree ptr not marked in member info btree allocated bitmap\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf))) {
- mutex_lock(&c->sb_lock);
- bch2_dev_btree_bitmap_mark(c, k);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- /*
- * We require a commit before key_trigger() because
- * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
- * wrong result if we run it multiple times.
- */
- unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
-
- ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
- BTREE_TRIGGER_check_repair|flags);
- if (ret)
- goto out;
-
- if (trans->nr_updates) {
- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-
- ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
- BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
-out:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_gc_btree(struct btree_trans *trans,
- struct progress_indicator_state *progress,
- enum btree_id btree, bool initial)
-{
- struct bch_fs *c = trans->c;
- unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
- int ret = 0;
-
- /* We need to make sure every leaf node is readable before going RW */
- if (initial)
- target_depth = 0;
-
- for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
- struct btree *prev = NULL;
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
- BTREE_ITER_prefetch);
-
- ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
- gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
- bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
- }));
- if (ret)
- goto err;
- }
-
- /* root */
- do {
-retry_root:
- bch2_trans_begin(trans);
-
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
- 0, bch2_btree_id_root(c, btree)->b->c.level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
- ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto err_root;
-
- if (b != btree_node_root(c, b)) {
- bch2_trans_iter_exit(trans, &iter);
- goto retry_root;
- }
-
- gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
- struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
-err_root:
- bch2_trans_iter_exit(trans, &iter);
- } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-err:
- bch_err_fn(c, ret);
- return ret;
-}
-
-static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
-{
- return cmp_int(gc_btree_order(l), gc_btree_order(r));
-}
-
-static int bch2_gc_btrees(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct progress_indicator_state progress;
- bch2_progress_init(&progress, c, ~0ULL);
-
- enum btree_id ids[BTREE_ID_NR];
- for (unsigned i = 0; i < BTREE_ID_NR; i++)
- ids[i] = i;
- bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
-
- for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
- unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
-
- if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
- continue;
-
- ret = bch2_gc_btree(trans, &progress, btree, true);
- }
-
- printbuf_exit(&buf);
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_mark_superblocks(struct bch_fs *c)
-{
- gc_pos_set(c, gc_phase(GC_PHASE_sb));
-
- return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
-}
-
-static void bch2_gc_free(struct bch_fs *c)
-{
- bch2_accounting_gc_free(c);
-
- genradix_free(&c->reflink_gc_table);
- genradix_free(&c->gc_stripes);
-
- for_each_member_device(c, ca)
- genradix_free(&ca->buckets_gc);
-}
-
-static int bch2_gc_start(struct bch_fs *c)
-{
- for_each_member_device(c, ca) {
- int ret = bch2_dev_usage_init(ca, true);
- if (ret) {
- bch2_dev_put(ca);
- return ret;
- }
- }
-
- return 0;
-}
-
-/* returns true if not equal */
-static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
- struct bch_alloc_v4 r)
-{
- return l.gen != r.gen ||
- l.oldest_gen != r.oldest_gen ||
- l.data_type != r.data_type ||
- l.dirty_sectors != r.dirty_sectors ||
- l.stripe_sectors != r.stripe_sectors ||
- l.cached_sectors != r.cached_sectors ||
- l.stripe_redundancy != r.stripe_redundancy ||
- l.stripe != r.stripe;
-}
-
-static int bch2_alloc_write_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_dev *ca,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i_alloc_v4 *a;
- struct bch_alloc_v4 old_gc, gc, old_convert, new;
- const struct bch_alloc_v4 *old;
- int ret;
-
- if (!bucket_valid(ca, k.k->p.offset))
- return 0;
-
- old = bch2_alloc_to_v4(k, &old_convert);
- gc = new = *old;
-
- __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
-
- old_gc = gc;
-
- if ((old->data_type == BCH_DATA_sb ||
- old->data_type == BCH_DATA_journal) &&
- !bch2_dev_is_online(ca)) {
- gc.data_type = old->data_type;
- gc.dirty_sectors = old->dirty_sectors;
- }
-
- /*
- * gc.data_type doesn't yet include need_discard & need_gc_gen states -
- * fix that here:
- */
- alloc_data_type_set(&gc, gc.data_type);
- if (gc.data_type != old_gc.data_type ||
- gc.dirty_sectors != old_gc.dirty_sectors) {
- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
- if (ret)
- return ret;
-
- /*
- * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
- * safe w.r.t. transaction restarts, so fixup the gc_bucket so
- * we don't run it twice:
- */
- struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
- gc_m->data_type = gc.data_type;
- gc_m->dirty_sectors = gc.dirty_sectors;
- }
-
- if (fsck_err_on(new.data_type != gc.data_type,
- trans, alloc_key_data_type_wrong,
- "bucket %llu:%llu gen %u has wrong data_type"
- ": got %s, should be %s",
- iter->pos.inode, iter->pos.offset,
- gc.gen,
- bch2_data_type_str(new.data_type),
- bch2_data_type_str(gc.data_type)))
- new.data_type = gc.data_type;
-
-#define copy_bucket_field(_errtype, _f) \
- if (fsck_err_on(new._f != gc._f, \
- trans, _errtype, \
- "bucket %llu:%llu gen %u data type %s has wrong " #_f \
- ": got %llu, should be %llu", \
- iter->pos.inode, iter->pos.offset, \
- gc.gen, \
- bch2_data_type_str(gc.data_type), \
- (u64) new._f, (u64) gc._f)) \
- new._f = gc._f; \
-
- copy_bucket_field(alloc_key_gen_wrong, gen);
- copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors);
- copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors);
- copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors);
- copy_bucket_field(alloc_key_stripe_wrong, stripe);
- copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy);
-#undef copy_bucket_field
-
- if (!bch2_alloc_v4_cmp(*old, new))
- return 0;
-
- a = bch2_alloc_to_v4_mut(trans, k);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- return ret;
-
- a->v = new;
-
- /*
- * The trigger normally makes sure these are set, but we're not running
- * triggers:
- */
- if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
- a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
-
- ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
-fsck_err:
- return ret;
-}
-
-static int bch2_gc_alloc_done(struct bch_fs *c)
-{
- int ret = 0;
-
- for_each_member_device(c, ca) {
- ret = bch2_trans_run(c,
- for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
- POS(ca->dev_idx, ca->mi.first_bucket),
- POS(ca->dev_idx, ca->mi.nbuckets - 1),
- BTREE_ITER_slots|BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_alloc_write_key(trans, &iter, ca, k)));
- if (ret) {
- bch2_dev_put(ca);
- break;
- }
- }
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_gc_alloc_start(struct bch_fs *c)
-{
- int ret = 0;
-
- for_each_member_device(c, ca) {
- ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
- if (ret) {
- bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_gc_alloc_start;
- break;
- }
- }
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_gc_write_stripes_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- const struct bch_stripe *s;
- struct gc_stripe *m;
- bool bad = false;
- unsigned i;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_stripe)
- return 0;
-
- s = bkey_s_c_to_stripe(k).v;
- m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-
- for (i = 0; i < s->nr_blocks; i++) {
- u32 old = stripe_blockcount_get(s, i);
- u32 new = (m ? m->block_sectors[i] : 0);
-
- if (old != new) {
- prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
- i, old, new);
- bad = true;
- }
- }
-
- if (bad)
- bch2_bkey_val_to_text(&buf, c, k);
-
- if (fsck_err_on(bad,
- trans, stripe_sector_count_wrong,
- "%s", buf.buf)) {
- struct bkey_i_stripe *new;
-
- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- bkey_reassemble(&new->k_i, k);
-
- for (i = 0; i < new->v.nr_blocks; i++)
- stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
-
- ret = bch2_trans_update(trans, iter, &new->k_i, 0);
- }
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_gc_stripes_done(struct bch_fs *c)
-{
- return bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_gc_write_stripes_key(trans, &iter, k)));
-}
-
-/**
- * bch2_check_allocations - walk all references to buckets, and recompute them:
- *
- * @c: filesystem object
- *
- * Returns: 0 on success, or standard errcode on failure
- *
- * Order matters here:
- * - Concurrent GC relies on the fact that we have a total ordering for
- * everything that GC walks - see gc_will_visit_node(),
- * gc_will_visit_root()
- *
- * - also, references move around in the course of index updates and
- * various other crap: everything needs to agree on the ordering
- * references are allowed to move around in - e.g., we're allowed to
- * start with a reference owned by an open_bucket (the allocator) and
- * move it to the btree, but not the reverse.
- *
- * This is necessary to ensure that gc doesn't miss references that
- * move around - if references move backwards in the ordering GC
- * uses, GC could skip past them
- */
-int bch2_check_allocations(struct bch_fs *c)
-{
- int ret;
-
- lockdep_assert_held(&c->state_lock);
-
- down_write(&c->gc_lock);
-
- bch2_btree_interior_updates_flush(c);
-
- ret = bch2_gc_accounting_start(c) ?:
- bch2_gc_start(c) ?:
- bch2_gc_alloc_start(c) ?:
- bch2_gc_reflink_start(c);
- if (ret)
- goto out;
-
- gc_pos_set(c, gc_phase(GC_PHASE_start));
-
- ret = bch2_mark_superblocks(c);
- bch_err_msg(c, ret, "marking superblocks");
- if (ret)
- goto out;
-
- ret = bch2_gc_btrees(c);
- if (ret)
- goto out;
-
- c->gc_count++;
-
- ret = bch2_gc_alloc_done(c) ?:
- bch2_gc_accounting_done(c) ?:
- bch2_gc_stripes_done(c) ?:
- bch2_gc_reflink_done(c);
-out:
- percpu_down_write(&c->mark_lock);
- /* Indicates that gc is no longer in progress: */
- __gc_pos_set(c, gc_phase(GC_PHASE_not_running));
-
- bch2_gc_free(c);
- percpu_up_write(&c->mark_lock);
-
- up_write(&c->gc_lock);
-
- /*
- * At startup, allocations can happen directly instead of via the
- * allocator thread - issue wakeup in case they blocked on gc_lock:
- */
- closure_wake_up(&c->freelist_wait);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int gc_btree_gens_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bkey_i *u;
- int ret;
-
- if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
- return -EROFS;
-
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- if (dev_ptr_stale(ca, ptr) > 16) {
- rcu_read_unlock();
- goto update;
- }
- }
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(*gen, ptr->gen))
- *gen = ptr->gen;
- }
- rcu_read_unlock();
- return 0;
-update:
- u = bch2_bkey_make_mut(trans, iter, &k, 0);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- bch2_extent_normalize(c, bkey_i_to_s(u));
- return 0;
-}
-
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
- struct btree_iter *iter, struct bkey_s_c k)
-{
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- struct bkey_i_alloc_v4 *a_mut;
- int ret;
-
- if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
- return 0;
-
- a_mut = bch2_alloc_to_v4_mut(trans, k);
- ret = PTR_ERR_OR_ZERO(a_mut);
- if (ret)
- return ret;
-
- a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
-
- return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
-}
-
-int bch2_gc_gens(struct bch_fs *c)
-{
- u64 b, start_time = local_clock();
- int ret;
-
- if (!mutex_trylock(&c->gc_gens_lock))
- return 0;
-
- trace_and_count(c, gc_gens_start, c);
-
- /*
- * We have to use trylock here. Otherwise, we would
- * introduce a deadlock in the RO path - we take the
- * state lock at the start of going RO.
- */
- if (!down_read_trylock(&c->state_lock)) {
- mutex_unlock(&c->gc_gens_lock);
- return 0;
- }
-
- for_each_member_device(c, ca) {
- struct bucket_gens *gens = bucket_gens(ca);
-
- BUG_ON(ca->oldest_gen);
-
- ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
- if (!ca->oldest_gen) {
- bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_gc_gens;
- goto err;
- }
-
- for (b = gens->first_bucket;
- b < gens->nbuckets; b++)
- ca->oldest_gen[b] = gens->b[b];
- }
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++)
- if (btree_type_has_ptrs(i)) {
- c->gc_gens_btree = i;
- c->gc_gens_pos = POS_MIN;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, i,
- POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- gc_btree_gens_key(trans, &iter, k)));
- if (ret)
- goto err;
- }
-
- struct bch_dev *ca = NULL;
- ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN,
- BTREE_ITER_prefetch,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
- ca = bch2_dev_iterate(c, ca, k.k->p.inode);
- if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
- bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
- })));
- bch2_dev_put(ca);
-
- if (ret)
- goto err;
-
- c->gc_gens_btree = 0;
- c->gc_gens_pos = POS_MIN;
-
- c->gc_count++;
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
- trace_and_count(c, gc_gens_end, c);
-err:
- for_each_member_device(c, ca) {
- kvfree(ca->oldest_gen);
- ca->oldest_gen = NULL;
- }
-
- up_read(&c->state_lock);
- mutex_unlock(&c->gc_gens_lock);
- if (!bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
- return ret;
-}
-
-static void bch2_gc_gens_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
- bch2_gc_gens(c);
- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_gc_gens_async(struct bch_fs *c)
-{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
- !queue_work(c->write_ref_wq, &c->gc_gens_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_fs_btree_gc_exit(struct bch_fs *c)
-{
-}
-
-int bch2_fs_btree_gc_init(struct bch_fs *c)
-{
- seqcount_init(&c->gc_pos_lock);
- INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
-
- init_rwsem(&c->gc_lock);
- mutex_init(&c->gc_gens_lock);
- return 0;
-}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
deleted file mode 100644
index 9693a90a48a2..000000000000
--- a/fs/bcachefs/btree_gc.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_GC_H
-#define _BCACHEFS_BTREE_GC_H
-
-#include "bkey.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-
-int bch2_check_topology(struct bch_fs *);
-int bch2_check_allocations(struct bch_fs *);
-
-/*
- * For concurrent mark and sweep (with other index updates), we define a total
- * ordering of _all_ references GC walks:
- *
- * Note that some references will have the same GC position as others - e.g.
- * everything within the same btree node; in those cases we're relying on
- * whatever locking exists for where those references live, i.e. the write lock
- * on a btree node.
- *
- * That locking is also required to ensure GC doesn't pass the updater in
- * between the updater adding/removing the reference and updating the GC marks;
- * without that, we would at best double count sometimes.
- *
- * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
- * be held that prevents GC from passing the position the updater is at.
- *
- * (What about the start of gc, when we're clearing all the marks? GC clears the
- * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
- * position inside its cmpxchg loop, so crap magically works).
- */
-
-/* Position of (the start of) a gc phase: */
-static inline struct gc_pos gc_phase(enum gc_phase phase)
-{
- return (struct gc_pos) { .phase = phase, };
-}
-
-static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
- struct bpos pos)
-{
- return (struct gc_pos) {
- .phase = GC_PHASE_btree,
- .btree = btree,
- .level = level,
- .pos = pos,
- };
-}
-
-static inline int gc_btree_order(enum btree_id btree)
-{
- if (btree == BTREE_ID_alloc)
- return -2;
- if (btree == BTREE_ID_stripes)
- return -1;
- return btree;
-}
-
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
- return cmp_int(l.phase, r.phase) ?:
- cmp_int(gc_btree_order(l.btree),
- gc_btree_order(r.btree)) ?:
- cmp_int(l.level, r.level) ?:
- bpos_cmp(l.pos, r.pos);
-}
-
-static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
-{
- unsigned seq;
- bool ret;
-
- do {
- seq = read_seqcount_begin(&c->gc_pos_lock);
- ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
- } while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
- return ret;
-}
-
-void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
-
-int bch2_gc_gens(struct bch_fs *);
-void bch2_gc_gens_async(struct bch_fs *);
-
-void bch2_fs_btree_gc_exit(struct bch_fs *);
-int bch2_fs_btree_gc_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h
deleted file mode 100644
index c24dd6edf377..000000000000
--- a/fs/bcachefs/btree_gc_types.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_GC_TYPES_H
-#define _BCACHEFS_BTREE_GC_TYPES_H
-
-#include <linux/generic-radix-tree.h>
-
-#define GC_PHASES() \
- x(not_running) \
- x(start) \
- x(sb) \
- x(btree)
-
-enum gc_phase {
-#define x(n) GC_PHASE_##n,
- GC_PHASES()
-#undef x
-};
-
-struct gc_pos {
- enum gc_phase phase:8;
- enum btree_id btree:8;
- u16 level;
- struct bpos pos;
-};
-
-struct reflink_gc {
- u64 offset;
- u32 size;
- u32 refcount;
-};
-
-typedef GENRADIX(struct reflink_gc) reflink_gc_table;
-
-#endif /* _BCACHEFS_BTREE_GC_TYPES_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
deleted file mode 100644
index 2ba33ffc9795..000000000000
--- a/fs/bcachefs/btree_io.c
+++ /dev/null
@@ -1,2635 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "bkey_sort.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "recovery.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
-{
- bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
- prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
- prt_str(out, "min: ");
- bch2_bpos_to_text(out, bn->min_key);
- prt_newline(out);
- prt_str(out, "max: ");
- bch2_bpos_to_text(out, bn->max_key);
-}
-
-void bch2_btree_node_io_unlock(struct btree *b)
-{
- EBUG_ON(!btree_node_write_in_flight(b));
-
- clear_btree_node_write_in_flight_inner(b);
- clear_btree_node_write_in_flight(b);
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-void bch2_btree_node_io_lock(struct btree *b)
-{
- wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_read(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_write(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_read(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_write(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-static void verify_no_dups(struct btree *b,
- struct bkey_packed *start,
- struct bkey_packed *end)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bkey_packed *k, *p;
-
- if (start == end)
- return;
-
- for (p = start, k = bkey_p_next(start);
- k != end;
- p = k, k = bkey_p_next(k)) {
- struct bkey l = bkey_unpack_key(b, p);
- struct bkey r = bkey_unpack_key(b, k);
-
- BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
- }
-#endif
-}
-
-static void set_needs_whiteout(struct bset *i, int v)
-{
- struct bkey_packed *k;
-
- for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
- k->needs_whiteout = v;
-}
-
-static void btree_bounce_free(struct bch_fs *c, size_t size,
- bool used_mempool, void *p)
-{
- if (used_mempool)
- mempool_free(p, &c->btree_bounce_pool);
- else
- kvfree(p);
-}
-
-static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
- bool *used_mempool)
-{
- unsigned flags = memalloc_nofs_save();
- void *p;
-
- BUG_ON(size > c->opts.btree_node_size);
-
- *used_mempool = false;
- p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
- if (!p) {
- *used_mempool = true;
- p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
- }
- memalloc_nofs_restore(flags);
- return p;
-}
-
-static void sort_bkey_ptrs(const struct btree *bt,
- struct bkey_packed **ptrs, unsigned nr)
-{
- unsigned n = nr, a = nr / 2, b, c, d;
-
- if (!a)
- return;
-
- /* Heap sort: see lib/sort.c: */
- while (1) {
- if (a)
- a--;
- else if (--n)
- swap(ptrs[0], ptrs[n]);
- else
- break;
-
- for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
- b = bch2_bkey_cmp_packed(bt,
- ptrs[c],
- ptrs[d]) >= 0 ? c : d;
- if (d == n)
- b = c;
-
- while (b != a &&
- bch2_bkey_cmp_packed(bt,
- ptrs[a],
- ptrs[b]) >= 0)
- b = (b - 1) / 2;
- c = b;
- while (b != a) {
- b = (b - 1) / 2;
- swap(ptrs[b], ptrs[c]);
- }
- }
-}
-
-static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
-{
- struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
- bool used_mempool = false;
- size_t bytes = b->whiteout_u64s * sizeof(u64);
-
- if (!b->whiteout_u64s)
- return;
-
- new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
-
- ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
-
- for (k = unwritten_whiteouts_start(b);
- k != unwritten_whiteouts_end(b);
- k = bkey_p_next(k))
- *--ptrs = k;
-
- sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
-
- k = new_whiteouts;
-
- while (ptrs != ptrs_end) {
- bkey_p_copy(k, *ptrs);
- k = bkey_p_next(k);
- ptrs++;
- }
-
- verify_no_dups(b, new_whiteouts,
- (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
-
- memcpy_u64s(unwritten_whiteouts_start(b),
- new_whiteouts, b->whiteout_u64s);
-
- btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
-}
-
-static bool should_compact_bset(struct btree *b, struct bset_tree *t,
- bool compacting, enum compact_mode mode)
-{
- if (!bset_dead_u64s(b, t))
- return false;
-
- switch (mode) {
- case COMPACT_LAZY:
- return should_compact_bset_lazy(b, t) ||
- (compacting && !bset_written(b, bset(b, t)));
- case COMPACT_ALL:
- return true;
- default:
- BUG();
- }
-}
-
-static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
-{
- bool ret = false;
-
- for_each_bset(b, t) {
- struct bset *i = bset(b, t);
- struct bkey_packed *k, *n, *out, *start, *end;
- struct btree_node_entry *src = NULL, *dst = NULL;
-
- if (t != b->set && !bset_written(b, i)) {
- src = container_of(i, struct btree_node_entry, keys);
- dst = max(write_block(b),
- (void *) btree_bkey_last(b, t - 1));
- }
-
- if (src != dst)
- ret = true;
-
- if (!should_compact_bset(b, t, ret, mode)) {
- if (src != dst) {
- memmove(dst, src, sizeof(*src) +
- le16_to_cpu(src->keys.u64s) *
- sizeof(u64));
- i = &dst->keys;
- set_btree_bset(b, t, i);
- }
- continue;
- }
-
- start = btree_bkey_first(b, t);
- end = btree_bkey_last(b, t);
-
- if (src != dst) {
- memmove(dst, src, sizeof(*src));
- i = &dst->keys;
- set_btree_bset(b, t, i);
- }
-
- out = i->start;
-
- for (k = start; k != end; k = n) {
- n = bkey_p_next(k);
-
- if (!bkey_deleted(k)) {
- bkey_p_copy(out, k);
- out = bkey_p_next(out);
- } else {
- BUG_ON(k->needs_whiteout);
- }
- }
-
- i->u64s = cpu_to_le16((u64 *) out - i->_data);
- set_btree_bset_end(b, t);
- bch2_bset_set_no_aux_tree(b, t);
- ret = true;
- }
-
- bch2_verify_btree_nr_keys(b);
-
- bch2_btree_build_aux_trees(b);
-
- return ret;
-}
-
-bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
- enum compact_mode mode)
-{
- return bch2_drop_whiteouts(b, mode);
-}
-
-static void btree_node_sort(struct bch_fs *c, struct btree *b,
- unsigned start_idx,
- unsigned end_idx)
-{
- struct btree_node *out;
- struct sort_iter_stack sort_iter;
- struct bset_tree *t;
- struct bset *start_bset = bset(b, &b->set[start_idx]);
- bool used_mempool = false;
- u64 start_time, seq = 0;
- unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
- bool sorting_entire_node = start_idx == 0 &&
- end_idx == b->nsets;
-
- sort_iter_stack_init(&sort_iter, b);
-
- for (t = b->set + start_idx;
- t < b->set + end_idx;
- t++) {
- u64s += le16_to_cpu(bset(b, t)->u64s);
- sort_iter_add(&sort_iter.iter,
- btree_bkey_first(b, t),
- btree_bkey_last(b, t));
- }
-
- bytes = sorting_entire_node
- ? btree_buf_bytes(b)
- : __vstruct_bytes(struct btree_node, u64s);
-
- out = btree_bounce_alloc(c, bytes, &used_mempool);
-
- start_time = local_clock();
-
- u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
-
- out->keys.u64s = cpu_to_le16(u64s);
-
- BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
-
- if (sorting_entire_node)
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
- start_time);
-
- /* Make sure we preserve bset journal_seq: */
- for (t = b->set + start_idx; t < b->set + end_idx; t++)
- seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
- start_bset->journal_seq = cpu_to_le64(seq);
-
- if (sorting_entire_node) {
- u64s = le16_to_cpu(out->keys.u64s);
-
- BUG_ON(bytes != btree_buf_bytes(b));
-
- /*
- * Our temporary buffer is the same size as the btree node's
- * buffer, we can just swap buffers instead of doing a big
- * memcpy()
- */
- *out = *b->data;
- out->keys.u64s = cpu_to_le16(u64s);
- swap(out, b->data);
- set_btree_bset(b, b->set, &b->data->keys);
- } else {
- start_bset->u64s = out->keys.u64s;
- memcpy_u64s(start_bset->start,
- out->keys.start,
- le16_to_cpu(out->keys.u64s));
- }
-
- for (i = start_idx + 1; i < end_idx; i++)
- b->nr.bset_u64s[start_idx] +=
- b->nr.bset_u64s[i];
-
- b->nsets -= shift;
-
- for (i = start_idx + 1; i < b->nsets; i++) {
- b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift];
- b->set[i] = b->set[i + shift];
- }
-
- for (i = b->nsets; i < MAX_BSETS; i++)
- b->nr.bset_u64s[i] = 0;
-
- set_btree_bset_end(b, &b->set[start_idx]);
- bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
-
- btree_bounce_free(c, bytes, used_mempool, out);
-
- bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_btree_sort_into(struct bch_fs *c,
- struct btree *dst,
- struct btree *src)
-{
- struct btree_nr_keys nr;
- struct btree_node_iter src_iter;
- u64 start_time = local_clock();
-
- BUG_ON(dst->nsets != 1);
-
- bch2_bset_set_no_aux_tree(dst, dst->set);
-
- bch2_btree_node_iter_init_from_start(&src_iter, src);
-
- nr = bch2_sort_repack(btree_bset_first(dst),
- src, &src_iter,
- &dst->format,
- true);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
- start_time);
-
- set_btree_bset_end(dst, dst->set);
-
- dst->nr.live_u64s += nr.live_u64s;
- dst->nr.bset_u64s[0] += nr.bset_u64s[0];
- dst->nr.packed_keys += nr.packed_keys;
- dst->nr.unpacked_keys += nr.unpacked_keys;
-
- bch2_verify_btree_nr_keys(dst);
-}
-
-/*
- * We're about to add another bset to the btree node, so if there's currently
- * too many bsets - sort some of them together:
- */
-static bool btree_node_compact(struct bch_fs *c, struct btree *b)
-{
- unsigned unwritten_idx;
- bool ret = false;
-
- for (unwritten_idx = 0;
- unwritten_idx < b->nsets;
- unwritten_idx++)
- if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
- break;
-
- if (b->nsets - unwritten_idx > 1) {
- btree_node_sort(c, b, unwritten_idx, b->nsets);
- ret = true;
- }
-
- if (unwritten_idx > 1) {
- btree_node_sort(c, b, 0, unwritten_idx);
- ret = true;
- }
-
- return ret;
-}
-
-void bch2_btree_build_aux_trees(struct btree *b)
-{
- for_each_bset(b, t)
- bch2_bset_build_aux_tree(b, t,
- !bset_written(b, bset(b, t)) &&
- t == bset_tree_last(b));
-}
-
-/*
- * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
- *
- * The first bset is going to be of similar order to the size of the node, the
- * last bset is bounded by btree_write_set_buffer(), which is set to keep the
- * memmove on insert from being too expensive: the middle bset should, ideally,
- * be the geometric mean of the first and the last.
- *
- * Returns true if the middle bset is greater than that geometric mean:
- */
-static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
-{
- unsigned mid_u64s_bits =
- (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
-
- return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
-}
-
-/*
- * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
- * inserted into
- *
- * Safe to call if there already is an unwritten bset - will only add a new bset
- * if @b doesn't already have one.
- *
- * Returns true if we sorted (i.e. invalidated iterators
- */
-void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
-{
- struct bch_fs *c = trans->c;
- struct btree_node_entry *bne;
- bool reinit_iter = false;
-
- EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
- BUG_ON(bset_written(b, bset(b, &b->set[1])));
- BUG_ON(btree_node_just_written(b));
-
- if (b->nsets == MAX_BSETS &&
- !btree_node_write_in_flight(b) &&
- should_compact_all(c, b)) {
- bch2_btree_node_write_trans(trans, b, SIX_LOCK_write,
- BTREE_WRITE_init_next_bset);
- reinit_iter = true;
- }
-
- if (b->nsets == MAX_BSETS &&
- btree_node_compact(c, b))
- reinit_iter = true;
-
- BUG_ON(b->nsets >= MAX_BSETS);
-
- bne = want_new_bset(c, b);
- if (bne)
- bch2_bset_init_next(b, bne);
-
- bch2_btree_build_aux_trees(b);
-
- if (reinit_iter)
- bch2_trans_node_reinit_iter(trans, b);
-}
-
-static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
- struct bch_dev *ca,
- struct btree *b, struct bset *i, struct bkey_packed *k,
- unsigned offset, int write)
-{
- prt_printf(out, bch2_log_msg(c, "%s"),
- write == READ
- ? "error validating btree node "
- : "corrupt btree node before write ");
- if (ca)
- prt_printf(out, "on %s ", ca->name);
- prt_printf(out, "at btree ");
- bch2_btree_pos_to_text(out, c, b);
-
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "\nnode offset %u/%u",
- b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
- if (i)
- prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
- if (k)
- prt_printf(out, " bset byte offset %lu",
- (unsigned long)(void *)k -
- ((unsigned long)(void *)i & ~511UL));
- prt_str(out, ": ");
-}
-
-__printf(10, 11)
-static int __btree_err(int ret,
- struct bch_fs *c,
- struct bch_dev *ca,
- struct btree *b,
- struct bset *i,
- struct bkey_packed *k,
- int write,
- bool have_retry,
- enum bch_sb_error_id err_type,
- const char *fmt, ...)
-{
- struct printbuf out = PRINTBUF;
- bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
- va_list args;
-
- btree_err_msg(&out, c, ca, b, i, k, b->written, write);
-
- va_start(args, fmt);
- prt_vprintf(&out, fmt, args);
- va_end(args);
-
- if (write == WRITE) {
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = c->opts.errors == BCH_ON_ERROR_continue
- ? 0
- : -BCH_ERR_fsck_errors_not_fixed;
- goto out;
- }
-
- if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
- ret = -BCH_ERR_btree_node_read_err_fixable;
- if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
- ret = -BCH_ERR_btree_node_read_err_bad_node;
-
- if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
- bch2_sb_error_count(c, err_type);
-
- switch (ret) {
- case -BCH_ERR_btree_node_read_err_fixable:
- ret = !silent
- ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
- : -BCH_ERR_fsck_fix;
- if (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore)
- goto fsck_err;
- ret = -BCH_ERR_fsck_fix;
- break;
- case -BCH_ERR_btree_node_read_err_want_retry:
- case -BCH_ERR_btree_node_read_err_must_retry:
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- break;
- case -BCH_ERR_btree_node_read_err_bad_node:
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = bch2_topology_error(c);
- break;
- case -BCH_ERR_btree_node_read_err_incompatible:
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = -BCH_ERR_fsck_errors_not_fixed;
- break;
- default:
- BUG();
- }
-out:
-fsck_err:
- printbuf_exit(&out);
- return ret;
-}
-
-#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
-({ \
- int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
- BCH_FSCK_ERR_##_err_type, \
- msg, ##__VA_ARGS__); \
- \
- if (_ret != -BCH_ERR_fsck_fix) { \
- ret = _ret; \
- goto fsck_err; \
- } \
- \
- *saw_error = true; \
-})
-
-#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
-
-/*
- * When btree topology repair changes the start or end of a node, that might
- * mean we have to drop keys that are no longer inside the node:
- */
-__cold
-void bch2_btree_node_drop_keys_outside_node(struct btree *b)
-{
- for_each_bset(b, t) {
- struct bset *i = bset(b, t);
- struct bkey_packed *k;
-
- for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
- if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
- break;
-
- if (k != i->start) {
- unsigned shift = (u64 *) k - (u64 *) i->start;
-
- memmove_u64s_down(i->start, k,
- (u64 *) vstruct_end(i) - (u64 *) k);
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
- set_btree_bset_end(b, t);
- }
-
- for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
- if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
- break;
-
- if (k != vstruct_last(i)) {
- i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
- set_btree_bset_end(b, t);
- }
- }
-
- /*
- * Always rebuild search trees: eytzinger search tree nodes directly
- * depend on the values of min/max key:
- */
- bch2_bset_set_no_aux_tree(b, b->set);
- bch2_btree_build_aux_trees(b);
- b->nr = bch2_btree_node_count_keys(b);
-
- struct bkey_s_c k;
- struct bkey unpacked;
- struct btree_node_iter iter;
- for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
- BUG_ON(bpos_lt(k.k->p, b->data->min_key));
- BUG_ON(bpos_gt(k.k->p, b->data->max_key));
- }
-}
-
-static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
- struct btree *b, struct bset *i,
- unsigned offset, unsigned sectors,
- int write, bool have_retry, bool *saw_error)
-{
- unsigned version = le16_to_cpu(i->version);
- unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
- int ret = 0;
-
- btree_err_on(!bch2_version_compatible(version),
- -BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i, NULL,
- btree_node_unsupported_version,
- "unsupported bset version %u.%u",
- BCH_VERSION_MAJOR(version),
- BCH_VERSION_MINOR(version));
-
- if (btree_err_on(version < c->sb.version_min,
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, NULL,
- btree_node_bset_older_than_sb_min,
- "bset version %u older than superblock version_min %u",
- version, c->sb.version_min)) {
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->version_min = cpu_to_le16(version);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- if (btree_err_on(BCH_VERSION_MAJOR(version) >
- BCH_VERSION_MAJOR(c->sb.version),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, NULL,
- btree_node_bset_newer_than_sb,
- "bset version %u newer than superblock version %u",
- version, c->sb.version)) {
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->version = cpu_to_le16(version);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
- -BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i, NULL,
- btree_node_unsupported_version,
- "BSET_SEPARATE_WHITEOUTS no longer supported");
-
- if (!write &&
- btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- bset_past_end_of_btree_node,
- "bset past end of btree node (offset %u len %u but written %zu)",
- offset, sectors, ptr_written ?: btree_sectors(c)))
- i->u64s = 0;
-
- btree_err_on(offset && !i->u64s,
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- bset_empty,
- "empty bset");
-
- btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_wrong_sector_offset,
- "bset at wrong sector offset");
-
- if (!offset) {
- struct btree_node *bn =
- container_of(i, struct btree_node, keys);
- /* These indicate that we read the wrong btree node: */
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- struct bch_btree_ptr_v2 *bp =
- &bkey_i_to_btree_ptr_v2(&b->key)->v;
-
- /* XXX endianness */
- btree_err_on(bp->seq != bn->keys.seq,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- bset_bad_seq,
- "incorrect sequence number (wrong btree node)");
- }
-
- btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i, NULL,
- btree_node_bad_btree,
- "incorrect btree id");
-
- btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i, NULL,
- btree_node_bad_level,
- "incorrect level");
-
- if (!write)
- compat_btree_node(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write, bn);
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- struct bch_btree_ptr_v2 *bp =
- &bkey_i_to_btree_ptr_v2(&b->key)->v;
-
- if (BTREE_PTR_RANGE_UPDATED(bp)) {
- b->data->min_key = bp->min_key;
- b->data->max_key = b->key.k.p;
- }
-
- btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_bad_min_key,
- "incorrect min_key: got %s should be %s",
- (printbuf_reset(&buf1),
- bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
- (printbuf_reset(&buf2),
- bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
- }
-
- btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i, NULL,
- btree_node_bad_max_key,
- "incorrect max key %s",
- (printbuf_reset(&buf1),
- bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
-
- if (write)
- compat_btree_node(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write, bn);
-
- btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
- -BCH_ERR_btree_node_read_err_bad_node,
- c, ca, b, i, NULL,
- btree_node_bad_format,
- "invalid bkey format: %s\n %s", buf1.buf,
- (printbuf_reset(&buf2),
- bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
- printbuf_reset(&buf1);
-
- compat_bformat(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write,
- &bn->format);
- }
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
-static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k,
- enum bch_validate_flags flags)
-{
- return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level,
- .btree = b->c.btree_id,
- .flags = flags
- });
-}
-
-static int bset_key_validate(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k,
- bool updated_range,
- enum bch_validate_flags flags)
-{
- struct bkey_validate_context from = (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level,
- .btree = b->c.btree_id,
- .flags = flags,
- };
- return __bch2_bkey_validate(c, k, from) ?:
- (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
- (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
-}
-
-static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
- struct bset *i, struct bkey_packed *k)
-{
- if (bkey_p_next(k) > vstruct_last(i))
- return false;
-
- if (k->format > KEY_FORMAT_CURRENT)
- return false;
-
- if (!bkeyp_u64s_valid(&b->format, k))
- return false;
-
- struct bkey tmp;
- struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- return !__bch2_bkey_validate(c, u.s_c,
- (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level,
- .btree = b->c.btree_id,
- .flags = BCH_VALIDATE_silent
- });
-}
-
-static inline int btree_node_read_bkey_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed(b, l, r)
- ?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
-}
-
-static int validate_bset_keys(struct bch_fs *c, struct btree *b,
- struct bset *i, int write,
- bool have_retry, bool *saw_error)
-{
- unsigned version = le16_to_cpu(i->version);
- struct bkey_packed *k, *prev = NULL;
- struct printbuf buf = PRINTBUF;
- bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
- int ret = 0;
-
- for (k = i->start;
- k != vstruct_last(i);) {
- struct bkey_s u;
- struct bkey tmp;
- unsigned next_good_key;
-
- if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bkey_past_bset_end,
- "key extends past end of bset")) {
- i->u64s = cpu_to_le16((u64 *) k - i->_data);
- break;
- }
-
- if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bkey_bad_format,
- "invalid bkey format %u", k->format))
- goto drop_this_key;
-
- if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bkey_bad_u64s,
- "bad k->u64s %u (min %u max %zu)", k->u64s,
- bkeyp_key_u64s(&b->format, k),
- U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
- goto drop_this_key;
-
- if (!write)
- bch2_bkey_compat(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write,
- &b->format, k);
-
- u = __bkey_disassemble(b, k, &tmp);
-
- ret = bset_key_validate(c, b, u.s_c, updated_range, write);
- if (ret == -BCH_ERR_fsck_delete_bkey)
- goto drop_this_key;
- if (ret)
- goto fsck_err;
-
- if (write)
- bch2_bkey_compat(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write,
- &b->format, k);
-
- if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
- struct bkey up = bkey_unpack_key(b, prev);
-
- printbuf_reset(&buf);
- prt_printf(&buf, "keys out of order: ");
- bch2_bkey_to_text(&buf, &up);
- prt_printf(&buf, " > ");
- bch2_bkey_to_text(&buf, u.k);
-
- if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bkey_out_of_order,
- "%s", buf.buf))
- goto drop_this_key;
- }
-
- prev = k;
- k = bkey_p_next(k);
- continue;
-drop_this_key:
- next_good_key = k->u64s;
-
- if (!next_good_key ||
- (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
- version >= bcachefs_metadata_version_snapshot)) {
- /*
- * only do scanning if bch2_bkey_compat() has nothing to
- * do
- */
-
- if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
- for (next_good_key = 1;
- next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
- next_good_key++)
- if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
- goto got_good_key;
- }
-
- /*
- * didn't find a good key, have to truncate the rest of
- * the bset
- */
- next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
- }
-got_good_key:
- le16_add_cpu(&i->u64s, -next_good_key);
- memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
- set_btree_node_need_rewrite(b);
- }
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
- struct btree *b, bool have_retry, bool *saw_error)
-{
- struct btree_node_entry *bne;
- struct sort_iter *iter;
- struct btree_node *sorted;
- struct bkey_packed *k;
- struct bset *i;
- bool used_mempool, blacklisted;
- bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
- unsigned u64s;
- unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
- u64 max_journal_seq = 0;
- struct printbuf buf = PRINTBUF;
- int ret = 0, retry_read = 0, write = READ;
- u64 start_time = local_clock();
-
- b->version_ondisk = U16_MAX;
- /* We might get called multiple times on read retry: */
- b->written = 0;
-
- iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
- sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
-
- if (bch2_meta_read_fault("btree"))
- btree_err(-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_fault_injected,
- "dynamic fault");
-
- btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_bad_magic,
- "bad magic: want %llx, got %llx",
- bset_magic(c), le64_to_cpu(b->data->magic));
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- struct bch_btree_ptr_v2 *bp =
- &bkey_i_to_btree_ptr_v2(&b->key)->v;
-
- bch2_bpos_to_text(&buf, b->data->min_key);
- prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, b->data->max_key);
-
- btree_err_on(b->data->keys.seq != bp->seq,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_bad_seq,
- "got wrong btree node: got\n%s",
- (printbuf_reset(&buf),
- bch2_btree_node_header_to_text(&buf, b->data),
- buf.buf));
- } else {
- btree_err_on(!b->data->keys.seq,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_bad_seq,
- "bad btree header: seq 0\n%s",
- (printbuf_reset(&buf),
- bch2_btree_node_header_to_text(&buf, b->data),
- buf.buf));
- }
-
- while (b->written < (ptr_written ?: btree_sectors(c))) {
- unsigned sectors;
- bool first = !b->written;
-
- if (first) {
- bne = NULL;
- i = &b->data->keys;
- } else {
- bne = write_block(b);
- i = &bne->keys;
-
- if (i->seq != b->data->keys.seq)
- break;
- }
-
- struct nonce nonce = btree_nonce(i, b->written << 9);
- bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
-
- btree_err_on(!good_csum_type,
- bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
- ? -BCH_ERR_btree_node_read_err_must_retry
- : -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_unknown_csum,
- "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
- if (first) {
- if (good_csum_type) {
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
- bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
- if (csum_bad)
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
- btree_err_on(csum_bad,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_bad_csum,
- "%s",
- (printbuf_reset(&buf),
- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
- buf.buf));
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "decrypting btree node: %s", bch2_err_str(ret)))
- goto fsck_err;
- }
-
- btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
- -BCH_ERR_btree_node_read_err_incompatible,
- c, NULL, b, NULL, NULL,
- btree_node_unsupported_version,
- "btree node does not have NEW_EXTENT_OVERWRITE set");
-
- sectors = vstruct_sectors(b->data, c->block_bits);
- } else {
- if (good_csum_type) {
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
- bool csum_bad = bch2_crc_cmp(bne->csum, csum);
- if (ca && csum_bad)
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
- btree_err_on(csum_bad,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_bad_csum,
- "%s",
- (printbuf_reset(&buf),
- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
- buf.buf));
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "decrypting btree node: %s", bch2_err_str(ret)))
- goto fsck_err;
- }
-
- sectors = vstruct_sectors(bne, c->block_bits);
- }
-
- b->version_ondisk = min(b->version_ondisk,
- le16_to_cpu(i->version));
-
- ret = validate_bset(c, ca, b, i, b->written, sectors,
- READ, have_retry, saw_error);
- if (ret)
- goto fsck_err;
-
- if (!b->written)
- btree_node_set_format(b, b->data->format);
-
- ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
- if (ret)
- goto fsck_err;
-
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
- blacklisted = bch2_journal_seq_is_blacklisted(c,
- le64_to_cpu(i->journal_seq),
- true);
-
- btree_err_on(blacklisted && first,
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- bset_blacklisted_journal_seq,
- "first btree node bset has blacklisted journal seq (%llu)",
- le64_to_cpu(i->journal_seq));
-
- btree_err_on(blacklisted && ptr_written,
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- first_bset_blacklisted_journal_seq,
- "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
- le64_to_cpu(i->journal_seq),
- b->written, b->written + sectors, ptr_written);
-
- b->written = min(b->written + sectors, btree_sectors(c));
-
- if (blacklisted && !first)
- continue;
-
- sort_iter_add(iter,
- vstruct_idx(i, 0),
- vstruct_last(i));
-
- max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
- }
-
- if (ptr_written) {
- btree_err_on(b->written < ptr_written,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL, NULL,
- btree_node_data_missing,
- "btree node data missing: expected %u sectors, found %u",
- ptr_written, b->written);
- } else {
- for (bne = write_block(b);
- bset_byte_offset(b, bne) < btree_buf_bytes(b);
- bne = (void *) bne + block_bytes(c))
- btree_err_on(bne->keys.seq == b->data->keys.seq &&
- !bch2_journal_seq_is_blacklisted(c,
- le64_to_cpu(bne->keys.journal_seq),
- true),
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL, NULL,
- btree_node_bset_after_end,
- "found bset signature after last bset");
- }
-
- sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
- sorted->keys.u64s = 0;
-
- set_btree_bset(b, b->set, &b->data->keys);
-
- b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
- memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
- btree_buf_bytes(b) -
- sizeof(struct btree_node) -
- b->nr.live_u64s * sizeof(u64));
-
- u64s = le16_to_cpu(sorted->keys.u64s);
- *sorted = *b->data;
- sorted->keys.u64s = cpu_to_le16(u64s);
- swap(sorted, b->data);
- set_btree_bset(b, b->set, &b->data->keys);
- b->nsets = 1;
- b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
-
- BUG_ON(b->nr.live_u64s != u64s);
-
- btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
-
- if (updated_range)
- bch2_btree_node_drop_keys_outside_node(b);
-
- i = &b->data->keys;
- for (k = i->start; k != vstruct_last(i);) {
- struct bkey tmp;
- struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-
- ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
- if (ret == -BCH_ERR_fsck_delete_bkey ||
- (bch2_inject_invalid_keys &&
- !bversion_cmp(u.k->bversion, MAX_VERSION))) {
- btree_keys_account_key_drop(&b->nr, 0, k);
-
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- set_btree_bset_end(b, b->set);
- set_btree_node_need_rewrite(b);
- continue;
- }
- if (ret)
- goto fsck_err;
-
- if (u.k->type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
-
- bp.v->mem_ptr = 0;
- }
-
- k = bkey_p_next(k);
- }
-
- bch2_bset_build_aux_tree(b, b->set, false);
-
- set_needs_whiteout(btree_bset_first(b), true);
-
- btree_node_reset_sib_u64s(b);
-
- rcu_read_lock();
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
- set_btree_node_need_rewrite(b);
- }
- rcu_read_unlock();
-
- if (!ptr_written)
- set_btree_node_need_rewrite(b);
-out:
- mempool_free(iter, &c->fill_iter);
- printbuf_exit(&buf);
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
- return retry_read;
-fsck_err:
- if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry) {
- retry_read = 1;
- } else {
- set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- }
- goto out;
-}
-
-static void btree_node_read_work(struct work_struct *work)
-{
- struct btree_read_bio *rb =
- container_of(work, struct btree_read_bio, work);
- struct bch_fs *c = rb->c;
- struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
- struct btree *b = rb->b;
- struct bio *bio = &rb->bio;
- struct bch_io_failures failed = { .nr = 0 };
- struct printbuf buf = PRINTBUF;
- bool saw_error = false;
- bool retry = false;
- bool can_retry;
-
- goto start;
- while (1) {
- retry = true;
- bch_info(c, "retrying read");
- ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
- rb->have_ioref = ca != NULL;
- rb->start_time = local_clock();
- bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
- bio->bi_iter.bi_sector = rb->pick.ptr.offset;
- bio->bi_iter.bi_size = btree_buf_bytes(b);
-
- if (rb->have_ioref) {
- bio_set_dev(bio, ca->disk_sb.bdev);
- submit_bio_wait(bio);
- } else {
- bio->bi_status = BLK_STS_REMOVED;
- }
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
- rb->start_time, !bio->bi_status);
-start:
- printbuf_reset(&buf);
- bch2_btree_pos_to_text(&buf, c, b);
-
- if (ca && bio->bi_status)
- bch_err_dev_ratelimited(ca,
- "btree read error %s for %s",
- bch2_blk_status_to_str(bio->bi_status), buf.buf);
- if (rb->have_ioref)
- percpu_ref_put(&ca->io_ref);
- rb->have_ioref = false;
-
- bch2_mark_io_failure(&failed, &rb->pick, false);
-
- can_retry = bch2_bkey_pick_read_device(c,
- bkey_i_to_s_c(&b->key),
- &failed, &rb->pick, -1) > 0;
-
- if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
- if (retry)
- bch_info(c, "retry success");
- break;
- }
-
- saw_error = true;
-
- if (!can_retry) {
- set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- break;
- }
- }
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
- rb->start_time);
- bio_put(&rb->bio);
-
- if ((saw_error ||
- btree_node_need_rewrite(b)) &&
- !btree_node_read_error(b) &&
- c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
- if (saw_error) {
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s",
- __func__, buf.buf);
- }
-
- bch2_btree_node_rewrite_async(c, b);
- }
-
- printbuf_exit(&buf);
- clear_btree_node_read_in_flight(b);
- wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_endio(struct bio *bio)
-{
- struct btree_read_bio *rb =
- container_of(bio, struct btree_read_bio, bio);
- struct bch_fs *c = rb->c;
- struct bch_dev *ca = rb->have_ioref
- ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
- rb->start_time, !bio->bi_status);
-
- queue_work(c->btree_read_complete_wq, &rb->work);
-}
-
-struct btree_node_read_all {
- struct closure cl;
- struct bch_fs *c;
- struct btree *b;
- unsigned nr;
- void *buf[BCH_REPLICAS_MAX];
- struct bio *bio[BCH_REPLICAS_MAX];
- blk_status_t err[BCH_REPLICAS_MAX];
-};
-
-static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
-{
- struct btree_node *bn = data;
- struct btree_node_entry *bne;
- unsigned offset = 0;
-
- if (le64_to_cpu(bn->magic) != bset_magic(c))
- return 0;
-
- while (offset < btree_sectors(c)) {
- if (!offset) {
- offset += vstruct_sectors(bn, c->block_bits);
- } else {
- bne = data + (offset << 9);
- if (bne->keys.seq != bn->keys.seq)
- break;
- offset += vstruct_sectors(bne, c->block_bits);
- }
- }
-
- return offset;
-}
-
-static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
-{
- struct btree_node *bn = data;
- struct btree_node_entry *bne;
-
- if (!offset)
- return false;
-
- while (offset < btree_sectors(c)) {
- bne = data + (offset << 9);
- if (bne->keys.seq == bn->keys.seq)
- return true;
- offset++;
- }
-
- return false;
- return offset;
-}
-
-static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
-{
- closure_type(ra, struct btree_node_read_all, cl);
- struct bch_fs *c = ra->c;
- struct btree *b = ra->b;
- struct printbuf buf = PRINTBUF;
- bool dump_bset_maps = false;
- bool have_retry = false;
- int ret = 0, best = -1, write = READ;
- unsigned i, written = 0, written2 = 0;
- __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
- ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
- bool _saw_error = false, *saw_error = &_saw_error;
-
- for (i = 0; i < ra->nr; i++) {
- struct btree_node *bn = ra->buf[i];
-
- if (ra->err[i])
- continue;
-
- if (le64_to_cpu(bn->magic) != bset_magic(c) ||
- (seq && seq != bn->keys.seq))
- continue;
-
- if (best < 0) {
- best = i;
- written = btree_node_sectors_written(c, bn);
- continue;
- }
-
- written2 = btree_node_sectors_written(c, ra->buf[i]);
- if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL, NULL,
- btree_node_replicas_sectors_written_mismatch,
- "btree node sectors written mismatch: %u != %u",
- written, written2) ||
- btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL, NULL,
- btree_node_bset_after_end,
- "found bset signature after last bset") ||
- btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL, NULL,
- btree_node_replicas_data_mismatch,
- "btree node replicas content mismatch"))
- dump_bset_maps = true;
-
- if (written2 > written) {
- written = written2;
- best = i;
- }
- }
-fsck_err:
- if (dump_bset_maps) {
- for (i = 0; i < ra->nr; i++) {
- struct btree_node *bn = ra->buf[i];
- struct btree_node_entry *bne = NULL;
- unsigned offset = 0, sectors;
- bool gap = false;
-
- if (ra->err[i])
- continue;
-
- printbuf_reset(&buf);
-
- while (offset < btree_sectors(c)) {
- if (!offset) {
- sectors = vstruct_sectors(bn, c->block_bits);
- } else {
- bne = ra->buf[i] + (offset << 9);
- if (bne->keys.seq != bn->keys.seq)
- break;
- sectors = vstruct_sectors(bne, c->block_bits);
- }
-
- prt_printf(&buf, " %u-%u", offset, offset + sectors);
- if (bne && bch2_journal_seq_is_blacklisted(c,
- le64_to_cpu(bne->keys.journal_seq), false))
- prt_printf(&buf, "*");
- offset += sectors;
- }
-
- while (offset < btree_sectors(c)) {
- bne = ra->buf[i] + (offset << 9);
- if (bne->keys.seq == bn->keys.seq) {
- if (!gap)
- prt_printf(&buf, " GAP");
- gap = true;
-
- sectors = vstruct_sectors(bne, c->block_bits);
- prt_printf(&buf, " %u-%u", offset, offset + sectors);
- if (bch2_journal_seq_is_blacklisted(c,
- le64_to_cpu(bne->keys.journal_seq), false))
- prt_printf(&buf, "*");
- }
- offset++;
- }
-
- bch_err(c, "replica %u:%s", i, buf.buf);
- }
- }
-
- if (best >= 0) {
- memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
- ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
- } else {
- ret = -1;
- }
-
- if (ret) {
- set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- } else if (*saw_error)
- bch2_btree_node_rewrite_async(c, b);
-
- for (i = 0; i < ra->nr; i++) {
- mempool_free(ra->buf[i], &c->btree_bounce_pool);
- bio_put(ra->bio[i]);
- }
-
- closure_debug_destroy(&ra->cl);
- kfree(ra);
- printbuf_exit(&buf);
-
- clear_btree_node_read_in_flight(b);
- wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_all_replicas_endio(struct bio *bio)
-{
- struct btree_read_bio *rb =
- container_of(bio, struct btree_read_bio, bio);
- struct bch_fs *c = rb->c;
- struct btree_node_read_all *ra = rb->ra;
-
- if (rb->have_ioref) {
- struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
- bch2_latency_acct(ca, rb->start_time, READ);
- }
-
- ra->err[rb->idx] = bio->bi_status;
- closure_put(&ra->cl);
-}
-
-/*
- * XXX This allocates multiple times from the same mempools, and can deadlock
- * under sufficient memory pressure (but is only a debug path)
- */
-static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
-{
- struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded pick;
- struct btree_node_read_all *ra;
- unsigned i;
-
- ra = kzalloc(sizeof(*ra), GFP_NOFS);
- if (!ra)
- return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
-
- closure_init(&ra->cl, NULL);
- ra->c = c;
- ra->b = b;
- ra->nr = bch2_bkey_nr_ptrs(k);
-
- for (i = 0; i < ra->nr; i++) {
- ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
- ra->bio[i] = bio_alloc_bioset(NULL,
- buf_pages(ra->buf[i], btree_buf_bytes(b)),
- REQ_OP_READ|REQ_SYNC|REQ_META,
- GFP_NOFS,
- &c->btree_bio);
- }
-
- i = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
- struct btree_read_bio *rb =
- container_of(ra->bio[i], struct btree_read_bio, bio);
- rb->c = c;
- rb->b = b;
- rb->ra = ra;
- rb->start_time = local_clock();
- rb->have_ioref = ca != NULL;
- rb->idx = i;
- rb->pick = pick;
- rb->bio.bi_iter.bi_sector = pick.ptr.offset;
- rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
- bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
-
- if (rb->have_ioref) {
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
- bio_sectors(&rb->bio));
- bio_set_dev(&rb->bio, ca->disk_sb.bdev);
-
- closure_get(&ra->cl);
- submit_bio(&rb->bio);
- } else {
- ra->err[i] = BLK_STS_REMOVED;
- }
-
- i++;
- }
-
- if (sync) {
- closure_sync(&ra->cl);
- btree_node_read_all_replicas_done(&ra->cl.work);
- } else {
- continue_at(&ra->cl, btree_node_read_all_replicas_done,
- c->btree_read_complete_wq);
- }
-
- return 0;
-}
-
-void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
- bool sync)
-{
- struct bch_fs *c = trans->c;
- struct extent_ptr_decoded pick;
- struct btree_read_bio *rb;
- struct bch_dev *ca;
- struct bio *bio;
- int ret;
-
- trace_and_count(c, btree_node_read, trans, b);
-
- if (bch2_verify_all_btree_replicas &&
- !btree_node_read_all_replicas(c, b, sync))
- return;
-
- ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
- NULL, &pick, -1);
-
- if (ret <= 0) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "btree node read error: no device to read from\n at ");
- bch2_btree_pos_to_text(&buf, c, b);
- bch_err_ratelimited(c, "%s", buf.buf);
-
- if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
- bch2_fatal_error(c);
-
- set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- clear_btree_node_read_in_flight(b);
- wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
- printbuf_exit(&buf);
- return;
- }
-
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
-
- bio = bio_alloc_bioset(NULL,
- buf_pages(b->data, btree_buf_bytes(b)),
- REQ_OP_READ|REQ_SYNC|REQ_META,
- GFP_NOFS,
- &c->btree_bio);
- rb = container_of(bio, struct btree_read_bio, bio);
- rb->c = c;
- rb->b = b;
- rb->ra = NULL;
- rb->start_time = local_clock();
- rb->have_ioref = ca != NULL;
- rb->pick = pick;
- INIT_WORK(&rb->work, btree_node_read_work);
- bio->bi_iter.bi_sector = pick.ptr.offset;
- bio->bi_end_io = btree_node_read_endio;
- bch2_bio_map(bio, b->data, btree_buf_bytes(b));
-
- if (rb->have_ioref) {
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
- bio_sectors(bio));
- bio_set_dev(bio, ca->disk_sb.bdev);
-
- if (sync) {
- submit_bio_wait(bio);
- bch2_latency_acct(ca, rb->start_time, READ);
- btree_node_read_work(&rb->work);
- } else {
- submit_bio(bio);
- }
- } else {
- bio->bi_status = BLK_STS_REMOVED;
-
- if (sync)
- btree_node_read_work(&rb->work);
- else
- queue_work(c->btree_read_complete_wq, &rb->work);
- }
-}
-
-static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
- const struct bkey_i *k, unsigned level)
-{
- struct bch_fs *c = trans->c;
- struct closure cl;
- struct btree *b;
- int ret;
-
- closure_init_stack(&cl);
-
- do {
- ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
- closure_sync(&cl);
- } while (ret);
-
- b = bch2_btree_node_mem_alloc(trans, level != 0);
- bch2_btree_cache_cannibalize_unlock(trans);
-
- BUG_ON(IS_ERR(b));
-
- bkey_copy(&b->key, k);
- BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
-
- set_btree_node_read_in_flight(b);
-
- /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
- bch2_trans_unlock(trans);
- bch2_btree_node_read(trans, b, true);
-
- if (btree_node_read_error(b)) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
-
- ret = -BCH_ERR_btree_node_read_error;
- goto err;
- }
-
- bch2_btree_set_root_for_read(c, b);
-err:
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-
- return ret;
-}
-
-int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
- const struct bkey_i *k, unsigned level)
-{
- return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
-}
-
-struct btree_node_scrub {
- struct bch_fs *c;
- struct bch_dev *ca;
- void *buf;
- bool used_mempool;
- unsigned written;
-
- enum btree_id btree;
- unsigned level;
- struct bkey_buf key;
- __le64 seq;
-
- struct work_struct work;
- struct bio bio;
-};
-
-static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
- struct printbuf *err)
-{
- unsigned written = 0;
-
- if (le64_to_cpu(data->magic) != bset_magic(c)) {
- prt_printf(err, "bad magic: want %llx, got %llx",
- bset_magic(c), le64_to_cpu(data->magic));
- return false;
- }
-
- while (written < (ptr_written ?: btree_sectors(c))) {
- struct btree_node_entry *bne;
- struct bset *i;
- bool first = !written;
-
- if (first) {
- bne = NULL;
- i = &data->keys;
- } else {
- bne = (void *) data + (written << 9);
- i = &bne->keys;
-
- if (!ptr_written && i->seq != data->keys.seq)
- break;
- }
-
- struct nonce nonce = btree_nonce(i, written << 9);
- bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
-
- if (first) {
- if (good_csum_type) {
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
- if (bch2_crc_cmp(data->csum, csum)) {
- bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
- return false;
- }
- }
-
- written += vstruct_sectors(data, c->block_bits);
- } else {
- if (good_csum_type) {
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
- if (bch2_crc_cmp(bne->csum, csum)) {
- bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
- return false;
- }
- }
-
- written += vstruct_sectors(bne, c->block_bits);
- }
- }
-
- return true;
-}
-
-static void btree_node_scrub_work(struct work_struct *work)
-{
- struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
- struct bch_fs *c = scrub->c;
- struct printbuf err = PRINTBUF;
-
- __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
- bkey_i_to_s_c(scrub->key.k));
- prt_newline(&err);
-
- if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
- struct btree_trans *trans = bch2_trans_get(c);
-
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, scrub->btree,
- scrub->key.k->k.p, 0, scrub->level - 1, 0);
-
- struct btree *b;
- int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter)));
- if (ret)
- goto err;
-
- if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) {
- bch_err(c, "error validating btree node during scrub on %s at btree %s",
- scrub->ca->name, err.buf);
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_begin(trans);
- bch2_trans_put(trans);
- }
-
- printbuf_exit(&err);
- bch2_bkey_buf_exit(&scrub->key, c);;
- btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
- percpu_ref_put(&scrub->ca->io_ref);
- kfree(scrub);
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
-}
-
-static void btree_node_scrub_endio(struct bio *bio)
-{
- struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
-
- queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
-}
-
-int bch2_btree_node_scrub(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c k, unsigned dev)
-{
- if (k.k->type != KEY_TYPE_btree_ptr_v2)
- return 0;
-
- struct bch_fs *c = trans->c;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub))
- return -BCH_ERR_erofs_no_writes;
-
- struct extent_ptr_decoded pick;
- int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
- if (ret <= 0)
- goto err;
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
- if (!ca) {
- ret = -BCH_ERR_device_offline;
- goto err;
- }
-
- bool used_mempool = false;
- void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
-
- unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
-
- struct btree_node_scrub *scrub =
- kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
- if (!scrub) {
- ret = -ENOMEM;
- goto err_free;
- }
-
- scrub->c = c;
- scrub->ca = ca;
- scrub->buf = buf;
- scrub->used_mempool = used_mempool;
- scrub->written = btree_ptr_sectors_written(k);
-
- scrub->btree = btree;
- scrub->level = level;
- bch2_bkey_buf_init(&scrub->key);
- bch2_bkey_buf_reassemble(&scrub->key, c, k);
- scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
-
- INIT_WORK(&scrub->work, btree_node_scrub_work);
-
- bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
- bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
- scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
- scrub->bio.bi_end_io = btree_node_scrub_endio;
- submit_bio(&scrub->bio);
- return 0;
-err_free:
- btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
- percpu_ref_put(&ca->io_ref);
-err:
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
- return ret;
-}
-
-static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
- struct btree_write *w)
-{
- unsigned long old, new;
-
- old = READ_ONCE(b->will_make_reachable);
- do {
- new = old;
- if (!(old & 1))
- break;
-
- new &= ~1UL;
- } while (!try_cmpxchg(&b->will_make_reachable, &old, new));
-
- if (old & 1)
- closure_put(&((struct btree_update *) new)->cl);
-
- bch2_journal_pin_drop(&c->journal, &w->journal);
-}
-
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
-{
- struct btree_write *w = btree_prev_write(b);
- unsigned long old, new;
- unsigned type = 0;
-
- bch2_btree_complete_write(c, b, w);
-
- if (start_time)
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
-
- old = READ_ONCE(b->flags);
- do {
- new = old;
-
- if ((old & (1U << BTREE_NODE_dirty)) &&
- (old & (1U << BTREE_NODE_need_write)) &&
- !(old & (1U << BTREE_NODE_never_write)) &&
- !(old & (1U << BTREE_NODE_write_blocked)) &&
- !(old & (1U << BTREE_NODE_will_make_reachable))) {
- new &= ~(1U << BTREE_NODE_dirty);
- new &= ~(1U << BTREE_NODE_need_write);
- new |= (1U << BTREE_NODE_write_in_flight);
- new |= (1U << BTREE_NODE_write_in_flight_inner);
- new |= (1U << BTREE_NODE_just_written);
- new ^= (1U << BTREE_NODE_write_idx);
-
- type = new & BTREE_WRITE_TYPE_MASK;
- new &= ~BTREE_WRITE_TYPE_MASK;
- } else {
- new &= ~(1U << BTREE_NODE_write_in_flight);
- new &= ~(1U << BTREE_NODE_write_in_flight_inner);
- }
- } while (!try_cmpxchg(&b->flags, &old, new));
-
- if (new & (1U << BTREE_NODE_write_in_flight))
- __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
- else
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
-{
- struct btree_trans *trans = bch2_trans_get(c);
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-
- /* we don't need transaction context anymore after we got the lock. */
- bch2_trans_put(trans);
- __btree_node_write_done(c, b, start_time);
- six_unlock_read(&b->c.lock);
-}
-
-static void btree_node_write_work(struct work_struct *work)
-{
- struct btree_write_bio *wbio =
- container_of(work, struct btree_write_bio, work);
- struct bch_fs *c = wbio->wbio.c;
- struct btree *b = wbio->wbio.bio.bi_private;
- u64 start_time = wbio->start_time;
- int ret = 0;
-
- btree_bounce_free(c,
- wbio->data_bytes,
- wbio->wbio.used_mempool,
- wbio->data);
-
- bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
- bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
-
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = -BCH_ERR_btree_node_write_all_failed;
- goto err;
- }
-
- if (wbio->wbio.first_btree_write) {
- if (wbio->wbio.failed.nr) {
-
- }
- } else {
- ret = bch2_trans_do(c,
- bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- BCH_WATERMARK_interior_updates|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_check_rw,
- !wbio->wbio.failed.nr));
- if (ret)
- goto err;
- }
-out:
- bio_put(&wbio->wbio.bio);
- btree_node_write_done(c, b, start_time);
- return;
-err:
- set_btree_node_noevict(b);
-
- if (!bch2_err_matches(ret, EROFS)) {
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret));
- bch2_btree_pos_to_text(&buf, c, b);
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
- goto out;
-}
-
-static void btree_node_write_endio(struct bio *bio)
-{
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
- struct bch_write_bio *orig = parent ?: wbio;
- struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio);
- struct bch_fs *c = wbio->c;
- struct btree *b = wbio->bio.bi_private;
- struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
- wbio->submit_time, !bio->bi_status);
-
- if (ca && bio->bi_status) {
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "btree write error: %s\n ",
- bch2_blk_status_to_str(bio->bi_status));
- bch2_btree_pos_to_text(&buf, c, b);
- bch_err_dev_ratelimited(ca, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (bio->bi_status) {
- unsigned long flags;
- spin_lock_irqsave(&c->btree_write_error_lock, flags);
- bch2_dev_list_add_dev(&orig->failed, wbio->dev);
- spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
- }
-
- if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref);
-
- if (parent) {
- bio_put(bio);
- bio_endio(&parent->bio);
- return;
- }
-
- clear_btree_node_write_in_flight_inner(b);
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
- INIT_WORK(&wb->work, btree_node_write_work);
- queue_work(c->btree_io_complete_wq, &wb->work);
-}
-
-static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
- struct bset *i, unsigned sectors)
-{
- bool saw_error;
-
- int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
- (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level + 1,
- .btree = b->c.btree_id,
- .flags = BCH_VALIDATE_write,
- });
- if (ret) {
- bch2_fs_inconsistent(c, "invalid btree node key before write");
- return ret;
- }
-
- ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
- if (ret) {
- bch2_inconsistent_error(c);
- dump_stack();
- }
-
- return ret;
-}
-
-static void btree_write_submit(struct work_struct *work)
-{
- struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
- BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-
- bkey_copy(&tmp.k, &wbio->key);
-
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
- ptr->offset += wbio->sector_offset;
-
- bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
- &tmp.k, false);
-}
-
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
-{
- struct btree_write_bio *wbio;
- struct bset *i;
- struct btree_node *bn = NULL;
- struct btree_node_entry *bne = NULL;
- struct sort_iter_stack sort_iter;
- struct nonce nonce;
- unsigned bytes_to_write, sectors_to_write, bytes, u64s;
- u64 seq = 0;
- bool used_mempool;
- unsigned long old, new;
- bool validate_before_checksum = false;
- enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
- void *data;
- u64 start_time = local_clock();
- int ret;
-
- if (flags & BTREE_WRITE_ALREADY_STARTED)
- goto do_write;
-
- /*
- * We may only have a read lock on the btree node - the dirty bit is our
- * "lock" against racing with other threads that may be trying to start
- * a write, we do a write iff we clear the dirty bit. Since setting the
- * dirty bit requires a write lock, we can't race with other threads
- * redirtying it:
- */
- old = READ_ONCE(b->flags);
- do {
- new = old;
-
- if (!(old & (1 << BTREE_NODE_dirty)))
- return;
-
- if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
- !(old & (1 << BTREE_NODE_need_write)))
- return;
-
- if (old &
- ((1 << BTREE_NODE_never_write)|
- (1 << BTREE_NODE_write_blocked)))
- return;
-
- if (b->written &&
- (old & (1 << BTREE_NODE_will_make_reachable)))
- return;
-
- if (old & (1 << BTREE_NODE_write_in_flight))
- return;
-
- if (flags & BTREE_WRITE_ONLY_IF_NEED)
- type = new & BTREE_WRITE_TYPE_MASK;
- new &= ~BTREE_WRITE_TYPE_MASK;
-
- new &= ~(1 << BTREE_NODE_dirty);
- new &= ~(1 << BTREE_NODE_need_write);
- new |= (1 << BTREE_NODE_write_in_flight);
- new |= (1 << BTREE_NODE_write_in_flight_inner);
- new |= (1 << BTREE_NODE_just_written);
- new ^= (1 << BTREE_NODE_write_idx);
- } while (!try_cmpxchg_acquire(&b->flags, &old, new));
-
- if (new & (1U << BTREE_NODE_need_write))
- return;
-do_write:
- BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
-
- atomic_long_dec(&c->btree_cache.nr_dirty);
-
- BUG_ON(btree_node_fake(b));
- BUG_ON((b->will_make_reachable != 0) != !b->written);
-
- BUG_ON(b->written >= btree_sectors(c));
- BUG_ON(b->written & (block_sectors(c) - 1));
- BUG_ON(bset_written(b, btree_bset_last(b)));
- BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
- BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
-
- bch2_sort_whiteouts(c, b);
-
- sort_iter_stack_init(&sort_iter, b);
-
- bytes = !b->written
- ? sizeof(struct btree_node)
- : sizeof(struct btree_node_entry);
-
- bytes += b->whiteout_u64s * sizeof(u64);
-
- for_each_bset(b, t) {
- i = bset(b, t);
-
- if (bset_written(b, i))
- continue;
-
- bytes += le16_to_cpu(i->u64s) * sizeof(u64);
- sort_iter_add(&sort_iter.iter,
- btree_bkey_first(b, t),
- btree_bkey_last(b, t));
- seq = max(seq, le64_to_cpu(i->journal_seq));
- }
-
- BUG_ON(b->written && !seq);
-
- /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
- bytes += 8;
-
- /* buffer must be a multiple of the block size */
- bytes = round_up(bytes, block_bytes(c));
-
- data = btree_bounce_alloc(c, bytes, &used_mempool);
-
- if (!b->written) {
- bn = data;
- *bn = *b->data;
- i = &bn->keys;
- } else {
- bne = data;
- bne->keys = b->data->keys;
- i = &bne->keys;
- }
-
- i->journal_seq = cpu_to_le64(seq);
- i->u64s = 0;
-
- sort_iter_add(&sort_iter.iter,
- unwritten_whiteouts_start(b),
- unwritten_whiteouts_end(b));
- SET_BSET_SEPARATE_WHITEOUTS(i, false);
-
- u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
- le16_add_cpu(&i->u64s, u64s);
-
- b->whiteout_u64s = 0;
-
- BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
-
- set_needs_whiteout(i, false);
-
- /* do we have data to write? */
- if (b->written && !i->u64s)
- goto nowrite;
-
- bytes_to_write = vstruct_end(i) - data;
- sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
- if (!b->written &&
- b->key.k.type == KEY_TYPE_btree_ptr_v2)
- BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
-
- memset(data + bytes_to_write, 0,
- (sectors_to_write << 9) - bytes_to_write);
-
- BUG_ON(b->written + sectors_to_write > btree_sectors(c));
- BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
- BUG_ON(i->seq != b->data->keys.seq);
-
- i->version = cpu_to_le16(c->sb.version);
- SET_BSET_OFFSET(i, b->written);
- SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
-
- if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
- validate_before_checksum = true;
-
- /* validate_bset will be modifying: */
- if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
- validate_before_checksum = true;
-
- /* if we're going to be encrypting, check metadata validity first: */
- if (validate_before_checksum &&
- validate_bset_for_write(c, b, i, sectors_to_write))
- goto err;
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "encrypting btree node: %s", bch2_err_str(ret)))
- goto err;
-
- nonce = btree_nonce(i, b->written << 9);
-
- if (bn)
- bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
- else
- bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
- /* if we're not encrypting, check metadata after checksumming: */
- if (!validate_before_checksum &&
- validate_bset_for_write(c, b, i, sectors_to_write))
- goto err;
-
- /*
- * We handle btree write errors by immediately halting the journal -
- * after we've done that, we can't issue any subsequent btree writes
- * because they might have pointers to new nodes that failed to write.
- *
- * Furthermore, there's no point in doing any more btree writes because
- * with the journal stopped, we're never going to update the journal to
- * reflect that those writes were done and the data flushed from the
- * journal:
- *
- * Also on journal error, the pending write may have updates that were
- * never journalled (interior nodes, see btree_update_nodes_written()) -
- * it's critical that we don't do the write in that case otherwise we
- * will have updates visible that weren't in the journal:
- *
- * Make sure to update b->written so bch2_btree_init_next() doesn't
- * break:
- */
- if (bch2_journal_error(&c->journal) ||
- c->opts.nochanges)
- goto err;
-
- trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
-
- wbio = container_of(bio_alloc_bioset(NULL,
- buf_pages(data, sectors_to_write << 9),
- REQ_OP_WRITE|REQ_META,
- GFP_NOFS,
- &c->btree_bio),
- struct btree_write_bio, wbio.bio);
- wbio_init(&wbio->wbio.bio);
- wbio->data = data;
- wbio->data_bytes = bytes;
- wbio->sector_offset = b->written;
- wbio->start_time = start_time;
- wbio->wbio.c = c;
- wbio->wbio.used_mempool = used_mempool;
- wbio->wbio.first_btree_write = !b->written;
- wbio->wbio.bio.bi_end_io = btree_node_write_endio;
- wbio->wbio.bio.bi_private = b;
-
- bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
-
- bkey_copy(&wbio->key, &b->key);
-
- b->written += sectors_to_write;
-
- if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
- cpu_to_le16(b->written);
-
- atomic64_inc(&c->btree_write_stats[type].nr);
- atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
-
- INIT_WORK(&wbio->work, btree_write_submit);
- queue_work(c->btree_write_submit_wq, &wbio->work);
- return;
-err:
- set_btree_node_noevict(b);
- b->written += sectors_to_write;
-nowrite:
- btree_bounce_free(c, bytes, used_mempool, data);
- __btree_node_write_done(c, b, 0);
-}
-
-/*
- * Work that must be done with write lock held:
- */
-bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
-{
- bool invalidated_iter = false;
- struct btree_node_entry *bne;
-
- if (!btree_node_just_written(b))
- return false;
-
- BUG_ON(b->whiteout_u64s);
-
- clear_btree_node_just_written(b);
-
- /*
- * Note: immediately after write, bset_written() doesn't work - the
- * amount of data we had to write after compaction might have been
- * smaller than the offset of the last bset.
- *
- * However, we know that all bsets have been written here, as long as
- * we're still holding the write lock:
- */
-
- /*
- * XXX: decide if we really want to unconditionally sort down to a
- * single bset:
- */
- if (b->nsets > 1) {
- btree_node_sort(c, b, 0, b->nsets);
- invalidated_iter = true;
- } else {
- invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
- }
-
- for_each_bset(b, t)
- set_needs_whiteout(bset(b, t), true);
-
- bch2_btree_verify(c, b);
-
- /*
- * If later we don't unconditionally sort down to a single bset, we have
- * to ensure this is still true:
- */
- BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
-
- bne = want_new_bset(c, b);
- if (bne)
- bch2_bset_init_next(b, bne);
-
- bch2_btree_build_aux_trees(b);
-
- return invalidated_iter;
-}
-
-/*
- * Use this one if the node is intent locked:
- */
-void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
- enum six_lock_type lock_type_held,
- unsigned flags)
-{
- if (lock_type_held == SIX_LOCK_intent ||
- (lock_type_held == SIX_LOCK_read &&
- six_lock_tryupgrade(&b->c.lock))) {
- __bch2_btree_node_write(c, b, flags);
-
- /* don't cycle lock unnecessarily: */
- if (btree_node_just_written(b) &&
- six_trylock_write(&b->c.lock)) {
- bch2_btree_post_write_cleanup(c, b);
- six_unlock_write(&b->c.lock);
- }
-
- if (lock_type_held == SIX_LOCK_read)
- six_lock_downgrade(&b->c.lock);
- } else {
- __bch2_btree_node_write(c, b, flags);
- if (lock_type_held == SIX_LOCK_write &&
- btree_node_just_written(b))
- bch2_btree_post_write_cleanup(c, b);
- }
-}
-
-void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b,
- enum six_lock_type lock_type_held,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
-
- if (lock_type_held == SIX_LOCK_intent ||
- (lock_type_held == SIX_LOCK_read &&
- six_lock_tryupgrade(&b->c.lock))) {
- __bch2_btree_node_write(c, b, flags);
-
- /* don't cycle lock unnecessarily: */
- if (btree_node_just_written(b) &&
- six_trylock_write(&b->c.lock)) {
- bch2_btree_post_write_cleanup(c, b);
- __bch2_btree_node_unlock_write(trans, b);
- }
-
- if (lock_type_held == SIX_LOCK_read)
- six_lock_downgrade(&b->c.lock);
- } else {
- __bch2_btree_node_write(c, b, flags);
- if (lock_type_held == SIX_LOCK_write &&
- btree_node_just_written(b))
- bch2_btree_post_write_cleanup(c, b);
- }
-}
-
-static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
-{
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
- unsigned i;
- bool ret = false;
-restart:
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, i, pos)
- if (test_bit(flag, &b->flags)) {
- rcu_read_unlock();
- wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
- ret = true;
- goto restart;
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *c)
-{
- return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
-}
-
-bool bch2_btree_flush_all_writes(struct bch_fs *c)
-{
- return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
-}
-
-static const char * const bch2_btree_write_types[] = {
-#define x(t, n) [n] = #t,
- BCH_BTREE_WRITE_TYPES()
- NULL
-};
-
-void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
-{
- printbuf_tabstop_push(out, 20);
- printbuf_tabstop_push(out, 10);
-
- prt_printf(out, "\tnr\tsize\n");
-
- for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
- u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
- u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
-
- prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
- prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
- prt_newline(out);
- }
-}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
deleted file mode 100644
index dbf76d22c660..000000000000
--- a/fs/bcachefs/btree_io.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_IO_H
-#define _BCACHEFS_BTREE_IO_H
-
-#include "bkey_methods.h"
-#include "bset.h"
-#include "btree_locking.h"
-#include "checksum.h"
-#include "extents.h"
-#include "io_write_types.h"
-
-struct bch_fs;
-struct btree_write;
-struct btree;
-struct btree_iter;
-struct btree_node_read_all;
-
-static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
- if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
- atomic_long_inc(&c->btree_cache.nr_dirty);
-}
-
-static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
- if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
- atomic_long_dec(&c->btree_cache.nr_dirty);
-}
-
-static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
-{
- return k.k->type == KEY_TYPE_btree_ptr_v2
- ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
- : 0;
-}
-
-struct btree_read_bio {
- struct bch_fs *c;
- struct btree *b;
- struct btree_node_read_all *ra;
- u64 start_time;
- unsigned have_ioref:1;
- unsigned idx:7;
- struct extent_ptr_decoded pick;
- struct work_struct work;
- struct bio bio;
-};
-
-struct btree_write_bio {
- struct work_struct work;
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
- void *data;
- unsigned data_bytes;
- unsigned sector_offset;
- u64 start_time;
- struct bch_write_bio wbio;
-};
-
-void bch2_btree_node_io_unlock(struct btree *);
-void bch2_btree_node_io_lock(struct btree *);
-void __bch2_btree_node_wait_on_read(struct btree *);
-void __bch2_btree_node_wait_on_write(struct btree *);
-void bch2_btree_node_wait_on_read(struct btree *);
-void bch2_btree_node_wait_on_write(struct btree *);
-
-enum compact_mode {
- COMPACT_LAZY,
- COMPACT_ALL,
-};
-
-bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
- enum compact_mode);
-
-static inline bool should_compact_bset_lazy(struct btree *b,
- struct bset_tree *t)
-{
- unsigned total_u64s = bset_u64s(t);
- unsigned dead_u64s = bset_dead_u64s(b, t);
-
- return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
-}
-
-static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
-{
- for_each_bset(b, t)
- if (should_compact_bset_lazy(b, t))
- return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
-
- return false;
-}
-
-static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
-{
- return (struct nonce) {{
- [0] = cpu_to_le32(offset),
- [1] = ((__le32 *) &i->seq)[0],
- [2] = ((__le32 *) &i->seq)[1],
- [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
- }};
-}
-
-static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
-{
- struct nonce nonce = btree_nonce(i, offset);
- int ret;
-
- if (!offset) {
- struct btree_node *bn = container_of(i, struct btree_node, keys);
- unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
- ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
- &bn->flags, bytes);
- if (ret)
- return ret;
-
- nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
- }
-
- return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
- vstruct_end(i) - (void *) i->_data);
-}
-
-void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
-
-void bch2_btree_node_drop_keys_outside_node(struct btree *);
-
-void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct btree_trans *, struct btree *);
-
-int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
- struct btree *, bool, bool *);
-void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
-int bch2_btree_root_read(struct bch_fs *, enum btree_id,
- const struct bkey_i *, unsigned);
-
-int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, unsigned);
-
-bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
-
-enum btree_write_flags {
- __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
- __BTREE_WRITE_ALREADY_STARTED,
-};
-#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED)
-#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED)
-
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
-void bch2_btree_node_write(struct bch_fs *, struct btree *,
- enum six_lock_type, unsigned);
-void bch2_btree_node_write_trans(struct btree_trans *, struct btree *,
- enum six_lock_type, unsigned);
-
-static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b,
- enum six_lock_type lock_held)
-{
- bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *);
-bool bch2_btree_flush_all_writes(struct bch_fs *);
-
-static inline void compat_bformat(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write, struct bkey_format *f)
-{
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_inodes) {
- swap(f->bits_per_field[BKEY_FIELD_INODE],
- f->bits_per_field[BKEY_FIELD_OFFSET]);
- swap(f->field_offset[BKEY_FIELD_INODE],
- f->field_offset[BKEY_FIELD_OFFSET]);
- }
-
- if (version < bcachefs_metadata_version_snapshot &&
- (level || btree_type_has_snapshots(btree_id))) {
- u64 max_packed =
- ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
- f->field_offset[BKEY_FIELD_SNAPSHOT] = write
- ? 0
- : cpu_to_le64(U32_MAX - max_packed);
- }
-}
-
-static inline void compat_bpos(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write, struct bpos *p)
-{
- if (big_endian != CPU_BIG_ENDIAN)
- bch2_bpos_swab(p);
-
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_inodes)
- swap(p->inode, p->offset);
-}
-
-static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write,
- struct btree_node *bn)
-{
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id_is_extents(btree_id) &&
- !bpos_eq(bn->min_key, POS_MIN) &&
- write)
- bn->min_key = bpos_nosnap_predecessor(bn->min_key);
-
- if (version < bcachefs_metadata_version_snapshot &&
- write)
- bn->max_key.snapshot = 0;
-
- compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
- compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
-
- if (version < bcachefs_metadata_version_snapshot &&
- !write)
- bn->max_key.snapshot = U32_MAX;
-
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id_is_extents(btree_id) &&
- !bpos_eq(bn->min_key, POS_MIN) &&
- !write)
- bn->min_key = bpos_nosnap_successor(bn->min_key);
-}
-
-void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
deleted file mode 100644
index 7542c6f9c88e..000000000000
--- a/fs/bcachefs/btree_iter.c
+++ /dev/null
@@ -1,3673 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "replicas.h"
-#include "snapshot.h"
-#include "trace.h"
-
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
-static inline void btree_path_list_add(struct btree_trans *,
- btree_path_idx_t, btree_path_idx_t);
-
-static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
-{
-#ifdef TRACK_PATH_ALLOCATED
- return iter->ip_allocated;
-#else
- return 0;
-#endif
-}
-
-static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
-static void bch2_trans_srcu_lock(struct btree_trans *);
-
-static inline int __btree_path_cmp(const struct btree_path *l,
- enum btree_id r_btree_id,
- bool r_cached,
- struct bpos r_pos,
- unsigned r_level)
-{
- /*
- * Must match lock ordering as defined by __bch2_btree_node_lock:
- */
- return cmp_int(l->btree_id, r_btree_id) ?:
- cmp_int((int) l->cached, (int) r_cached) ?:
- bpos_cmp(l->pos, r_pos) ?:
- -cmp_int(l->level, r_level);
-}
-
-static inline int btree_path_cmp(const struct btree_path *l,
- const struct btree_path *r)
-{
- return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
-}
-
-static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
-{
- /* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_all_snapshots) {
- p = bpos_successor(p);
- } else {
- p = bpos_nosnap_successor(p);
- p.snapshot = iter->snapshot;
- }
-
- return p;
-}
-
-static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
-{
- /* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_all_snapshots) {
- p = bpos_predecessor(p);
- } else {
- p = bpos_nosnap_predecessor(p);
- p.snapshot = iter->snapshot;
- }
-
- return p;
-}
-
-static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
-{
- struct bpos pos = iter->pos;
-
- if ((iter->flags & BTREE_ITER_is_extents) &&
- !bkey_eq(pos, POS_MAX))
- pos = bkey_successor(iter, pos);
- return pos;
-}
-
-static inline bool btree_path_pos_before_node(struct btree_path *path,
- struct btree *b)
-{
- return bpos_lt(path->pos, b->data->min_key);
-}
-
-static inline bool btree_path_pos_after_node(struct btree_path *path,
- struct btree *b)
-{
- return bpos_gt(path->pos, b->key.k.p);
-}
-
-static inline bool btree_path_pos_in_node(struct btree_path *path,
- struct btree *b)
-{
- return path->btree_id == b->c.btree_id &&
- !btree_path_pos_before_node(path, b) &&
- !btree_path_pos_after_node(path, b);
-}
-
-/* Btree iterator: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_cached(struct btree_trans *trans,
- struct btree_path *path)
-{
- struct bkey_cached *ck;
- bool locked = btree_node_locked(path, 0);
-
- if (!bch2_btree_node_relock(trans, path, 0))
- return;
-
- ck = (void *) path->l[0].b;
- BUG_ON(ck->key.btree_id != path->btree_id ||
- !bkey_eq(ck->key.pos, path->pos));
-
- if (!locked)
- btree_node_unlock(trans, path, 0);
-}
-
-static void bch2_btree_path_verify_level(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree_path_level *l;
- struct btree_node_iter tmp;
- bool locked;
- struct bkey_packed *p, *k;
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
- struct printbuf buf3 = PRINTBUF;
- const char *msg;
-
- if (!bch2_debug_check_iterators)
- return;
-
- l = &path->l[level];
- tmp = l->iter;
- locked = btree_node_locked(path, level);
-
- if (path->cached) {
- if (!level)
- bch2_btree_path_verify_cached(trans, path);
- return;
- }
-
- if (!btree_path_node(path, level))
- return;
-
- if (!bch2_btree_node_relock_notrace(trans, path, level))
- return;
-
- BUG_ON(!btree_path_pos_in_node(path, l->b));
-
- bch2_btree_node_iter_verify(&l->iter, l->b);
-
- /*
- * For interior nodes, the iterator will have skipped past deleted keys:
- */
- p = level
- ? bch2_btree_node_iter_prev(&tmp, l->b)
- : bch2_btree_node_iter_prev_all(&tmp, l->b);
- k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-
- if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
- msg = "before";
- goto err;
- }
-
- if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
- msg = "after";
- goto err;
- }
-
- if (!locked)
- btree_node_unlock(trans, path, level);
- return;
-err:
- bch2_bpos_to_text(&buf1, path->pos);
-
- if (p) {
- struct bkey uk = bkey_unpack_key(l->b, p);
-
- bch2_bkey_to_text(&buf2, &uk);
- } else {
- prt_printf(&buf2, "(none)");
- }
-
- if (k) {
- struct bkey uk = bkey_unpack_key(l->b, k);
-
- bch2_bkey_to_text(&buf3, &uk);
- } else {
- prt_printf(&buf3, "(none)");
- }
-
- panic("path should be %s key at level %u:\n"
- "path pos %s\n"
- "prev key %s\n"
- "cur key %s\n",
- msg, level, buf1.buf, buf2.buf, buf3.buf);
-}
-
-static void bch2_btree_path_verify(struct btree_trans *trans,
- struct btree_path *path)
-{
- struct bch_fs *c = trans->c;
-
- for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
- if (!path->l[i].b) {
- BUG_ON(!path->cached &&
- bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
- break;
- }
-
- bch2_btree_path_verify_level(trans, path, i);
- }
-
- bch2_btree_path_verify_locks(path);
-}
-
-void bch2_trans_verify_paths(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned iter;
-
- trans_for_each_path(trans, path, iter)
- bch2_btree_path_verify(trans, path);
-}
-
-static void bch2_btree_iter_verify(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
-
- BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
-
- BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
- (iter->flags & BTREE_ITER_all_snapshots));
-
- BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
- (iter->flags & BTREE_ITER_all_snapshots) &&
- !btree_type_has_snapshot_field(iter->btree_id));
-
- if (iter->update_path)
- bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
- bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
-}
-
-static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
-{
- BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
- !iter->pos.snapshot);
-
- BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
- iter->pos.snapshot != iter->snapshot);
-
- BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) :
- !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) :
- (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
- bkey_gt(iter->pos, iter->k.p)));
-}
-
-static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
-{
- struct btree_trans *trans = iter->trans;
- struct btree_iter copy;
- struct bkey_s_c prev;
- int ret = 0;
-
- if (!bch2_debug_check_iterators)
- return 0;
-
- if (!(iter->flags & BTREE_ITER_filter_snapshots))
- return 0;
-
- if (bkey_err(k) || !k.k)
- return 0;
-
- BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot));
-
- bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
- BTREE_ITER_nopreserve|
- BTREE_ITER_all_snapshots);
- prev = bch2_btree_iter_prev(&copy);
- if (!prev.k)
- goto out;
-
- ret = bkey_err(prev);
- if (ret)
- goto out;
-
- if (bkey_eq(prev.k->p, k.k->p) &&
- bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
- prev.k->p.snapshot) > 0) {
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
- bch2_bkey_to_text(&buf1, k.k);
- bch2_bkey_to_text(&buf2, prev.k);
-
- panic("iter snap %u\n"
- "k %s\n"
- "prev %s\n",
- iter->snapshot,
- buf1.buf, buf2.buf);
- }
-out:
- bch2_trans_iter_exit(trans, &copy);
- return ret;
-}
-
-void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
- struct bpos pos)
-{
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- struct btree_path *path;
- struct trans_for_each_path_inorder_iter iter;
- struct printbuf buf = PRINTBUF;
-
- btree_trans_sort_paths(trans);
-
- trans_for_each_path_inorder(trans, path, iter) {
- if (path->btree_id != id ||
- !btree_node_locked(path, 0) ||
- !path->should_be_locked)
- continue;
-
- if (!path->cached) {
- if (bkey_ge(pos, path->l[0].b->data->min_key) &&
- bkey_le(pos, path->l[0].b->key.k.p))
- return;
- } else {
- if (bkey_eq(pos, path->pos))
- return;
- }
- }
-
- bch2_dump_trans_paths_updates(trans);
- bch2_bpos_to_text(&buf, pos);
-
- panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
-}
-
-#else
-
-static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
- struct btree_path *path, unsigned l) {}
-static inline void bch2_btree_path_verify(struct btree_trans *trans,
- struct btree_path *path) {}
-static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
-static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
-
-#endif
-
-/* Btree path: fixups after btree updates */
-
-static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
- struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k)
-{
- struct btree_node_iter_set *set;
-
- btree_node_iter_for_each(iter, set)
- if (set->end == t->end_offset) {
- set->k = __btree_node_key_to_offset(b, k);
- bch2_btree_node_iter_sort(iter, b);
- return;
- }
-
- bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
-}
-
-static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
- struct btree *b,
- struct bkey_packed *where)
-{
- struct btree_path_level *l = &path->l[b->c.level];
-
- if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
- return;
-
- if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
- bch2_btree_node_iter_advance(&l->iter, l->b);
-}
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
- struct btree *b,
- struct bkey_packed *where)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path_with_node(trans, b, path, i) {
- __bch2_btree_path_fix_key_modified(path, b, where);
- bch2_btree_path_verify_level(trans, path, b->c.level);
- }
-}
-
-static void __bch2_btree_node_iter_fix(struct btree_path *path,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bset_tree *t,
- struct bkey_packed *where,
- unsigned clobber_u64s,
- unsigned new_u64s)
-{
- const struct bkey_packed *end = btree_bkey_last(b, t);
- struct btree_node_iter_set *set;
- unsigned offset = __btree_node_key_to_offset(b, where);
- int shift = new_u64s - clobber_u64s;
- unsigned old_end = t->end_offset - shift;
- unsigned orig_iter_pos = node_iter->data[0].k;
- bool iter_current_key_modified =
- orig_iter_pos >= offset &&
- orig_iter_pos <= offset + clobber_u64s;
-
- btree_node_iter_for_each(node_iter, set)
- if (set->end == old_end)
- goto found;
-
- /* didn't find the bset in the iterator - might have to readd it: */
- if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
- bch2_btree_node_iter_push(node_iter, b, where, end);
- goto fixup_done;
- } else {
- /* Iterator is after key that changed */
- return;
- }
-found:
- set->end = t->end_offset;
-
- /* Iterator hasn't gotten to the key that changed yet: */
- if (set->k < offset)
- return;
-
- if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
- set->k = offset;
- } else if (set->k < offset + clobber_u64s) {
- set->k = offset + new_u64s;
- if (set->k == set->end)
- bch2_btree_node_iter_set_drop(node_iter, set);
- } else {
- /* Iterator is after key that changed */
- set->k = (int) set->k + shift;
- return;
- }
-
- bch2_btree_node_iter_sort(node_iter, b);
-fixup_done:
- if (node_iter->data[0].k != orig_iter_pos)
- iter_current_key_modified = true;
-
- /*
- * When a new key is added, and the node iterator now points to that
- * key, the iterator might have skipped past deleted keys that should
- * come after the key the iterator now points to. We have to rewind to
- * before those deleted keys - otherwise
- * bch2_btree_node_iter_prev_all() breaks:
- */
- if (!bch2_btree_node_iter_end(node_iter) &&
- iter_current_key_modified &&
- b->c.level) {
- struct bkey_packed *k, *k2, *p;
-
- k = bch2_btree_node_iter_peek_all(node_iter, b);
-
- for_each_bset(b, t) {
- bool set_pos = false;
-
- if (node_iter->data[0].end == t->end_offset)
- continue;
-
- k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
- while ((p = bch2_bkey_prev_all(b, t, k2)) &&
- bkey_iter_cmp(b, k, p) < 0) {
- k2 = p;
- set_pos = true;
- }
-
- if (set_pos)
- btree_node_iter_set_set_pos(node_iter,
- b, t, k2);
- }
- }
-}
-
-void bch2_btree_node_iter_fix(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bkey_packed *where,
- unsigned clobber_u64s,
- unsigned new_u64s)
-{
- struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
- struct btree_path *linked;
- unsigned i;
-
- if (node_iter != &path->l[b->c.level].iter) {
- __bch2_btree_node_iter_fix(path, b, node_iter, t,
- where, clobber_u64s, new_u64s);
-
- if (bch2_debug_check_iterators)
- bch2_btree_node_iter_verify(node_iter, b);
- }
-
- trans_for_each_path_with_node(trans, b, linked, i) {
- __bch2_btree_node_iter_fix(linked, b,
- &linked->l[b->c.level].iter, t,
- where, clobber_u64s, new_u64s);
- bch2_btree_path_verify_level(trans, linked, b->c.level);
- }
-}
-
-/* Btree path level: pointer to a particular btree node and node iter */
-
-static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
- struct btree_path_level *l,
- struct bkey *u,
- struct bkey_packed *k)
-{
- if (unlikely(!k)) {
- /*
- * signal to bch2_btree_iter_peek_slot() that we're currently at
- * a hole
- */
- u->type = KEY_TYPE_deleted;
- return bkey_s_c_null;
- }
-
- return bkey_disassemble(l->b, k, u);
-}
-
-static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
- struct btree_path_level *l,
- struct bkey *u)
-{
- return __btree_iter_unpack(c, l, u,
- bch2_btree_node_iter_peek_all(&l->iter, l->b));
-}
-
-static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_path_level *l,
- struct bkey *u)
-{
- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
- bch2_btree_node_iter_prev(&l->iter, l->b));
-
- path->pos = k.k ? k.k->p : l->b->data->min_key;
- trans->paths_sorted = false;
- bch2_btree_path_verify_level(trans, path, l - path->l);
- return k;
-}
-
-static inline bool btree_path_advance_to_pos(struct btree_path *path,
- struct btree_path_level *l,
- int max_advance)
-{
- struct bkey_packed *k;
- int nr_advanced = 0;
-
- while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
- bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
- if (max_advance > 0 && nr_advanced >= max_advance)
- return false;
-
- bch2_btree_node_iter_advance(&l->iter, l->b);
- nr_advanced++;
- }
-
- return true;
-}
-
-static inline void __btree_path_level_init(struct btree_path *path,
- unsigned level)
-{
- struct btree_path_level *l = &path->l[level];
-
- bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
- /*
- * Iterators to interior nodes should always be pointed at the first non
- * whiteout:
- */
- if (level)
- bch2_btree_node_iter_peek(&l->iter, l->b);
-}
-
-void bch2_btree_path_level_init(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- BUG_ON(path->cached);
-
- EBUG_ON(!btree_path_pos_in_node(path, b));
-
- path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
- path->l[b->c.level].b = b;
- __btree_path_level_init(path, b->c.level);
-}
-
-/* Btree path: fixups after btree node updates: */
-
-static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
-{
- struct bch_fs *c = trans->c;
-
- trans_for_each_update(trans, i)
- if (!i->cached &&
- i->level == b->c.level &&
- i->btree_id == b->c.btree_id &&
- bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
- bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
- i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
-
- if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
- bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
- i->k->k.p);
-
- if (j_k) {
- i->old_k = j_k->k;
- i->old_v = &j_k->v;
- }
- }
- }
-}
-
-/*
- * A btree node is being replaced - update the iterator to point to the new
- * node:
- */
-void bch2_trans_node_add(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- struct btree_path *prev;
-
- BUG_ON(!btree_path_pos_in_node(path, b));
-
- while ((prev = prev_btree_path(trans, path)) &&
- btree_path_pos_in_node(prev, b))
- path = prev;
-
- for (;
- path && btree_path_pos_in_node(path, b);
- path = next_btree_path(trans, path))
- if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
- enum btree_node_locked_type t =
- btree_lock_want(path, b->c.level);
-
- if (t != BTREE_NODE_UNLOCKED) {
- btree_node_unlock(trans, path, b->c.level);
- six_lock_increment(&b->c.lock, (enum six_lock_type) t);
- mark_btree_node_locked(trans, path, b->c.level, t);
- }
-
- bch2_btree_path_level_init(trans, path, b);
- }
-
- bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-void bch2_trans_node_drop(struct btree_trans *trans,
- struct btree *b)
-{
- struct btree_path *path;
- unsigned i, level = b->c.level;
-
- trans_for_each_path(trans, path, i)
- if (path->l[level].b == b) {
- btree_node_unlock(trans, path, level);
- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
- }
-}
-
-/*
- * A btree node has been modified in such a way as to invalidate iterators - fix
- * them:
- */
-void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path_with_node(trans, b, path, i)
- __btree_path_level_init(path, b->c.level);
-
- bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-/* Btree path: traverse, set_pos: */
-
-static inline int btree_path_lock_root(struct btree_trans *trans,
- struct btree_path *path,
- unsigned depth_want,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
- enum six_lock_type lock_type;
- unsigned i;
- int ret;
-
- EBUG_ON(path->nodes_locked);
-
- while (1) {
- struct btree *b = READ_ONCE(r->b);
- if (unlikely(!b)) {
- BUG_ON(!r->error);
- return r->error;
- }
-
- path->level = READ_ONCE(b->c.level);
-
- if (unlikely(path->level < depth_want)) {
- /*
- * the root is at a lower depth than the depth we want:
- * got to the end of the btree, or we're walking nodes
- * greater than some depth and there are no nodes >=
- * that depth
- */
- path->level = depth_want;
- for (i = path->level; i < BTREE_MAX_DEPTH; i++)
- path->l[i].b = NULL;
- return 1;
- }
-
- lock_type = __btree_lock_want(path, path->level);
- ret = btree_node_lock(trans, path, &b->c,
- path->level, lock_type, trace_ip);
- if (unlikely(ret)) {
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
- BUG();
- }
-
- if (likely(b == READ_ONCE(r->b) &&
- b->c.level == path->level &&
- !race_fault())) {
- for (i = 0; i < path->level; i++)
- path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
- path->l[path->level].b = b;
- for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
- path->l[i].b = NULL;
-
- mark_btree_node_locked(trans, path, path->level,
- (enum btree_node_locked_type) lock_type);
- bch2_btree_path_level_init(trans, path, b);
- return 0;
- }
-
- six_unlock_type(&b->c.lock, lock_type);
- }
-}
-
-noinline
-static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
-{
- struct bch_fs *c = trans->c;
- struct btree_path_level *l = path_l(path);
- struct btree_node_iter node_iter = l->iter;
- struct bkey_packed *k;
- struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_started, &c->flags)
- ? (path->level > 1 ? 0 : 2)
- : (path->level > 1 ? 1 : 16);
- bool was_locked = btree_node_locked(path, path->level);
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
-
- while (nr-- && !ret) {
- if (!bch2_btree_node_relock(trans, path, path->level))
- break;
-
- bch2_btree_node_iter_advance(&node_iter, l->b);
- k = bch2_btree_node_iter_peek(&node_iter, l->b);
- if (!k)
- break;
-
- bch2_bkey_buf_unpack(&tmp, c, l->b, k);
- ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
- path->level - 1);
- }
-
- if (!was_locked)
- btree_node_unlock(trans, path, path->level);
-
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
-}
-
-static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
- struct btree_and_journal_iter *jiter)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_started, &c->flags)
- ? (path->level > 1 ? 0 : 2)
- : (path->level > 1 ? 1 : 16);
- bool was_locked = btree_node_locked(path, path->level);
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
-
- jiter->fail_if_too_many_whiteouts = true;
-
- while (nr-- && !ret) {
- if (!bch2_btree_node_relock(trans, path, path->level))
- break;
-
- bch2_btree_and_journal_iter_advance(jiter);
- k = bch2_btree_and_journal_iter_peek(jiter);
- if (!k.k)
- break;
-
- bch2_bkey_buf_reassemble(&tmp, c, k);
- ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
- path->level - 1);
- }
-
- if (!was_locked)
- btree_node_unlock(trans, path, path->level);
-
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
-}
-
-static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
- struct btree_path *path,
- unsigned plevel, struct btree *b)
-{
- struct btree_path_level *l = &path->l[plevel];
- bool locked = btree_node_locked(path, plevel);
- struct bkey_packed *k;
- struct bch_btree_ptr_v2 *bp;
-
- if (!bch2_btree_node_relock(trans, path, plevel))
- return;
-
- k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
-
- bp = (void *) bkeyp_val(&l->b->format, k);
- bp->mem_ptr = (unsigned long)b;
-
- if (!locked)
- btree_node_unlock(trans, path, plevel);
-}
-
-static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
- struct btree_path *path,
- unsigned flags,
- struct bkey_buf *out)
-{
- struct bch_fs *c = trans->c;
- struct btree_path_level *l = path_l(path);
- struct btree_and_journal_iter jiter;
- struct bkey_s_c k;
- int ret = 0;
-
- __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
-
- k = bch2_btree_and_journal_iter_peek(&jiter);
- if (!k.k) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "node not found at pos ");
- bch2_bpos_to_text(&buf, path->pos);
- prt_str(&buf, " at btree ");
- bch2_btree_pos_to_text(&buf, c, l->b);
-
- ret = bch2_fs_topology_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- goto err;
- }
-
- bch2_bkey_buf_reassemble(out, c, k);
-
- if ((flags & BTREE_ITER_prefetch) &&
- c->opts.btree_node_prefetch)
- ret = btree_path_prefetch_j(trans, path, &jiter);
-
-err:
- bch2_btree_and_journal_iter_exit(&jiter);
- return ret;
-}
-
-static __always_inline int btree_path_down(struct btree_trans *trans,
- struct btree_path *path,
- unsigned flags,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_path_level *l = path_l(path);
- struct btree *b;
- unsigned level = path->level - 1;
- enum six_lock_type lock_type = __btree_lock_want(path, level);
- struct bkey_buf tmp;
- int ret;
-
- EBUG_ON(!btree_node_locked(path, path->level));
-
- bch2_bkey_buf_init(&tmp);
-
- if (unlikely(trans->journal_replay_not_finished)) {
- ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
- if (ret)
- goto err;
- } else {
- struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
- if (!k) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "node not found at pos ");
- bch2_bpos_to_text(&buf, path->pos);
- prt_str(&buf, " within parent node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
-
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_btree_need_topology_repair;
- goto err;
- }
-
- bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-
- if ((flags & BTREE_ITER_prefetch) &&
- c->opts.btree_node_prefetch) {
- ret = btree_path_prefetch(trans, path);
- if (ret)
- goto err;
- }
- }
-
- b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
- ret = PTR_ERR_OR_ZERO(b);
- if (unlikely(ret))
- goto err;
-
- if (likely(!trans->journal_replay_not_finished &&
- tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
- unlikely(b != btree_node_mem_ptr(tmp.k)))
- btree_node_mem_ptr_set(trans, path, level + 1, b);
-
- if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(trans, path, level + 1);
-
- mark_btree_node_locked(trans, path, level,
- (enum btree_node_locked_type) lock_type);
- path->level = level;
- bch2_btree_path_level_init(trans, path, b);
-
- bch2_btree_path_verify_locks(path);
-err:
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
-}
-
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_path *path;
- unsigned long trace_ip = _RET_IP_;
- unsigned i;
- int ret = 0;
-
- if (trans->in_traverse_all)
- return -BCH_ERR_transaction_restart_in_traverse_all;
-
- trans->in_traverse_all = true;
-retry_all:
- trans->restarted = 0;
- trans->last_restarted_ip = 0;
-
- trans_for_each_path(trans, path, i)
- path->should_be_locked = false;
-
- btree_trans_sort_paths(trans);
-
- bch2_trans_unlock(trans);
- cond_resched();
- trans_set_locked(trans, false);
-
- if (unlikely(trans->memory_allocation_failure)) {
- struct closure cl;
-
- closure_init_stack(&cl);
-
- do {
- ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
- closure_sync(&cl);
- } while (ret);
- }
-
- /* Now, redo traversals in correct order: */
- i = 0;
- while (i < trans->nr_sorted) {
- btree_path_idx_t idx = trans->sorted[i];
-
- /*
- * Traversing a path can cause another path to be added at about
- * the same position:
- */
- if (trans->paths[idx].uptodate) {
- __btree_path_get(trans, &trans->paths[idx], false);
- ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
- __btree_path_put(trans, &trans->paths[idx], false);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, ENOMEM))
- goto retry_all;
- if (ret)
- goto err;
- } else {
- i++;
- }
- }
-
- /*
- * We used to assert that all paths had been traversed here
- * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
- * path->should_be_locked is not set yet, we might have unlocked and
- * then failed to relock a path - that's fine.
- */
-err:
- bch2_btree_cache_cannibalize_unlock(trans);
-
- trans->in_traverse_all = false;
-
- trace_and_count(c, trans_traverse_all, trans, trace_ip);
- return ret;
-}
-
-static inline bool btree_path_check_pos_in_node(struct btree_path *path,
- unsigned l, int check_pos)
-{
- if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
- return false;
- if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
- return false;
- return true;
-}
-
-static inline bool btree_path_good_node(struct btree_trans *trans,
- struct btree_path *path,
- unsigned l, int check_pos)
-{
- return is_btree_node(path, l) &&
- bch2_btree_node_relock(trans, path, l) &&
- btree_path_check_pos_in_node(path, l, check_pos);
-}
-
-static void btree_path_set_level_down(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_level)
-{
- unsigned l;
-
- path->level = new_level;
-
- for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
- if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(trans, path, l);
-
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- bch2_btree_path_verify(trans, path);
-}
-
-static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
- struct btree_path *path,
- int check_pos)
-{
- unsigned i, l = path->level;
-again:
- while (btree_path_node(path, l) &&
- !btree_path_good_node(trans, path, l, check_pos))
- __btree_path_set_level_up(trans, path, l++);
-
- /* If we need intent locks, take them too: */
- for (i = l + 1;
- i < path->locks_want && btree_path_node(path, i);
- i++)
- if (!bch2_btree_node_relock(trans, path, i)) {
- while (l <= i)
- __btree_path_set_level_up(trans, path, l++);
- goto again;
- }
-
- return l;
-}
-
-static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
- struct btree_path *path,
- int check_pos)
-{
- return likely(btree_node_locked(path, path->level) &&
- btree_path_check_pos_in_node(path, path->level, check_pos))
- ? path->level
- : __btree_path_up_until_good_node(trans, path, check_pos);
-}
-
-/*
- * This is the main state machine for walking down the btree - walks down to a
- * specified depth
- *
- * Returns 0 on success, -EIO on error (error reading in a btree node).
- *
- * On error, caller (peek_node()/peek_key()) must return NULL; the error is
- * stashed in the iterator and returned from bch2_trans_exit().
- */
-int bch2_btree_path_traverse_one(struct btree_trans *trans,
- btree_path_idx_t path_idx,
- unsigned flags,
- unsigned long trace_ip)
-{
- struct btree_path *path = &trans->paths[path_idx];
- unsigned depth_want = path->level;
- int ret = -((int) trans->restarted);
-
- if (unlikely(ret))
- goto out;
-
- if (unlikely(!trans->srcu_held))
- bch2_trans_srcu_lock(trans);
-
- trace_btree_path_traverse_start(trans, path);
-
- /*
- * Ensure we obey path->should_be_locked: if it's set, we can't unlock
- * and re-traverse the path without a transaction restart:
- */
- if (path->should_be_locked) {
- ret = bch2_btree_path_relock(trans, path, trace_ip);
- goto out;
- }
-
- if (path->cached) {
- ret = bch2_btree_path_traverse_cached(trans, path, flags);
- goto out;
- }
-
- path = &trans->paths[path_idx];
-
- if (unlikely(path->level >= BTREE_MAX_DEPTH))
- goto out_uptodate;
-
- path->level = btree_path_up_until_good_node(trans, path, 0);
- unsigned max_level = path->level;
-
- EBUG_ON(btree_path_node(path, path->level) &&
- !btree_node_locked(path, path->level));
-
- /*
- * Note: path->nodes[path->level] may be temporarily NULL here - that
- * would indicate to other code that we got to the end of the btree,
- * here it indicates that relocking the root failed - it's critical that
- * btree_path_lock_root() comes next and that it can't fail
- */
- while (path->level > depth_want) {
- ret = btree_path_node(path, path->level)
- ? btree_path_down(trans, path, flags, trace_ip)
- : btree_path_lock_root(trans, path, depth_want, trace_ip);
- if (unlikely(ret)) {
- if (ret == 1) {
- /*
- * No nodes at this level - got to the end of
- * the btree:
- */
- ret = 0;
- goto out;
- }
-
- __bch2_btree_path_unlock(trans, path);
- path->level = depth_want;
- path->l[path->level].b = ERR_PTR(ret);
- goto out;
- }
- }
-
- if (unlikely(max_level > path->level)) {
- struct btree_path *linked;
- unsigned iter;
-
- trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
- for (unsigned j = path->level + 1; j < max_level; j++)
- linked->l[j] = path->l[j];
- }
-
-out_uptodate:
- path->uptodate = BTREE_ITER_UPTODATE;
- trace_btree_path_traverse_end(trans, path);
-out:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
- panic("ret %s (%i) trans->restarted %s (%i)\n",
- bch2_err_str(ret), ret,
- bch2_err_str(trans->restarted), trans->restarted);
- bch2_btree_path_verify(trans, path);
- return ret;
-}
-
-static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
- struct btree_path *src)
-{
- unsigned i, offset = offsetof(struct btree_path, pos);
-
- memcpy((void *) dst + offset,
- (void *) src + offset,
- sizeof(struct btree_path) - offset);
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++) {
- unsigned t = btree_node_locked_type(dst, i);
-
- if (t != BTREE_NODE_UNLOCKED)
- six_lock_increment(&dst->l[i].b->c.lock, t);
- }
-}
-
-static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
- bool intent, unsigned long ip)
-{
- btree_path_idx_t new = btree_path_alloc(trans, src);
- btree_path_copy(trans, trans->paths + new, trans->paths + src);
- __btree_path_get(trans, trans->paths + new, intent);
-#ifdef TRACK_PATH_ALLOCATED
- trans->paths[new].ip_allocated = ip;
-#endif
- return new;
-}
-
-__flatten
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
- btree_path_idx_t path, bool intent, unsigned long ip)
-{
- struct btree_path *old = trans->paths + path;
- __btree_path_put(trans, trans->paths + path, intent);
- path = btree_path_clone(trans, path, intent, ip);
- trace_btree_path_clone(trans, old, trans->paths + path);
- trans->paths[path].preserve = false;
- return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *trans,
- btree_path_idx_t path_idx, struct bpos new_pos,
- bool intent, unsigned long ip)
-{
- int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- EBUG_ON(!trans->paths[path_idx].ref);
-
- trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
-
- path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
-
- struct btree_path *path = trans->paths + path_idx;
- path->pos = new_pos;
- trans->paths_sorted = false;
-
- if (unlikely(path->cached)) {
- btree_node_unlock(trans, path, 0);
- path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- goto out;
- }
-
- unsigned level = btree_path_up_until_good_node(trans, path, cmp);
-
- if (btree_path_node(path, level)) {
- struct btree_path_level *l = &path->l[level];
-
- BUG_ON(!btree_node_locked(path, level));
- /*
- * We might have to skip over many keys, or just a few: try
- * advancing the node iterator, and if we have to skip over too
- * many keys just reinit it (or if we're rewinding, since that
- * is expensive).
- */
- if (cmp < 0 ||
- !btree_path_advance_to_pos(path, l, 8))
- bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
- /*
- * Iterators to interior nodes should always be pointed at the first non
- * whiteout:
- */
- if (unlikely(level))
- bch2_btree_node_iter_peek(&l->iter, l->b);
- }
-
- if (unlikely(level != path->level)) {
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- __bch2_btree_path_unlock(trans, path);
- }
-out:
- bch2_btree_path_verify(trans, path);
- return path_idx;
-}
-
-/* Btree path: main interface: */
-
-static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
- struct btree_path *sib;
-
- sib = prev_btree_path(trans, path);
- if (sib && !btree_path_cmp(sib, path))
- return sib;
-
- sib = next_btree_path(trans, path);
- if (sib && !btree_path_cmp(sib, path))
- return sib;
-
- return NULL;
-}
-
-static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
- struct btree_path *sib;
-
- sib = prev_btree_path(trans, path);
- if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
- return sib;
-
- sib = next_btree_path(trans, path);
- if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
- return sib;
-
- return NULL;
-}
-
-static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
-{
- __bch2_btree_path_unlock(trans, trans->paths + path);
- btree_path_list_remove(trans, trans->paths + path);
- __clear_bit(path, trans->paths_allocated);
-}
-
-static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
-{
- unsigned l = path->level;
-
- do {
- if (!btree_path_node(path, l))
- break;
-
- if (!is_btree_node(path, l))
- return false;
-
- if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
- return false;
-
- l++;
- } while (l < path->locks_want);
-
- return true;
-}
-
-void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
-{
- struct btree_path *path = trans->paths + path_idx, *dup;
-
- if (!__btree_path_put(trans, path, intent))
- return;
-
- dup = path->preserve
- ? have_path_at_pos(trans, path)
- : have_node_at_pos(trans, path);
-
- trace_btree_path_free(trans, path_idx, dup);
-
- if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
- return;
-
- if (path->should_be_locked && !trans->restarted) {
- if (!dup)
- return;
-
- if (!(trans->locked
- ? bch2_btree_path_relock_norestart(trans, dup)
- : bch2_btree_path_can_relock(trans, dup)))
- return;
- }
-
- if (dup) {
- dup->preserve |= path->preserve;
- dup->should_be_locked |= path->should_be_locked;
- }
-
- __bch2_path_free(trans, path_idx);
-}
-
-static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
- bool intent)
-{
- if (!__btree_path_put(trans, trans->paths + path, intent))
- return;
-
- __bch2_path_free(trans, path);
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
-{
- panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
- trans->restart_count, restart_count,
- (void *) trans->last_begin_ip);
-}
-
-static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct printbuf buf = PRINTBUF;
- bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
- panic("in transaction restart: %s, last restarted by\n%s",
- bch2_err_str(trans->restarted),
- buf.buf);
-#else
- panic("in transaction restart: %s, last restarted by %pS\n",
- bch2_err_str(trans->restarted),
- (void *) trans->last_restarted_ip);
-#endif
-}
-
-void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
-{
- if (trans->restarted)
- bch2_trans_in_restart_error(trans);
-
- if (!trans->locked)
- panic("trans should be locked, unlocked by %pS\n",
- (void *) trans->last_unlock_ip);
-
- BUG();
-}
-
-noinline __cold
-void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
-{
- prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
- trans->nr_updates, trans->fn, trans->journal_res.seq);
- printbuf_indent_add(buf, 2);
-
- trans_for_each_update(trans, i) {
- struct bkey_s_c old = { &i->old_k, i->old_v };
-
- prt_str(buf, "update: btree=");
- bch2_btree_id_to_text(buf, i->btree_id);
- prt_printf(buf, " cached=%u %pS\n",
- i->cached,
- (void *) i->ip_allocated);
-
- prt_printf(buf, " old ");
- bch2_bkey_val_to_text(buf, trans->c, old);
- prt_newline(buf);
-
- prt_printf(buf, " new ");
- bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
- prt_newline(buf);
- }
-
- for (struct jset_entry *e = trans->journal_entries;
- e != btree_trans_journal_entries_top(trans);
- e = vstruct_next(e))
- bch2_journal_entry_to_text(buf, trans->c, e);
-
- printbuf_indent_sub(buf, 2);
-}
-
-noinline __cold
-void bch2_dump_trans_updates(struct btree_trans *trans)
-{
- struct printbuf buf = PRINTBUF;
-
- bch2_trans_updates_to_text(&buf, trans);
- bch2_print_str(trans->c, buf.buf);
- printbuf_exit(&buf);
-}
-
-static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
- struct btree_path *path = trans->paths + path_idx;
-
- prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
- path_idx, path->ref, path->intent_ref,
- path->preserve ? 'P' : ' ',
- path->should_be_locked ? 'S' : ' ',
- path->cached ? 'C' : 'B');
- bch2_btree_id_level_to_text(out, path->btree_id, path->level);
- prt_str(out, " pos ");
- bch2_bpos_to_text(out, path->pos);
-
- if (!path->cached && btree_node_locked(path, path->level)) {
- prt_char(out, ' ');
- struct btree *b = path_l(path)->b;
- bch2_bpos_to_text(out, b->data->min_key);
- prt_char(out, '-');
- bch2_bpos_to_text(out, b->key.k.p);
- }
-
-#ifdef TRACK_PATH_ALLOCATED
- prt_printf(out, " %pS", (void *) path->ip_allocated);
-#endif
-}
-
-static const char *btree_node_locked_str(enum btree_node_locked_type t)
-{
- switch (t) {
- case BTREE_NODE_UNLOCKED:
- return "unlocked";
- case BTREE_NODE_READ_LOCKED:
- return "read";
- case BTREE_NODE_INTENT_LOCKED:
- return "intent";
- case BTREE_NODE_WRITE_LOCKED:
- return "write";
- default:
- return NULL;
- }
-}
-
-void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
- bch2_btree_path_to_text_short(out, trans, path_idx);
-
- struct btree_path *path = trans->paths + path_idx;
-
- prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
- prt_newline(out);
-
- printbuf_indent_add(out, 2);
- for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
- prt_printf(out, "l=%u locks %s seq %u node ", l,
- btree_node_locked_str(btree_node_locked_type(path, l)),
- path->l[l].lock_seq);
-
- int ret = PTR_ERR_OR_ZERO(path->l[l].b);
- if (ret)
- prt_str(out, bch2_err_str(ret));
- else
- prt_printf(out, "%px", path->l[l].b);
- prt_newline(out);
- }
- printbuf_indent_sub(out, 2);
-}
-
-static noinline __cold
-void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
- bool nosort)
-{
- struct trans_for_each_path_inorder_iter iter;
-
- if (!nosort)
- btree_trans_sort_paths(trans);
-
- trans_for_each_path_idx_inorder(trans, iter) {
- bch2_btree_path_to_text_short(out, trans, iter.path_idx);
- prt_newline(out);
- }
-}
-
-noinline __cold
-void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
-{
- __bch2_trans_paths_to_text(out, trans, false);
-}
-
-static noinline __cold
-void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
-{
- struct printbuf buf = PRINTBUF;
-
- __bch2_trans_paths_to_text(&buf, trans, nosort);
- bch2_trans_updates_to_text(&buf, trans);
-
- bch2_print_str(trans->c, buf.buf);
- printbuf_exit(&buf);
-}
-
-noinline __cold
-void bch2_dump_trans_paths_updates(struct btree_trans *trans)
-{
- __bch2_dump_trans_paths_updates(trans, false);
-}
-
-noinline __cold
-static void bch2_trans_update_max_paths(struct btree_trans *trans)
-{
- struct btree_transaction_stats *s = btree_trans_stats(trans);
- struct printbuf buf = PRINTBUF;
- size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
-
- bch2_trans_paths_to_text(&buf, trans);
-
- if (!buf.allocation_failure) {
- mutex_lock(&s->lock);
- if (nr > s->nr_max_paths) {
- s->nr_max_paths = nr;
- swap(s->max_paths_text, buf.buf);
- }
- mutex_unlock(&s->lock);
- }
-
- printbuf_exit(&buf);
-
- trans->nr_paths_max = nr;
-}
-
-noinline __cold
-int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
-{
- if (trace_trans_restart_too_many_iters_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_trans_paths_to_text(&buf, trans);
- trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
- printbuf_exit(&buf);
- }
-
- count_event(trans->c, trans_restart_too_many_iters);
-
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
-}
-
-static noinline void btree_path_overflow(struct btree_trans *trans)
-{
- bch2_dump_trans_paths_updates(trans);
- bch_err(trans->c, "trans path overflow");
-}
-
-static noinline void btree_paths_realloc(struct btree_trans *trans)
-{
- unsigned nr = trans->nr_paths * 2;
-
- void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
- sizeof(struct btree_trans_paths) +
- nr * sizeof(struct btree_path) +
- nr * sizeof(btree_path_idx_t) + 8 +
- nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
-
- unsigned long *paths_allocated = p;
- memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
- p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
-
- p += sizeof(struct btree_trans_paths);
- struct btree_path *paths = p;
- *trans_paths_nr(paths) = nr;
- memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
- p += nr * sizeof(struct btree_path);
-
- btree_path_idx_t *sorted = p;
- memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
- p += nr * sizeof(btree_path_idx_t) + 8;
-
- struct btree_insert_entry *updates = p;
- memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
-
- unsigned long *old = trans->paths_allocated;
-
- rcu_assign_pointer(trans->paths_allocated, paths_allocated);
- rcu_assign_pointer(trans->paths, paths);
- rcu_assign_pointer(trans->sorted, sorted);
- rcu_assign_pointer(trans->updates, updates);
-
- trans->nr_paths = nr;
-
- if (old != trans->_paths_allocated)
- kfree_rcu_mightsleep(old);
-}
-
-static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
- btree_path_idx_t pos)
-{
- btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
-
- if (unlikely(idx == trans->nr_paths)) {
- if (trans->nr_paths == BTREE_ITER_MAX) {
- btree_path_overflow(trans);
- return 0;
- }
-
- btree_paths_realloc(trans);
- }
-
- /*
- * Do this before marking the new path as allocated, since it won't be
- * initialized yet:
- */
- if (unlikely(idx > trans->nr_paths_max))
- bch2_trans_update_max_paths(trans);
-
- __set_bit(idx, trans->paths_allocated);
-
- struct btree_path *path = &trans->paths[idx];
- path->ref = 0;
- path->intent_ref = 0;
- path->nodes_locked = 0;
-
- btree_path_list_add(trans, pos, idx);
- trans->paths_sorted = false;
- return idx;
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos,
- unsigned locks_want, unsigned level,
- unsigned flags, unsigned long ip)
-{
- struct btree_path *path;
- bool cached = flags & BTREE_ITER_cached;
- bool intent = flags & BTREE_ITER_intent;
- struct trans_for_each_path_inorder_iter iter;
- btree_path_idx_t path_pos = 0, path_idx;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_trans_verify_locks(trans);
-
- btree_trans_sort_paths(trans);
-
- trans_for_each_path_inorder(trans, path, iter) {
- if (__btree_path_cmp(path,
- btree_id,
- cached,
- pos,
- level) > 0)
- break;
-
- path_pos = iter.path_idx;
- }
-
- if (path_pos &&
- trans->paths[path_pos].cached == cached &&
- trans->paths[path_pos].btree_id == btree_id &&
- trans->paths[path_pos].level == level) {
- trace_btree_path_get(trans, trans->paths + path_pos, &pos);
-
- __btree_path_get(trans, trans->paths + path_pos, intent);
- path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
- path = trans->paths + path_idx;
- } else {
- path_idx = btree_path_alloc(trans, path_pos);
- path = trans->paths + path_idx;
-
- __btree_path_get(trans, path, intent);
- path->pos = pos;
- path->btree_id = btree_id;
- path->cached = cached;
- path->uptodate = BTREE_ITER_NEED_TRAVERSE;
- path->should_be_locked = false;
- path->level = level;
- path->locks_want = locks_want;
- path->nodes_locked = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
- path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
-#ifdef TRACK_PATH_ALLOCATED
- path->ip_allocated = ip;
-#endif
- trans->paths_sorted = false;
-
- trace_btree_path_alloc(trans, path);
- }
-
- if (!(flags & BTREE_ITER_nopreserve))
- path->preserve = true;
-
- if (path->intent_ref)
- locks_want = max(locks_want, level + 1);
-
- /*
- * If the path has locks_want greater than requested, we don't downgrade
- * it here - on transaction restart because btree node split needs to
- * upgrade locks, we might be putting/getting the iterator again.
- * Downgrading iterators only happens via bch2_trans_downgrade(), after
- * a successful transaction commit.
- */
-
- locks_want = min(locks_want, BTREE_MAX_DEPTH);
- if (locks_want > path->locks_want)
- bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
-
- return path_idx;
-}
-
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
- enum btree_id btree_id,
- unsigned level,
- struct bpos pos)
-{
- btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
- BTREE_ITER_nopreserve|
- BTREE_ITER_intent, _RET_IP_);
- path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
-
- struct btree_path *path = trans->paths + path_idx;
- bch2_btree_path_downgrade(trans, path);
- __bch2_btree_path_unlock(trans, path);
- return path_idx;
-}
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
-{
-
- struct btree_path_level *l = path_l(path);
- struct bkey_packed *_k;
- struct bkey_s_c k;
-
- if (unlikely(!l->b))
- return bkey_s_c_null;
-
- EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
- EBUG_ON(!btree_node_locked(path, path->level));
-
- if (!path->cached) {
- _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
-
- EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
-
- if (!k.k || !bpos_eq(path->pos, k.k->p))
- goto hole;
- } else {
- struct bkey_cached *ck = (void *) path->l[0].b;
- if (!ck)
- return bkey_s_c_null;
-
- EBUG_ON(path->btree_id != ck->key.btree_id ||
- !bkey_eq(path->pos, ck->key.pos));
-
- *u = ck->k->k;
- k = (struct bkey_s_c) { u, &ck->k->v };
- }
-
- return k;
-hole:
- bkey_init(u);
- u->p = path->pos;
- return (struct bkey_s_c) { u, NULL };
-}
-
-void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
-
- if (!iter->path || trans->restarted)
- return;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- path->preserve = false;
- if (path->ref == 1)
- path->should_be_locked = false;
-}
-/* Btree iterators: */
-
-int __must_check
-__bch2_btree_iter_traverse(struct btree_iter *iter)
-{
- return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
-}
-
-int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- int ret;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- iter->path = bch2_btree_path_set_pos(trans, iter->path,
- btree_iter_search_key(iter),
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
- if (ret)
- return ret;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- if (btree_path_node(path, path->level))
- btree_path_set_should_be_locked(trans, path);
- return 0;
-}
-
-/* Iterate across nodes (leaf and interior nodes) */
-
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- struct btree *b = NULL;
- int ret;
-
- EBUG_ON(trans->paths[iter->path].cached);
- bch2_btree_iter_verify(iter);
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (ret)
- goto err;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- b = btree_path_node(path, path->level);
- if (!b)
- goto out;
-
- BUG_ON(bpos_lt(b->key.k.p, iter->pos));
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = b->key.k.p;
-
- iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out:
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
-
- return b;
-err:
- b = ERR_PTR(ret);
- goto out;
-}
-
-/* Only kept for -tools */
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
-{
- struct btree *b;
-
- while (b = bch2_btree_iter_peek_node(iter),
- bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
- bch2_trans_begin(iter->trans);
-
- return b;
-}
-
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- struct btree *b = NULL;
- int ret;
-
- EBUG_ON(trans->paths[iter->path].cached);
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(iter);
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (ret)
- goto err;
-
-
- struct btree_path *path = btree_iter_path(trans, iter);
-
- /* already at end? */
- if (!btree_path_node(path, path->level))
- return NULL;
-
- /* got to end? */
- if (!btree_path_node(path, path->level + 1)) {
- btree_path_set_level_up(trans, path);
- return NULL;
- }
-
- if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
- __bch2_btree_path_unlock(trans, path);
- path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
- path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
- goto err;
- }
-
- b = btree_path_node(path, path->level + 1);
-
- if (bpos_eq(iter->pos, b->key.k.p)) {
- __btree_path_set_level_up(trans, path, path->level++);
- } else {
- if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(trans, path, path->level + 1);
-
- /*
- * Haven't gotten to the end of the parent node: go back down to
- * the next child node
- */
- iter->path = bch2_btree_path_set_pos(trans, iter->path,
- bpos_successor(iter->pos),
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- path = btree_iter_path(trans, iter);
- btree_path_set_level_down(trans, path, iter->min_depth);
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (ret)
- goto err;
-
- path = btree_iter_path(trans, iter);
- b = path->l[path->level].b;
- }
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = b->key.k.p;
-
- iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
- EBUG_ON(btree_iter_path(trans, iter)->uptodate);
-out:
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
-
- return b;
-err:
- b = ERR_PTR(ret);
- goto out;
-}
-
-/* Iterate across keys (in leaf nodes only) */
-
-inline bool bch2_btree_iter_advance(struct btree_iter *iter)
-{
- struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_all_snapshots
- ? bpos_eq(pos, SPOS_MAX)
- : bkey_eq(pos, SPOS_MAX));
-
- if (ret && !(iter->flags & BTREE_ITER_is_extents))
- pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
- return ret;
-}
-
-inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
-{
- struct bpos pos = bkey_start_pos(&iter->k);
- bool ret = !(iter->flags & BTREE_ITER_all_snapshots
- ? bpos_eq(pos, POS_MIN)
- : bkey_eq(pos, POS_MIN));
-
- if (ret && !(iter->flags & BTREE_ITER_is_extents))
- pos = bkey_predecessor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
- return ret;
-}
-
-static noinline
-void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k)
-{
- struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
-
- trans_for_each_update(trans, i)
- if (!i->key_cache_already_flushed &&
- i->btree_id == iter->btree_id &&
- bpos_le(i->k->k.p, iter->pos) &&
- bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
- iter->k = i->k->k;
- *k = bkey_i_to_s_c(i->k);
- }
-}
-
-static noinline
-void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
- struct bpos end = path_l(path)->b->key.k.p;
-
- trans_for_each_update(trans, i)
- if (!i->key_cache_already_flushed &&
- i->btree_id == iter->btree_id &&
- bpos_ge(i->k->k.p, path->pos) &&
- bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
- iter->k = i->k->k;
- *k = bkey_i_to_s_c(i->k);
- }
-}
-
-static noinline
-void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k)
-{
- trans_for_each_update(trans, i)
- if (!i->key_cache_already_flushed &&
- i->btree_id == iter->btree_id &&
- bpos_eq(i->k->k.p, iter->pos)) {
- iter->k = i->k->k;
- *k = bkey_i_to_s_c(i->k);
- }
-}
-
-static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end_pos)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
-
- return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
- path->level,
- path->pos,
- end_pos,
- &iter->journal_idx);
-}
-
-static noinline
-struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
-
- if (k) {
- iter->k = k->k;
- return bkey_i_to_s_c(k);
- } else {
- return bkey_s_c_null;
- }
-}
-
-static noinline
-void btree_trans_peek_journal(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c *k)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
- bch2_btree_journal_peek(trans, iter,
- k->k ? k->k->p : path_l(path)->b->key.k.p);
- if (next_journal) {
- iter->k = next_journal->k;
- *k = bkey_i_to_s_c(next_journal);
- }
-}
-
-static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end_pos)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
-
- return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
- path->level,
- path->pos,
- end_pos,
- &iter->journal_idx);
-}
-
-static noinline
-void btree_trans_peek_prev_journal(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c *k)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
- bch2_btree_journal_peek_prev(trans, iter,
- k->k ? k->k->p : path_l(path)->b->key.k.p);
-
- if (next_journal) {
- iter->k = next_journal->k;
- *k = bkey_i_to_s_c(next_journal);
- }
-}
-
-/*
- * Checks btree key cache for key at iter->pos and returns it if present, or
- * bkey_s_c_null:
- */
-static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
-{
- struct btree_trans *trans = iter->trans;
- struct bch_fs *c = trans->c;
- struct bkey u;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- if ((iter->flags & BTREE_ITER_key_cache_fill) &&
- bpos_eq(iter->pos, pos))
- return bkey_s_c_null;
-
- if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
- return bkey_s_c_null;
-
- if (!iter->key_cache_path)
- iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
- iter->flags & BTREE_ITER_intent, 0,
- iter->flags|BTREE_ITER_cached|
- BTREE_ITER_cached_nofill,
- _THIS_IP_);
-
- iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- iter->flags|BTREE_ITER_cached) ?:
- bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
-
- k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
- if (!k.k)
- return k;
-
- if ((iter->flags & BTREE_ITER_all_snapshots) &&
- !bpos_eq(pos, k.k->p))
- return bkey_s_c_null;
-
- iter->k = u;
- k.k = &iter->k;
- btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
- return k;
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
-{
- struct btree_trans *trans = iter->trans;
- struct bkey_s_c k, k2;
- int ret;
-
- EBUG_ON(btree_iter_path(trans, iter)->cached);
- bch2_btree_iter_verify(iter);
-
- while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
- k = bkey_s_c_err(ret);
- break;
- }
-
- struct btree_path *path = btree_iter_path(trans, iter);
- struct btree_path_level *l = path_l(path);
-
- if (unlikely(!l->b)) {
- /* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
- k = bkey_s_c_null;
- break;
- }
-
- btree_path_set_should_be_locked(trans, path);
-
- k = btree_path_level_peek_all(trans->c, l, &iter->k);
-
- if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- k.k &&
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
- k = k2;
- if (bkey_err(k)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
- break;
- }
- }
-
- if (unlikely(iter->flags & BTREE_ITER_with_journal))
- btree_trans_peek_journal(trans, iter, &k);
-
- if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
- trans->nr_updates))
- bch2_btree_trans_peek_updates(trans, iter, &k);
-
- if (k.k && bkey_deleted(k.k)) {
- /*
- * If we've got a whiteout, and it's after the search
- * key, advance the search key to the whiteout instead
- * of just after the whiteout - it might be a btree
- * whiteout, with a real key at the same position, since
- * in the btree deleted keys sort before non deleted.
- */
- search_key = !bpos_eq(search_key, k.k->p)
- ? k.k->p
- : bpos_successor(k.k->p);
- continue;
- }
-
- if (likely(k.k)) {
- break;
- } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
- /* Advance to next leaf node: */
- search_key = bpos_successor(l->b->key.k.p);
- } else {
- /* End of btree: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
- k = bkey_s_c_null;
- break;
- }
- }
-
- bch2_btree_iter_verify(iter);
- return k;
-}
-
-/**
- * bch2_btree_iter_peek_max() - returns first key greater than or equal to
- * iterator's current position
- * @iter: iterator to peek from
- * @end: search limit: returns keys less than or equal to @end
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
-{
- struct btree_trans *trans = iter->trans;
- struct bpos search_key = btree_iter_search_key(iter);
- struct bkey_s_c k;
- struct bpos iter_pos = iter->pos;
- int ret;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
-
- ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- if (iter->update_path) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
- iter->update_path = 0;
- }
-
- while (1) {
- k = __bch2_btree_iter_peek(iter, search_key);
- if (unlikely(!k.k))
- goto end;
- if (unlikely(bkey_err(k)))
- goto out_no_locked;
-
- if (iter->flags & BTREE_ITER_filter_snapshots) {
- /*
- * We need to check against @end before FILTER_SNAPSHOTS because
- * if we get to a different inode that requested we might be
- * seeing keys for a different snapshot tree that will all be
- * filtered out.
- *
- * But we can't do the full check here, because bkey_start_pos()
- * isn't monotonically increasing before FILTER_SNAPSHOTS, and
- * that's what we check against in extents mode:
- */
- if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
- ? bkey_gt(k.k->p, end)
- : k.k->p.inode > end.inode))
- goto end;
-
- if (iter->update_path &&
- !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
- iter->update_path = 0;
- }
-
- if ((iter->flags & BTREE_ITER_intent) &&
- !(iter->flags & BTREE_ITER_is_extents) &&
- !iter->update_path) {
- struct bpos pos = k.k->p;
-
- if (pos.snapshot < iter->snapshot) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
-
- pos.snapshot = iter->snapshot;
-
- /*
- * advance, same as on exit for iter->path, but only up
- * to snapshot
- */
- __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
- iter->update_path = iter->path;
-
- iter->update_path = bch2_btree_path_set_pos(trans,
- iter->update_path, pos,
- iter->flags & BTREE_ITER_intent,
- _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
- }
-
- /*
- * We can never have a key in a leaf node at POS_MAX, so
- * we don't have to check these successor() calls:
- */
- if (!bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
-
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_key_cache_fill)) {
- search_key = bkey_successor(iter, k.k->p);
- continue;
- }
- }
-
- /*
- * iter->pos should be mononotically increasing, and always be
- * equal to the key we just returned - except extents can
- * straddle iter->pos:
- */
- if (!(iter->flags & BTREE_ITER_is_extents))
- iter_pos = k.k->p;
- else
- iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
-
- if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) :
- iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) :
- bkey_gt(iter_pos, end)))
- goto end;
-
- break;
- }
-
- iter->pos = iter_pos;
-
- iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out_no_locked:
- if (iter->update_path) {
- ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
- if (unlikely(ret))
- k = bkey_s_c_err(ret);
- else
- btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
- }
-
- if (!(iter->flags & BTREE_ITER_all_snapshots))
- iter->pos.snapshot = iter->snapshot;
-
- ret = bch2_btree_iter_verify_ret(iter, k);
- if (unlikely(ret)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
- k = bkey_s_c_err(ret);
- }
-
- bch2_btree_iter_verify_entry_exit(iter);
-
- return k;
-end:
- bch2_btree_iter_set_pos(iter, end);
- k = bkey_s_c_null;
- goto out_no_locked;
-}
-
-/**
- * bch2_btree_iter_next() - returns first key greater than iterator's current
- * position
- * @iter: iterator to peek from
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
-{
- if (!bch2_btree_iter_advance(iter))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek(iter);
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
-{
- struct btree_trans *trans = iter->trans;
- struct bkey_s_c k, k2;
-
- bch2_btree_iter_verify(iter);
-
- while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
- k = bkey_s_c_err(ret);
- break;
- }
-
- struct btree_path *path = btree_iter_path(trans, iter);
- struct btree_path_level *l = path_l(path);
-
- if (unlikely(!l->b)) {
- /* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
- k = bkey_s_c_null;
- break;
- }
-
- btree_path_set_should_be_locked(trans, path);
-
- k = btree_path_level_peek_all(trans->c, l, &iter->k);
- if (!k.k || bpos_gt(k.k->p, search_key)) {
- k = btree_path_level_prev(trans, path, l, &iter->k);
-
- BUG_ON(k.k && bpos_gt(k.k->p, search_key));
- }
-
- if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- k.k &&
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
- k = k2;
- if (bkey_err(k2)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
- break;
- }
- }
-
- if (unlikely(iter->flags & BTREE_ITER_with_journal))
- btree_trans_peek_prev_journal(trans, iter, &k);
-
- if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
- trans->nr_updates))
- bch2_btree_trans_peek_prev_updates(trans, iter, &k);
-
- if (likely(k.k && !bkey_deleted(k.k))) {
- break;
- } else if (k.k) {
- search_key = bpos_predecessor(k.k->p);
- } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
- /* Advance to previous leaf node: */
- search_key = bpos_predecessor(path->l[0].b->data->min_key);
- } else {
- /* Start of btree: */
- bch2_btree_iter_set_pos(iter, POS_MIN);
- k = bkey_s_c_null;
- break;
- }
- }
-
- bch2_btree_iter_verify(iter);
- return k;
-}
-
-/**
- * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
- * iterator's current position
- * @iter: iterator to peek from
- * @end: search limit: returns keys greater than or equal to @end
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
-{
- if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
- !bkey_eq(iter->pos, POS_MAX)) {
- /*
- * bkey_start_pos(), for extents, is not monotonically
- * increasing until after filtering for snapshots:
- *
- * Thus, for extents we need to search forward until we find a
- * real visible extents - easiest to just use peek_slot() (which
- * internally uses peek() for extents)
- */
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- if (bkey_err(k))
- return k;
-
- if (!bkey_deleted(k.k) &&
- (!(iter->flags & BTREE_ITER_is_extents) ||
- bkey_lt(bkey_start_pos(k.k), iter->pos)))
- return k;
- }
-
- struct btree_trans *trans = iter->trans;
- struct bpos search_key = iter->pos;
- struct bkey_s_c k;
- btree_path_idx_t saved_path = 0;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
-
- int ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- while (1) {
- k = __bch2_btree_iter_peek_prev(iter, search_key);
- if (unlikely(!k.k))
- goto end;
- if (unlikely(bkey_err(k)))
- goto out_no_locked;
-
- if (iter->flags & BTREE_ITER_filter_snapshots) {
- struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
- if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
- /*
- * If we have a saved candidate, and we're past
- * the last possible snapshot overwrite, return
- * it:
- */
- bch2_path_put_nokeep(trans, iter->path,
- iter->flags & BTREE_ITER_intent);
- iter->path = saved_path;
- saved_path = 0;
- k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
- break;
- }
-
- /*
- * We need to check against @end before FILTER_SNAPSHOTS because
- * if we get to a different inode that requested we might be
- * seeing keys for a different snapshot tree that will all be
- * filtered out.
- */
- if (unlikely(bkey_lt(k.k->p, end)))
- goto end;
-
- if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
- search_key = bpos_predecessor(k.k->p);
- continue;
- }
-
- if (k.k->p.snapshot != iter->snapshot) {
- /*
- * Have a key visible in iter->snapshot, but
- * might have overwrites: - save it and keep
- * searching. Unless it's a whiteout - then drop
- * our previous saved candidate:
- */
- if (saved_path) {
- bch2_path_put_nokeep(trans, saved_path,
- iter->flags & BTREE_ITER_intent);
- saved_path = 0;
- }
-
- if (!bkey_whiteout(k.k)) {
- saved_path = btree_path_clone(trans, iter->path,
- iter->flags & BTREE_ITER_intent,
- _THIS_IP_);
- trace_btree_path_save_pos(trans,
- trans->paths + iter->path,
- trans->paths + saved_path);
- }
-
- search_key = bpos_predecessor(k.k->p);
- continue;
- }
-
- if (bkey_whiteout(k.k)) {
- search_key = bkey_predecessor(iter, k.k->p);
- search_key.snapshot = U32_MAX;
- continue;
- }
- }
-
- EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) :
- iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
- bkey_gt(k.k->p, iter->pos));
-
- if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) :
- iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) :
- bkey_lt(k.k->p, end)))
- goto end;
-
- break;
- }
-
- /* Extents can straddle iter->pos: */
- iter->pos = bpos_min(iter->pos, k.k->p);;
-
- if (iter->flags & BTREE_ITER_filter_snapshots)
- iter->pos.snapshot = iter->snapshot;
-out_no_locked:
- if (saved_path)
- bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
-
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
- return k;
-end:
- bch2_btree_iter_set_pos(iter, end);
- k = bkey_s_c_null;
- goto out_no_locked;
-}
-
-/**
- * bch2_btree_iter_prev() - returns first key less than iterator's current
- * position
- * @iter: iterator to peek from
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
-{
- if (!bch2_btree_iter_rewind(iter))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek_prev(iter);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- struct bpos search_key;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(iter);
- bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
-
- ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- /* extents can't span inode numbers: */
- if ((iter->flags & BTREE_ITER_is_extents) &&
- unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
- if (iter->pos.inode == KEY_INODE_MAX)
- return bkey_s_c_null;
-
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
- }
-
- search_key = btree_iter_search_key(iter);
- iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- struct btree_path *path = btree_iter_path(trans, iter);
- if (unlikely(!btree_path_node(path, path->level)))
- return bkey_s_c_null;
-
- if ((iter->flags & BTREE_ITER_cached) ||
- !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
- k = bkey_s_c_null;
-
- if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
- trans->nr_updates)) {
- bch2_btree_trans_peek_slot_updates(trans, iter, &k);
- if (k.k)
- goto out;
- }
-
- if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
- (k = btree_trans_peek_slot_journal(trans, iter)).k)
- goto out;
-
- if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
- if (!bkey_err(k))
- iter->k = *k.k;
- /* We're not returning a key from iter->path: */
- goto out_no_locked;
- }
-
- k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
- if (unlikely(!k.k))
- goto out_no_locked;
-
- if (unlikely(k.k->type == KEY_TYPE_whiteout &&
- (iter->flags & BTREE_ITER_filter_snapshots) &&
- !(iter->flags & BTREE_ITER_key_cache_fill)))
- iter->k.type = KEY_TYPE_deleted;
- } else {
- struct bpos next;
- struct bpos end = iter->pos;
-
- if (iter->flags & BTREE_ITER_is_extents)
- end.offset = U64_MAX;
-
- EBUG_ON(btree_iter_path(trans, iter)->level);
-
- if (iter->flags & BTREE_ITER_intent) {
- struct btree_iter iter2;
-
- bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek_max(&iter2, end);
-
- if (k.k && !bkey_err(k)) {
- swap(iter->key_cache_path, iter2.key_cache_path);
- iter->k = iter2.k;
- k.k = &iter->k;
- }
- bch2_trans_iter_exit(trans, &iter2);
- } else {
- struct bpos pos = iter->pos;
-
- k = bch2_btree_iter_peek_max(iter, end);
- if (unlikely(bkey_err(k)))
- bch2_btree_iter_set_pos(iter, pos);
- else
- iter->pos = pos;
- }
-
- if (unlikely(bkey_err(k)))
- goto out_no_locked;
-
- next = k.k ? bkey_start_pos(k.k) : POS_MAX;
-
- if (bkey_lt(iter->pos, next)) {
- bkey_init(&iter->k);
- iter->k.p = iter->pos;
-
- if (iter->flags & BTREE_ITER_is_extents) {
- bch2_key_resize(&iter->k,
- min_t(u64, KEY_SIZE_MAX,
- (next.inode == iter->pos.inode
- ? next.offset
- : KEY_OFFSET_MAX) -
- iter->pos.offset));
- EBUG_ON(!iter->k.size);
- }
-
- k = (struct bkey_s_c) { &iter->k, NULL };
- }
- }
-out:
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out_no_locked:
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
- ret = bch2_btree_iter_verify_ret(iter, k);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
-
- return k;
-}
-
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
-{
- if (!bch2_btree_iter_advance(iter))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek_slot(iter);
-}
-
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
-{
- if (!bch2_btree_iter_rewind(iter))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek_slot(iter);
-}
-
-/* Obsolete, but still used by rust wrapper in -tools */
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(iter->trans) ||
- (k = bch2_btree_iter_peek_type(iter, iter->flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(iter->trans);
-
- return k;
-}
-
-/* new transactional stuff: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
-
- trans_for_each_path(trans, path, i) {
- BUG_ON(path->sorted_idx >= trans->nr_sorted);
- BUG_ON(trans->sorted[path->sorted_idx] != i);
- }
-
- for (i = 0; i < trans->nr_sorted; i++) {
- unsigned idx = trans->sorted[i];
-
- BUG_ON(!test_bit(idx, trans->paths_allocated));
- BUG_ON(trans->paths[idx].sorted_idx != i);
- }
-}
-
-static void btree_trans_verify_sorted(struct btree_trans *trans)
-{
- struct btree_path *path, *prev = NULL;
- struct trans_for_each_path_inorder_iter iter;
-
- if (!bch2_debug_check_iterators)
- return;
-
- trans_for_each_path_inorder(trans, path, iter) {
- if (prev && btree_path_cmp(prev, path) > 0) {
- __bch2_dump_trans_paths_updates(trans, true);
- panic("trans paths out of order!\n");
- }
- prev = path;
- }
-}
-#else
-static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
-static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
-#endif
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
-{
- int i, l = 0, r = trans->nr_sorted, inc = 1;
- bool swapped;
-
- btree_trans_verify_sorted_refs(trans);
-
- if (trans->paths_sorted)
- goto out;
-
- /*
- * Cocktail shaker sort: this is efficient because iterators will be
- * mostly sorted.
- */
- do {
- swapped = false;
-
- for (i = inc > 0 ? l : r - 2;
- i + 1 < r && i >= l;
- i += inc) {
- if (btree_path_cmp(trans->paths + trans->sorted[i],
- trans->paths + trans->sorted[i + 1]) > 0) {
- swap(trans->sorted[i], trans->sorted[i + 1]);
- trans->paths[trans->sorted[i]].sorted_idx = i;
- trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
- swapped = true;
- }
- }
-
- if (inc > 0)
- --r;
- else
- l++;
- inc = -inc;
- } while (swapped);
-
- trans->paths_sorted = true;
-out:
- btree_trans_verify_sorted(trans);
-}
-
-static inline void btree_path_list_remove(struct btree_trans *trans,
- struct btree_path *path)
-{
- EBUG_ON(path->sorted_idx >= trans->nr_sorted);
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- trans->nr_sorted--;
- memmove_u64s_down_small(trans->sorted + path->sorted_idx,
- trans->sorted + path->sorted_idx + 1,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
- sizeof(u64) / sizeof(btree_path_idx_t)));
-#else
- array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
-#endif
- for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
- trans->paths[trans->sorted[i]].sorted_idx = i;
-}
-
-static inline void btree_path_list_add(struct btree_trans *trans,
- btree_path_idx_t pos,
- btree_path_idx_t path_idx)
-{
- struct btree_path *path = trans->paths + path_idx;
-
- path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
-
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
- trans->sorted + path->sorted_idx,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
- sizeof(u64) / sizeof(btree_path_idx_t)));
- trans->nr_sorted++;
- trans->sorted[path->sorted_idx] = path_idx;
-#else
- array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
-#endif
-
- for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
- trans->paths[trans->sorted[i]].sorted_idx = i;
-
- btree_trans_verify_sorted_refs(trans);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
-{
- if (iter->update_path)
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
- if (iter->path)
- bch2_path_put(trans, iter->path,
- iter->flags & BTREE_ITER_intent);
- if (iter->key_cache_path)
- bch2_path_put(trans, iter->key_cache_path,
- iter->flags & BTREE_ITER_intent);
- iter->path = 0;
- iter->update_path = 0;
- iter->key_cache_path = 0;
- iter->trans = NULL;
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *trans,
- struct btree_iter *iter,
- enum btree_id btree_id, struct bpos pos,
- unsigned flags)
-{
- bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, 0, flags),
- _RET_IP_);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *trans,
- struct btree_iter *iter,
- enum btree_id btree_id,
- struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags)
-{
- flags |= BTREE_ITER_not_extents;
- flags |= BTREE_ITER_snapshot_field;
- flags |= BTREE_ITER_all_snapshots;
-
- if (!depth && btree_id_cached(trans->c, btree_id))
- flags |= BTREE_ITER_with_key_cache;
-
- bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
- bch2_btree_iter_flags(trans, btree_id, depth, flags),
- _RET_IP_);
-
- iter->min_depth = depth;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
- BUG_ON(path->level != depth);
- BUG_ON(iter->min_depth != depth);
-}
-
-void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
-{
- struct btree_trans *trans = src->trans;
-
- *dst = *src;
-#ifdef TRACK_PATH_ALLOCATED
- dst->ip_allocated = _RET_IP_;
-#endif
- if (src->path)
- __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
- if (src->update_path)
- __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
- dst->key_cache_path = 0;
-}
-
-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
-{
- struct bch_fs *c = trans->c;
- unsigned new_top = trans->mem_top + size;
- unsigned old_bytes = trans->mem_bytes;
- unsigned new_bytes = roundup_pow_of_two(new_top);
- int ret;
- void *new_mem;
- void *p;
-
- WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
-
- ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (ret)
- return ERR_PTR(ret);
-
- struct btree_transaction_stats *s = btree_trans_stats(trans);
- s->max_mem = max(s->max_mem, new_bytes);
-
- if (trans->used_mempool) {
- if (trans->mem_bytes >= new_bytes)
- goto out_change_top;
-
- /* No more space from mempool item, need malloc new one */
- new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
- if (unlikely(!new_mem)) {
- bch2_trans_unlock(trans);
-
- new_mem = kmalloc(new_bytes, GFP_KERNEL);
- if (!new_mem)
- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
-
- ret = bch2_trans_relock(trans);
- if (ret) {
- kfree(new_mem);
- return ERR_PTR(ret);
- }
- }
- memcpy(new_mem, trans->mem, trans->mem_top);
- trans->used_mempool = false;
- mempool_free(trans->mem, &c->btree_trans_mem_pool);
- goto out_new_mem;
- }
-
- new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
- if (unlikely(!new_mem)) {
- bch2_trans_unlock(trans);
-
- new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
- if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
- new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
- new_bytes = BTREE_TRANS_MEM_MAX;
- memcpy(new_mem, trans->mem, trans->mem_top);
- trans->used_mempool = true;
- kfree(trans->mem);
- }
-
- if (!new_mem)
- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
-
- trans->mem = new_mem;
- trans->mem_bytes = new_bytes;
-
- ret = bch2_trans_relock(trans);
- if (ret)
- return ERR_PTR(ret);
- }
-out_new_mem:
- trans->mem = new_mem;
- trans->mem_bytes = new_bytes;
-
- if (old_bytes) {
- trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
- return ERR_PTR(btree_trans_restart_ip(trans,
- BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
- }
-out_change_top:
- p = trans->mem + trans->mem_top;
- trans->mem_top += size;
- memset(p, 0, size);
- return p;
-}
-
-static inline void check_srcu_held_too_long(struct btree_trans *trans)
-{
- WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
- "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
- (jiffies - trans->srcu_lock_time) / HZ);
-}
-
-void bch2_trans_srcu_unlock(struct btree_trans *trans)
-{
- if (trans->srcu_held) {
- struct bch_fs *c = trans->c;
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (path->cached && !btree_node_locked(path, 0))
- path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
-
- check_srcu_held_too_long(trans);
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
- trans->srcu_held = false;
- }
-}
-
-static void bch2_trans_srcu_lock(struct btree_trans *trans)
-{
- if (!trans->srcu_held) {
- trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
- trans->srcu_held = true;
- }
-}
-
-/**
- * bch2_trans_begin() - reset a transaction after a interrupted attempt
- * @trans: transaction to reset
- *
- * Returns: current restart counter, to be used with trans_was_restarted()
- *
- * While iterating over nodes or updating nodes a attempt to lock a btree node
- * may return BCH_ERR_transaction_restart when the trylock fails. When this
- * occurs bch2_trans_begin() should be called and the transaction retried.
- */
-u32 bch2_trans_begin(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
- u64 now;
-
- bch2_trans_reset_updates(trans);
-
- trans->restart_count++;
- trans->mem_top = 0;
- trans->journal_entries = NULL;
-
- trans_for_each_path(trans, path, i) {
- path->should_be_locked = false;
-
- /*
- * If the transaction wasn't restarted, we're presuming to be
- * doing something new: dont keep iterators excpt the ones that
- * are in use - except for the subvolumes btree:
- */
- if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
- path->preserve = false;
-
- /*
- * XXX: we probably shouldn't be doing this if the transaction
- * was restarted, but currently we still overflow transaction
- * iterators if we do that
- */
- if (!path->ref && !path->preserve)
- __bch2_path_free(trans, i);
- else
- path->preserve = false;
- }
-
- now = local_clock();
-
- if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
- time_after64(now, trans->last_begin_time + 10))
- __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
- trans->last_begin_time, now);
-
- if (!trans->restarted &&
- (need_resched() ||
- time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
- bch2_trans_unlock(trans);
- cond_resched();
- now = local_clock();
- }
- trans->last_begin_time = now;
-
- if (unlikely(trans->srcu_held &&
- time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
- bch2_trans_srcu_unlock(trans);
-
- trans->last_begin_ip = _RET_IP_;
-
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
- if (trans->restarted) {
- trans->restart_count_this_trans++;
- } else {
- trans->restart_count_this_trans = 0;
- }
-#endif
-
- trans_set_locked(trans, false);
-
- if (trans->restarted) {
- bch2_btree_path_traverse_all(trans);
- trans->notrace_relock_fail = false;
- }
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- return trans->restart_count;
-}
-
-const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
-
-unsigned bch2_trans_get_fn_idx(const char *fn)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
- if (!bch2_btree_transaction_fns[i] ||
- bch2_btree_transaction_fns[i] == fn) {
- bch2_btree_transaction_fns[i] = fn;
- return i;
- }
-
- pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
- return 0;
-}
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
- __acquires(&c->btree_trans_barrier)
-{
- struct btree_trans *trans;
-
- if (IS_ENABLED(__KERNEL__)) {
- trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
- if (trans) {
- memset(trans, 0, offsetof(struct btree_trans, list));
- goto got_trans;
- }
- }
-
- trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
- memset(trans, 0, sizeof(*trans));
-
- seqmutex_lock(&c->btree_trans_lock);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- struct btree_trans *pos;
- pid_t pid = current->pid;
-
- trans->locking_wait.task = current;
-
- list_for_each_entry(pos, &c->btree_trans_list, list) {
- struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
- /*
- * We'd much prefer to be stricter here and completely
- * disallow multiple btree_trans in the same thread -
- * but the data move path calls bch2_write when we
- * already have a btree_trans initialized.
- */
- BUG_ON(pos_task &&
- pid == pos_task->pid &&
- pos->locked);
- }
- }
-
- list_add(&trans->list, &c->btree_trans_list);
- seqmutex_unlock(&c->btree_trans_lock);
-got_trans:
- trans->c = c;
- trans->last_begin_time = local_clock();
- trans->fn_idx = fn_idx;
- trans->locking_wait.task = current;
- trans->journal_replay_not_finished =
- unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
- atomic_inc_not_zero(&c->journal_keys.ref);
- trans->nr_paths = ARRAY_SIZE(trans->_paths);
- trans->paths_allocated = trans->_paths_allocated;
- trans->sorted = trans->_sorted;
- trans->paths = trans->_paths;
- trans->updates = trans->_updates;
-
- *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
-
- trans->paths_allocated[0] = 1;
-
- static struct lock_class_key lockdep_key;
- lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0);
-
- if (fn_idx < BCH_TRANSACTIONS_NR) {
- trans->fn = bch2_btree_transaction_fns[fn_idx];
-
- struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
-
- if (s->max_mem) {
- unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
-
- trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
- if (likely(trans->mem))
- trans->mem_bytes = expected_mem_bytes;
- }
-
- trans->nr_paths_max = s->nr_max_paths;
- trans->journal_entries_size = s->journal_entries_size;
- }
-
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
- trans->srcu_held = true;
- trans_set_locked(trans, false);
-
- closure_init_stack_release(&trans->ref);
- return trans;
-}
-
-static void check_btree_paths_leaked(struct btree_trans *trans)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bch_fs *c = trans->c;
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (path->ref)
- goto leaked;
- return;
-leaked:
- bch_err(c, "btree paths leaked from %s!", trans->fn);
- trans_for_each_path(trans, path, i)
- if (path->ref)
- printk(KERN_ERR " btree %s %pS\n",
- bch2_btree_id_str(path->btree_id),
- (void *) path->ip_allocated);
- /* Be noisy about this: */
- bch2_fatal_error(c);
-#endif
-}
-
-void bch2_trans_put(struct btree_trans *trans)
- __releases(&c->btree_trans_barrier)
-{
- struct bch_fs *c = trans->c;
-
- if (trans->restarted)
- bch2_trans_in_restart_error(trans);
-
- bch2_trans_unlock(trans);
-
- trans_for_each_update(trans, i)
- __btree_path_put(trans, trans->paths + i->path, true);
- trans->nr_updates = 0;
-
- check_btree_paths_leaked(trans);
-
- if (trans->srcu_held) {
- check_srcu_held_too_long(trans);
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
- }
-
- if (unlikely(trans->journal_replay_not_finished))
- bch2_journal_keys_put(c);
-
- /*
- * trans->ref protects trans->locking_wait.task, btree_paths array; used
- * by cycle detector
- */
- closure_return_sync(&trans->ref);
- trans->locking_wait.task = NULL;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- darray_exit(&trans->last_restarted_trace);
-#endif
-
- unsigned long *paths_allocated = trans->paths_allocated;
- trans->paths_allocated = NULL;
- trans->paths = NULL;
-
- if (paths_allocated != trans->_paths_allocated)
- kvfree_rcu_mightsleep(paths_allocated);
-
- if (trans->used_mempool)
- mempool_free(trans->mem, &c->btree_trans_mem_pool);
- else
- kfree(trans->mem);
-
- /* Userspace doesn't have a real percpu implementation: */
- if (IS_ENABLED(__KERNEL__))
- trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
-
- if (trans) {
- seqmutex_lock(&c->btree_trans_lock);
- list_del(&trans->list);
- seqmutex_unlock(&c->btree_trans_lock);
-
- mempool_free(trans, &c->btree_trans_pool);
- }
-}
-
-bool bch2_current_has_btree_trans(struct bch_fs *c)
-{
- seqmutex_lock(&c->btree_trans_lock);
- struct btree_trans *trans;
- bool ret = false;
- list_for_each_entry(trans, &c->btree_trans_list, list)
- if (trans->locking_wait.task == current &&
- trans->locked) {
- ret = true;
- break;
- }
- seqmutex_unlock(&c->btree_trans_lock);
- return ret;
-}
-
-static void __maybe_unused
-bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
- struct btree_bkey_cached_common *b)
-{
- struct six_lock_count c = six_lock_counts(&b->lock);
- struct task_struct *owner;
- pid_t pid;
-
- rcu_read_lock();
- owner = READ_ONCE(b->lock.owner);
- pid = owner ? owner->pid : 0;
- rcu_read_unlock();
-
- prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
- bch2_btree_id_to_text(out, b->btree_id);
- prt_printf(out, " l=%u:", b->level);
- bch2_bpos_to_text(out, btree_node_pos(b));
-
- prt_printf(out, "\t locks %u:%u:%u held by pid %u",
- c.n[0], c.n[1], c.n[2], pid);
-}
-
-void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
-{
- struct btree_bkey_cached_common *b;
- static char lock_types[] = { 'r', 'i', 'w' };
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
- unsigned l, idx;
-
- /* before rcu_read_lock(): */
- bch2_printbuf_make_room(out, 4096);
-
- if (!out->nr_tabstops) {
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 32);
- }
-
- prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
-
- /* trans->paths is rcu protected vs. freeing */
- rcu_read_lock();
- out->atomic++;
-
- struct btree_path *paths = rcu_dereference(trans->paths);
- if (!paths)
- goto out;
-
- unsigned long *paths_allocated = trans_paths_allocated(paths);
-
- trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
- struct btree_path *path = paths + idx;
- if (!path->nodes_locked)
- continue;
-
- prt_printf(out, " path %u %c ",
- idx,
- path->cached ? 'c' : 'b');
- bch2_btree_id_to_text(out, path->btree_id);
- prt_printf(out, " l=%u:", path->level);
- bch2_bpos_to_text(out, path->pos);
- prt_newline(out);
-
- for (l = 0; l < BTREE_MAX_DEPTH; l++) {
- if (btree_node_locked(path, l) &&
- !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
- prt_printf(out, " %c l=%u ",
- lock_types[btree_node_locked_type(path, l)], l);
- bch2_btree_bkey_cached_common_to_text(out, b);
- prt_newline(out);
- }
- }
- }
-
- b = READ_ONCE(trans->locking);
- if (b) {
- prt_printf(out, " blocked for %lluus on\n",
- div_u64(local_clock() - trans->locking_wait.start_time, 1000));
- prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
- bch2_btree_bkey_cached_common_to_text(out, b);
- prt_newline(out);
- }
-out:
- --out->atomic;
- rcu_read_unlock();
-}
-
-void bch2_fs_btree_iter_exit(struct bch_fs *c)
-{
- struct btree_transaction_stats *s;
- struct btree_trans *trans;
- int cpu;
-
- if (c->btree_trans_bufs)
- for_each_possible_cpu(cpu) {
- struct btree_trans *trans =
- per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
-
- if (trans) {
- seqmutex_lock(&c->btree_trans_lock);
- list_del(&trans->list);
- seqmutex_unlock(&c->btree_trans_lock);
- }
- kfree(trans);
- }
- free_percpu(c->btree_trans_bufs);
-
- trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
- if (trans)
- panic("%s leaked btree_trans\n", trans->fn);
-
- for (s = c->btree_transaction_stats;
- s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
- s++) {
- kfree(s->max_paths_text);
- bch2_time_stats_exit(&s->lock_hold_times);
- }
-
- if (c->btree_trans_barrier_initialized) {
- synchronize_srcu_expedited(&c->btree_trans_barrier);
- cleanup_srcu_struct(&c->btree_trans_barrier);
- }
- mempool_exit(&c->btree_trans_mem_pool);
- mempool_exit(&c->btree_trans_pool);
-}
-
-void bch2_fs_btree_iter_init_early(struct bch_fs *c)
-{
- struct btree_transaction_stats *s;
-
- for (s = c->btree_transaction_stats;
- s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
- s++) {
- bch2_time_stats_init(&s->duration);
- bch2_time_stats_init(&s->lock_hold_times);
- mutex_init(&s->lock);
- }
-
- INIT_LIST_HEAD(&c->btree_trans_list);
- seqmutex_init(&c->btree_trans_lock);
-}
-
-int bch2_fs_btree_iter_init(struct bch_fs *c)
-{
- int ret;
-
- c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
- if (!c->btree_trans_bufs)
- return -ENOMEM;
-
- ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
- sizeof(struct btree_trans)) ?:
- mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
- BTREE_TRANS_MEM_MAX) ?:
- init_srcu_struct(&c->btree_trans_barrier);
- if (ret)
- return ret;
-
- /*
- * static annotation (hackily done) for lock ordering of reclaim vs.
- * btree node locks:
- */
-#ifdef CONFIG_LOCKDEP
- fs_reclaim_acquire(GFP_KERNEL);
- struct btree_trans *trans = bch2_trans_get(c);
- trans_set_locked(trans, false);
- bch2_trans_put(trans);
- fs_reclaim_release(GFP_KERNEL);
-#endif
-
- c->btree_trans_barrier_initialized = true;
- return 0;
-
-}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
deleted file mode 100644
index 8823eec6b284..000000000000
--- a/fs/bcachefs/btree_iter.h
+++ /dev/null
@@ -1,955 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_ITER_H
-#define _BCACHEFS_BTREE_ITER_H
-
-#include "bset.h"
-#include "btree_types.h"
-#include "trace.h"
-
-void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
-void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_updates(struct btree_trans *);
-void bch2_dump_trans_paths_updates(struct btree_trans *);
-
-static inline int __bkey_err(const struct bkey *k)
-{
- return PTR_ERR_OR_ZERO(k);
-}
-
-#define bkey_err(_k) __bkey_err((_k).k)
-
-static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
-{
- unsigned idx = path - trans->paths;
-
- EBUG_ON(idx >= trans->nr_paths);
- EBUG_ON(!test_bit(idx, trans->paths_allocated));
- if (unlikely(path->ref == U8_MAX)) {
- bch2_dump_trans_paths_updates(trans);
- panic("path %u refcount overflow\n", idx);
- }
-
- path->ref++;
- path->intent_ref += intent;
- trace_btree_path_get_ll(trans, path);
-}
-
-static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
-{
- EBUG_ON(path - trans->paths >= trans->nr_paths);
- EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
- EBUG_ON(!path->ref);
- EBUG_ON(!path->intent_ref && intent);
-
- trace_btree_path_put_ll(trans, path);
- path->intent_ref -= intent;
- return --path->ref == 0;
-}
-
-static inline void btree_path_set_dirty(struct btree_path *path,
- enum btree_path_uptodate u)
-{
- path->uptodate = max_t(unsigned, path->uptodate, u);
-}
-
-static inline struct btree *btree_path_node(struct btree_path *path,
- unsigned level)
-{
- return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
-}
-
-static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
- const struct btree *b, unsigned level)
-{
- return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
-}
-
-static inline struct btree *btree_node_parent(struct btree_path *path,
- struct btree *b)
-{
- return btree_path_node(path, b->c.level + 1);
-}
-
-/* Iterate over paths within a transaction: */
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *);
-
-static inline void btree_trans_sort_paths(struct btree_trans *trans)
-{
- if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- trans->paths_sorted)
- return;
- __bch2_btree_trans_sort_paths(trans);
-}
-
-static inline unsigned long *trans_paths_nr(struct btree_path *paths)
-{
- return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths;
-}
-
-static inline unsigned long *trans_paths_allocated(struct btree_path *paths)
-{
- unsigned long *v = trans_paths_nr(paths);
- return v - BITS_TO_LONGS(*v);
-}
-
-#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\
- for (_idx = _start; \
- (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \
- _idx++)
-
-static inline struct btree_path *
-__trans_next_path(struct btree_trans *trans, unsigned *idx)
-{
- unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG;
- /*
- * Open coded find_next_bit(), because
- * - this is fast path, we can't afford the function call
- * - and we know that nr_paths is a multiple of BITS_PER_LONG,
- */
- while (*idx < trans->nr_paths) {
- unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1));
- if (v) {
- *idx += __ffs(v);
- return trans->paths + *idx;
- }
-
- *idx += BITS_PER_LONG;
- *idx &= ~(BITS_PER_LONG - 1);
- w++;
- }
-
- return NULL;
-}
-
-/*
- * This version is intended to be safe for use on a btree_trans that is owned by
- * another thread, for bch2_btree_trans_to_text();
- */
-#define trans_for_each_path_from(_trans, _path, _idx, _start) \
- for (_idx = _start; \
- (_path = __trans_next_path((_trans), &_idx)); \
- _idx++)
-
-#define trans_for_each_path(_trans, _path, _idx) \
- trans_for_each_path_from(_trans, _path, _idx, 1)
-
-static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
- unsigned idx = path ? path->sorted_idx + 1 : 0;
-
- EBUG_ON(idx > trans->nr_sorted);
-
- return idx < trans->nr_sorted
- ? trans->paths + trans->sorted[idx]
- : NULL;
-}
-
-static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
- unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
-
- return idx
- ? trans->paths + trans->sorted[idx - 1]
- : NULL;
-}
-
-#define trans_for_each_path_idx_inorder(_trans, _iter) \
- for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
- (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
- _iter.sorted_idx < (_trans)->nr_sorted); \
- _iter.sorted_idx++)
-
-struct trans_for_each_path_inorder_iter {
- btree_path_idx_t sorted_idx;
- btree_path_idx_t path_idx;
-};
-
-#define trans_for_each_path_inorder(_trans, _path, _iter) \
- for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
- (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
- _path = (_trans)->paths + _iter.path_idx, \
- _iter.sorted_idx < (_trans)->nr_sorted); \
- _iter.sorted_idx++)
-
-#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \
- for (_i = trans->nr_sorted - 1; \
- ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
- --_i)
-
-static inline bool __path_has_node(const struct btree_path *path,
- const struct btree *b)
-{
- return path->l[b->c.level].b == b &&
- btree_node_lock_seq_matches(path, b, b->c.level);
-}
-
-static inline struct btree_path *
-__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
- unsigned *idx)
-{
- struct btree_path *path;
-
- while ((path = __trans_next_path(trans, idx)) &&
- !__path_has_node(path, b))
- (*idx)++;
-
- return path;
-}
-
-#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \
- for (_iter = 1; \
- (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\
- _iter++)
-
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t,
- bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_make_mut(struct btree_trans *trans,
- btree_path_idx_t path, bool intent,
- unsigned long ip)
-{
- if (trans->paths[path].ref > 1 ||
- trans->paths[path].preserve)
- path = __bch2_btree_path_make_mut(trans, path, intent, ip);
- trans->paths[path].should_be_locked = false;
- return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t,
- struct bpos, bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_set_pos(struct btree_trans *trans,
- btree_path_idx_t path, struct bpos new_pos,
- bool intent, unsigned long ip)
-{
- return !bpos_eq(new_pos, trans->paths[path].pos)
- ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip)
- : path;
-}
-
-int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
- btree_path_idx_t,
- unsigned, unsigned long);
-
-static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *);
-
-static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
- btree_path_idx_t path, unsigned flags)
-{
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
- return 0;
-
- return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
- unsigned, unsigned, unsigned, unsigned long);
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id,
- unsigned, struct bpos);
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
-
-/*
- * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
- * different snapshot:
- */
-static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-{
- struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-
- if (k.k && bpos_eq(path->pos, k.k->p))
- return k;
-
- bkey_init(u);
- u->p = path->pos;
- return (struct bkey_s_c) { u, NULL };
-}
-
-struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
- struct btree_iter *, struct bpos);
-
-void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
-
-int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
-
-static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
-{
- return mutex_trylock(lock)
- ? 0
- : __bch2_trans_mutex_lock(trans, lock);
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
-#else
-static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
-static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
- struct bpos pos) {}
-#endif
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
- struct btree *, struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
- struct btree *, struct btree_node_iter *,
- struct bkey_packed *, unsigned, unsigned);
-
-int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
-
-void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool);
-
-int bch2_trans_relock(struct btree_trans *);
-int bch2_trans_relock_notrace(struct btree_trans *);
-void bch2_trans_unlock(struct btree_trans *);
-void bch2_trans_unlock_long(struct btree_trans *);
-
-static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
-{
- return restart_count != trans->restart_count
- ? -BCH_ERR_transaction_restart_nested
- : 0;
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
-
-static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
- u32 restart_count)
-{
- if (trans_was_restarted(trans, restart_count))
- bch2_trans_restart_error(trans, restart_count);
-}
-
-void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *);
-
-static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans)
-{
- if (trans->restarted || !trans->locked)
- bch2_trans_unlocked_or_in_restart_error(trans);
-}
-
-__always_inline
-static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip)
-{
- BUG_ON(err <= 0);
- BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
-
- trans->restarted = err;
- trans->last_restarted_ip = ip;
- return -err;
-}
-
-__always_inline
-static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
-{
- btree_trans_restart_foreign_task(trans, err, ip);
-#ifdef CONFIG_BCACHEFS_DEBUG
- darray_exit(&trans->last_restarted_trace);
- bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
-#endif
- return -err;
-}
-
-__always_inline
-static int btree_trans_restart(struct btree_trans *trans, int err)
-{
- return btree_trans_restart_ip(trans, err, _THIS_IP_);
-}
-
-static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
-{
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
- if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
- trace_and_count(trans->c, trans_restart_injected, trans, ip);
- return btree_trans_restart_ip(trans,
- BCH_ERR_transaction_restart_fault_inject, ip);
- }
-#endif
- return 0;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
-
-void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
-
-static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
- struct btree_path *path)
-{
- unsigned new_locks_want = path->level + !!path->intent_ref;
-
- if (path->locks_want > new_locks_want)
- __bch2_btree_path_downgrade(trans, path, new_locks_want);
-}
-
-void bch2_trans_downgrade(struct btree_trans *);
-
-void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
-void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
-void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
-
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
-int __must_check bch2_btree_iter_traverse(struct btree_iter *);
-
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
-
-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
-{
- return bch2_btree_iter_peek_max(iter, SPOS_MAX);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
-
-static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
-{
- return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
-}
-
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
-
-bool bch2_btree_iter_advance(struct btree_iter *);
-bool bch2_btree_iter_rewind(struct btree_iter *);
-
-static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
- iter->k.type = KEY_TYPE_deleted;
- iter->k.p.inode = iter->pos.inode = new_pos.inode;
- iter->k.p.offset = iter->pos.offset = new_pos.offset;
- iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot;
- iter->k.size = 0;
-}
-
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
- struct btree_trans *trans = iter->trans;
-
- if (unlikely(iter->update_path))
- bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
- iter->update_path = 0;
-
- if (!(iter->flags & BTREE_ITER_all_snapshots))
- new_pos.snapshot = iter->snapshot;
-
- __bch2_btree_iter_set_pos(iter, new_pos);
-}
-
-static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
-{
- BUG_ON(!(iter->flags & BTREE_ITER_is_extents));
- iter->pos = bkey_start_pos(&iter->k);
-}
-
-static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
-{
- struct bpos pos = iter->pos;
-
- iter->snapshot = snapshot;
- pos.snapshot = snapshot;
- bch2_btree_iter_set_pos(iter, pos);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-
-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
- unsigned btree_id,
- unsigned level,
- unsigned flags)
-{
- if (level || !btree_id_cached(trans->c, btree_id)) {
- flags &= ~BTREE_ITER_cached;
- flags &= ~BTREE_ITER_with_key_cache;
- } else if (!(flags & BTREE_ITER_cached))
- flags |= BTREE_ITER_with_key_cache;
-
- if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
- btree_id_is_extents(btree_id))
- flags |= BTREE_ITER_is_extents;
-
- if (!(flags & BTREE_ITER_snapshot_field) &&
- !btree_type_has_snapshot_field(btree_id))
- flags &= ~BTREE_ITER_all_snapshots;
-
- if (!(flags & BTREE_ITER_all_snapshots) &&
- btree_type_has_snapshots(btree_id))
- flags |= BTREE_ITER_filter_snapshots;
-
- if (trans->journal_replay_not_finished)
- flags |= BTREE_ITER_with_journal;
-
- return flags;
-}
-
-static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags,
- unsigned long ip)
-{
- iter->trans = trans;
- iter->update_path = 0;
- iter->key_cache_path = 0;
- iter->btree_id = btree_id;
- iter->min_depth = 0;
- iter->flags = flags;
- iter->snapshot = pos.snapshot;
- iter->pos = pos;
- iter->k = POS_KEY(pos);
- iter->journal_idx = 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
- iter->ip_allocated = ip;
-#endif
- iter->path = bch2_path_get(trans, btree_id, iter->pos,
- locks_want, depth, flags, ip);
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
- enum btree_id, struct bpos, unsigned);
-
-static inline void bch2_trans_iter_init(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned flags)
-{
- if (__builtin_constant_p(btree_id) &&
- __builtin_constant_p(flags))
- bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, 0, flags),
- _THIS_IP_);
- else
- bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
- enum btree_id, struct bpos,
- unsigned, unsigned, unsigned);
-void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
-
-void bch2_set_btree_iter_dontneed(struct btree_iter *);
-
-void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
-
-/**
- * bch2_trans_kmalloc - allocate memory for use by the current transaction
- *
- * Must be called after bch2_trans_begin, which on second and further calls
- * frees all memory allocated in this transaction
- */
-static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
-{
- size = roundup(size, 8);
-
- if (likely(trans->mem_top + size <= trans->mem_bytes)) {
- void *p = trans->mem + trans->mem_top;
-
- trans->mem_top += size;
- memset(p, 0, size);
- return p;
- } else {
- return __bch2_trans_kmalloc(trans, size);
- }
-}
-
-static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
-{
- size = round_up(size, 8);
-
- if (likely(trans->mem_top + size <= trans->mem_bytes)) {
- void *p = trans->mem + trans->mem_top;
-
- trans->mem_top += size;
- return p;
- } else {
- return __bch2_trans_kmalloc(trans, size);
- }
-}
-
-static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned type)
-{
- struct bkey_s_c k;
-
- bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
- k = bch2_btree_iter_peek_slot(iter);
-
- if (!bkey_err(k) && type && k.k->type != type)
- k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
- if (unlikely(bkey_err(k)))
- bch2_trans_iter_exit(trans, iter);
- return k;
-}
-
-static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned flags)
-{
- return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
-}
-
-#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
- bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
- _btree_id, _pos, _flags, KEY_TYPE_##_type))
-
-static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k)
-{
- unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k));
- memcpy(dst_v, src_k.v, b);
- if (unlikely(b < dst_size))
- memset(dst_v + b, 0, dst_size - b);
-}
-
-#define bkey_val_copy(_dst_v, _src_k) \
-do { \
- BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \
- __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \
-} while (0)
-
-static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
- unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned type,
- unsigned val_size, void *val)
-{
- struct btree_iter iter;
- struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
- int ret = bkey_err(k);
- if (!ret) {
- __bkey_val_copy(val, val_size, k);
- bch2_trans_iter_exit(trans, &iter);
- }
-
- return ret;
-}
-
-#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
- __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \
- KEY_TYPE_##_type, sizeof(*_val), _val)
-
-void bch2_trans_srcu_unlock(struct btree_trans *);
-
-u32 bch2_trans_begin(struct btree_trans *);
-
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _locks_want, _depth, _flags, _b, _do) \
-({ \
- bch2_trans_begin((_trans)); \
- \
- struct btree_iter _iter; \
- bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \
- _start, _locks_want, _depth, _flags); \
- int _ret3 = 0; \
- do { \
- _ret3 = lockrestart_do((_trans), ({ \
- struct btree *_b = bch2_btree_iter_peek_node(&_iter); \
- if (!_b) \
- break; \
- \
- PTR_ERR_OR_ZERO(_b) ?: (_do); \
- })) ?: \
- lockrestart_do((_trans), \
- PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \
- } while (!_ret3); \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
-
-#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _flags, _b, _do) \
- __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- 0, 0, _flags, _b, _do)
-
-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
- unsigned flags)
-{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
- bch2_btree_iter_peek_prev(iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
- unsigned flags)
-{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
- bch2_btree_iter_peek(iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
- struct bpos end,
- unsigned flags)
-{
- if (!(flags & BTREE_ITER_slots))
- return bch2_btree_iter_peek_max(iter, end);
-
- if (bkey_gt(iter->pos, end))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek_slot(iter);
-}
-
-int __bch2_btree_trans_too_many_iters(struct btree_trans *);
-
-static inline int btree_trans_too_many_iters(struct btree_trans *trans)
-{
- if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
- return __bch2_btree_trans_too_many_iters(trans);
-
- return 0;
-}
-
-/*
- * goto instead of loop, so that when used inside for_each_btree_key2()
- * break/continue work correctly
- */
-#define lockrestart_do(_trans, _do) \
-({ \
- __label__ transaction_restart; \
- u32 _restart_count; \
- int _ret2; \
-transaction_restart: \
- _restart_count = bch2_trans_begin(_trans); \
- _ret2 = (_do); \
- \
- if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \
- goto transaction_restart; \
- \
- if (!_ret2) \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- _ret2; \
-})
-
-/*
- * nested_lockrestart_do(), nested_commit_do():
- *
- * These are like lockrestart_do() and commit_do(), with two differences:
- *
- * - We don't call bch2_trans_begin() unless we had a transaction restart
- * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
- * transaction restart
- */
-#define nested_lockrestart_do(_trans, _do) \
-({ \
- u32 _restart_count, _orig_restart_count; \
- int _ret2; \
- \
- _restart_count = _orig_restart_count = (_trans)->restart_count; \
- \
- while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
- _restart_count = bch2_trans_begin(_trans); \
- \
- if (!_ret2) \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- \
- _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \
-})
-
-#define for_each_btree_key_max_continue(_trans, _iter, \
- _end, _flags, _k, _do) \
-({ \
- struct bkey_s_c _k; \
- int _ret3 = 0; \
- \
- do { \
- _ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), \
- _end, (_flags)); \
- if (!(_k).k) \
- break; \
- \
- bkey_err(_k) ?: (_do); \
- })); \
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
-
-#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \
- for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_max(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _do) \
-({ \
- bch2_trans_begin(trans); \
- \
- struct btree_iter _iter; \
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
- for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\
-})
-
-#define for_each_btree_key(_trans, _iter, _btree_id, \
- _start, _flags, _k, _do) \
- for_each_btree_key_max(_trans, _iter, _btree_id, _start, \
- SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
- _start, _flags, _k, _do) \
-({ \
- struct btree_iter _iter; \
- struct bkey_s_c _k; \
- int _ret3 = 0; \
- \
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
- do { \
- _ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \
- (_flags)); \
- if (!(_k).k) \
- break; \
- \
- bkey_err(_k) ?: (_do); \
- })); \
- } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
-
-#define for_each_btree_key_commit(_trans, _iter, _btree_id, \
- _start, _iter_flags, _k, \
- _disk_res, _journal_seq, _commit_flags,\
- _do) \
- for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
- (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \
- _start, _iter_flags, _k, \
- _disk_res, _journal_seq, _commit_flags,\
- _do) \
- for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
- (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \
- _start, _end, _iter_flags, _k, \
- _disk_res, _journal_seq, _commit_flags,\
- _do) \
- for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
- (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_commit_flags)))
-
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-
-#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
-#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
- for (; \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
-#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
- SPOS_MAX, _flags, _k, _ret)
-
-#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_rewind(&(_iter)))
-
-#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
- for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
-
-/*
- * This should not be used in a fastpath, without first trying _do in
- * nonblocking mode - it will cause excessive transaction restarts and
- * potentially livelocking:
- */
-#define drop_locks_do(_trans, _do) \
-({ \
- bch2_trans_unlock(_trans); \
- (_do) ?: bch2_trans_relock(_trans); \
-})
-
-#define allocate_dropping_locks_errcode(_trans, _do) \
-({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
- int _ret = _do; \
- \
- if (bch2_err_matches(_ret, ENOMEM)) { \
- _gfp = GFP_KERNEL; \
- _ret = drop_locks_do(_trans, _do); \
- } \
- _ret; \
-})
-
-#define allocate_dropping_locks(_trans, _ret, _do) \
-({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
- typeof(_do) _p = _do; \
- \
- _ret = 0; \
- if (unlikely(!_p)) { \
- _gfp = GFP_KERNEL; \
- _ret = drop_locks_do(_trans, ((_p = _do), 0)); \
- } \
- _p; \
-})
-
-#define bch2_trans_run(_c, _do) \
-({ \
- struct btree_trans *trans = bch2_trans_get(_c); \
- int _ret = (_do); \
- bch2_trans_put(trans); \
- _ret; \
-})
-
-#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
-void bch2_trans_put(struct btree_trans *);
-
-bool bch2_current_has_btree_trans(struct bch_fs *);
-
-extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
-unsigned bch2_trans_get_fn_idx(const char *);
-
-#define bch2_trans_get(_c) \
-({ \
- static unsigned trans_fn_idx; \
- \
- if (unlikely(!trans_fn_idx)) \
- trans_fn_idx = bch2_trans_get_fn_idx(__func__); \
- __bch2_trans_get(_c, trans_fn_idx); \
-})
-
-void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
-
-void bch2_fs_btree_iter_exit(struct bch_fs *);
-void bch2_fs_btree_iter_init_early(struct bch_fs *);
-int bch2_fs_btree_iter_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
deleted file mode 100644
index 6d25e3f85ce8..000000000000
--- a/fs/bcachefs/btree_journal_iter.c
+++ /dev/null
@@ -1,806 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_cache.h"
-#include "btree_journal_iter.h"
-#include "journal_io.h"
-
-#include <linux/sort.h>
-
-/*
- * For managing keys we read from the journal: until journal replay works normal
- * btree lookups need to be able to find and return keys from the journal where
- * they overwrite what's in the btree, so we have a special iterator and
- * operations for the regular btree iter code to use:
- */
-
-static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
-{
- size_t gap_size = keys->size - keys->nr;
-
- BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
-
- if (pos >= keys->gap)
- pos -= gap_size;
- return pos;
-}
-
-static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-{
- size_t gap_size = keys->size - keys->nr;
-
- if (idx >= keys->gap)
- idx += gap_size;
- return idx;
-}
-
-static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-{
- return keys->data + idx_to_pos(keys, idx);
-}
-
-static size_t __bch2_journal_key_search(struct journal_keys *keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
-{
- size_t l = 0, r = keys->nr, m;
-
- while (l < r) {
- m = l + ((r - l) >> 1);
- if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
- l = m + 1;
- else
- r = m;
- }
-
- BUG_ON(l < keys->nr &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-
- BUG_ON(l &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-
- return l;
-}
-
-static size_t bch2_journal_key_search(struct journal_keys *keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
-{
- return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-}
-
-/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
-{
- struct journal_keys *keys = &c->journal_keys;
- unsigned iters = 0;
- struct journal_key *k;
-
- BUG_ON(*idx > keys->nr);
-search:
- if (!*idx)
- *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
- while (*idx &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
- --(*idx);
- iters++;
- if (iters == 10) {
- *idx = 0;
- goto search;
- }
- }
-
- struct bkey_i *ret = NULL;
- rcu_read_lock(); /* for overwritten_ranges */
-
- while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
- if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
- break;
-
- if (k->overwritten) {
- if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->end;
- else
- *idx += 1;
- continue;
- }
-
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
- ret = k->k;
- break;
- }
-
- (*idx)++;
- iters++;
- if (iters == 10) {
- *idx = 0;
- rcu_read_unlock();
- goto search;
- }
- }
-
- rcu_read_unlock();
- return ret;
-}
-
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
-{
- struct journal_keys *keys = &c->journal_keys;
- unsigned iters = 0;
- struct journal_key *k;
-
- BUG_ON(*idx > keys->nr);
-search:
- if (!*idx)
- *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
- while (*idx &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
- (*idx)++;
- iters++;
- if (iters == 10) {
- *idx = 0;
- goto search;
- }
- }
-
- struct bkey_i *ret = NULL;
- rcu_read_lock(); /* for overwritten_ranges */
-
- while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
- if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
- break;
-
- if (k->overwritten) {
- if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->start - 1;
- else
- *idx -= 1;
- continue;
- }
-
- if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
- ret = k->k;
- break;
- }
-
- --(*idx);
- iters++;
- if (iters == 10) {
- *idx = 0;
- goto search;
- }
- }
-
- rcu_read_unlock();
- return ret;
-}
-
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos)
-{
- size_t idx = 0;
-
- return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx);
-}
-
-static void journal_iter_verify(struct journal_iter *iter)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct journal_keys *keys = iter->keys;
- size_t gap_size = keys->size - keys->nr;
-
- BUG_ON(iter->idx >= keys->gap &&
- iter->idx < keys->gap + gap_size);
-
- if (iter->idx < keys->size) {
- struct journal_key *k = keys->data + iter->idx;
-
- int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
- BUG_ON(cmp > 0);
- }
-#endif
-}
-
-static void journal_iters_fix(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
- /* The key we just inserted is immediately before the gap: */
- size_t gap_end = keys->gap + (keys->size - keys->nr);
- struct journal_key *new_key = &keys->data[keys->gap - 1];
- struct journal_iter *iter;
-
- /*
- * If an iterator points one after the key we just inserted, decrement
- * the iterator so it points at the key we just inserted - if the
- * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
- * handle that:
- */
- list_for_each_entry(iter, &c->journal_iters, list) {
- journal_iter_verify(iter);
- if (iter->idx == gap_end &&
- new_key->btree_id == iter->btree_id &&
- new_key->level == iter->level)
- iter->idx = keys->gap - 1;
- journal_iter_verify(iter);
- }
-}
-
-static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-{
- struct journal_keys *keys = &c->journal_keys;
- struct journal_iter *iter;
- size_t gap_size = keys->size - keys->nr;
-
- list_for_each_entry(iter, &c->journal_iters, list) {
- if (iter->idx > old_gap)
- iter->idx -= gap_size;
- if (iter->idx >= new_gap)
- iter->idx += gap_size;
- }
-}
-
-int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
-{
- struct journal_key n = {
- .btree_id = id,
- .level = level,
- .k = k,
- .allocated = true,
- /*
- * Ensure these keys are done last by journal replay, to unblock
- * journal reclaim:
- */
- .journal_seq = U64_MAX,
- };
- struct journal_keys *keys = &c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-
- BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
- if (idx < keys->size &&
- journal_key_cmp(&n, &keys->data[idx]) == 0) {
- if (keys->data[idx].allocated)
- kfree(keys->data[idx].k);
- keys->data[idx] = n;
- return 0;
- }
-
- if (idx > keys->gap)
- idx -= keys->size - keys->nr;
-
- size_t old_gap = keys->gap;
-
- if (keys->nr == keys->size) {
- journal_iters_move_gap(c, old_gap, keys->size);
- old_gap = keys->size;
-
- struct journal_keys new_keys = {
- .nr = keys->nr,
- .size = max_t(size_t, keys->size, 8) * 2,
- };
-
- new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
- if (!new_keys.data) {
- bch_err(c, "%s: error allocating new key array (size %zu)",
- __func__, new_keys.size);
- return -BCH_ERR_ENOMEM_journal_key_insert;
- }
-
- /* Since @keys was full, there was no gap: */
- memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
- kvfree(keys->data);
- keys->data = new_keys.data;
- keys->nr = new_keys.nr;
- keys->size = new_keys.size;
-
- /* And now the gap is at the end: */
- keys->gap = keys->nr;
- }
-
- journal_iters_move_gap(c, old_gap, idx);
-
- move_gap(keys, idx);
-
- keys->nr++;
- keys->data[keys->gap++] = n;
-
- journal_iters_fix(c);
-
- return 0;
-}
-
-/*
- * Can only be used from the recovery thread while we're still RO - can't be
- * used once we've got RW, as journal_keys is at that point used by multiple
- * threads:
- */
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
-{
- struct bkey_i *n;
- int ret;
-
- n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
- if (!n)
- return -BCH_ERR_ENOMEM_journal_key_insert;
-
- bkey_copy(n, k);
- ret = bch2_journal_key_insert_take(c, id, level, n);
- if (ret)
- kfree(n);
- return ret;
-}
-
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bpos pos)
-{
- struct bkey_i whiteout;
-
- bkey_init(&whiteout.k);
- whiteout.k.p = pos;
-
- return bch2_journal_key_insert(c, id, level, &whiteout);
-}
-
-bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
- unsigned level, struct bpos pos)
-{
- struct journal_keys *keys = &trans->c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
- if (!trans->journal_replay_not_finished)
- return false;
-
- return (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- bkey_deleted(&keys->data[idx].k->k));
-}
-
-static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
-{
- struct journal_key *k = keys->data + pos;
- size_t idx = pos_to_idx(keys, pos);
-
- k->overwritten = true;
-
- struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
- struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
-
- bool prev_overwritten = prev && prev->overwritten;
- bool next_overwritten = next && next->overwritten;
-
- struct journal_key_range_overwritten *prev_range =
- prev_overwritten ? prev->overwritten_range : NULL;
- struct journal_key_range_overwritten *next_range =
- next_overwritten ? next->overwritten_range : NULL;
-
- BUG_ON(prev_range && prev_range->end != idx);
- BUG_ON(next_range && next_range->start != idx + 1);
-
- if (prev_range && next_range) {
- prev_range->end = next_range->end;
-
- keys->data[pos].overwritten_range = prev_range;
- for (size_t i = next_range->start; i < next_range->end; i++) {
- struct journal_key *ip = keys->data + idx_to_pos(keys, i);
- BUG_ON(ip->overwritten_range != next_range);
- ip->overwritten_range = prev_range;
- }
-
- kfree_rcu_mightsleep(next_range);
- } else if (prev_range) {
- prev_range->end++;
- k->overwritten_range = prev_range;
- if (next_overwritten) {
- prev_range->end++;
- next->overwritten_range = prev_range;
- }
- } else if (next_range) {
- next_range->start--;
- k->overwritten_range = next_range;
- if (prev_overwritten) {
- next_range->start--;
- prev->overwritten_range = next_range;
- }
- } else if (prev_overwritten || next_overwritten) {
- struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- return;
-
- r->start = idx - (size_t) prev_overwritten;
- r->end = idx + 1 + (size_t) next_overwritten;
-
- rcu_assign_pointer(k->overwritten_range, r);
- if (prev_overwritten)
- prev->overwritten_range = r;
- if (next_overwritten)
- next->overwritten_range = r;
- }
-}
-
-void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
- unsigned level, struct bpos pos)
-{
- struct journal_keys *keys = &c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
- if (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- !keys->data[idx].overwritten) {
- mutex_lock(&keys->overwrite_lock);
- __bch2_journal_key_overwritten(keys, idx);
- mutex_unlock(&keys->overwrite_lock);
- }
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
- if (iter->idx < iter->keys->size) {
- iter->idx++;
- if (iter->idx == iter->keys->gap)
- iter->idx += iter->keys->size - iter->keys->nr;
- }
-}
-
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-{
- struct bkey_s_c ret = bkey_s_c_null;
-
- journal_iter_verify(iter);
-
- rcu_read_lock();
- while (iter->idx < iter->keys->size) {
- struct journal_key *k = iter->keys->data + iter->idx;
-
- int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
- if (cmp < 0)
- break;
- BUG_ON(cmp);
-
- if (!k->overwritten) {
- ret = bkey_i_to_s_c(k->k);
- break;
- }
-
- if (k->overwritten_range)
- iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
- else
- bch2_journal_iter_advance(iter);
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static void bch2_journal_iter_exit(struct journal_iter *iter)
-{
- list_del(&iter->list);
-}
-
-static void bch2_journal_iter_init(struct bch_fs *c,
- struct journal_iter *iter,
- enum btree_id id, unsigned level,
- struct bpos pos)
-{
- iter->btree_id = id;
- iter->level = level;
- iter->keys = &c->journal_keys;
- iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
-
- journal_iter_verify(iter);
-}
-
-static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-{
- return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
- iter->b, &iter->unpacked);
-}
-
-static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-{
- bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-}
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-{
- if (bpos_eq(iter->pos, SPOS_MAX))
- iter->at_end = true;
- else
- iter->pos = bpos_successor(iter->pos);
-}
-
-static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
-{
- struct btree_and_journal_iter iter = *_iter;
- struct bch_fs *c = iter.trans->c;
- unsigned level = iter.journal.level;
- struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_started, &c->flags)
- ? (level > 1 ? 0 : 2)
- : (level > 1 ? 1 : 16);
-
- iter.prefetch = false;
- iter.fail_if_too_many_whiteouts = true;
- bch2_bkey_buf_init(&tmp);
-
- while (nr--) {
- bch2_btree_and_journal_iter_advance(&iter);
- struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
- if (!k.k)
- break;
-
- bch2_bkey_buf_reassemble(&tmp, c, k);
- bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
- }
-
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-{
- struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
- size_t iters = 0;
-
- if (iter->prefetch && iter->journal.level)
- btree_and_journal_iter_prefetch(iter);
-again:
- if (iter->at_end)
- return bkey_s_c_null;
-
- iters++;
-
- if (iters > 20 && iter->fail_if_too_many_whiteouts)
- return bkey_s_c_null;
-
- while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
- bpos_lt(btree_k.k->p, iter->pos))
- bch2_journal_iter_advance_btree(iter);
-
- if (iter->trans->journal_replay_not_finished)
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
- bpos_lt(journal_k.k->p, iter->pos))
- bch2_journal_iter_advance(&iter->journal);
-
- ret = journal_k.k &&
- (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
- ? journal_k
- : btree_k;
-
- if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
- ret = bkey_s_c_null;
-
- if (ret.k) {
- iter->pos = ret.k->p;
- if (bkey_deleted(ret.k)) {
- bch2_btree_and_journal_iter_advance(iter);
- goto again;
- }
- } else {
- iter->pos = SPOS_MAX;
- iter->at_end = true;
- }
-
- return ret;
-}
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-{
- bch2_journal_iter_exit(&iter->journal);
-}
-
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
- struct btree_and_journal_iter *iter,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct bpos pos)
-{
- memset(iter, 0, sizeof(*iter));
-
- iter->trans = trans;
- iter->b = b;
- iter->node_iter = node_iter;
- iter->pos = b->data->min_key;
- iter->at_end = false;
- INIT_LIST_HEAD(&iter->journal.list);
-
- if (trans->journal_replay_not_finished) {
- bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
- if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
- list_add(&iter->journal.list, &trans->c->journal_iters);
- }
-}
-
-/*
- * this version is used by btree_gc before filesystem has gone RW and
- * multithreaded, so uses the journal_iters list:
- */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
- struct btree_and_journal_iter *iter,
- struct btree *b)
-{
- struct btree_node_iter node_iter;
-
- bch2_btree_node_iter_init_from_start(&node_iter, b);
- __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
-}
-
-/* sort and dedup all keys in the journal: */
-
-/*
- * When keys compare equal, oldest compares first:
- */
-static int journal_sort_key_cmp(const void *_l, const void *_r)
-{
- const struct journal_key *l = _l;
- const struct journal_key *r = _r;
-
- return journal_key_cmp(l, r) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset);
-}
-
-void bch2_journal_keys_put(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- BUG_ON(atomic_read(&keys->ref) <= 0);
-
- if (!atomic_dec_and_test(&keys->ref))
- return;
-
- move_gap(keys, keys->nr);
-
- darray_for_each(*keys, i) {
- if (i->overwritten_range &&
- (i == &darray_last(*keys) ||
- i->overwritten_range != i[1].overwritten_range))
- kfree(i->overwritten_range);
-
- if (i->allocated)
- kfree(i->k);
- }
-
- kvfree(keys->data);
- keys->data = NULL;
- keys->nr = keys->gap = keys->size = 0;
-
- struct journal_replay **i;
- struct genradix_iter iter;
-
- genradix_for_each(&c->journal_entries, iter, i)
- kvfree(*i);
- genradix_free(&c->journal_entries);
-}
-
-static void __journal_keys_sort(struct journal_keys *keys)
-{
- sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
-
- cond_resched();
-
- struct journal_key *dst = keys->data;
-
- darray_for_each(*keys, src) {
- /*
- * We don't accumulate accounting keys here because we have to
- * compare each individual accounting key against the version in
- * the btree during replay:
- */
- if (src->k->k.type != KEY_TYPE_accounting &&
- src + 1 < &darray_top(*keys) &&
- !journal_key_cmp(src, src + 1))
- continue;
-
- *dst++ = *src;
- }
-
- keys->nr = dst - keys->data;
-}
-
-int bch2_journal_keys_sort(struct bch_fs *c)
-{
- struct genradix_iter iter;
- struct journal_replay *i, **_i;
- struct journal_keys *keys = &c->journal_keys;
- size_t nr_read = 0;
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- cond_resched();
-
- for_each_jset_key(k, entry, &i->j) {
- struct journal_key n = (struct journal_key) {
- .btree_id = entry->btree_id,
- .level = entry->level,
- .k = k,
- .journal_seq = le64_to_cpu(i->j.seq),
- .journal_offset = k->_data - i->j._data,
- };
-
- if (darray_push(keys, n)) {
- __journal_keys_sort(keys);
-
- if (keys->nr * 8 > keys->size * 7) {
- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
- keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
-
- BUG_ON(darray_push(keys, n));
- }
-
- nr_read++;
- }
- }
-
- __journal_keys_sort(keys);
- keys->gap = keys->nr;
-
- bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
- return 0;
-}
-
-void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
- unsigned level_min, unsigned level_max,
- struct bpos start, struct bpos end)
-{
- struct journal_keys *keys = &c->journal_keys;
- size_t dst = 0;
-
- move_gap(keys, keys->nr);
-
- darray_for_each(*keys, i)
- if (!(i->btree_id == btree &&
- i->level >= level_min &&
- i->level <= level_max &&
- bpos_ge(i->k->k.p, start) &&
- bpos_le(i->k->k.p, end)))
- keys->data[dst++] = *i;
- keys->nr = keys->gap = dst;
-}
-
-void bch2_journal_keys_dump(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
- struct printbuf buf = PRINTBUF;
-
- pr_info("%zu keys:", keys->nr);
-
- move_gap(keys, keys->nr);
-
- darray_for_each(*keys, i) {
- printbuf_reset(&buf);
- prt_printf(&buf, "btree=");
- bch2_btree_id_to_text(&buf, i->btree_id);
- prt_printf(&buf, " l=%u ", i->level);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
- pr_err("%s", buf.buf);
- }
- printbuf_exit(&buf);
-}
-
-void bch2_fs_journal_keys_init(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- atomic_set(&keys->ref, 1);
- keys->initial_ref_held = true;
- mutex_init(&keys->overwrite_lock);
-}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
deleted file mode 100644
index 2a3082919b8d..000000000000
--- a/fs/bcachefs/btree_journal_iter.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
-#define _BCACHEFS_BTREE_JOURNAL_ITER_H
-
-#include "bkey.h"
-
-struct journal_iter {
- struct list_head list;
- enum btree_id btree_id;
- unsigned level;
- size_t idx;
- struct journal_keys *keys;
-};
-
-/*
- * Iterate over keys in the btree, with keys from the journal overlaid on top:
- */
-
-struct btree_and_journal_iter {
- struct btree_trans *trans;
- struct btree *b;
- struct btree_node_iter node_iter;
- struct bkey unpacked;
-
- struct journal_iter journal;
- struct bpos pos;
- bool at_end;
- bool prefetch;
- bool fail_if_too_many_whiteouts;
-};
-
-static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
- unsigned l_level,
- const struct journal_key *r)
-{
- return -cmp_int(l_level, r->level) ?:
- cmp_int(l_btree_id, r->btree_id);
-}
-
-static inline int __journal_key_cmp(enum btree_id l_btree_id,
- unsigned l_level,
- struct bpos l_pos,
- const struct journal_key *r)
-{
- return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
- bpos_cmp(l_pos, r->k->k.p);
-}
-
-static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
- return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
- unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
- unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
-
-int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
- struct btree_and_journal_iter *);
-
-int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
- unsigned, struct bkey_i *);
-int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
- unsigned, struct bkey_i *);
-int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
-bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
- struct btree_and_journal_iter *, struct btree *,
- struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
- struct btree_and_journal_iter *, struct btree *);
-
-void bch2_journal_keys_put(struct bch_fs *);
-
-static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
-{
- if (c->journal_keys.initial_ref_held)
- bch2_journal_keys_put(c);
- c->journal_keys.initial_ref_held = false;
-}
-
-int bch2_journal_keys_sort(struct bch_fs *);
-
-void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
- unsigned, unsigned,
- struct bpos, struct bpos);
-
-void bch2_journal_keys_dump(struct bch_fs *);
-
-void bch2_fs_journal_keys_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
deleted file mode 100644
index 8b773823704f..000000000000
--- a/fs/bcachefs/btree_journal_iter_types.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
-#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
-
-struct journal_key_range_overwritten {
- size_t start, end;
-};
-
-struct journal_key {
- u64 journal_seq;
- u32 journal_offset;
- enum btree_id btree_id:8;
- unsigned level:8;
- bool allocated;
- bool overwritten;
- struct journal_key_range_overwritten __rcu *
- overwritten_range;
- struct bkey_i *k;
-};
-
-struct journal_keys {
- /* must match layout in darray_types.h */
- size_t nr, size;
- struct journal_key *data;
- /*
- * Gap buffer: instead of all the empty space in the array being at the
- * end of the buffer - from @nr to @size - the empty space is at @gap.
- * This means that sequential insertions are O(n) instead of O(n^2).
- */
- size_t gap;
- atomic_t ref;
- bool initial_ref_held;
- struct mutex overwrite_lock;
-};
-
-#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
deleted file mode 100644
index edce59433375..000000000000
--- a/fs/bcachefs/btree_key_cache.c
+++ /dev/null
@@ -1,849 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static inline bool btree_uses_pcpu_readers(enum btree_id id)
-{
- return id == BTREE_ID_subvolumes;
-}
-
-static struct kmem_cache *bch2_key_cache;
-
-static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const struct bkey_cached *ck = obj;
- const struct bkey_cached_key *key = arg->key;
-
- return ck->key.btree_id != key->btree_id ||
- !bpos_eq(ck->key.pos, key->pos);
-}
-
-static const struct rhashtable_params bch2_btree_key_cache_params = {
- .head_offset = offsetof(struct bkey_cached, hash),
- .key_offset = offsetof(struct bkey_cached, key),
- .key_len = sizeof(struct bkey_cached_key),
- .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
- .automatic_shrinking = true,
-};
-
-static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path,
- struct bkey_cached *ck,
- enum btree_node_locked_type lock_held)
-{
- path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
- path->l[0].b = (void *) ck;
- mark_btree_node_locked(trans, path, 0, lock_held);
-}
-
-__flatten
-inline struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
-{
- struct bkey_cached_key key = {
- .btree_id = btree_id,
- .pos = pos,
- };
-
- return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
- bch2_btree_key_cache_params);
-}
-
-static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
-{
- if (!six_trylock_intent(&ck->c.lock))
- return false;
-
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- six_unlock_intent(&ck->c.lock);
- return false;
- }
-
- if (!six_trylock_write(&ck->c.lock)) {
- six_unlock_intent(&ck->c.lock);
- return false;
- }
-
- return true;
-}
-
-static bool bkey_cached_evict(struct btree_key_cache *c,
- struct bkey_cached *ck)
-{
- bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
- bch2_btree_key_cache_params);
- if (ret) {
- memset(&ck->key, ~0, sizeof(ck->key));
- atomic_long_dec(&c->nr_keys);
- }
-
- return ret;
-}
-
-static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
-{
- struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
- struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
-
- this_cpu_dec(*c->btree_key_cache.nr_pending);
- kmem_cache_free(bch2_key_cache, ck);
-}
-
-static void bkey_cached_free(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- kfree(ck->k);
- ck->k = NULL;
- ck->u64s = 0;
-
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
-
- bool pcpu_readers = ck->c.lock.readers != NULL;
- rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
- this_cpu_inc(*bc->nr_pending);
-}
-
-static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
-{
- gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
-
- struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
- if (unlikely(!ck))
- return NULL;
- ck->k = kmalloc(key_u64s * sizeof(u64), gfp);
- if (unlikely(!ck->k)) {
- kmem_cache_free(bch2_key_cache, ck);
- return NULL;
- }
- ck->u64s = key_u64s;
- return ck;
-}
-
-static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
-{
- struct bch_fs *c = trans->c;
- struct btree_key_cache *bc = &c->btree_key_cache;
- bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
- int ret;
-
- struct bkey_cached *ck = container_of_or_null(
- rcu_pending_dequeue(&bc->pending[pcpu_readers]),
- struct bkey_cached, rcu);
- if (ck)
- goto lock;
-
- ck = allocate_dropping_locks(trans, ret,
- __bkey_cached_alloc(key_u64s, _gfp));
- if (ret) {
- if (ck)
- kfree(ck->k);
- kmem_cache_free(bch2_key_cache, ck);
- return ERR_PTR(ret);
- }
-
- if (ck) {
- bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
- ck->c.cached = true;
- goto lock;
- }
-
- ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
- struct bkey_cached, rcu);
- if (ck)
- goto lock;
-lock:
- six_lock_intent(&ck->c.lock, NULL, NULL);
- six_lock_write(&ck->c.lock, NULL, NULL);
- return ck;
-}
-
-static struct bkey_cached *
-bkey_cached_reuse(struct btree_key_cache *c)
-{
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct bkey_cached *ck;
- unsigned i;
-
- rcu_read_lock();
- tbl = rht_dereference_rcu(c->table.tbl, &c->table);
- for (i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bkey_cached_lock_for_evict(ck)) {
- if (bkey_cached_evict(c, ck))
- goto out;
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
- }
- }
- ck = NULL;
-out:
- rcu_read_unlock();
- return ck;
-}
-
-static int btree_key_cache_create(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_path *ck_path,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct btree_key_cache *bc = &c->btree_key_cache;
-
- /*
- * bch2_varint_decode can read past the end of the buffer by at
- * most 7 bytes (it won't be used):
- */
- unsigned key_u64s = k.k->u64s + 1;
-
- /*
- * Allocate some extra space so that the transaction commit path is less
- * likely to have to reallocate, since that requires a transaction
- * restart:
- */
- key_u64s = min(256U, (key_u64s * 3) / 2);
- key_u64s = roundup_pow_of_two(key_u64s);
-
- struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s);
- int ret = PTR_ERR_OR_ZERO(ck);
- if (ret)
- return ret;
-
- if (unlikely(!ck)) {
- ck = bkey_cached_reuse(bc);
- if (unlikely(!ck)) {
- bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_id_str(ck_path->btree_id));
- return -BCH_ERR_ENOMEM_btree_key_cache_create;
- }
- }
-
- ck->c.level = 0;
- ck->c.btree_id = ck_path->btree_id;
- ck->key.btree_id = ck_path->btree_id;
- ck->key.pos = ck_path->pos;
- ck->flags = 1U << BKEY_CACHED_ACCESSED;
-
- if (unlikely(key_u64s > ck->u64s)) {
- mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
-
- struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
- kmalloc(key_u64s * sizeof(u64), _gfp));
- if (unlikely(!new_k)) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_id_str(ck->key.btree_id), key_u64s);
- ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
- } else if (ret) {
- kfree(new_k);
- goto err;
- }
-
- kfree(ck->k);
- ck->k = new_k;
- ck->u64s = key_u64s;
- }
-
- bkey_reassemble(ck->k, k);
-
- ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c);
- if (unlikely(ret))
- goto err;
-
- ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
-
- bch2_btree_node_unlock_write(trans, path, path_l(path)->b);
-
- if (unlikely(ret)) /* raced with another fill? */
- goto err;
-
- atomic_long_inc(&bc->nr_keys);
- six_unlock_write(&ck->c.lock);
-
- enum six_lock_type lock_want = __btree_lock_want(ck_path, 0);
- if (lock_want == SIX_LOCK_read)
- six_lock_downgrade(&ck->c.lock);
- btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want);
- ck_path->uptodate = BTREE_ITER_UPTODATE;
- return 0;
-err:
- bkey_cached_free(bc, ck);
- mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
-
- return ret;
-}
-
-static noinline int btree_key_cache_fill(struct btree_trans *trans,
- struct btree_path *ck_path,
- unsigned flags)
-{
- if (flags & BTREE_ITER_cached_nofill) {
- ck_path->l[0].b = NULL;
- return 0;
- }
-
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
- BTREE_ITER_intent|
- BTREE_ITER_key_cache_fill|
- BTREE_ITER_cached_nofill);
- iter.flags &= ~BTREE_ITER_with_journal;
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- /* Recheck after btree lookup, before allocating: */
- ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
- if (unlikely(ret))
- goto out;
-
- ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k);
- if (ret)
- goto err;
-
- if (trace_key_cache_fill_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, ck_path->pos);
- prt_char(&buf, ' ');
- bch2_bkey_val_to_text(&buf, trans->c, k);
- trace_key_cache_fill(trans, buf.buf);
- printbuf_exit(&buf);
- }
-out:
- /* We're not likely to need this iterator again: */
- bch2_set_btree_iter_dontneed(&iter);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
- struct btree_path *path)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck;
-retry:
- ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
- if (!ck)
- return -ENOENT;
-
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
- int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_);
- if (ret)
- return ret;
-
- if (ck->key.btree_id != path->btree_id ||
- !bpos_eq(ck->key.pos, path->pos)) {
- six_unlock_type(&ck->c.lock, lock_want);
- goto retry;
- }
-
- if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
- set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
- btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
- path->uptodate = BTREE_ITER_UPTODATE;
- return 0;
-}
-
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
- unsigned flags)
-{
- EBUG_ON(path->level);
-
- path->l[1].b = NULL;
-
- int ret;
- do {
- ret = btree_path_traverse_cached_fast(trans, path);
- if (unlikely(ret == -ENOENT))
- ret = btree_key_cache_fill(trans, path, flags);
- } while (ret == -EEXIST);
-
- if (unlikely(ret)) {
- path->uptodate = BTREE_ITER_NEED_TRAVERSE;
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- btree_node_unlock(trans, path, 0);
- path->l[0].b = ERR_PTR(ret);
- }
- }
- return ret;
-}
-
-static int btree_key_cache_flush_pos(struct btree_trans *trans,
- struct bkey_cached_key key,
- u64 journal_seq,
- unsigned commit_flags,
- bool evict)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- struct btree_iter c_iter, b_iter;
- struct bkey_cached *ck = NULL;
- int ret;
-
- bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
- BTREE_ITER_slots|
- BTREE_ITER_intent|
- BTREE_ITER_all_snapshots);
- bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
- BTREE_ITER_cached|
- BTREE_ITER_intent);
- b_iter.flags &= ~BTREE_ITER_with_key_cache;
-
- ret = bch2_btree_iter_traverse(&c_iter);
- if (ret)
- goto out;
-
- ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
- if (!ck)
- goto out;
-
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- if (evict)
- goto evict;
- goto out;
- }
-
- if (journal_seq && ck->journal.seq != journal_seq)
- goto out;
-
- trans->journal_res.seq = ck->journal.seq;
-
- /*
- * If we're at the end of the journal, we really want to free up space
- * in the journal right away - we don't want to pin that old journal
- * sequence number with a new btree node write, we want to re-journal
- * the update
- */
- if (ck->journal.seq == journal_last_seq(j))
- commit_flags |= BCH_WATERMARK_reclaim;
-
- if (ck->journal.seq != journal_last_seq(j) ||
- !test_bit(JOURNAL_space_low, &c->journal.flags))
- commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
-
- struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
- ret = bkey_err(btree_k);
- if (ret)
- goto err;
-
- /* * Check that we're not violating cache coherency rules: */
- BUG_ON(bkey_deleted(btree_k.k));
-
- ret = bch2_trans_update(trans, &b_iter, ck->k,
- BTREE_UPDATE_key_cache_reclaim|
- BTREE_UPDATE_internal_snapshot_node|
- BTREE_TRIGGER_norun) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc|
- commit_flags);
-err:
- bch2_fs_fatal_err_on(ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
- !bch2_journal_error(j), c,
- "flushing key cache: %s", bch2_err_str(ret));
- if (ret)
- goto out;
-
- bch2_journal_pin_drop(j, &ck->journal);
-
- struct btree_path *path = btree_iter_path(trans, &c_iter);
- BUG_ON(!btree_node_locked(path, 0));
-
- if (!evict) {
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
- atomic_long_dec(&c->btree_key_cache.nr_dirty);
- }
- } else {
- struct btree_path *path2;
- unsigned i;
-evict:
- trans_for_each_path(trans, path2, i)
- if (path2 != path)
- __bch2_btree_path_unlock(trans, path2);
-
- bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
-
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
- atomic_long_dec(&c->btree_key_cache.nr_dirty);
- }
-
- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
- if (bkey_cached_evict(&c->btree_key_cache, ck)) {
- bkey_cached_free(&c->btree_key_cache, ck);
- } else {
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
- }
- }
-out:
- bch2_trans_iter_exit(trans, &b_iter);
- bch2_trans_iter_exit(trans, &c_iter);
- return ret;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *j,
- struct journal_entry_pin *pin, u64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bkey_cached *ck =
- container_of(pin, struct bkey_cached, journal);
- struct bkey_cached_key key;
- struct btree_trans *trans = bch2_trans_get(c);
- int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- int ret = 0;
-
- btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
- key = ck->key;
-
- if (ck->journal.seq != seq ||
- !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- six_unlock_read(&ck->c.lock);
- goto unlock;
- }
-
- if (ck->seq != seq) {
- bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
- bch2_btree_key_cache_journal_flush);
- six_unlock_read(&ck->c.lock);
- goto unlock;
- }
- six_unlock_read(&ck->c.lock);
-
- ret = lockrestart_do(trans,
- btree_key_cache_flush_pos(trans, key, seq,
- BCH_TRANS_COMMIT_journal_reclaim, false));
-unlock:
- srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
- bch2_trans_put(trans);
- return ret;
-}
-
-bool bch2_btree_insert_key_cached(struct btree_trans *trans,
- unsigned flags,
- struct btree_insert_entry *insert_entry)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
- struct bkey_i *insert = insert_entry->k;
- bool kick_reclaim = false;
-
- BUG_ON(insert->k.u64s > ck->u64s);
-
- bkey_copy(ck->k, insert);
-
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- set_bit(BKEY_CACHED_DIRTY, &ck->flags);
- atomic_long_inc(&c->btree_key_cache.nr_dirty);
-
- if (bch2_nr_btree_keys_need_flush(c))
- kick_reclaim = true;
- }
-
- /*
- * To minimize lock contention, we only add the journal pin here and
- * defer pin updates to the flush callback via ->seq. Be careful not to
- * update ->seq on nojournal commits because we don't want to update the
- * pin to a seq that doesn't include journal updates on disk. Otherwise
- * we risk losing the update after a crash.
- *
- * The only exception is if the pin is not active in the first place. We
- * have to add the pin because journal reclaim drives key cache
- * flushing. The flush callback will not proceed unless ->seq matches
- * the latest pin, so make sure it starts with a consistent value.
- */
- if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
- !journal_pin_active(&ck->journal)) {
- ck->seq = trans->journal_res.seq;
- }
- bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
- &ck->journal, bch2_btree_key_cache_journal_flush);
-
- if (kick_reclaim)
- journal_reclaim_kick(&c->journal);
- return true;
-}
-
-void bch2_btree_key_cache_drop(struct btree_trans *trans,
- struct btree_path *path)
-{
- struct bch_fs *c = trans->c;
- struct btree_key_cache *bc = &c->btree_key_cache;
- struct bkey_cached *ck = (void *) path->l[0].b;
-
- /*
- * We just did an update to the btree, bypassing the key cache: the key
- * cache key is now stale and must be dropped, even if dirty:
- */
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
- atomic_long_dec(&c->btree_key_cache.nr_dirty);
- bch2_journal_pin_drop(&c->journal, &ck->journal);
- }
-
- bkey_cached_evict(bc, ck);
- bkey_cached_free(bc, ck);
-
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
-
- struct btree_path *path2;
- unsigned i;
- trans_for_each_path(trans, path2, i)
- if (path2->l[0].b == (void *) ck) {
- __bch2_btree_path_unlock(trans, path2);
- path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
- path2->should_be_locked = false;
- btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE);
- }
-
- bch2_trans_verify_locks(trans);
-}
-
-static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct bch_fs *c = shrink->private_data;
- struct btree_key_cache *bc = &c->btree_key_cache;
- struct bucket_table *tbl;
- struct bkey_cached *ck;
- size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
- unsigned iter, start;
- int srcu_idx;
-
- srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- rcu_read_lock();
-
- tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-
- /*
- * Scanning is expensive while a rehash is in progress - most elements
- * will be on the new hashtable, if it's in progress
- *
- * A rehash could still start while we're scanning - that's ok, we'll
- * still see most elements.
- */
- if (unlikely(tbl->nest)) {
- rcu_read_unlock();
- srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
- return SHRINK_STOP;
- }
-
- iter = bc->shrink_iter;
- if (iter >= tbl->size)
- iter = 0;
- start = iter;
-
- do {
- struct rhash_head *pos, *next;
-
- pos = rht_ptr_rcu(&tbl->buckets[iter]);
-
- while (!rht_is_a_nulls(pos)) {
- next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
- ck = container_of(pos, struct bkey_cached, hash);
-
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- bc->skipped_dirty++;
- } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
- clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- bc->skipped_accessed++;
- } else if (!bkey_cached_lock_for_evict(ck)) {
- bc->skipped_lock_fail++;
- } else if (bkey_cached_evict(bc, ck)) {
- bkey_cached_free(bc, ck);
- bc->freed++;
- freed++;
- } else {
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
- }
-
- scanned++;
- if (scanned >= nr)
- goto out;
-
- pos = next;
- }
-
- iter++;
- if (iter >= tbl->size)
- iter = 0;
- } while (scanned < nr && iter != start);
-out:
- bc->shrink_iter = iter;
-
- rcu_read_unlock();
- srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
- return freed;
-}
-
-static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct bch_fs *c = shrink->private_data;
- struct btree_key_cache *bc = &c->btree_key_cache;
- long nr = atomic_long_read(&bc->nr_keys) -
- atomic_long_read(&bc->nr_dirty);
-
- /*
- * Avoid hammering our shrinker too much if it's nearly empty - the
- * shrinker code doesn't take into account how big our cache is, if it's
- * mostly empty but the system is under memory pressure it causes nasty
- * lock contention:
- */
- nr -= 128;
-
- return max(0L, nr);
-}
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
- struct bucket_table *tbl;
- struct bkey_cached *ck;
- struct rhash_head *pos;
- LIST_HEAD(items);
- unsigned i;
-
- shrinker_free(bc->shrink);
-
- /*
- * The loop is needed to guard against racing with rehash:
- */
- while (atomic_long_read(&bc->nr_keys)) {
- rcu_read_lock();
- tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- if (tbl) {
- if (tbl->nest) {
- /* wait for in progress rehash */
- rcu_read_unlock();
- mutex_lock(&bc->table.mutex);
- mutex_unlock(&bc->table.mutex);
- continue;
- }
- for (i = 0; i < tbl->size; i++)
- while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
- ck = container_of(pos, struct bkey_cached, hash);
- BUG_ON(!bkey_cached_evict(bc, ck));
- kfree(ck->k);
- kmem_cache_free(bch2_key_cache, ck);
- }
- }
- rcu_read_unlock();
- }
-
- if (atomic_long_read(&bc->nr_dirty) &&
- !bch2_journal_error(&c->journal) &&
- test_bit(BCH_FS_was_rw, &c->flags))
- panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
- atomic_long_read(&bc->nr_dirty));
-
- if (atomic_long_read(&bc->nr_keys))
- panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
- atomic_long_read(&bc->nr_keys));
-
- if (bc->table_init_done)
- rhashtable_destroy(&bc->table);
-
- rcu_pending_exit(&bc->pending[0]);
- rcu_pending_exit(&bc->pending[1]);
-
- free_percpu(bc->nr_pending);
-}
-
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
-{
-}
-
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
- struct shrinker *shrink;
-
- bc->nr_pending = alloc_percpu(size_t);
- if (!bc->nr_pending)
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-
- if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
- rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-
- if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-
- bc->table_init_done = true;
-
- shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
- if (!shrink)
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
- bc->shrink = shrink;
- shrink->count_objects = bch2_btree_key_cache_count;
- shrink->scan_objects = bch2_btree_key_cache_scan;
- shrink->batch = 1 << 14;
- shrink->seeks = 0;
- shrink->private_data = c;
- shrinker_register(shrink);
- return 0;
-}
-
-void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
-{
- printbuf_tabstop_push(out, 24);
- printbuf_tabstop_push(out, 12);
-
- prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
- prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
- prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
- prt_newline(out);
- prt_printf(out, "shrinker:\n");
- prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
- prt_printf(out, "freed:\t%lu\r\n", bc->freed);
- prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
- prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
- prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
- prt_newline(out);
- prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
-}
-
-void bch2_btree_key_cache_exit(void)
-{
- kmem_cache_destroy(bch2_key_cache);
-}
-
-int __init bch2_btree_key_cache_init(void)
-{
- bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
- if (!bch2_key_cache)
- return -ENOMEM;
-
- return 0;
-}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
deleted file mode 100644
index 51d6289b8dee..000000000000
--- a/fs/bcachefs/btree_key_cache.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
-#define _BCACHEFS_BTREE_KEY_CACHE_H
-
-static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
-{
- size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
- size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
- size_t max_dirty = 1024 + nr_keys / 2;
-
- return max_t(ssize_t, 0, nr_dirty - max_dirty);
-}
-
-static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c)
-{
- size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
- size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
- size_t max_dirty = 4096 + (nr_keys * 3) / 4;
-
- return nr_dirty - max_dirty;
-}
-
-static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
-{
- return __bch2_btree_key_cache_must_wait(c) > 0;
-}
-
-static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c)
-{
- size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
- size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
- size_t max_dirty = 2048 + (nr_keys * 5) / 8;
-
- return nr_dirty <= max_dirty;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *,
- struct journal_entry_pin *, u64);
-
-struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
-
-int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
- unsigned);
-
-bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
- struct btree_insert_entry *);
-void bch2_btree_key_cache_drop(struct btree_trans *,
- struct btree_path *);
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
-
-void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
-
-void bch2_btree_key_cache_exit(void);
-int __init bch2_btree_key_cache_init(void);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
deleted file mode 100644
index 722f1ed10551..000000000000
--- a/fs/bcachefs/btree_key_cache_types.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-
-#include "rcu_pending.h"
-
-struct btree_key_cache {
- struct rhashtable table;
- bool table_init_done;
-
- struct shrinker *shrink;
- unsigned shrink_iter;
-
- /* 0: non pcpu reader locks, 1: pcpu reader locks */
- struct rcu_pending pending[2];
- size_t __percpu *nr_pending;
-
- atomic_long_t nr_keys;
- atomic_long_t nr_dirty;
-
- /* shrinker stats */
- unsigned long requested_to_free;
- unsigned long freed;
- unsigned long skipped_dirty;
- unsigned long skipped_accessed;
- unsigned long skipped_lock_fail;
-};
-
-struct bkey_cached_key {
- u32 btree_id;
- struct bpos pos;
-} __packed __aligned(4);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
deleted file mode 100644
index 94eb2b73a843..000000000000
--- a/fs/bcachefs/btree_locking.c
+++ /dev/null
@@ -1,902 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_locking.h"
-#include "btree_types.h"
-
-static struct lock_class_key bch2_btree_node_lock_key;
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
- enum six_lock_init_flags flags,
- gfp_t gfp)
-{
- __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
- lockdep_set_notrack_class(&b->lock);
-}
-
-/* Btree node locking: */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
- struct btree_path *skip,
- struct btree_bkey_cached_common *b,
- unsigned level)
-{
- struct btree_path *path;
- struct six_lock_count ret;
- unsigned i;
-
- memset(&ret, 0, sizeof(ret));
-
- if (IS_ERR_OR_NULL(b))
- return ret;
-
- trans_for_each_path(trans, path, i)
- if (path != skip && &path->l[level].b->c == b) {
- int t = btree_node_locked_type(path, level);
-
- if (t != BTREE_NODE_UNLOCKED)
- ret.n[t]++;
- }
-
- return ret;
-}
-
-/* unlock */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
- struct btree_path *path, struct btree *b)
-{
- bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-/* lock */
-
-/*
- * @trans wants to lock @b with type @type
- */
-struct trans_waiting_for_lock {
- struct btree_trans *trans;
- struct btree_bkey_cached_common *node_want;
- enum six_lock_type lock_want;
-
- /* for iterating over held locks :*/
- u8 path_idx;
- u8 level;
- u64 lock_start_time;
-};
-
-struct lock_graph {
- struct trans_waiting_for_lock g[8];
- unsigned nr;
-};
-
-static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
-{
- struct trans_waiting_for_lock *i;
-
- prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
-
- for (i = g->g; i < g->g + g->nr; i++) {
- struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
- if (!task)
- continue;
-
- bch2_btree_trans_to_text(out, i->trans);
- bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
- }
-}
-
-static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
-{
- struct trans_waiting_for_lock *i;
-
- for (i = g->g; i != g->g + g->nr; i++) {
- struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
- if (i != g->g)
- prt_str(out, "<- ");
- prt_printf(out, "%u ", task ? task->pid : 0);
- }
- prt_newline(out);
-}
-
-static void lock_graph_up(struct lock_graph *g)
-{
- closure_put(&g->g[--g->nr].trans->ref);
-}
-
-static noinline void lock_graph_pop_all(struct lock_graph *g)
-{
- while (g->nr)
- lock_graph_up(g);
-}
-
-static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i)
-{
- while (g->g + g->nr > i)
- lock_graph_up(g);
-}
-
-static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
- g->g[g->nr++] = (struct trans_waiting_for_lock) {
- .trans = trans,
- .node_want = trans->locking,
- .lock_want = trans->locking_wait.lock_want,
- };
-}
-
-static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
- closure_get(&trans->ref);
- __lock_graph_down(g, trans);
-}
-
-static bool lock_graph_remove_non_waiters(struct lock_graph *g,
- struct trans_waiting_for_lock *from)
-{
- struct trans_waiting_for_lock *i;
-
- if (from->trans->locking != from->node_want) {
- lock_graph_pop_from(g, from);
- return true;
- }
-
- for (i = from + 1; i < g->g + g->nr; i++)
- if (i->trans->locking != i->node_want ||
- i->trans->locking_wait.start_time != i[-1].lock_start_time) {
- lock_graph_pop_from(g, i);
- return true;
- }
-
- return false;
-}
-
-static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
-
- count_event(c, trans_restart_would_deadlock);
-
- if (trace_trans_restart_would_deadlock_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- buf.atomic++;
- print_cycle(&buf, g);
-
- trace_trans_restart_would_deadlock(trans, buf.buf);
- printbuf_exit(&buf);
- }
-}
-
-static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
-{
- if (i == g->g) {
- trace_would_deadlock(g, i->trans);
- return btree_trans_restart_foreign_task(i->trans,
- BCH_ERR_transaction_restart_would_deadlock,
- _THIS_IP_);
- } else {
- i->trans->lock_must_abort = true;
- wake_up_process(i->trans->locking_wait.task);
- return 0;
- }
-}
-
-static int btree_trans_abort_preference(struct btree_trans *trans)
-{
- if (trans->lock_may_not_fail)
- return 0;
- if (trans->locking_wait.lock_want == SIX_LOCK_write)
- return 1;
- if (!trans->in_traverse_all)
- return 2;
- return 3;
-}
-
-static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
- struct trans_waiting_for_lock *from)
-{
- struct trans_waiting_for_lock *i, *abort = NULL;
- unsigned best = 0, pref;
- int ret;
-
- if (lock_graph_remove_non_waiters(g, from))
- return 0;
-
- /* Only checking, for debugfs: */
- if (cycle) {
- print_cycle(cycle, g);
- ret = -1;
- goto out;
- }
-
- for (i = from; i < g->g + g->nr; i++) {
- pref = btree_trans_abort_preference(i->trans);
- if (pref > best) {
- abort = i;
- best = pref;
- }
- }
-
- if (unlikely(!best)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
-
- for (i = g->g; i < g->g + g->nr; i++) {
- struct btree_trans *trans = i->trans;
-
- bch2_btree_trans_to_text(&buf, trans);
-
- prt_printf(&buf, "backtrace:\n");
- printbuf_indent_add(&buf, 2);
- bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
-
- bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- BUG();
- }
-
- ret = abort_lock(g, abort);
-out:
- if (ret)
- lock_graph_pop_all(g);
- else
- lock_graph_pop_from(g, abort);
- return ret;
-}
-
-static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
- struct printbuf *cycle)
-{
- struct btree_trans *orig_trans = g->g->trans;
- struct trans_waiting_for_lock *i;
-
- for (i = g->g; i < g->g + g->nr; i++)
- if (i->trans == trans) {
- closure_put(&trans->ref);
- return break_cycle(g, cycle, i);
- }
-
- if (g->nr == ARRAY_SIZE(g->g)) {
- closure_put(&trans->ref);
-
- if (orig_trans->lock_may_not_fail)
- return 0;
-
- lock_graph_pop_all(g);
-
- if (cycle)
- return 0;
-
- trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
- return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
- }
-
- __lock_graph_down(g, trans);
- return 0;
-}
-
-static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
-{
- return t1 + t2 > 1;
-}
-
-int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
-{
- struct lock_graph g;
- struct trans_waiting_for_lock *top;
- struct btree_bkey_cached_common *b;
- btree_path_idx_t path_idx;
- int ret = 0;
-
- g.nr = 0;
-
- if (trans->lock_must_abort && !trans->lock_may_not_fail) {
- if (cycle)
- return -1;
-
- trace_would_deadlock(&g, trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
- }
-
- lock_graph_down(&g, trans);
-
- /* trans->paths is rcu protected vs. freeing */
- rcu_read_lock();
- if (cycle)
- cycle->atomic++;
-next:
- if (!g.nr)
- goto out;
-
- top = &g.g[g.nr - 1];
-
- struct btree_path *paths = rcu_dereference(top->trans->paths);
- if (!paths)
- goto up;
-
- unsigned long *paths_allocated = trans_paths_allocated(paths);
-
- trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
- path_idx, top->path_idx) {
- struct btree_path *path = paths + path_idx;
- if (!path->nodes_locked)
- continue;
-
- if (path_idx != top->path_idx) {
- top->path_idx = path_idx;
- top->level = 0;
- top->lock_start_time = 0;
- }
-
- for (;
- top->level < BTREE_MAX_DEPTH;
- top->level++, top->lock_start_time = 0) {
- int lock_held = btree_node_locked_type(path, top->level);
-
- if (lock_held == BTREE_NODE_UNLOCKED)
- continue;
-
- b = &READ_ONCE(path->l[top->level].b)->c;
-
- if (IS_ERR_OR_NULL(b)) {
- /*
- * If we get here, it means we raced with the
- * other thread updating its btree_path
- * structures - which means it can't be blocked
- * waiting on a lock:
- */
- if (!lock_graph_remove_non_waiters(&g, g.g)) {
- /*
- * If lock_graph_remove_non_waiters()
- * didn't do anything, it must be
- * because we're being called by debugfs
- * checking for lock cycles, which
- * invokes us on btree_transactions that
- * aren't actually waiting on anything.
- * Just bail out:
- */
- lock_graph_pop_all(&g);
- }
-
- goto next;
- }
-
- if (list_empty_careful(&b->lock.wait_list))
- continue;
-
- raw_spin_lock(&b->lock.wait_lock);
- list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
- BUG_ON(b != trans->locking);
-
- if (top->lock_start_time &&
- time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
- continue;
-
- top->lock_start_time = trans->locking_wait.start_time;
-
- /* Don't check for self deadlock: */
- if (trans == top->trans ||
- !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
- continue;
-
- closure_get(&trans->ref);
- raw_spin_unlock(&b->lock.wait_lock);
-
- ret = lock_graph_descend(&g, trans, cycle);
- if (ret)
- goto out;
- goto next;
-
- }
- raw_spin_unlock(&b->lock.wait_lock);
- }
- }
-up:
- if (g.nr > 1 && cycle)
- print_chain(cycle, &g);
- lock_graph_up(&g);
- goto next;
-out:
- if (cycle)
- --cycle->atomic;
- rcu_read_unlock();
- return ret;
-}
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
-{
- struct btree_trans *trans = p;
-
- return bch2_check_for_deadlock(trans, NULL);
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
- struct btree_bkey_cached_common *b,
- bool lock_may_not_fail)
-{
- int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
- int ret;
-
- /*
- * Must drop our read locks before calling six_lock_write() -
- * six_unlock() won't do wakeups until the reader count
- * goes to 0, and it's safe because we have the node intent
- * locked:
- */
- six_lock_readers_add(&b->lock, -readers);
- ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
- lock_may_not_fail, _RET_IP_);
- six_lock_readers_add(&b->lock, readers);
-
- if (ret)
- mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
-
- return ret;
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b)
-{
- int ret = __btree_node_lock_write(trans, path, b, true);
- BUG_ON(ret);
-}
-
-/* relock */
-
-static inline bool btree_path_get_locks(struct btree_trans *trans,
- struct btree_path *path,
- bool upgrade,
- struct get_locks_fail *f)
-{
- unsigned l = path->level;
- int fail_idx = -1;
-
- do {
- if (!btree_path_node(path, l))
- break;
-
- if (!(upgrade
- ? bch2_btree_node_upgrade(trans, path, l)
- : bch2_btree_node_relock(trans, path, l))) {
- fail_idx = l;
-
- if (f) {
- f->l = l;
- f->b = path->l[l].b;
- }
- }
-
- l++;
- } while (l < path->locks_want);
-
- /*
- * When we fail to get a lock, we have to ensure that any child nodes
- * can't be relocked so bch2_btree_path_traverse has to walk back up to
- * the node that we failed to relock:
- */
- if (fail_idx >= 0) {
- __bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
- do {
- path->l[fail_idx].b = upgrade
- ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
- : ERR_PTR(-BCH_ERR_no_btree_node_relock);
- --fail_idx;
- } while (fail_idx >= 0);
- }
-
- if (path->uptodate == BTREE_ITER_NEED_RELOCK)
- path->uptodate = BTREE_ITER_UPTODATE;
-
- return path->uptodate < BTREE_ITER_NEED_RELOCK;
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned level,
- bool trace)
-{
- struct btree *b = btree_path_node(path, level);
- int want = __btree_lock_want(path, level);
-
- if (race_fault())
- goto fail;
-
- if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
- (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, &b->c, level, want))) {
- mark_btree_node_locked(trans, path, level, want);
- return true;
- }
-fail:
- if (trace && !trans->notrace_relock_fail)
- trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
- return false;
-}
-
-/* upgrade */
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree *b = path->l[level].b;
-
- if (!is_btree_node(path, level))
- return false;
-
- switch (btree_lock_want(path, level)) {
- case BTREE_NODE_UNLOCKED:
- BUG_ON(btree_node_locked(path, level));
- return true;
- case BTREE_NODE_READ_LOCKED:
- BUG_ON(btree_node_intent_locked(path, level));
- return bch2_btree_node_relock(trans, path, level);
- case BTREE_NODE_INTENT_LOCKED:
- break;
- case BTREE_NODE_WRITE_LOCKED:
- BUG();
- }
-
- if (btree_node_intent_locked(path, level))
- return true;
-
- if (race_fault())
- return false;
-
- if (btree_node_locked(path, level)
- ? six_lock_tryupgrade(&b->c.lock)
- : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
- goto success;
-
- if (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
- btree_node_unlock(trans, path, level);
- goto success;
- }
-
- trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
- return false;
-success:
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
- return true;
-}
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-int bch2_btree_path_relock_intent(struct btree_trans *trans,
- struct btree_path *path)
-{
- unsigned l;
-
- for (l = path->level;
- l < path->locks_want && btree_path_node(path, l);
- l++) {
- if (!bch2_btree_node_relock(trans, path, l)) {
- __bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
- }
- }
-
- return 0;
-}
-
-__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
-{
- struct get_locks_fail f;
-
- bool ret = btree_path_get_locks(trans, path, false, &f);
- bch2_trans_verify_locks(trans);
- return ret;
-}
-
-int __bch2_btree_path_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
-{
- if (!bch2_btree_path_relock_norestart(trans, path)) {
- trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
- }
-
- return 0;
-}
-
-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want,
- struct get_locks_fail *f)
-{
- EBUG_ON(path->locks_want >= new_locks_want);
-
- path->locks_want = new_locks_want;
-
- bool ret = btree_path_get_locks(trans, path, true, f);
- bch2_trans_verify_locks(trans);
- return ret;
-}
-
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want,
- struct get_locks_fail *f)
-{
- bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
- if (ret)
- goto out;
-
- /*
- * XXX: this is ugly - we'd prefer to not be mucking with other
- * iterators in the btree_trans here.
- *
- * On failure to upgrade the iterator, setting iter->locks_want and
- * calling get_locks() is sufficient to make bch2_btree_path_traverse()
- * get the locks we want on transaction restart.
- *
- * But if this iterator was a clone, on transaction restart what we did
- * to this iterator isn't going to be preserved.
- *
- * Possibly we could add an iterator field for the parent iterator when
- * an iterator is a copy - for now, we'll just upgrade any other
- * iterators with the same btree id.
- *
- * The code below used to be needed to ensure ancestor nodes get locked
- * before interior nodes - now that's handled by
- * bch2_btree_path_traverse_all().
- */
- if (!path->cached && !trans->in_traverse_all) {
- struct btree_path *linked;
- unsigned i;
-
- trans_for_each_path(trans, linked, i)
- if (linked != path &&
- linked->cached == path->cached &&
- linked->btree_id == path->btree_id &&
- linked->locks_want < new_locks_want) {
- linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true, NULL);
- }
- }
-out:
- bch2_trans_verify_locks(trans);
- return ret;
-}
-
-void __bch2_btree_path_downgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- unsigned l, old_locks_want = path->locks_want;
-
- if (trans->restarted)
- return;
-
- EBUG_ON(path->locks_want < new_locks_want);
-
- path->locks_want = new_locks_want;
-
- while (path->nodes_locked &&
- (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
- if (l > path->level) {
- btree_node_unlock(trans, path, l);
- } else {
- if (btree_node_intent_locked(path, l)) {
- six_lock_downgrade(&path->l[l].b->c.lock);
- mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
- }
- break;
- }
- }
-
- bch2_btree_path_verify_locks(path);
-
- trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
-}
-
-/* Btree transaction locking: */
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- if (trans->restarted)
- return;
-
- trans_for_each_path(trans, path, i)
- if (path->ref)
- bch2_btree_path_downgrade(trans, path);
-}
-
-static inline void __bch2_trans_unlock(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- __bch2_btree_path_unlock(trans, path);
-}
-
-static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
- struct get_locks_fail *f, bool trace)
-{
- if (!trace)
- goto out;
-
- if (trace_trans_restart_relock_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, path->pos);
- prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
- if (IS_ERR_OR_NULL(f->b)) {
- prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
- } else {
- prt_printf(&buf, "%u", f->b->c.lock.seq);
-
- struct six_lock_count c =
- bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
- prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-
- c = six_lock_counts(&f->b->c.lock);
- prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
- }
-
- trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
- printbuf_exit(&buf);
- }
-
- count_event(trans->c, trans_restart_relock);
-out:
- __bch2_trans_unlock(trans);
- bch2_trans_verify_locks(trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
-}
-
-static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
-{
- bch2_trans_verify_locks(trans);
-
- if (unlikely(trans->restarted))
- return -((int) trans->restarted);
- if (unlikely(trans->locked))
- goto out;
-
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i) {
- struct get_locks_fail f;
-
- if (path->should_be_locked &&
- !btree_path_get_locks(trans, path, false, &f))
- return bch2_trans_relock_fail(trans, path, &f, trace);
- }
-
- trans_set_locked(trans, true);
-out:
- bch2_trans_verify_locks(trans);
- return 0;
-}
-
-int bch2_trans_relock(struct btree_trans *trans)
-{
- return __bch2_trans_relock(trans, true);
-}
-
-int bch2_trans_relock_notrace(struct btree_trans *trans)
-{
- return __bch2_trans_relock(trans, false);
-}
-
-void bch2_trans_unlock_noassert(struct btree_trans *trans)
-{
- __bch2_trans_unlock(trans);
-
- trans_set_unlocked(trans);
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
- __bch2_trans_unlock(trans);
-
- trans_set_unlocked(trans);
-}
-
-void bch2_trans_unlock_long(struct btree_trans *trans)
-{
- bch2_trans_unlock(trans);
- bch2_trans_srcu_unlock(trans);
-}
-
-void bch2_trans_unlock_write(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++)
- if (btree_node_write_locked(path, l))
- bch2_btree_node_unlock_write(trans, path, path->l[l].b);
-}
-
-int __bch2_trans_mutex_lock(struct btree_trans *trans,
- struct mutex *lock)
-{
- int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
-
- if (ret)
- mutex_unlock(lock);
- return ret;
-}
-
-/* Debug */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void bch2_btree_path_verify_locks(struct btree_path *path)
-{
- /*
- * A path may be uptodate and yet have nothing locked if and only if
- * there is no node at path->level, which generally means we were
- * iterating over all nodes and got to the end of the btree
- */
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
- btree_path_node(path, path->level) &&
- !path->nodes_locked);
-
- if (!path->nodes_locked)
- return;
-
- for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
- int want = btree_lock_want(path, l);
- int have = btree_node_locked_type(path, l);
-
- BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
-
- BUG_ON(is_btree_node(path, l) &&
- (want == BTREE_NODE_UNLOCKED ||
- have != BTREE_NODE_WRITE_LOCKED) &&
- want != have);
-
- BUG_ON(btree_node_locked(path, l) &&
- path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
- }
-}
-
-static bool bch2_trans_locked(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (path->nodes_locked)
- return true;
- return false;
-}
-
-void bch2_trans_verify_locks(struct btree_trans *trans)
-{
- if (!trans->locked) {
- BUG_ON(bch2_trans_locked(trans));
- return;
- }
-
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- bch2_btree_path_verify_locks(path);
-}
-
-#endif
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
deleted file mode 100644
index b33ab7af8440..000000000000
--- a/fs/bcachefs/btree_locking.h
+++ /dev/null
@@ -1,450 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_LOCKING_H
-#define _BCACHEFS_BTREE_LOCKING_H
-
-/*
- * Only for internal btree use:
- *
- * The btree iterator tracks what locks it wants to take, and what locks it
- * currently has - here we have wrappers for locking/unlocking btree nodes and
- * updating the iterator state
- */
-
-#include "btree_iter.h"
-#include "six.h"
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
-
-void bch2_trans_unlock_noassert(struct btree_trans *);
-void bch2_trans_unlock_write(struct btree_trans *);
-
-static inline bool is_btree_node(struct btree_path *path, unsigned l)
-{
- return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
-}
-
-static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
-{
- return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
- ? &trans->c->btree_transaction_stats[trans->fn_idx]
- : NULL;
-}
-
-/* matches six lock types */
-enum btree_node_locked_type {
- BTREE_NODE_UNLOCKED = -1,
- BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
- BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
- BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write,
-};
-
-static inline int btree_node_locked_type(struct btree_path *path,
- unsigned level)
-{
- return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
-}
-
-static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
-{
- return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
-}
-
-static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
-{
- return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
-}
-
-static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
-{
- return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
-}
-
-static inline bool btree_node_locked(struct btree_path *path, unsigned level)
-{
- return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
-}
-
-static inline void mark_btree_node_locked_noreset(struct btree_path *path,
- unsigned level,
- enum btree_node_locked_type type)
-{
- /* relying on this to avoid a branch */
- BUILD_BUG_ON(SIX_LOCK_read != 0);
- BUILD_BUG_ON(SIX_LOCK_intent != 1);
-
- path->nodes_locked &= ~(3U << (level << 1));
- path->nodes_locked |= (type + 1) << (level << 1);
-}
-
-static inline void mark_btree_node_locked(struct btree_trans *trans,
- struct btree_path *path,
- unsigned level,
- enum btree_node_locked_type type)
-{
- mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- path->l[level].lock_taken_time = local_clock();
-#endif
-}
-
-static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
-{
- return level < path->locks_want
- ? SIX_LOCK_intent
- : SIX_LOCK_read;
-}
-
-static inline enum btree_node_locked_type
-btree_lock_want(struct btree_path *path, int level)
-{
- if (level < path->level)
- return BTREE_NODE_UNLOCKED;
- if (level < path->locks_want)
- return BTREE_NODE_INTENT_LOCKED;
- if (level == path->level)
- return BTREE_NODE_READ_LOCKED;
- return BTREE_NODE_UNLOCKED;
-}
-
-static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
- path->l[level].lock_taken_time,
- local_clock());
-#endif
-}
-
-/* unlock: */
-
-void bch2_btree_node_unlock_write(struct btree_trans *,
- struct btree_path *, struct btree *);
-
-static inline void btree_node_unlock(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- int lock_type = btree_node_locked_type(path, level);
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
-
- if (lock_type != BTREE_NODE_UNLOCKED) {
- if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) {
- bch2_btree_node_unlock_write(trans, path, path->l[level].b);
- lock_type = BTREE_NODE_INTENT_LOCKED;
- }
- six_unlock_type(&path->l[level].b->c.lock, lock_type);
- btree_trans_lock_hold_time_update(trans, path, level);
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
- }
-}
-
-static inline int btree_path_lowest_level_locked(struct btree_path *path)
-{
- return __ffs(path->nodes_locked) >> 1;
-}
-
-static inline int btree_path_highest_level_locked(struct btree_path *path)
-{
- return __fls(path->nodes_locked) >> 1;
-}
-
-static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
- struct btree_path *path)
-{
- btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
-
- while (path->nodes_locked)
- btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
-}
-
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
-{
- if (!b->c.lock.write_lock_recurse) {
- struct btree_path *linked;
- unsigned i;
-
- trans_for_each_path_with_node(trans, b, linked, i)
- linked->l[b->c.level].lock_seq++;
- }
-
- six_unlock_write(&b->c.lock);
-}
-
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
- struct btree *b)
-{
- EBUG_ON(path->l[b->c.level].b != b);
- EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
- EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
-
- mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- __bch2_btree_node_unlock_write(trans, b);
-}
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
-
-/* lock: */
-
-static inline void trans_set_locked(struct btree_trans *trans, bool try)
-{
- if (!trans->locked) {
- lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_);
- trans->locked = true;
- trans->last_unlock_ip = 0;
-
- trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0;
- current->flags |= PF_MEMALLOC_NOFS;
- }
-}
-
-static inline void trans_set_unlocked(struct btree_trans *trans)
-{
- if (trans->locked) {
- lock_release(&trans->dep_map, _THIS_IP_);
- trans->locked = false;
- trans->last_unlock_ip = _RET_IP_;
-
- if (!trans->pf_memalloc_nofs)
- current->flags &= ~PF_MEMALLOC_NOFS;
- }
-}
-
-static inline int __btree_node_lock_nopath(struct btree_trans *trans,
- struct btree_bkey_cached_common *b,
- enum six_lock_type type,
- bool lock_may_not_fail,
- unsigned long ip)
-{
- trans->lock_may_not_fail = lock_may_not_fail;
- trans->lock_must_abort = false;
- trans->locking = b;
-
- int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
- bch2_six_check_for_deadlock, trans, ip);
- WRITE_ONCE(trans->locking, NULL);
- WRITE_ONCE(trans->locking_wait.start_time, 0);
-
- if (!ret)
- trace_btree_path_lock(trans, _THIS_IP_, b);
- return ret;
-}
-
-static inline int __must_check
-btree_node_lock_nopath(struct btree_trans *trans,
- struct btree_bkey_cached_common *b,
- enum six_lock_type type,
- unsigned long ip)
-{
- return __btree_node_lock_nopath(trans, b, type, false, ip);
-}
-
-static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
- struct btree_bkey_cached_common *b,
- enum six_lock_type type)
-{
- int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
-
- BUG_ON(ret);
-}
-
-/*
- * Lock a btree node if we already have it locked on one of our linked
- * iterators:
- */
-static inline bool btree_node_lock_increment(struct btree_trans *trans,
- struct btree_bkey_cached_common *b,
- unsigned level,
- enum btree_node_locked_type want)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (&path->l[level].b->c == b &&
- btree_node_locked_type(path, level) >= want) {
- six_lock_increment(&b->lock, (enum six_lock_type) want);
- return true;
- }
-
- return false;
-}
-
-static inline int btree_node_lock(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b,
- unsigned level,
- enum six_lock_type type,
- unsigned long ip)
-{
- int ret = 0;
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- if (likely(six_trylock_type(&b->lock, type)) ||
- btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
- !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- path->l[b->level].lock_taken_time = local_clock();
-#endif
- }
-
- return ret;
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
- struct btree_bkey_cached_common *b, bool);
-
-static inline int __btree_node_lock_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b,
- bool lock_may_not_fail)
-{
- EBUG_ON(&path->l[b->level].b->c != b);
- EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
- EBUG_ON(!btree_node_intent_locked(path, b->level));
-
- /*
- * six locks are unfair, and read locks block while a thread wants a
- * write lock: thus, we need to tell the cycle detector we have a write
- * lock _before_ taking the lock:
- */
- mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
-
- return likely(six_trylock_write(&b->lock))
- ? 0
- : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
-}
-
-static inline int __must_check
-bch2_btree_node_lock_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b)
-{
- return __btree_node_lock_write(trans, path, b, false);
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *,
- struct btree_path *,
- struct btree_bkey_cached_common *);
-
-/* relock: */
-
-bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
-int __bch2_btree_path_relock(struct btree_trans *,
- struct btree_path *, unsigned long);
-
-static inline int bch2_btree_path_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
-{
- return btree_node_locked(path, path->level)
- ? 0
- : __bch2_btree_path_relock(trans, path, trace_ip);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
-
-static inline bool bch2_btree_node_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- EBUG_ON(btree_node_locked(path, level) &&
- !btree_node_write_locked(path, level) &&
- btree_node_locked_type(path, level) != __btree_lock_want(path, level));
-
- return likely(btree_node_locked(path, level)) ||
- (!IS_ERR_OR_NULL(path->l[level].b) &&
- __bch2_btree_node_relock(trans, path, level, true));
-}
-
-static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- EBUG_ON(btree_node_locked(path, level) &&
- !btree_node_write_locked(path, level) &&
- btree_node_locked_type(path, level) != __btree_lock_want(path, level));
-
- return likely(btree_node_locked(path, level)) ||
- (!IS_ERR_OR_NULL(path->l[level].b) &&
- __bch2_btree_node_relock(trans, path, level, false));
-}
-
-/* upgrade */
-
-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
- struct btree_path *, unsigned,
- struct get_locks_fail *);
-
-bool __bch2_btree_path_upgrade(struct btree_trans *,
- struct btree_path *, unsigned,
- struct get_locks_fail *);
-
-static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- struct get_locks_fail f = {};
- unsigned old_locks_want = path->locks_want;
-
- new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
-
- if (path->locks_want < new_locks_want
- ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
- : path->nodes_locked)
- return 0;
-
- trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
- old_locks_want, new_locks_want, &f);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
-}
-
-/* misc: */
-
-static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
-{
- EBUG_ON(!btree_node_locked(path, path->level));
- EBUG_ON(path->uptodate);
-
- path->should_be_locked = true;
- trace_btree_path_should_be_locked(trans, path);
-}
-
-static inline void __btree_path_set_level_up(struct btree_trans *trans,
- struct btree_path *path,
- unsigned l)
-{
- btree_node_unlock(trans, path, l);
- path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
-}
-
-static inline void btree_path_set_level_up(struct btree_trans *trans,
- struct btree_path *path)
-{
- __btree_path_set_level_up(trans, path, path->level++);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-}
-
-/* debug */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
- struct btree_path *,
- struct btree_bkey_cached_common *b,
- unsigned);
-
-int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_path_verify_locks(struct btree_path *);
-void bch2_trans_verify_locks(struct btree_trans *);
-#else
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
-#endif
-
-#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
deleted file mode 100644
index 678161321e42..000000000000
--- a/fs/bcachefs/btree_node_scan.c
+++ /dev/null
@@ -1,603 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_journal_iter.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal_io.h"
-#include "recovery_passes.h"
-
-#include <linux/kthread.h>
-#include <linux/min_heap.h>
-#include <linux/sort.h>
-
-struct find_btree_nodes_worker {
- struct closure *cl;
- struct find_btree_nodes *f;
- struct bch_dev *ca;
-};
-
-static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
-{
- bch2_btree_id_level_to_text(out, n->btree_id, n->level);
- prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
- n->seq, n->journal_seq, n->cookie);
- bch2_bpos_to_text(out, n->min_key);
- prt_str(out, "-");
- bch2_bpos_to_text(out, n->max_key);
-
- if (n->range_updated)
- prt_str(out, " range updated");
-
- for (unsigned i = 0; i < n->nr_ptrs; i++) {
- prt_char(out, ' ');
- bch2_extent_ptr_to_text(out, c, n->ptrs + i);
- }
-}
-
-static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
-{
- printbuf_indent_add(out, 2);
- darray_for_each(nodes, i) {
- found_btree_node_to_text(out, c, i);
- prt_newline(out);
- }
- printbuf_indent_sub(out, 2);
-}
-
-static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
-{
- struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
-
- set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
- bp->k.p = f->max_key;
- bp->v.seq = cpu_to_le64(f->cookie);
- bp->v.sectors_written = 0;
- bp->v.flags = 0;
- bp->v.sectors_written = cpu_to_le16(f->sectors_written);
- bp->v.min_key = f->min_key;
- SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
- memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
-}
-
-static inline u64 bkey_journal_seq(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode_v3:
- return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
- default:
- return 0;
- }
-}
-
-static bool found_btree_node_is_readable(struct btree_trans *trans,
- struct found_btree_node *f)
-{
- struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
- found_btree_node_to_key(&tmp.k, f);
-
- struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
- bool ret = !IS_ERR_OR_NULL(b);
- if (!ret)
- return ret;
-
- f->sectors_written = b->written;
- f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
-
- struct bkey_s_c k;
- struct bkey unpacked;
- struct btree_node_iter iter;
- for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
- f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
-
- six_unlock_read(&b->c.lock);
-
- /*
- * We might update this node's range; if that happens, we need the node
- * to be re-read so the read path can trim keys that are no longer in
- * this node
- */
- if (b != btree_node_root(trans->c, b))
- bch2_btree_node_evict(trans, &tmp.k);
- return ret;
-}
-
-static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
-{
- const struct found_btree_node *l = _l;
- const struct found_btree_node *r = _r;
-
- return cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- cmp_int(l->cookie, r->cookie);
-}
-
-/*
- * Given two found btree nodes, if their sequence numbers are equal, take the
- * one that's readable:
- */
-static int found_btree_node_cmp_time(const struct found_btree_node *l,
- const struct found_btree_node *r)
-{
- return cmp_int(l->seq, r->seq) ?:
- cmp_int(l->journal_seq, r->journal_seq);
-}
-
-static int found_btree_node_cmp_pos(const void *_l, const void *_r)
-{
- const struct found_btree_node *l = _l;
- const struct found_btree_node *r = _r;
-
- return cmp_int(l->btree_id, r->btree_id) ?:
- -cmp_int(l->level, r->level) ?:
- bpos_cmp(l->min_key, r->min_key) ?:
- -found_btree_node_cmp_time(l, r);
-}
-
-static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
-{
- return found_btree_node_cmp_pos(l, r) < 0;
-}
-
-static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
-{
- struct found_btree_node *l = _l;
- struct found_btree_node *r = _r;
-
- swap(*l, *r);
-}
-
-static const struct min_heap_callbacks found_btree_node_heap_cbs = {
- .less = found_btree_node_cmp_pos_less,
- .swp = found_btree_node_swap,
-};
-
-static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
- struct bio *bio, struct btree_node *bn, u64 offset)
-{
- struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
-
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
- bio->bi_iter.bi_sector = offset;
- bch2_bio_map(bio, bn, PAGE_SIZE);
-
- u64 submit_time = local_clock();
- submit_bio_wait(bio);
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
-
- if (bio->bi_status) {
- bch_err_dev_ratelimited(ca,
- "IO error in try_read_btree_node() at %llu: %s",
- offset, bch2_blk_status_to_str(bio->bi_status));
- return;
- }
-
- if (le64_to_cpu(bn->magic) != bset_magic(c))
- return;
-
- if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
- if (!c->chacha20)
- return;
-
- struct nonce nonce = btree_nonce(&bn->keys, 0);
- unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
- bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
- }
-
- if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
- return;
-
- if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
- return;
-
- if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
- return;
-
- rcu_read_lock();
- struct found_btree_node n = {
- .btree_id = BTREE_NODE_ID(bn),
- .level = BTREE_NODE_LEVEL(bn),
- .seq = BTREE_NODE_SEQ(bn),
- .cookie = le64_to_cpu(bn->keys.seq),
- .min_key = bn->min_key,
- .max_key = bn->max_key,
- .nr_ptrs = 1,
- .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
- .ptrs[0].offset = offset,
- .ptrs[0].dev = ca->dev_idx,
- .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)),
- };
- rcu_read_unlock();
-
- if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
- mutex_lock(&f->lock);
- if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
- bch_err(c, "try_read_btree_node() can't handle endian conversion");
- f->ret = -EINVAL;
- goto unlock;
- }
-
- if (darray_push(&f->nodes, n))
- f->ret = -ENOMEM;
-unlock:
- mutex_unlock(&f->lock);
- }
-}
-
-static int read_btree_nodes_worker(void *p)
-{
- struct find_btree_nodes_worker *w = p;
- struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
- struct bch_dev *ca = w->ca;
- void *buf = (void *) __get_free_page(GFP_KERNEL);
- struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
- unsigned long last_print = jiffies;
-
- if (!buf || !bio) {
- bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
- w->f->ret = -ENOMEM;
- goto err;
- }
-
- for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
- for (unsigned bucket_offset = 0;
- bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
- bucket_offset += btree_sectors(c)) {
- if (time_after(jiffies, last_print + HZ * 30)) {
- u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
- u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
-
- bch_info(ca, "%s: %2u%% done", __func__,
- (unsigned) div64_u64(cur_sector * 100, end_sector));
- last_print = jiffies;
- }
-
- u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
- !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
- continue;
-
- try_read_btree_node(w->f, ca, bio, buf, sector);
- }
-err:
- bio_put(bio);
- free_page((unsigned long) buf);
- percpu_ref_put(&ca->io_ref);
- closure_put(w->cl);
- kfree(w);
- return 0;
-}
-
-static int read_btree_nodes(struct find_btree_nodes *f)
-{
- struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
- struct closure cl;
- int ret = 0;
-
- closure_init_stack(&cl);
-
- for_each_online_member(c, ca) {
- if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
- continue;
-
- struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
- if (!w) {
- percpu_ref_put(&ca->io_ref);
- ret = -ENOMEM;
- goto err;
- }
-
- w->cl = &cl;
- w->f = f;
- w->ca = ca;
-
- struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
- ret = PTR_ERR_OR_ZERO(t);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- kfree(w);
- bch_err_msg(c, ret, "starting kthread");
- break;
- }
-
- closure_get(&cl);
- percpu_ref_get(&ca->io_ref);
- wake_up_process(t);
- }
-err:
- closure_sync(&cl);
- return f->ret ?: ret;
-}
-
-static bool nodes_overlap(const struct found_btree_node *l,
- const struct found_btree_node *r)
-{
- return (l->btree_id == r->btree_id &&
- l->level == r->level &&
- bpos_gt(l->max_key, r->min_key));
-}
-
-static int handle_overwrites(struct bch_fs *c,
- struct found_btree_node *l,
- found_btree_nodes *nodes_heap)
-{
- struct found_btree_node *r;
-
- while ((r = min_heap_peek(nodes_heap)) &&
- nodes_overlap(l, r)) {
- int cmp = found_btree_node_cmp_time(l, r);
-
- if (cmp > 0) {
- if (bpos_cmp(l->max_key, r->max_key) >= 0)
- min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
- else {
- r->range_updated = true;
- r->min_key = bpos_successor(l->max_key);
- r->range_updated = true;
- min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
- }
- } else if (cmp < 0) {
- BUG_ON(bpos_eq(l->min_key, r->min_key));
-
- l->max_key = bpos_predecessor(r->min_key);
- l->range_updated = true;
- } else if (r->level) {
- min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
- } else {
- if (bpos_cmp(l->max_key, r->max_key) >= 0)
- min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
- else {
- r->range_updated = true;
- r->min_key = bpos_successor(l->max_key);
- r->range_updated = true;
- min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
- }
- }
- }
-
- return 0;
-}
-
-int bch2_scan_for_btree_nodes(struct bch_fs *c)
-{
- struct find_btree_nodes *f = &c->found_btree_nodes;
- struct printbuf buf = PRINTBUF;
- found_btree_nodes nodes_heap = {};
- size_t dst;
- int ret = 0;
-
- if (f->nodes.nr)
- return 0;
-
- mutex_init(&f->lock);
-
- ret = read_btree_nodes(f);
- if (ret)
- return ret;
-
- if (!f->nodes.nr) {
- bch_err(c, "%s: no btree nodes found", __func__);
- ret = -EINVAL;
- goto err;
- }
-
- if (0 && c->opts.verbose) {
- printbuf_reset(&buf);
- prt_printf(&buf, "%s: nodes found:\n", __func__);
- found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
- }
-
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
-
- dst = 0;
- darray_for_each(f->nodes, i) {
- struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
-
- if (prev &&
- prev->cookie == i->cookie) {
- if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
- bch_err(c, "%s: found too many replicas for btree node", __func__);
- ret = -EINVAL;
- goto err;
- }
- prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
- } else {
- f->nodes.data[dst++] = *i;
- }
- }
- f->nodes.nr = dst;
-
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-
- if (0 && c->opts.verbose) {
- printbuf_reset(&buf);
- prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
- found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
- }
-
- swap(nodes_heap, f->nodes);
-
- {
- /* darray must have same layout as a heap */
- min_heap_char real_heap;
- BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr));
- BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size));
- BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr));
- BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size));
- }
-
- min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
-
- if (nodes_heap.nr) {
- ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
- if (ret)
- goto err;
-
- min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
- }
-
- while (true) {
- ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
- if (ret)
- goto err;
-
- if (!nodes_heap.nr)
- break;
-
- ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
- if (ret)
- goto err;
-
- min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
- }
-
- for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
- BUG_ON(nodes_overlap(n, n + 1));
-
- if (0 && c->opts.verbose) {
- printbuf_reset(&buf);
- prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
- found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
- } else {
- bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
- }
-
- eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-err:
- darray_exit(&nodes_heap);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
-{
- const struct found_btree_node *l = _l;
- const struct found_btree_node *r = _r;
-
- return cmp_int(l->btree_id, r->btree_id) ?:
- -cmp_int(l->level, r->level) ?:
- bpos_cmp(l->max_key, r->min_key);
-}
-
-#define for_each_found_btree_node_in_range(_f, _search, _idx) \
- for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
- sizeof((_f)->nodes.data[0]), \
- found_btree_node_range_start_cmp, &search); \
- _idx < (_f)->nodes.nr && \
- (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
- (_f)->nodes.data[_idx].level == _search.level && \
- bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
- _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
-
-bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
-{
- struct find_btree_nodes *f = &c->found_btree_nodes;
-
- struct found_btree_node search = {
- .btree_id = b->c.btree_id,
- .level = b->c.level,
- .min_key = b->data->min_key,
- .max_key = b->key.k.p,
- };
-
- for_each_found_btree_node_in_range(f, search, idx)
- if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
- return true;
- return false;
-}
-
-bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
-{
- struct found_btree_node search = {
- .btree_id = btree,
- .level = 0,
- .min_key = POS_MIN,
- .max_key = SPOS_MAX,
- };
-
- for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
- return true;
- return false;
-}
-
-int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
- unsigned level, struct bpos node_min, struct bpos node_max)
-{
- if (btree_id_is_alloc(btree))
- return 0;
-
- struct find_btree_nodes *f = &c->found_btree_nodes;
-
- int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
- if (ret)
- return ret;
-
- if (c->opts.verbose) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "recovery ");
- bch2_btree_id_level_to_text(&buf, btree, level);
- prt_str(&buf, " ");
- bch2_bpos_to_text(&buf, node_min);
- prt_str(&buf, " - ");
- bch2_bpos_to_text(&buf, node_max);
-
- bch_info(c, "%s(): %s", __func__, buf.buf);
- printbuf_exit(&buf);
- }
-
- struct found_btree_node search = {
- .btree_id = btree,
- .level = level,
- .min_key = node_min,
- .max_key = node_max,
- };
-
- for_each_found_btree_node_in_range(f, search, idx) {
- struct found_btree_node n = f->nodes.data[idx];
-
- n.range_updated |= bpos_lt(n.min_key, node_min);
- n.min_key = bpos_max(n.min_key, node_min);
-
- n.range_updated |= bpos_gt(n.max_key, node_max);
- n.max_key = bpos_min(n.max_key, node_max);
-
- struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
- found_btree_node_to_key(&tmp.k, &n);
-
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
- bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
- printbuf_exit(&buf);
-
- BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
- (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = level + 1,
- .btree = btree,
- }));
-
- ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
-{
- darray_exit(&f->nodes);
-}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
deleted file mode 100644
index 08687b209787..000000000000
--- a/fs/bcachefs/btree_node_scan.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
-#define _BCACHEFS_BTREE_NODE_SCAN_H
-
-int bch2_scan_for_btree_nodes(struct bch_fs *);
-bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
-bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
-int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
deleted file mode 100644
index 2811b6857c97..000000000000
--- a/fs/bcachefs/btree_node_scan_types.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-
-#include "darray.h"
-
-struct found_btree_node {
- bool range_updated:1;
- u8 btree_id;
- u8 level;
- unsigned sectors_written;
- u32 seq;
- u64 journal_seq;
- u64 cookie;
-
- struct bpos min_key;
- struct bpos max_key;
-
- unsigned nr_ptrs;
- struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
-};
-
-typedef DARRAY(struct found_btree_node) found_btree_nodes;
-
-struct find_btree_nodes {
- int ret;
- struct mutex lock;
- found_btree_nodes nodes;
-};
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
deleted file mode 100644
index 7d7e52ddde02..000000000000
--- a/fs/bcachefs/btree_trans_commit.c
+++ /dev/null
@@ -1,1086 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "disk_accounting.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "replicas.h"
-#include "snapshot.h"
-
-#include <linux/prefetch.h>
-
-static const char * const trans_commit_flags_strs[] = {
-#define x(n, ...) #n,
- BCH_TRANS_COMMIT_FLAGS()
-#undef x
- NULL
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags)
-{
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
- prt_printf(out, "watermark=%s", bch2_watermarks[watermark]);
-
- flags >>= BCH_WATERMARK_BITS;
- if (flags) {
- prt_char(out, ' ');
- bch2_prt_bitflags(out, trans_commit_flags_strs, flags);
- }
-}
-
-static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bch_fs *c = trans->c;
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
-
- if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
- bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
-
- if (j_k)
- k = bkey_i_to_s_c(j_k);
- }
-
- u = *k.k;
- u.needs_whiteout = i->old_k.needs_whiteout;
-
- BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
- BUG_ON(i->old_v != k.v);
-#endif
-}
-
-static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- return (trans->paths + i->path)->l + i->level;
-}
-
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- return i != trans->updates &&
- insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b;
-}
-
-static inline bool same_leaf_as_next(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- return i + 1 < trans->updates + trans->nr_updates &&
- insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b;
-}
-
-inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- struct bch_fs *c = trans->c;
-
- if (unlikely(btree_node_just_written(b)) &&
- bch2_btree_post_write_cleanup(c, b))
- bch2_trans_node_reinit_iter(trans, b);
-
- /*
- * If the last bset has been written, or if it's gotten too big - start
- * a new bset to insert into:
- */
- if (want_new_bset(c, b))
- bch2_btree_init_next(trans, b);
-}
-
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- while (--i >= trans->updates) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
- }
-
- trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int bch2_trans_lock_write(struct btree_trans *trans)
-{
- EBUG_ON(trans->write_locked);
-
- trans_for_each_update(trans, i) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c))
- return trans_lock_write_fail(trans, i);
-
- if (!i->cached)
- bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
- }
-
- trans->write_locked = true;
- return 0;
-}
-
-static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans)
-{
- if (likely(trans->write_locked)) {
- trans_for_each_update(trans, i)
- if (btree_node_locked_type(trans->paths + i->path, i->level) ==
- BTREE_NODE_WRITE_LOCKED)
- bch2_btree_node_unlock_write_inlined(trans,
- trans->paths + i->path, insert_l(trans, i)->b);
- trans->write_locked = false;
- }
-}
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bkey_i *insert)
-{
- struct bkey_packed *k;
- unsigned clobber_u64s = 0, new_u64s = 0;
-
- EBUG_ON(btree_node_just_written(b));
- EBUG_ON(bset_written(b, btree_bset_last(b)));
- EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
- EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
- EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
- EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
- kmsan_check_memory(insert, bkey_bytes(&insert->k));
-
- k = bch2_btree_node_iter_peek_all(node_iter, b);
- if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
- k = NULL;
-
- /* @k is the key being overwritten/deleted, if any: */
- EBUG_ON(k && bkey_deleted(k));
-
- /* Deleting, but not found? nothing to do: */
- if (bkey_deleted(&insert->k) && !k)
- return false;
-
- if (bkey_deleted(&insert->k)) {
- /* Deleting: */
- btree_account_key_drop(b, k);
- k->type = KEY_TYPE_deleted;
-
- if (k->needs_whiteout)
- push_whiteout(b, insert->k.p);
- k->needs_whiteout = false;
-
- if (k >= btree_bset_last(b)->start) {
- clobber_u64s = k->u64s;
- bch2_bset_delete(b, k, clobber_u64s);
- goto fix_iter;
- } else {
- bch2_btree_path_fix_key_modified(trans, b, k);
- }
-
- return true;
- }
-
- if (k) {
- /* Overwriting: */
- btree_account_key_drop(b, k);
- k->type = KEY_TYPE_deleted;
-
- insert->k.needs_whiteout = k->needs_whiteout;
- k->needs_whiteout = false;
-
- if (k >= btree_bset_last(b)->start) {
- clobber_u64s = k->u64s;
- goto overwrite;
- } else {
- bch2_btree_path_fix_key_modified(trans, b, k);
- }
- }
-
- k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-overwrite:
- bch2_bset_insert(b, k, insert, clobber_u64s);
- new_u64s = k->u64s;
-fix_iter:
- if (clobber_u64s != new_u64s)
- bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
- clobber_u64s, new_u64s);
- return true;
-}
-
-static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
- unsigned i, u64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct btree_write *w = container_of(pin, struct btree_write, journal);
- struct btree *b = container_of(w, struct btree, writes[i]);
- struct btree_trans *trans = bch2_trans_get(c);
- unsigned long old, new;
- unsigned idx = w - b->writes;
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-
- old = READ_ONCE(b->flags);
- do {
- new = old;
-
- if (!(old & (1 << BTREE_NODE_dirty)) ||
- !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
- w->journal.seq != seq)
- break;
-
- new &= ~BTREE_WRITE_TYPE_MASK;
- new |= BTREE_WRITE_journal_reclaim;
- new |= 1 << BTREE_NODE_need_write;
- } while (!try_cmpxchg(&b->flags, &old, new));
-
- btree_node_write_if_need(trans, b, SIX_LOCK_read);
- six_unlock_read(&b->c.lock);
-
- bch2_trans_put(trans);
- return 0;
-}
-
-int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
- return __btree_node_flush(j, pin, 0, seq);
-}
-
-int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
- return __btree_node_flush(j, pin, 1, seq);
-}
-
-inline void bch2_btree_add_journal_pin(struct bch_fs *c,
- struct btree *b, u64 seq)
-{
- struct btree_write *w = btree_current_write(b);
-
- bch2_journal_pin_add(&c->journal, seq, &w->journal,
- btree_node_write_idx(b) == 0
- ? bch2_btree_node_flush0
- : bch2_btree_node_flush1);
-}
-
-/**
- * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
- * @trans: btree transaction object
- * @path: path pointing to @insert's pos
- * @insert: key to insert
- * @journal_seq: sequence number of journal reservation
- */
-inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
- struct btree_path *path,
- struct bkey_i *insert,
- u64 journal_seq)
-{
- struct bch_fs *c = trans->c;
- struct btree *b = path_l(path)->b;
- struct bset_tree *t = bset_tree_last(b);
- struct bset *i = bset(b, t);
- int old_u64s = bset_u64s(t);
- int old_live_u64s = b->nr.live_u64s;
- int live_u64s_added, u64s_added;
-
- if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
- &path_l(path)->iter, insert)))
- return;
-
- i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
-
- bch2_btree_add_journal_pin(c, b, journal_seq);
-
- if (unlikely(!btree_node_dirty(b))) {
- EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- set_btree_node_dirty_acct(c, b);
- }
-
- live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
- u64s_added = (int) bset_u64s(t) - old_u64s;
-
- if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
- if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
- if (u64s_added > live_u64s_added &&
- bch2_maybe_compact_whiteouts(c, b))
- bch2_trans_node_reinit_iter(trans, b);
-}
-
-/* Cached btree updates: */
-
-/* Normal update interface: */
-
-static inline void btree_insert_entry_checks(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- struct btree_path *path = trans->paths + i->path;
-
- BUG_ON(!bpos_eq(i->k->k.p, path->pos));
- BUG_ON(i->cached != path->cached);
- BUG_ON(i->level != path->level);
- BUG_ON(i->btree_id != path->btree_id);
- BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id));
- EBUG_ON(!i->level &&
- btree_type_has_snapshots(i->btree_id) &&
- !(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
- test_bit(JOURNAL_replay_done, &trans->c->journal.flags) &&
- i->k->k.p.snapshot &&
- bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
-}
-
-static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
- unsigned flags)
-{
- return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
- trans->journal_u64s, flags, trans);
-}
-
-#define JSET_ENTRY_LOG_U64s 4
-
-static noinline void journal_transaction_name(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- struct jset_entry *entry =
- bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_log, 0, 0,
- JSET_ENTRY_LOG_U64s);
- struct jset_entry_log *l =
- container_of(entry, struct jset_entry_log, entry);
-
- strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
-}
-
-static inline int btree_key_can_insert(struct btree_trans *trans,
- struct btree *b, unsigned u64s)
-{
- if (!bch2_btree_node_insert_fits(b, u64s))
- return -BCH_ERR_btree_insert_btree_node_full;
-
- return 0;
-}
-
-noinline static int
-btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
- struct btree_path *path, unsigned new_u64s)
-{
- struct bkey_cached *ck = (void *) path->l[0].b;
- struct bkey_i *new_k;
- int ret;
-
- bch2_trans_unlock_updates_write(trans);
- bch2_trans_unlock(trans);
-
- new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
- if (!new_k) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_id_str(path->btree_id), new_u64s);
- return -BCH_ERR_ENOMEM_btree_key_cache_insert;
- }
-
- ret = bch2_trans_relock(trans) ?:
- bch2_trans_lock_write(trans);
- if (unlikely(ret)) {
- kfree(new_k);
- return ret;
- }
-
- memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
-
- trans_for_each_update(trans, i)
- if (i->old_v == &ck->k->v)
- i->old_v = &new_k->v;
-
- kfree(ck->k);
- ck->u64s = new_u64s;
- ck->k = new_k;
- return 0;
-}
-
-static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
- struct btree_path *path, unsigned u64s)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) path->l[0].b;
- unsigned new_u64s;
- struct bkey_i *new_k;
- unsigned watermark = flags & BCH_WATERMARK_MASK;
-
- EBUG_ON(path->level);
-
- if (watermark < BCH_WATERMARK_reclaim &&
- !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(c))
- return -BCH_ERR_btree_insert_need_journal_reclaim;
-
- /*
- * bch2_varint_decode can read past the end of the buffer by at most 7
- * bytes (it won't be used):
- */
- u64s += 1;
-
- if (u64s <= ck->u64s)
- return 0;
-
- new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
- if (unlikely(!new_k))
- return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
-
- trans_for_each_update(trans, i)
- if (i->old_v == &ck->k->v)
- i->old_v = &new_k->v;
-
- ck->u64s = new_u64s;
- ck->k = new_k;
- return 0;
-}
-
-/* Triggers: */
-
-static int run_one_mem_trigger(struct btree_trans *trans,
- struct btree_insert_entry *i,
- unsigned flags)
-{
- verify_update_old_key(trans, i);
-
- if (unlikely(flags & BTREE_TRIGGER_norun))
- return 0;
-
- struct bkey_s_c old = { &i->old_k, i->old_v };
- struct bkey_i *new = i->k;
- const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
- const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-
- if (old_ops->trigger == new_ops->trigger)
- return bch2_key_trigger(trans, i->btree_id, i->level,
- old, bkey_i_to_s(new),
- BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
- else
- return bch2_key_trigger_new(trans, i->btree_id, i->level,
- bkey_i_to_s(new), flags) ?:
- bch2_key_trigger_old(trans, i->btree_id, i->level,
- old, flags);
-}
-
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- verify_update_old_key(trans, i);
-
- if ((i->flags & BTREE_TRIGGER_norun) ||
- !btree_node_type_has_trans_triggers(i->bkey_type))
- return 0;
-
- /*
- * Transactional triggers create new btree_insert_entries, so we can't
- * pass them a pointer to a btree_insert_entry, that memory is going to
- * move:
- */
- struct bkey old_k = i->old_k;
- struct bkey_s_c old = { &old_k, i->old_v };
- const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
- const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
- unsigned flags = i->flags|BTREE_TRIGGER_transactional;
-
- if (!i->insert_trigger_run &&
- !i->overwrite_trigger_run &&
- old_ops->trigger == new_ops->trigger) {
- i->overwrite_trigger_run = true;
- i->insert_trigger_run = true;
- return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
- BTREE_TRIGGER_insert|
- BTREE_TRIGGER_overwrite|flags) ?: 1;
- } else if (!i->overwrite_trigger_run) {
- i->overwrite_trigger_run = true;
- return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
- } else if (!i->insert_trigger_run) {
- i->insert_trigger_run = true;
- return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
- } else {
- return 0;
- }
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
- unsigned sort_id_start = 0;
-
- while (sort_id_start < trans->nr_updates) {
- unsigned i, sort_id = trans->updates[sort_id_start].sort_order;
- bool trans_trigger_run;
-
- /*
- * For a given btree, this algorithm runs insert triggers before
- * overwrite triggers: this is so that when extents are being
- * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop
- * references before they are re-added.
- *
- * Running triggers will append more updates to the list of
- * updates as we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- for (i = sort_id_start;
- i < trans->nr_updates && trans->updates[i].sort_order <= sort_id;
- i++) {
- if (trans->updates[i].sort_order < sort_id) {
- sort_id_start = i;
- continue;
- }
-
- int ret = run_one_trans_trigger(trans, trans->updates + i);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
- }
- } while (trans_trigger_run);
-
- sort_id_start = i;
- }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
- btree_node_type_has_trans_triggers(i->bkey_type) &&
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
-#endif
- return 0;
-}
-
-static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
-{
- trans_for_each_update(trans, i)
- if (btree_node_type_has_triggers(i->bkey_type) &&
- gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) {
- int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry **stopped_at,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_trans_commit_hook *h;
- unsigned u64s = 0;
- int ret = 0;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- if (race_fault()) {
- trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
- }
-
- /*
- * Check if the insert will fit in the leaf node with the write lock
- * held, otherwise another thread could write the node changing the
- * amount of space available:
- */
-
- prefetch(&trans->c->journal.flags);
-
- trans_for_each_update(trans, i) {
- /* Multiple inserts might go to same leaf: */
- if (!same_leaf_as_prev(trans, i))
- u64s = 0;
-
- u64s += i->k->k.u64s;
- ret = !i->cached
- ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s)
- : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s);
- if (ret) {
- *stopped_at = i;
- return ret;
- }
-
- i->k->k.needs_whiteout = false;
- }
-
- /*
- * Don't get journal reservation until after we know insert will
- * succeed:
- */
- if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
- ret = bch2_trans_journal_res_get(trans,
- (flags & BCH_WATERMARK_MASK)|
- JOURNAL_RES_GET_NONBLOCK);
- if (ret)
- return ret;
-
- if (unlikely(trans->journal_transaction_names))
- journal_transaction_name(trans);
- }
-
- /*
- * Not allowed to fail after we've gotten our journal reservation - we
- * have to use it:
- */
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
- if (bch2_journal_seq_verify)
- trans_for_each_update(trans, i)
- i->k->k.bversion.lo = trans->journal_res.seq;
- else if (bch2_inject_invalid_keys)
- trans_for_each_update(trans, i)
- i->k->k.bversion = MAX_VERSION;
- }
-
- h = trans->hooks;
- while (h) {
- ret = h->fn(trans, h);
- if (ret)
- return ret;
- h = h->next;
- }
-
- struct jset_entry *entry = trans->journal_entries;
-
- percpu_down_read(&c->mark_lock);
- for (entry = trans->journal_entries;
- entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
- entry->start->k.type == KEY_TYPE_accounting) {
- ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags);
- if (ret)
- goto revert_fs_usage;
- }
- percpu_up_read(&c->mark_lock);
-
- /* XXX: we only want to run this if deltas are nonzero */
- bch2_trans_account_disk_usage_change(trans);
-
- trans_for_each_update(trans, i)
- if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
- if (ret)
- goto fatal_err;
- }
-
- if (unlikely(c->gc_pos.phase)) {
- ret = bch2_trans_commit_run_gc_triggers(trans);
- if (ret)
- goto fatal_err;
- }
-
- struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit };
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i)) {
- ret = bch2_journal_entry_validate(c, NULL, i,
- bcachefs_metadata_version_current,
- CPU_BIG_ENDIAN, validate_context);
- if (unlikely(ret)) {
- bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
- trans->fn);
- goto fatal_err;
- }
- }
-
- trans_for_each_update(trans, i) {
- validate_context.level = i->level;
- validate_context.btree = i->btree_id;
-
- ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context);
- if (unlikely(ret)){
- bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
- trans->fn, (void *) i->ip_allocated);
- goto fatal_err;
- }
- btree_insert_entry_checks(trans, i);
- }
-
- if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
- struct journal *j = &c->journal;
- struct jset_entry *entry;
-
- trans_for_each_update(trans, i) {
- if (i->key_cache_already_flushed)
- continue;
-
- if (i->flags & BTREE_UPDATE_nojournal)
- continue;
-
- verify_update_old_key(trans, i);
-
- if (trans->journal_transaction_names) {
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_overwrite,
- i->btree_id, i->level,
- i->old_k.u64s);
- bkey_reassemble((struct bkey_i *) entry->start,
- (struct bkey_s_c) { &i->old_k, i->old_v });
- }
-
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_btree_keys,
- i->btree_id, i->level,
- i->k->k.u64s);
- bkey_copy((struct bkey_i *) entry->start, i->k);
- }
-
- memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->journal_entries,
- trans->journal_entries_u64s);
-
- trans->journal_res.offset += trans->journal_entries_u64s;
- trans->journal_res.u64s -= trans->journal_entries_u64s;
-
- if (trans->journal_seq)
- *trans->journal_seq = trans->journal_res.seq;
- }
-
- trans_for_each_update(trans, i) {
- struct btree_path *path = trans->paths + i->path;
-
- if (!i->cached)
- bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
- else if (!i->key_cache_already_flushed)
- bch2_btree_insert_key_cached(trans, flags, i);
- else
- bch2_btree_key_cache_drop(trans, path);
- }
-
- return 0;
-fatal_err:
- bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
- percpu_down_read(&c->mark_lock);
-revert_fs_usage:
- for (struct jset_entry *entry2 = trans->journal_entries;
- entry2 != entry;
- entry2 = vstruct_next(entry2))
- if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
- entry2->start->k.type == KEY_TYPE_accounting)
- bch2_accounting_trans_commit_revert(trans,
- bkey_i_to_accounting(entry2->start), flags);
- percpu_up_read(&c->mark_lock);
- return ret;
-}
-
-static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
-{
- /*
- * Accounting keys aren't deduped in the journal: we have to compare
- * each individual update against what's in the btree to see if it has
- * been applied yet, and accounting updates also don't overwrite,
- * they're deltas that accumulate.
- */
- trans_for_each_update(trans, i)
- if (i->k->k.type != KEY_TYPE_accounting)
- bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-}
-
-static int bch2_trans_commit_journal_pin_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
-{
- return 0;
-}
-
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry **stopped_at,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- int ret = 0, u64s_delta = 0;
-
- for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
- struct btree_insert_entry *i = trans->updates + idx;
- if (i->cached)
- continue;
-
- u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
- u64s_delta -= i->old_btree_u64s;
-
- if (!same_leaf_as_next(trans, i)) {
- if (u64s_delta <= 0) {
- ret = bch2_foreground_maybe_merge(trans, i->path,
- i->level, flags);
- if (unlikely(ret))
- return ret;
- }
-
- u64s_delta = 0;
- }
- }
-
- ret = bch2_trans_lock_write(trans);
- if (unlikely(ret))
- return ret;
-
- ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
-
- if (!ret && unlikely(trans->journal_replay_not_finished))
- bch2_drop_overwrites_from_journal(trans);
-
- bch2_trans_unlock_updates_write(trans);
-
- if (!ret && trans->journal_pin)
- bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
- trans->journal_pin,
- bch2_trans_commit_journal_pin_flush);
-
- /*
- * Drop journal reservation after dropping write locks, since dropping
- * the journal reservation may kick off a journal write:
- */
- if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
- bch2_journal_res_put(&c->journal, &trans->journal_res);
-
- return ret;
-}
-
-static int journal_reclaim_wait_done(struct bch_fs *c)
-{
- int ret = bch2_journal_error(&c->journal) ?:
- bch2_btree_key_cache_wait_done(c);
-
- if (!ret)
- journal_reclaim_kick(&c->journal);
- return ret;
-}
-
-static noinline
-int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry *i,
- int ret, unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
- if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) {
- /*
- * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
- * flag
- */
- if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < BCH_WATERMARK_reclaim) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
- goto out;
- }
-
- ret = drop_locks_do(trans,
- bch2_trans_journal_res_get(trans,
- (flags & BCH_WATERMARK_MASK)|
- JOURNAL_RES_GET_CHECK));
- goto out;
- }
-
- switch (ret) {
- case -BCH_ERR_btree_insert_btree_node_full:
- ret = bch2_btree_split_leaf(trans, i->path, flags);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- trace_and_count(c, trans_restart_btree_node_split, trans,
- trace_ip, trans->paths + i->path);
- break;
- case -BCH_ERR_btree_insert_need_mark_replicas:
- ret = drop_locks_do(trans,
- bch2_accounting_update_sb(trans));
- break;
- case -BCH_ERR_btree_insert_need_journal_reclaim:
- bch2_trans_unlock(trans);
-
- trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
- track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true);
-
- wait_event_freezable(c->journal.reclaim_wait,
- (ret = journal_reclaim_wait_done(c)));
-
- track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false);
-
- if (ret < 0)
- break;
-
- ret = bch2_trans_relock(trans);
- break;
- default:
- BUG_ON(ret >= 0);
- break;
- }
-out:
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
- (flags & BCH_TRANS_COMMIT_no_enospc), c,
- "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
-
- return ret;
-}
-
-/*
- * This is for updates done in the early part of fsck - btree_gc - before we've
- * gone RW. we only add the new key to the list of keys for journal replay to
- * do.
- */
-static noinline int
-do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
-
- BUG_ON(current != c->recovery_task);
-
- trans_for_each_update(trans, i) {
- int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
- if (ret)
- return ret;
- }
-
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i))
- if (i->type == BCH_JSET_ENTRY_btree_keys ||
- i->type == BCH_JSET_ENTRY_write_buffer_keys) {
- int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
-{
- struct btree_insert_entry *errored_at = NULL;
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (unlikely(ret))
- goto out_reset;
-
- if (!trans->nr_updates &&
- !trans->journal_entries_u64s)
- goto out_reset;
-
- ret = bch2_trans_commit_run_triggers(trans);
- if (ret)
- goto out_reset;
-
- if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
- unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
- ret = do_bch2_trans_commit_to_journal_replay(trans);
- else
- ret = -BCH_ERR_erofs_trans_commit;
- goto out_reset;
- }
-
- EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-
- trans->journal_u64s = trans->journal_entries_u64s;
- trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
- if (trans->journal_transaction_names)
- trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
-
- trans_for_each_update(trans, i) {
- struct btree_path *path = trans->paths + i->path;
-
- EBUG_ON(!path->should_be_locked);
-
- ret = bch2_btree_path_upgrade(trans, path, i->level + 1);
- if (unlikely(ret))
- goto out;
-
- EBUG_ON(!btree_node_intent_locked(path, i->level));
-
- if (i->key_cache_already_flushed)
- continue;
-
- if (i->flags & BTREE_UPDATE_nojournal)
- continue;
-
- /* we're going to journal the key being updated: */
- trans->journal_u64s += jset_u64s(i->k->k.u64s);
-
- /* and we're also going to log the overwrite: */
- if (trans->journal_transaction_names)
- trans->journal_u64s += jset_u64s(i->old_k.u64s);
- }
-
- if (trans->extra_disk_res) {
- ret = bch2_disk_reservation_add(c, trans->disk_res,
- trans->extra_disk_res,
- (flags & BCH_TRANS_COMMIT_no_enospc)
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- goto err;
- }
-retry:
- errored_at = NULL;
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-
- ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
-
- /* make sure we didn't drop or screw up locks: */
- bch2_trans_verify_locks(trans);
-
- if (ret)
- goto err;
-
- trace_and_count(c, transaction_commit, trans, _RET_IP_);
-out:
- if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
- bch2_write_ref_put(c, BCH_WRITE_REF_trans);
-out_reset:
- if (!ret)
- bch2_trans_downgrade(trans);
- bch2_trans_reset_updates(trans);
-
- return ret;
-err:
- ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_);
- if (ret)
- goto out;
-
- /*
- * We might have done another transaction commit in the error path -
- * i.e. btree write buffer flush - which will have made use of
- * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
- * how the journal sequence number to pin is passed in - so we must
- * restart:
- */
- if (flags & BCH_TRANS_COMMIT_no_journal_res) {
- ret = -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-
- goto retry;
-}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
deleted file mode 100644
index 77578da2d23f..000000000000
--- a/fs/bcachefs/btree_types.h
+++ /dev/null
@@ -1,895 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_TYPES_H
-#define _BCACHEFS_BTREE_TYPES_H
-
-#include <linux/list.h>
-#include <linux/rhashtable.h>
-
-#include "bbpos_types.h"
-#include "btree_key_cache_types.h"
-#include "buckets_types.h"
-#include "darray.h"
-#include "errcode.h"
-#include "journal_types.h"
-#include "replicas_types.h"
-#include "six.h"
-
-struct open_bucket;
-struct btree_update;
-struct btree_trans;
-
-#define MAX_BSETS 3U
-
-struct btree_nr_keys {
-
- /*
- * Amount of live metadata (i.e. size of node after a compaction) in
- * units of u64s
- */
- u16 live_u64s;
- u16 bset_u64s[MAX_BSETS];
-
- /* live keys only: */
- u16 packed_keys;
- u16 unpacked_keys;
-};
-
-struct bset_tree {
- /*
- * We construct a binary tree in an array as if the array
- * started at 1, so that things line up on the same cachelines
- * better: see comments in bset.c at cacheline_to_bkey() for
- * details
- */
-
- /* size of the binary tree and prev array */
- u16 size;
-
- /* function of size - precalculated for to_inorder() */
- u16 extra;
-
- u16 data_offset;
- u16 aux_data_offset;
- u16 end_offset;
-};
-
-struct btree_write {
- struct journal_entry_pin journal;
-};
-
-struct btree_alloc {
- struct open_buckets ob;
- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
-
-struct btree_bkey_cached_common {
- struct six_lock lock;
- u8 level;
- u8 btree_id;
- bool cached;
-};
-
-struct btree {
- struct btree_bkey_cached_common c;
-
- struct rhash_head hash;
- u64 hash_val;
-
- unsigned long flags;
- u16 written;
- u8 nsets;
- u8 nr_key_bits;
- u16 version_ondisk;
-
- struct bkey_format format;
-
- struct btree_node *data;
- void *aux_data;
-
- /*
- * Sets of sorted keys - the real btree node - plus a binary search tree
- *
- * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
- * to the memory we have allocated for this btree node. Additionally,
- * set[0]->data points to the entire btree node as it exists on disk.
- */
- struct bset_tree set[MAX_BSETS];
-
- struct btree_nr_keys nr;
- u16 sib_u64s[2];
- u16 whiteout_u64s;
- u8 byte_order;
- u8 unpack_fn_len;
-
- struct btree_write writes[2];
-
- /* Key/pointer for this btree node */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-
- /*
- * XXX: add a delete sequence number, so when bch2_btree_node_relock()
- * fails because the lock sequence number has changed - i.e. the
- * contents were modified - we can still relock the node if it's still
- * the one we want, without redoing the traversal
- */
-
- /*
- * For asynchronous splits/interior node updates:
- * When we do a split, we allocate new child nodes and update the parent
- * node to point to them: we update the parent in memory immediately,
- * but then we must wait until the children have been written out before
- * the update to the parent can be written - this is a list of the
- * btree_updates that are blocking this node from being
- * written:
- */
- struct list_head write_blocked;
-
- /*
- * Also for asynchronous splits/interior node updates:
- * If a btree node isn't reachable yet, we don't want to kick off
- * another write - because that write also won't yet be reachable and
- * marking it as completed before it's reachable would be incorrect:
- */
- unsigned long will_make_reachable;
-
- struct open_buckets ob;
-
- /* lru list */
- struct list_head list;
-};
-
-#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
- x(lock_intent) \
- x(lock_write) \
- x(dirty) \
- x(read_in_flight) \
- x(write_in_flight) \
- x(noevict) \
- x(write_blocked) \
- x(will_make_reachable) \
- x(access_bit)
-
-enum bch_btree_cache_not_freed_reasons {
-#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
- BCH_BTREE_CACHE_NOT_FREED_REASONS()
-#undef x
- BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
-};
-
-struct btree_cache_list {
- unsigned idx;
- struct shrinker *shrink;
- struct list_head list;
- size_t nr;
-};
-
-struct btree_cache {
- struct rhashtable table;
- bool table_init_done;
- /*
- * We never free a struct btree, except on shutdown - we just put it on
- * the btree_cache_freed list and reuse it later. This simplifies the
- * code, and it doesn't cost us much memory as the memory usage is
- * dominated by buffers that hold the actual btree node data and those
- * can be freed - and the number of struct btrees allocated is
- * effectively bounded.
- *
- * btree_cache_freeable effectively is a small cache - we use it because
- * high order page allocations can be rather expensive, and it's quite
- * common to delete and allocate btree nodes in quick succession. It
- * should never grow past ~2-3 nodes in practice.
- */
- struct mutex lock;
- struct list_head freeable;
- struct list_head freed_pcpu;
- struct list_head freed_nonpcpu;
- struct btree_cache_list live[2];
-
- size_t nr_freeable;
- size_t nr_reserve;
- size_t nr_by_btree[BTREE_ID_NR];
- atomic_long_t nr_dirty;
-
- /* shrinker stats */
- size_t nr_freed;
- u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
-
- /*
- * If we need to allocate memory for a new btree node and that
- * allocation fails, we can cannibalize another node in the btree cache
- * to satisfy the allocation - lock to guarantee only one thread does
- * this at a time:
- */
- struct task_struct *alloc_lock;
- struct closure_waitlist alloc_wait;
-
- struct bbpos pinned_nodes_start;
- struct bbpos pinned_nodes_end;
- /* btree id mask: 0 for leaves, 1 for interior */
- u64 pinned_nodes_mask[2];
-};
-
-struct btree_node_iter {
- struct btree_node_iter_set {
- u16 k, end;
- } data[MAX_BSETS];
-};
-
-#define BTREE_ITER_FLAGS() \
- x(slots) \
- x(intent) \
- x(prefetch) \
- x(is_extents) \
- x(not_extents) \
- x(cached) \
- x(with_key_cache) \
- x(with_updates) \
- x(with_journal) \
- x(snapshot_field) \
- x(all_snapshots) \
- x(filter_snapshots) \
- x(nopreserve) \
- x(cached_nofill) \
- x(key_cache_fill) \
-
-#define STR_HASH_FLAGS() \
- x(must_create) \
- x(must_replace)
-
-#define BTREE_UPDATE_FLAGS() \
- x(internal_snapshot_node) \
- x(nojournal) \
- x(key_cache_reclaim)
-
-
-/*
- * BTREE_TRIGGER_norun - don't run triggers at all
- *
- * BTREE_TRIGGER_transactional - we're running transactional triggers as part of
- * a transaction commit: triggers may generate new updates
- *
- * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction
- * commit: we have our journal reservation, we're holding btree node write
- * locks, and we know the transaction is going to commit (returning an error
- * here is a fatal error, causing us to go emergency read-only)
- *
- * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage
- *
- * BTREE_TRIGGER_insert - @new is entering the btree
- * BTREE_TRIGGER_overwrite - @old is leaving the btree
- *
- * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc
- * trigger
- */
-#define BTREE_TRIGGER_FLAGS() \
- x(norun) \
- x(transactional) \
- x(atomic) \
- x(check_repair) \
- x(gc) \
- x(insert) \
- x(overwrite) \
- x(is_root) \
- x(bucket_invalidate)
-
-enum {
-#define x(n) BTREE_ITER_FLAG_BIT_##n,
- BTREE_ITER_FLAGS()
- STR_HASH_FLAGS()
- BTREE_UPDATE_FLAGS()
- BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-/* iter flags must fit in a u16: */
-//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15);
-
-enum btree_iter_update_trigger_flags {
-#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
- BTREE_ITER_FLAGS()
-#undef x
-#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
- STR_HASH_FLAGS()
-#undef x
-#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
- BTREE_UPDATE_FLAGS()
-#undef x
-#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
- BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-enum btree_path_uptodate {
- BTREE_ITER_UPTODATE = 0,
- BTREE_ITER_NEED_RELOCK = 1,
- BTREE_ITER_NEED_TRAVERSE = 2,
-};
-
-#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
-#define TRACK_PATH_ALLOCATED
-#endif
-
-typedef u16 btree_path_idx_t;
-
-struct btree_path {
- btree_path_idx_t sorted_idx;
- u8 ref;
- u8 intent_ref;
-
- /* btree_iter_copy starts here: */
- struct bpos pos;
-
- enum btree_id btree_id:5;
- bool cached:1;
- bool preserve:1;
- enum btree_path_uptodate uptodate:2;
- /*
- * When true, failing to relock this path will cause the transaction to
- * restart:
- */
- bool should_be_locked:1;
- unsigned level:3,
- locks_want:3;
- u8 nodes_locked;
-
- struct btree_path_level {
- struct btree *b;
- struct btree_node_iter iter;
- u32 lock_seq;
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- u64 lock_taken_time;
-#endif
- } l[BTREE_MAX_DEPTH];
-#ifdef TRACK_PATH_ALLOCATED
- unsigned long ip_allocated;
-#endif
-};
-
-static inline struct btree_path_level *path_l(struct btree_path *path)
-{
- return path->l + path->level;
-}
-
-static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
-{
-#ifdef TRACK_PATH_ALLOCATED
- return path->ip_allocated;
-#else
- return _THIS_IP_;
-#endif
-}
-
-/*
- * @pos - iterator's current position
- * @level - current btree depth
- * @locks_want - btree level below which we start taking intent locks
- * @nodes_locked - bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked - bitmask indicating which locks are intent locks
- */
-struct btree_iter {
- struct btree_trans *trans;
- btree_path_idx_t path;
- btree_path_idx_t update_path;
- btree_path_idx_t key_cache_path;
-
- enum btree_id btree_id:8;
- u8 min_depth;
-
- /* btree_iter_copy starts here: */
- u16 flags;
-
- /* When we're filtering by snapshot, the snapshot ID we're looking for: */
- unsigned snapshot;
-
- struct bpos pos;
- /*
- * Current unpacked key - so that bch2_btree_iter_next()/
- * bch2_btree_iter_next_slot() can correctly advance pos.
- */
- struct bkey k;
-
- /* BTREE_ITER_with_journal: */
- size_t journal_idx;
-#ifdef TRACK_PATH_ALLOCATED
- unsigned long ip_allocated;
-#endif
-};
-
-#define BKEY_CACHED_ACCESSED 0
-#define BKEY_CACHED_DIRTY 1
-
-struct bkey_cached {
- struct btree_bkey_cached_common c;
-
- unsigned long flags;
- u16 u64s;
- struct bkey_cached_key key;
-
- struct rhash_head hash;
-
- struct journal_entry_pin journal;
- u64 seq;
-
- struct bkey_i *k;
- struct rcu_head rcu;
-};
-
-static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
-{
- return !b->cached
- ? container_of(b, struct btree, c)->key.k.p
- : container_of(b, struct bkey_cached, c)->key.pos;
-}
-
-struct btree_insert_entry {
- unsigned flags;
- u8 sort_order;
- u8 bkey_type;
- enum btree_id btree_id:8;
- u8 level:4;
- bool cached:1;
- bool insert_trigger_run:1;
- bool overwrite_trigger_run:1;
- bool key_cache_already_flushed:1;
- /*
- * @old_k may be a key from the journal; @old_btree_u64s always refers
- * to the size of the key being overwritten in the btree:
- */
- u8 old_btree_u64s;
- btree_path_idx_t path;
- struct bkey_i *k;
- /* key being overwritten: */
- struct bkey old_k;
- const struct bch_val *old_v;
- unsigned long ip_allocated;
-};
-
-/* Number of btree paths we preallocate, usually enough */
-#define BTREE_ITER_INITIAL 64
-/*
- * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
- * paths should run inside this limit, and if they don't it usually indicates a
- * bug (leaking/duplicated btree paths).
- *
- * exception: some fsck paths
- *
- * bugs with excessive path usage seem to have possibly been eliminated now, so
- * we might consider eliminating this (and btree_trans_too_many_iter()) at some
- * point.
- */
-#define BTREE_ITER_NORMAL_LIMIT 256
-/* never exceed limit */
-#define BTREE_ITER_MAX (1U << 10)
-
-struct btree_trans_commit_hook;
-typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
-
-struct btree_trans_commit_hook {
- btree_trans_commit_hook_fn *fn;
- struct btree_trans_commit_hook *next;
-};
-
-#define BTREE_TRANS_MEM_MAX (1U << 16)
-
-#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000
-
-struct btree_trans_paths {
- unsigned long nr_paths;
- struct btree_path paths[];
-};
-
-struct btree_trans {
- struct bch_fs *c;
-
- unsigned long *paths_allocated;
- struct btree_path *paths;
- btree_path_idx_t *sorted;
- struct btree_insert_entry *updates;
-
- void *mem;
- unsigned mem_top;
- unsigned mem_bytes;
-
- btree_path_idx_t nr_sorted;
- btree_path_idx_t nr_paths;
- btree_path_idx_t nr_paths_max;
- btree_path_idx_t nr_updates;
- u8 fn_idx;
- u8 lock_must_abort;
- bool lock_may_not_fail:1;
- bool srcu_held:1;
- bool locked:1;
- bool pf_memalloc_nofs:1;
- bool write_locked:1;
- bool used_mempool:1;
- bool in_traverse_all:1;
- bool paths_sorted:1;
- bool memory_allocation_failure:1;
- bool journal_transaction_names:1;
- bool journal_replay_not_finished:1;
- bool notrace_relock_fail:1;
- enum bch_errcode restarted:16;
- u32 restart_count;
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
- u32 restart_count_this_trans;
-#endif
-
- u64 last_begin_time;
- unsigned long last_begin_ip;
- unsigned long last_restarted_ip;
-#ifdef CONFIG_BCACHEFS_DEBUG
- bch_stacktrace last_restarted_trace;
-#endif
- unsigned long last_unlock_ip;
- unsigned long srcu_lock_time;
-
- const char *fn;
- struct btree_bkey_cached_common *locking;
- struct six_lock_waiter locking_wait;
- int srcu_idx;
-
- /* update path: */
- u16 journal_entries_u64s;
- u16 journal_entries_size;
- struct jset_entry *journal_entries;
-
- struct btree_trans_commit_hook *hooks;
- struct journal_entry_pin *journal_pin;
-
- struct journal_res journal_res;
- u64 *journal_seq;
- struct disk_reservation *disk_res;
-
- struct bch_fs_usage_base fs_usage_delta;
-
- unsigned journal_u64s;
- unsigned extra_disk_res; /* XXX kill */
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
- /* Entries before this are zeroed out on every bch2_trans_get() call */
-
- struct list_head list;
- struct closure ref;
-
- unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
- struct btree_trans_paths trans_paths;
- struct btree_path _paths[BTREE_ITER_INITIAL];
- btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4];
- struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
-};
-
-static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
-{
- return trans->paths + iter->path;
-}
-
-static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter)
-{
- return iter->key_cache_path
- ? trans->paths + iter->key_cache_path
- : NULL;
-}
-
-#define BCH_BTREE_WRITE_TYPES() \
- x(initial, 0) \
- x(init_next_bset, 1) \
- x(cache_reclaim, 2) \
- x(journal_reclaim, 3) \
- x(interior, 4)
-
-enum btree_write_type {
-#define x(t, n) BTREE_WRITE_##t,
- BCH_BTREE_WRITE_TYPES()
-#undef x
- BTREE_WRITE_TYPE_NR,
-};
-
-#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
-#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
-
-#define BTREE_FLAGS() \
- x(read_in_flight) \
- x(read_error) \
- x(dirty) \
- x(need_write) \
- x(write_blocked) \
- x(will_make_reachable) \
- x(noevict) \
- x(write_idx) \
- x(accessed) \
- x(write_in_flight) \
- x(write_in_flight_inner) \
- x(just_written) \
- x(dying) \
- x(fake) \
- x(need_rewrite) \
- x(never_write) \
- x(pinned)
-
-enum btree_flags {
- /* First bits for btree node write type */
- BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
-#define x(flag) BTREE_NODE_##flag,
- BTREE_FLAGS()
-#undef x
-};
-
-#define x(flag) \
-static inline bool btree_node_ ## flag(struct btree *b) \
-{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
- \
-static inline void set_btree_node_ ## flag(struct btree *b) \
-{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
- \
-static inline void clear_btree_node_ ## flag(struct btree *b) \
-{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-
-BTREE_FLAGS()
-#undef x
-
-static inline struct btree_write *btree_current_write(struct btree *b)
-{
- return b->writes + btree_node_write_idx(b);
-}
-
-static inline struct btree_write *btree_prev_write(struct btree *b)
-{
- return b->writes + (btree_node_write_idx(b) ^ 1);
-}
-
-static inline struct bset_tree *bset_tree_last(struct btree *b)
-{
- EBUG_ON(!b->nsets);
- return b->set + b->nsets - 1;
-}
-
-static inline void *
-__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
-{
- return (void *) ((u64 *) b->data + 1 + offset);
-}
-
-static inline u16
-__btree_node_ptr_to_offset(const struct btree *b, const void *p)
-{
- u16 ret = (u64 *) p - 1 - (u64 *) b->data;
-
- EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
- return ret;
-}
-
-static inline struct bset *bset(const struct btree *b,
- const struct bset_tree *t)
-{
- return __btree_node_offset_to_ptr(b, t->data_offset);
-}
-
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
-{
- t->end_offset =
- __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
-}
-
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
- const struct bset *i)
-{
- t->data_offset = __btree_node_ptr_to_offset(b, i);
- set_btree_bset_end(b, t);
-}
-
-static inline struct bset *btree_bset_first(struct btree *b)
-{
- return bset(b, b->set);
-}
-
-static inline struct bset *btree_bset_last(struct btree *b)
-{
- return bset(b, bset_tree_last(b));
-}
-
-static inline u16
-__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
-{
- return __btree_node_ptr_to_offset(b, k);
-}
-
-static inline struct bkey_packed *
-__btree_node_offset_to_key(const struct btree *b, u16 k)
-{
- return __btree_node_offset_to_ptr(b, k);
-}
-
-static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
-{
- return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
-}
-
-#define btree_bkey_first(_b, _t) \
-({ \
- EBUG_ON(bset(_b, _t)->start != \
- __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
- \
- bset(_b, _t)->start; \
-})
-
-#define btree_bkey_last(_b, _t) \
-({ \
- EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \
- vstruct_last(bset(_b, _t))); \
- \
- __btree_node_offset_to_key(_b, (_t)->end_offset); \
-})
-
-static inline unsigned bset_u64s(struct bset_tree *t)
-{
- return t->end_offset - t->data_offset -
- sizeof(struct bset) / sizeof(u64);
-}
-
-static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
-{
- return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
-}
-
-static inline unsigned bset_byte_offset(struct btree *b, void *i)
-{
- return i - (void *) b->data;
-}
-
-enum btree_node_type {
- BKEY_TYPE_btree,
-#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
- BCH_BTREE_IDS()
-#undef x
- BKEY_TYPE_NR
-};
-
-/* Type of a key in btree @id at level @level: */
-static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
-{
- return level ? BKEY_TYPE_btree : (unsigned) id + 1;
-}
-
-/* Type of keys @b contains: */
-static inline enum btree_node_type btree_node_type(struct btree *b)
-{
- return __btree_node_type(b->c.level, b->c.btree_id);
-}
-
-const char *bch2_btree_node_type_str(enum btree_node_type);
-
-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
- (BIT_ULL(BKEY_TYPE_extents)| \
- BIT_ULL(BKEY_TYPE_alloc)| \
- BIT_ULL(BKEY_TYPE_inodes)| \
- BIT_ULL(BKEY_TYPE_stripes)| \
- BIT_ULL(BKEY_TYPE_reflink)| \
- BIT_ULL(BKEY_TYPE_subvolumes)| \
- BIT_ULL(BKEY_TYPE_btree))
-
-#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
- (BIT_ULL(BKEY_TYPE_alloc)| \
- BIT_ULL(BKEY_TYPE_inodes)| \
- BIT_ULL(BKEY_TYPE_stripes)| \
- BIT_ULL(BKEY_TYPE_snapshots))
-
-#define BTREE_NODE_TYPE_HAS_TRIGGERS \
- (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
- BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
-
-static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type)
-{
- return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS;
-}
-
-static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type)
-{
- return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS;
-}
-
-static inline bool btree_node_type_has_triggers(enum btree_node_type type)
-{
- return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
-}
-
-static inline bool btree_id_is_extents(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
- return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1);
-}
-
-static inline bool btree_type_has_snapshots(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_has_snapshot_field(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_has_ptrs(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_uses_write_buffer(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline u8 btree_trigger_order(enum btree_id btree)
-{
- switch (btree) {
- case BTREE_ID_alloc:
- return U8_MAX;
- case BTREE_ID_stripes:
- return U8_MAX - 1;
- default:
- return btree;
- }
-}
-
-struct btree_root {
- struct btree *b;
-
- /* On disk root - see async splits: */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
- u8 level;
- u8 alive;
- s16 error;
-};
-
-enum btree_gc_coalesce_fail_reason {
- BTREE_GC_COALESCE_FAIL_RESERVE_GET,
- BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
- BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
-};
-
-enum btree_node_sibling {
- btree_prev_sib,
- btree_next_sib,
-};
-
-struct get_locks_fail {
- unsigned l;
- struct btree *b;
-};
-
-#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
deleted file mode 100644
index bd2eb42edb24..000000000000
--- a/fs/bcachefs/btree_update.c
+++ /dev/null
@@ -1,908 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "extents.h"
-#include "keylist.h"
-#include "snapshot.h"
-#include "trace.h"
-
-static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
- const struct btree_insert_entry *r)
-{
- return cmp_int(l->sort_order, r->sort_order) ?:
- cmp_int(l->cached, r->cached) ?:
- -cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p);
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
- struct bkey_i *, enum btree_iter_update_trigger_flags,
- unsigned long ip);
-
-static noinline int extent_front_merge(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bkey_i **insert,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i *update;
- int ret;
-
- if (unlikely(trans->journal_replay_not_finished))
- return 0;
-
- update = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- return ret;
-
- if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
- return 0;
-
- ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
- bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
-
- ret = bch2_btree_delete_at(trans, iter, flags);
- if (ret)
- return ret;
-
- *insert = update;
- return 0;
-}
-
-static noinline int extent_back_merge(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- int ret;
-
- if (unlikely(trans->journal_replay_not_finished))
- return 0;
-
- ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
- bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
-
- bch2_bkey_merge(c, bkey_i_to_s(insert), k);
- return 0;
-}
-
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u32 snapshot = pos.snapshot;
- int ret;
-
- if (!bch2_snapshot_parent(trans->c, pos.snapshot))
- return 0;
-
- pos.snapshot++;
-
- for_each_btree_key_norestart(trans, iter, btree_id, pos,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_nopreserve, k, ret) {
- if (!bkey_eq(k.k->p, pos))
- break;
-
- if (bch2_snapshot_is_ancestor(trans->c, snapshot,
- k.k->p.snapshot)) {
- ret = !bkey_whiteout(k.k);
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id id,
- struct bpos old_pos,
- struct bpos new_pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter old_iter, new_iter = { NULL };
- struct bkey_s_c old_k, new_k;
- snapshot_id_list s;
- struct bkey_i *update;
- int ret = 0;
-
- if (!bch2_snapshot_has_children(c, old_pos.snapshot))
- return 0;
-
- darray_init(&s);
-
- bch2_trans_iter_init(trans, &old_iter, id, old_pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots);
- while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
- !(ret = bkey_err(old_k)) &&
- bkey_eq(old_pos, old_k.k->p)) {
- struct bpos whiteout_pos =
- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
-
- if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
- snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
- continue;
-
- new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_intent);
- ret = bkey_err(new_k);
- if (ret)
- break;
-
- if (new_k.k->type == KEY_TYPE_deleted) {
- update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- break;
-
- bkey_init(&update->k);
- update->k.p = whiteout_pos;
- update->k.type = KEY_TYPE_whiteout;
-
- ret = bch2_trans_update(trans, &new_iter, update,
- BTREE_UPDATE_internal_snapshot_node);
- }
- bch2_trans_iter_exit(trans, &new_iter);
-
- ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &new_iter);
- bch2_trans_iter_exit(trans, &old_iter);
- darray_exit(&s);
-
- return ret;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
- struct btree_iter *iter,
- enum btree_iter_update_trigger_flags flags,
- struct bkey_s_c old,
- struct bkey_s_c new)
-{
- enum btree_id btree_id = iter->btree_id;
- struct bkey_i *update;
- struct bpos new_start = bkey_start_pos(new.k);
- unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start);
- unsigned back_split = bkey_gt(old.k->p, new.k->p);
- unsigned middle_split = (front_split || back_split) &&
- old.k->p.snapshot != new.k->p.snapshot;
- unsigned nr_splits = front_split + back_split + middle_split;
- int ret = 0, compressed_sectors;
-
- /*
- * If we're going to be splitting a compressed extent, note it
- * so that __bch2_trans_commit() can increase our disk
- * reservation:
- */
- if (nr_splits > 1 &&
- (compressed_sectors = bch2_bkey_sectors_compressed(old)))
- trans->extra_disk_res += compressed_sectors * (nr_splits - 1);
-
- if (front_split) {
- update = bch2_bkey_make_mut_noupdate(trans, old);
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bch2_cut_back(new_start, update);
-
- ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
- old.k->p, update->k.p) ?:
- bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_internal_snapshot_node|flags);
- if (ret)
- return ret;
- }
-
- /* If we're overwriting in a different snapshot - middle split: */
- if (middle_split) {
- update = bch2_bkey_make_mut_noupdate(trans, old);
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bch2_cut_front(new_start, update);
- bch2_cut_back(new.k->p, update);
-
- ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
- old.k->p, update->k.p) ?:
- bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_internal_snapshot_node|flags);
- if (ret)
- return ret;
- }
-
- if (bkey_le(old.k->p, new.k->p)) {
- update = bch2_trans_kmalloc(trans, sizeof(*update));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bkey_init(&update->k);
- update->k.p = old.k->p;
- update->k.p.snapshot = new.k->p.snapshot;
-
- if (new.k->p.snapshot != old.k->p.snapshot) {
- update->k.type = KEY_TYPE_whiteout;
- } else if (btree_type_has_snapshots(btree_id)) {
- ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
- if (ret < 0)
- return ret;
- if (ret)
- update->k.type = KEY_TYPE_whiteout;
- }
-
- ret = bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_internal_snapshot_node|flags);
- if (ret)
- return ret;
- }
-
- if (back_split) {
- update = bch2_bkey_make_mut_noupdate(trans, old);
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bch2_cut_front(new.k->p, update);
-
- ret = bch2_trans_update_by_path(trans, iter->path, update,
- BTREE_UPDATE_internal_snapshot_node|
- flags, _RET_IP_);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int bch2_trans_update_extent(struct btree_trans *trans,
- struct btree_iter *orig_iter,
- struct bkey_i *insert,
- enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- enum btree_id btree_id = orig_iter->btree_id;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
- BTREE_ITER_intent|
- BTREE_ITER_with_updates|
- BTREE_ITER_not_extents);
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
- if ((ret = bkey_err(k)))
- goto err;
- if (!k.k)
- goto out;
-
- if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
- if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
- ret = extent_front_merge(trans, &iter, k, &insert, flags);
- if (ret)
- goto err;
- }
-
- goto next;
- }
-
- while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
- bool done = bkey_lt(insert->k.p, k.k->p);
-
- ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
- if (ret)
- goto err;
-
- if (done)
- goto out;
-next:
- bch2_btree_iter_advance(&iter);
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
- if ((ret = bkey_err(k)))
- goto err;
- if (!k.k)
- goto out;
- }
-
- if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
- ret = extent_back_merge(trans, &iter, insert, k);
- if (ret)
- goto err;
- }
-out:
- if (!bkey_deleted(&insert->k))
- ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-static noinline int flush_new_cached_update(struct btree_trans *trans,
- struct btree_insert_entry *i,
- enum btree_iter_update_trigger_flags flags,
- unsigned long ip)
-{
- struct bkey k;
- int ret;
-
- btree_path_idx_t path_idx =
- bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
- BTREE_ITER_intent, _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, path_idx, 0);
- if (ret)
- goto out;
-
- struct btree_path *btree_path = trans->paths + path_idx;
-
- /*
- * The old key in the insert entry might actually refer to an existing
- * key in the btree that has been deleted from cache and not yet
- * flushed. Check for this and skip the flush so we don't run triggers
- * against a stale key.
- */
- bch2_btree_path_peek_slot_exact(btree_path, &k);
- if (!bkey_deleted(&k))
- goto out;
-
- i->key_cache_already_flushed = true;
- i->flags |= BTREE_TRIGGER_norun;
-
- btree_path_set_should_be_locked(trans, btree_path);
- ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
-out:
- bch2_path_put(trans, path_idx, true);
- return ret;
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
- unsigned long ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i, n;
- int cmp;
-
- struct btree_path *path = trans->paths + path_idx;
- EBUG_ON(!path->should_be_locked);
- EBUG_ON(trans->nr_updates >= trans->nr_paths);
- EBUG_ON(!bpos_eq(k->k.p, path->pos));
-
- n = (struct btree_insert_entry) {
- .flags = flags,
- .sort_order = btree_trigger_order(path->btree_id),
- .bkey_type = __btree_node_type(path->level, path->btree_id),
- .btree_id = path->btree_id,
- .level = path->level,
- .cached = path->cached,
- .path = path_idx,
- .k = k,
- .ip_allocated = ip,
- };
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans_for_each_update(trans, i)
- BUG_ON(i != trans->updates &&
- btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
-
- /*
- * Pending updates are kept sorted: first, find position of new update,
- * then delete/trim any updates the new update overwrites:
- */
- for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) {
- cmp = btree_insert_entry_cmp(&n, i);
- if (cmp <= 0)
- break;
- }
-
- bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
-
- if (overwrite) {
- EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
- bch2_path_put(trans, i->path, true);
- i->flags = n.flags;
- i->cached = n.cached;
- i->k = n.k;
- i->path = n.path;
- i->ip_allocated = n.ip_allocated;
- } else {
- array_insert_item(trans->updates, trans->nr_updates,
- i - trans->updates, n);
-
- i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
- i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-
- if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
- bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-
- if (j_k) {
- i->old_k = j_k->k;
- i->old_v = &j_k->v;
- }
- }
- }
-
- __btree_path_get(trans, trans->paths + i->path, true);
-
- trace_update_by_path(trans, path, i, overwrite);
-
- /*
- * If a key is present in the key cache, it must also exist in the
- * btree - this is necessary for cache coherency. When iterating over
- * a btree that's cached in the key cache, the btree iter code checks
- * the key cache - but the key has to exist in the btree for that to
- * work:
- */
- if (path->cached && !i->old_btree_u64s)
- return flush_new_cached_update(trans, i, flags, ip);
-
- return 0;
-}
-
-static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree_path *path)
-{
- struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter);
-
- if (!key_cache_path ||
- !key_cache_path->should_be_locked ||
- !bpos_eq(key_cache_path->pos, iter->pos)) {
- struct bkey_cached *ck;
- int ret;
-
- if (!iter->key_cache_path)
- iter->key_cache_path =
- bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_intent|
- BTREE_ITER_cached, _THIS_IP_);
-
- iter->key_cache_path =
- bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
- iter->flags & BTREE_ITER_intent,
- _THIS_IP_);
-
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached);
- if (unlikely(ret))
- return ret;
-
- ck = (void *) trans->paths[iter->key_cache_path].l[0].b;
-
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
- }
-
- btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
- }
-
- return 0;
-}
-
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
- kmsan_check_memory(k, bkey_bytes(&k->k));
-
- btree_path_idx_t path_idx = iter->update_path ?: iter->path;
- int ret;
-
- if (iter->flags & BTREE_ITER_is_extents)
- return bch2_trans_update_extent(trans, iter, k, flags);
-
- if (bkey_deleted(&k->k) &&
- !(flags & BTREE_UPDATE_key_cache_reclaim) &&
- (iter->flags & BTREE_ITER_filter_snapshots)) {
- ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
- if (unlikely(ret < 0))
- return ret;
-
- if (ret)
- k->k.type = KEY_TYPE_whiteout;
- }
-
- /*
- * Ensure that updates to cached btrees go to the key cache:
- */
- struct btree_path *path = trans->paths + path_idx;
- if (!(flags & BTREE_UPDATE_key_cache_reclaim) &&
- !path->cached &&
- !path->level &&
- btree_id_cached(trans->c, path->btree_id)) {
- ret = bch2_trans_update_get_key_cache(trans, iter, path);
- if (ret)
- return ret;
-
- path_idx = iter->key_cache_path;
- }
-
- return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
-{
- struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- bkey_copy(n, k);
- return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
-{
- unsigned new_top = trans->journal_entries_u64s + u64s;
- unsigned old_size = trans->journal_entries_size;
-
- if (new_top > trans->journal_entries_size) {
- trans->journal_entries_size = roundup_pow_of_two(new_top);
-
- btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size;
- }
-
- struct jset_entry *n =
- bch2_trans_kmalloc_nomemzero(trans,
- trans->journal_entries_size * sizeof(u64));
- if (IS_ERR(n))
- return ERR_CAST(n);
-
- if (trans->journal_entries)
- memcpy(n, trans->journal_entries, old_size * sizeof(u64));
- trans->journal_entries = n;
-
- struct jset_entry *e = btree_trans_journal_entries_top(trans);
- trans->journal_entries_u64s = new_top;
- return e;
-}
-
-int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
- enum btree_id btree, struct bpos end)
-{
- bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
- struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- bch2_btree_iter_advance(iter);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- BUG_ON(k.k->type != KEY_TYPE_deleted);
-
- if (bkey_gt(k.k->p, end)) {
- ret = -BCH_ERR_ENOSPC_btree_slot;
- goto err;
- }
-
- return 0;
-err:
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *trans,
- struct btree_trans_commit_hook *h)
-{
- h->next = trans->hooks;
- trans->hooks = h;
-}
-
-int bch2_btree_insert_nonextent(struct btree_trans *trans,
- enum btree_id btree, struct bkey_i *k,
- enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, btree, k->k.p,
- BTREE_ITER_cached|
- BTREE_ITER_not_extents|
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k, flags);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
- BTREE_ITER_intent|flags);
- int ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k, flags);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/**
- * bch2_btree_insert - insert keys into the extent btree
- * @c: pointer to struct bch_fs
- * @id: btree to insert into
- * @k: key to insert
- * @disk_res: must be non-NULL whenever inserting or potentially
- * splitting data extents
- * @flags: transaction commit flags
- * @iter_flags: btree iter update trigger flags
- *
- * Returns: 0 on success, error code on failure
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
- struct disk_reservation *disk_res, int flags,
- enum btree_iter_update_trigger_flags iter_flags)
-{
- return bch2_trans_commit_do(c, disk_res, NULL, flags,
- bch2_btree_insert_trans(trans, id, k, iter_flags));
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned update_flags)
-{
- struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
- int ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- return ret;
-
- bkey_init(&k->k);
- k->k.p = iter->pos;
- return bch2_trans_update(trans, iter, k, update_flags);
-}
-
-int bch2_btree_delete(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos,
- unsigned update_flags)
-{
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, btree, pos,
- BTREE_ITER_cached|
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_delete_at(trans, &iter, update_flags);
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
- struct bpos start, struct bpos end,
- unsigned update_flags,
- u64 *journal_seq)
-{
- u32 restart_count = trans->restart_count;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
- while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(trans->c, 0);
- struct bkey_i delete;
-
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bkey_init(&delete.k);
-
- /*
- * This could probably be more efficient for extents:
- */
-
- /*
- * For extents, iter.pos won't necessarily be the same as
- * bkey_start_pos(k.k) (for non extents they always will be the
- * same). It's important that we delete starting from iter.pos
- * because the range we want to delete could start in the middle
- * of k.
- *
- * (bch2_btree_iter_peek() does guarantee that iter.pos >=
- * bkey_start_pos(k.k)).
- */
- delete.k.p = iter.pos;
-
- if (iter.flags & BTREE_ITER_is_extents)
- bch2_key_resize(&delete.k,
- bpos_min(end, k.k->p).offset -
- iter.pos.offset);
-
- ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
- bch2_trans_commit(trans, &disk_res, journal_seq,
- BCH_TRANS_COMMIT_no_enospc);
- bch2_disk_reservation_put(trans->c, &disk_res);
-err:
- /*
- * the bch2_trans_begin() call is in a weird place because we
- * need to call it after every transaction commit, to avoid path
- * overflow, but don't want to call it if the delete operation
- * is a no-op and we have no work to do:
- */
- bch2_trans_begin(trans);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
- struct bpos start, struct bpos end,
- unsigned update_flags,
- u64 *journal_seq)
-{
- int ret = bch2_trans_run(c,
- bch2_btree_delete_range_trans(trans, id, start, end,
- update_flags, journal_seq));
- if (ret == -BCH_ERR_transaction_restart_nested)
- ret = 0;
- return ret;
-}
-
-int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set)
-{
- struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
- int ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- return ret;
-
- bkey_init(&k->k);
- k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- k->k.p = iter->pos;
- if (iter->flags & BTREE_ITER_is_extents)
- bch2_key_resize(&k->k, 1);
-
- return bch2_trans_update(trans, iter, k, 0);
-}
-
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
- struct bpos pos, bool set)
-{
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
-
- int ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_bit_mod_iter(trans, &iter, set);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
- struct bpos pos, bool set)
-{
- struct bkey_i k;
-
- bkey_init(&k.k);
- k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- k.k.p = pos;
-
- return bch2_trans_update_buffered(trans, btree, &k);
-}
-
-int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
-{
- unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64));
- prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos);
-
- int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
- if (ret)
- return ret;
-
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
- ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- return ret;
-
- struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
- journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
- memcpy(l->d, buf->buf, buf->pos);
- return 0;
-}
-
-__printf(3, 0)
-static int
-__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
- va_list args)
-{
- struct printbuf buf = PRINTBUF;
- prt_vprintf(&buf, fmt, args);
-
- unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
- prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos);
-
- int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
- if (ret)
- goto err;
-
- if (!test_bit(JOURNAL_running, &c->journal.flags)) {
- ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
- if (ret)
- goto err;
-
- struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
- journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
- memcpy(l->d, buf.buf, buf.pos);
- c->journal.early_journal_entries.nr += jset_u64s(u64s);
- } else {
- ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
- bch2_trans_log_msg(trans, &buf));
- }
-err:
- printbuf_exit(&buf);
- return ret;
-}
-
-__printf(2, 3)
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
- va_list args;
- int ret;
-
- va_start(args, fmt);
- ret = __bch2_fs_log_msg(c, 0, fmt, args);
- va_end(args);
- return ret;
-}
-
-/*
- * Use for logging messages during recovery to enable reserved space and avoid
- * blocking.
- */
-__printf(2, 3)
-int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
- va_list args;
- int ret;
-
- va_start(args, fmt);
- ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
- va_end(args);
- return ret;
-}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
deleted file mode 100644
index d2e1c04353f6..000000000000
--- a/fs/bcachefs/btree_update.h
+++ /dev/null
@@ -1,378 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_H
-#define _BCACHEFS_BTREE_UPDATE_H
-
-#include "btree_iter.h"
-#include "journal.h"
-
-struct bch_fs;
-struct btree;
-
-void bch2_btree_node_prep_for_write(struct btree_trans *,
- struct btree_path *, struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
- struct btree *, struct btree_node_iter *,
- struct bkey_i *);
-
-int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
-int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
-void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
-
-void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
- struct bkey_i *, u64);
-
-#define BCH_TRANS_COMMIT_FLAGS() \
- x(no_enospc, "don't check for enospc") \
- x(no_check_rw, "don't attempt to take a ref on c->writes") \
- x(no_journal_res, "don't take a journal reservation, instead " \
- "pin journal entry referred to by trans->journal_res.seq") \
- x(journal_reclaim, "operation required for journal reclaim; may return error" \
- "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
- x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")
-
-enum __bch_trans_commit_flags {
- /* First bits for bch_watermark: */
- __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
-#define x(n, ...) __BCH_TRANS_COMMIT_##n,
- BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-enum bch_trans_commit_flags {
-#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
- BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
-
-int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
-int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
-
-int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
- struct bkey_i *, enum btree_iter_update_trigger_flags);
-
-int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
- enum btree_iter_update_trigger_flags);
-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct
- disk_reservation *, int flags, enum
- btree_iter_update_trigger_flags iter_flags);
-
-int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
- struct bpos, struct bpos, unsigned, u64 *);
-int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
- struct bpos, struct bpos, unsigned, u64 *);
-
-int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool);
-int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
-int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
-
-static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos)
-{
- return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
- struct bpos, struct bpos);
-
-/*
- * For use when splitting extents in existing snapshots:
- *
- * If @old_pos is an interior snapshot node, iterate over descendent snapshot
- * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
- * not visible, emit a whiteout at @new_pos.
- */
-static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id btree,
- struct bpos old_pos,
- struct bpos new_pos)
-{
- if (!btree_type_has_snapshots(btree) ||
- bkey_eq(old_pos, new_pos))
- return 0;
-
- return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
- enum btree_iter_update_trigger_flags,
- struct bkey_s_c, struct bkey_s_c);
-
-int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
- enum btree_id, struct bpos);
-
-int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, enum btree_iter_update_trigger_flags);
-
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
-
-static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
-{
- return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
-}
-
-static inline struct jset_entry *
-bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
-{
- if (!trans->journal_entries ||
- trans->journal_entries_u64s + u64s > trans->journal_entries_size)
- return __bch2_trans_jset_entry_alloc(trans, u64s);
-
- struct jset_entry *e = btree_trans_journal_entries_top(trans);
- trans->journal_entries_u64s += u64s;
- return e;
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
-
-int bch2_btree_write_buffer_insert_err(struct btree_trans *,
- enum btree_id, struct bkey_i *);
-
-static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
-{
- kmsan_check_memory(k, bkey_bytes(&k->k));
-
- if (unlikely(!btree_type_uses_write_buffer(btree))) {
- int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
- dump_stack();
- return ret;
- }
- /*
- * Most updates skip the btree write buffer until journal replay is
- * finished because synchronization with journal replay relies on having
- * a btree node locked - if we're overwriting a key in the journal that
- * journal replay hasn't yet replayed, we have to mark it as
- * overwritten.
- *
- * But accounting updates don't overwrite, they're deltas, and they have
- * to be flushed to the btree strictly in order for journal replay to be
- * able to tell which updates need to be applied:
- */
- if (k->k.type != KEY_TYPE_accounting &&
- unlikely(trans->journal_replay_not_finished))
- return bch2_btree_insert_clone_trans(trans, btree, k);
-
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
- int ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- return ret;
-
- journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s);
- bkey_copy(e->start, k);
- return 0;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *,
- struct btree_trans_commit_hook *);
-int __bch2_trans_commit(struct btree_trans *, unsigned);
-
-int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
-__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
-
-/**
- * bch2_trans_commit - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-static inline int bch2_trans_commit(struct btree_trans *trans,
- struct disk_reservation *disk_res,
- u64 *journal_seq,
- unsigned flags)
-{
- trans->disk_res = disk_res;
- trans->journal_seq = journal_seq;
-
- return __bch2_trans_commit(trans, flags);
-}
-
-#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
- lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_flags)))
-
-#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
- nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_flags)))
-
-#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \
- bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
-
-#define trans_for_each_update(_trans, _i) \
- for (struct btree_insert_entry *_i = (_trans)->updates; \
- (_i) < (_trans)->updates + (_trans)->nr_updates; \
- (_i)++)
-
-static inline void bch2_trans_reset_updates(struct btree_trans *trans)
-{
- trans_for_each_update(trans, i)
- bch2_path_put(trans, i->path, true);
-
- trans->nr_updates = 0;
- trans->journal_entries_u64s = 0;
- trans->hooks = NULL;
- trans->extra_disk_res = 0;
-}
-
-static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
- unsigned type, unsigned min_bytes)
-{
- unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
- struct bkey_i *mut;
-
- if (type && k.k->type != type)
- return ERR_PTR(-ENOENT);
-
- /* extra padding for varint_decode_fast... */
- mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
- if (!IS_ERR(mut)) {
- bkey_reassemble(mut, k);
-
- if (unlikely(bytes > bkey_bytes(k.k))) {
- memset((void *) mut + bkey_bytes(k.k), 0,
- bytes - bkey_bytes(k.k));
- mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
- }
- }
- return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
-{
- return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
-}
-
-#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \
- bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \
- KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k,
- enum btree_iter_update_trigger_flags flags,
- unsigned type, unsigned min_bytes)
-{
- struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
- int ret;
-
- if (IS_ERR(mut))
- return mut;
-
- ret = bch2_trans_update(trans, iter, mut, flags);
- if (ret)
- return ERR_PTR(ret);
-
- *k = bkey_i_to_s_c(mut);
- return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans,
- struct btree_iter *iter, struct bkey_s_c *k,
- enum btree_iter_update_trigger_flags flags)
-{
- return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
-}
-
-#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \
- bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
- KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags,
- unsigned type, unsigned min_bytes)
-{
- struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
- btree_id, pos, flags|BTREE_ITER_intent, type);
- struct bkey_i *ret = IS_ERR(k.k)
- ? ERR_CAST(k.k)
- : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
- if (IS_ERR(ret))
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags)
-{
- return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags,
- unsigned type, unsigned min_bytes)
-{
- struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
- btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
- int ret;
-
- if (IS_ERR(mut))
- return mut;
-
- ret = bch2_trans_update(trans, iter, mut, flags);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return ERR_PTR(ret);
- }
-
- return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags,
- unsigned min_bytes)
-{
- return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags)
-{
- return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
- bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \
- _btree_id, _pos, _flags, \
- KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
- enum btree_iter_update_trigger_flags flags,
- unsigned type, unsigned val_size)
-{
- struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
- int ret;
-
- if (IS_ERR(k))
- return k;
-
- bkey_init(&k->k);
- k->k.p = iter->pos;
- k->k.type = type;
- set_bkey_val_bytes(&k->k, val_size);
-
- ret = bch2_trans_update(trans, iter, k, flags);
- if (unlikely(ret))
- return ERR_PTR(ret);
- return k;
-}
-
-#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \
- bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \
- KEY_TYPE_##_type, sizeof(struct bch_##_type)))
-
-#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
deleted file mode 100644
index 67f1e3202835..000000000000
--- a/fs/bcachefs/btree_update_interior.c
+++ /dev/null
@@ -1,2763 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_journal_iter.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/random.h>
-
-static const char * const bch2_btree_update_modes[] = {
-#define x(t) #t,
- BTREE_UPDATE_MODES()
-#undef x
- NULL
-};
-
-static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- btree_path_idx_t, struct btree *, struct keylist *);
-static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
-/*
- * Verify that child nodes correctly span parent node's range:
- */
-int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
-{
- struct bch_fs *c = trans->c;
- struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
- ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
- : b->data->min_key;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- struct bkey_buf prev;
- int ret = 0;
-
- BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
- b->data->min_key));
-
- bch2_bkey_buf_init(&prev);
- bkey_init(&prev.k->k);
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-
- if (b == btree_node_root(c, b)) {
- if (!bpos_eq(b->data->min_key, POS_MIN)) {
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->data->min_key);
- log_fsck_err(trans, btree_root_bad_min_key,
- "btree root with incorrect min_key: %s", buf.buf);
- goto topology_repair;
- }
-
- if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
- printbuf_reset(&buf);
- bch2_bpos_to_text(&buf, b->data->max_key);
- log_fsck_err(trans, btree_root_bad_max_key,
- "btree root with incorrect max_key: %s", buf.buf);
- goto topology_repair;
- }
- }
-
- if (!b->c.level)
- goto out;
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- if (k.k->type != KEY_TYPE_btree_ptr_v2)
- goto out;
-
- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
- struct bpos expected_min = bkey_deleted(&prev.k->k)
- ? node_min
- : bpos_successor(prev.k->k.p);
-
- if (!bpos_eq(expected_min, bp.v->min_key)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf);
- prt_str(&buf, "end of prev node doesn't match start of next node\n in ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, "\n prev ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
- prt_str(&buf, "\n next ");
- bch2_bkey_val_to_text(&buf, c, k);
-
- log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
- goto topology_repair;
- }
-
- bch2_bkey_buf_reassemble(&prev, c, k);
- bch2_btree_and_journal_iter_advance(&iter);
- }
-
- if (bkey_deleted(&prev.k->k)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf);
- prt_str(&buf, "empty interior node\n in ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
- goto topology_repair;
- } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf);
- prt_str(&buf, "last child node doesn't end at end of parent node\n in ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, "\n last key ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
-
- log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
- goto topology_repair;
- }
-out:
-fsck_err:
- bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev, c);
- printbuf_exit(&buf);
- return ret;
-topology_repair:
- ret = bch2_topology_error(c);
- goto out;
-}
-
-/* Calculate ideal packed bkey format for new btree nodes: */
-
-static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
-{
- struct bkey_packed *k;
- struct bkey uk;
-
- for_each_bset(b, t)
- bset_tree_for_each_key(b, t, k)
- if (!bkey_deleted(k)) {
- uk = bkey_unpack_key(b, k);
- bch2_bkey_format_add_key(s, &uk);
- }
-}
-
-static struct bkey_format bch2_btree_calc_format(struct btree *b)
-{
- struct bkey_format_state s;
-
- bch2_bkey_format_init(&s);
- bch2_bkey_format_add_pos(&s, b->data->min_key);
- bch2_bkey_format_add_pos(&s, b->data->max_key);
- __bch2_btree_calc_format(&s, b);
-
- return bch2_bkey_format_done(&s);
-}
-
-static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
- struct bkey_format *old_f,
- struct bkey_format *new_f)
-{
- /* stupid integer promotion rules */
- ssize_t delta =
- (((int) new_f->key_u64s - old_f->key_u64s) *
- (int) nr.packed_keys) +
- (((int) new_f->key_u64s - BKEY_U64s) *
- (int) nr.unpacked_keys);
-
- BUG_ON(delta + nr.live_u64s < 0);
-
- return nr.live_u64s + delta;
-}
-
-/**
- * bch2_btree_node_format_fits - check if we could rewrite node with a new format
- *
- * @c: filesystem handle
- * @b: btree node to rewrite
- * @nr: number of keys for new node (i.e. b->nr)
- * @new_f: bkey format to translate keys to
- *
- * Returns: true if all re-packed keys will be able to fit in a new node.
- *
- * Assumes all keys will successfully pack with the new format.
- */
-static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
- struct btree_nr_keys nr,
- struct bkey_format *new_f)
-{
- size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
-
- return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
-}
-
-/* Btree node freeing/allocation: */
-
-static void __btree_node_free(struct btree_trans *trans, struct btree *b)
-{
- struct bch_fs *c = trans->c;
-
- trace_and_count(c, btree_node_free, trans, b);
-
- BUG_ON(btree_node_write_blocked(b));
- BUG_ON(btree_node_dirty(b));
- BUG_ON(btree_node_need_write(b));
- BUG_ON(b == btree_node_root(c, b));
- BUG_ON(b->ob.nr);
- BUG_ON(!list_empty(&b->write_blocked));
- BUG_ON(b->will_make_reachable);
-
- clear_btree_node_noevict(b);
-}
-
-static void bch2_btree_node_free_inmem(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- struct bch_fs *c = trans->c;
-
- bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-
- __btree_node_free(trans, b);
-
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
-
- six_unlock_write(&b->c.lock);
- mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-
- bch2_trans_node_drop(trans, b);
-}
-
-static void bch2_btree_node_free_never_used(struct btree_update *as,
- struct btree_trans *trans,
- struct btree *b)
-{
- struct bch_fs *c = as->c;
- struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
-
- BUG_ON(!list_empty(&b->write_blocked));
- BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
-
- b->will_make_reachable = 0;
- closure_put(&as->cl);
-
- clear_btree_node_will_make_reachable(b);
- clear_btree_node_accessed(b);
- clear_btree_node_dirty_acct(c, b);
- clear_btree_node_need_write(b);
-
- mutex_lock(&c->btree_cache.lock);
- __bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
-
- BUG_ON(p->nr >= ARRAY_SIZE(p->b));
- p->b[p->nr++] = b;
-
- six_unlock_intent(&b->c.lock);
-
- bch2_trans_node_drop(trans, b);
-}
-
-static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
- struct disk_reservation *res,
- struct closure *cl,
- bool interior_node,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct write_point *wp;
- struct btree *b;
- BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
- struct open_buckets obs = { .nr = 0 };
- struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
- ? BTREE_NODE_RESERVE
- : 0;
- int ret;
-
- b = bch2_btree_node_mem_alloc(trans, interior_node);
- if (IS_ERR(b))
- return b;
-
- BUG_ON(b->ob.nr);
-
- mutex_lock(&c->btree_reserve_cache_lock);
- if (c->btree_reserve_cache_nr > nr_reserve) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
- obs = a->ob;
- bkey_copy(&tmp.k, &a->k);
- mutex_unlock(&c->btree_reserve_cache_lock);
- goto out;
- }
- mutex_unlock(&c->btree_reserve_cache_lock);
-retry:
- ret = bch2_alloc_sectors_start_trans(trans,
- c->opts.metadata_target ?:
- c->opts.foreground_target,
- 0,
- writepoint_ptr(&c->btree_write_point),
- &devs_have,
- res->nr_replicas,
- min(res->nr_replicas,
- c->opts.metadata_replicas_required),
- watermark, 0, cl, &wp);
- if (unlikely(ret))
- goto err;
-
- if (wp->sectors_free < btree_sectors(c)) {
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- if (ob->sectors_free < btree_sectors(c))
- ob->sectors_free = 0;
-
- bch2_alloc_sectors_done(c, wp);
- goto retry;
- }
-
- bkey_btree_ptr_v2_init(&tmp.k);
- bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
-
- bch2_open_bucket_get(c, wp, &obs);
- bch2_alloc_sectors_done(c, wp);
-out:
- bkey_copy(&b->key, &tmp.k);
- b->ob = obs;
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-
- return b;
-err:
- bch2_btree_node_to_freelist(c, b);
- return ERR_PTR(ret);
-}
-
-static struct btree *bch2_btree_node_alloc(struct btree_update *as,
- struct btree_trans *trans,
- unsigned level)
-{
- struct bch_fs *c = as->c;
- struct btree *b;
- struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
- int ret;
-
- BUG_ON(level >= BTREE_MAX_DEPTH);
- BUG_ON(!p->nr);
-
- b = p->b[--p->nr];
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-
- set_btree_node_accessed(b);
- set_btree_node_dirty_acct(c, b);
- set_btree_node_need_write(b);
-
- bch2_bset_init_first(b, &b->data->keys);
- b->c.level = level;
- b->c.btree_id = as->btree_id;
- b->version_ondisk = c->sb.version;
-
- memset(&b->nr, 0, sizeof(b->nr));
- b->data->magic = cpu_to_le64(bset_magic(c));
- memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
- b->data->flags = 0;
- SET_BTREE_NODE_ID(b->data, as->btree_id);
- SET_BTREE_NODE_LEVEL(b->data, level);
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
-
- bp->v.mem_ptr = 0;
- bp->v.seq = b->data->keys.seq;
- bp->v.sectors_written = 0;
- }
-
- SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
-
- bch2_btree_build_aux_trees(b);
-
- ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
- BUG_ON(ret);
-
- trace_and_count(c, btree_node_alloc, trans, b);
- bch2_increment_clock(c, btree_sectors(c), WRITE);
- return b;
-}
-
-static void btree_set_min(struct btree *b, struct bpos pos)
-{
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
- b->data->min_key = pos;
-}
-
-static void btree_set_max(struct btree *b, struct bpos pos)
-{
- b->key.k.p = pos;
- b->data->max_key = pos;
-}
-
-static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
- struct btree_trans *trans,
- struct btree *b)
-{
- struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
- struct bkey_format format = bch2_btree_calc_format(b);
-
- /*
- * The keys might expand with the new format - if they wouldn't fit in
- * the btree node anymore, use the old format for now:
- */
- if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
- format = b->format;
-
- SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
-
- btree_set_min(n, b->data->min_key);
- btree_set_max(n, b->data->max_key);
-
- n->data->format = format;
- btree_node_set_format(n, format);
-
- bch2_btree_sort_into(as->c, n, b);
-
- btree_node_reset_sib_u64s(n);
- return n;
-}
-
-static struct btree *__btree_root_alloc(struct btree_update *as,
- struct btree_trans *trans, unsigned level)
-{
- struct btree *b = bch2_btree_node_alloc(as, trans, level);
-
- btree_set_min(b, POS_MIN);
- btree_set_max(b, SPOS_MAX);
- b->data->format = bch2_btree_calc_format(b);
-
- btree_node_set_format(b, b->data->format);
- bch2_btree_build_aux_trees(b);
-
- return b;
-}
-
-static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
-{
- struct bch_fs *c = as->c;
- struct prealloc_nodes *p;
-
- for (p = as->prealloc_nodes;
- p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
- p++) {
- while (p->nr) {
- struct btree *b = p->b[--p->nr];
-
- mutex_lock(&c->btree_reserve_cache_lock);
-
- if (c->btree_reserve_cache_nr <
- ARRAY_SIZE(c->btree_reserve_cache)) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
-
- a->ob = b->ob;
- b->ob.nr = 0;
- bkey_copy(&a->k, &b->key);
- } else {
- bch2_open_buckets_put(c, &b->ob);
- }
-
- mutex_unlock(&c->btree_reserve_cache_lock);
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
- __btree_node_free(trans, b);
- bch2_btree_node_to_freelist(c, b);
- }
- }
-}
-
-static int bch2_btree_reserve_get(struct btree_trans *trans,
- struct btree_update *as,
- unsigned nr_nodes[2],
- unsigned flags,
- struct closure *cl)
-{
- struct btree *b;
- unsigned interior;
- int ret = 0;
-
- BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
-
- /*
- * Protects reaping from the btree node cache and using the btree node
- * open bucket reserve:
- */
- ret = bch2_btree_cache_cannibalize_lock(trans, cl);
- if (ret)
- return ret;
-
- for (interior = 0; interior < 2; interior++) {
- struct prealloc_nodes *p = as->prealloc_nodes + interior;
-
- while (p->nr < nr_nodes[interior]) {
- b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
- interior, flags);
- if (IS_ERR(b)) {
- ret = PTR_ERR(b);
- goto err;
- }
-
- p->b[p->nr++] = b;
- }
- }
-err:
- bch2_btree_cache_cannibalize_unlock(trans);
- return ret;
-}
-
-/* Asynchronous interior node update machinery */
-
-static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
-{
- struct bch_fs *c = as->c;
-
- if (as->took_gc_lock)
- up_read(&c->gc_lock);
- as->took_gc_lock = false;
-
- bch2_journal_pin_drop(&c->journal, &as->journal);
- bch2_journal_pin_flush(&c->journal, &as->journal);
- bch2_disk_reservation_put(c, &as->disk_res);
- bch2_btree_reserve_put(as, trans);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
- as->start_time);
-
- mutex_lock(&c->btree_interior_update_lock);
- list_del(&as->unwritten_list);
- list_del(&as->list);
-
- closure_debug_destroy(&as->cl);
- mempool_free(as, &c->btree_interior_update_pool);
-
- /*
- * Have to do the wakeup with btree_interior_update_lock still held,
- * since being on btree_interior_update_list is our ref on @c:
- */
- closure_wake_up(&c->btree_interior_update_wait);
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static void btree_update_add_key(struct btree_update *as,
- struct keylist *keys, struct btree *b)
-{
- struct bkey_i *k = &b->key;
-
- BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
- ARRAY_SIZE(as->_old_keys));
-
- bkey_copy(keys->top, k);
- bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
-
- bch2_keylist_push(keys);
-}
-
-static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
-{
- for_each_keylist_key(&as->new_keys, k)
- if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
- return false;
- return true;
-}
-
-static void btree_update_new_nodes_mark_sb(struct btree_update *as)
-{
- struct bch_fs *c = as->c;
-
- mutex_lock(&c->sb_lock);
- for_each_keylist_key(&as->new_keys, k)
- bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-}
-
-/*
- * The transactional part of an interior btree node update, where we journal the
- * update we did to the interior node and update alloc info:
- */
-static int btree_update_nodes_written_trans(struct btree_trans *trans,
- struct btree_update *as)
-{
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
- int ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- return ret;
-
- memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
-
- trans->journal_pin = &as->journal;
-
- for_each_keylist_key(&as->old_keys, k) {
- unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
- ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
- BTREE_TRIGGER_transactional);
- if (ret)
- return ret;
- }
-
- for_each_keylist_key(&as->new_keys, k) {
- unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
- ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
- BTREE_TRIGGER_transactional);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* If the node has been reused, we might be reading uninitialized memory - that's fine: */
-static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq)
-{
- struct btree_node *b_data = READ_ONCE(b->data);
-
- return (b_data ? b_data->keys.seq : 0) == seq;
-}
-
-static void btree_update_nodes_written(struct btree_update *as)
-{
- struct bch_fs *c = as->c;
- struct btree *b;
- struct btree_trans *trans = bch2_trans_get(c);
- u64 journal_seq = 0;
- unsigned i;
- int ret;
-
- /*
- * If we're already in an error state, it might be because a btree node
- * was never written, and we might be trying to free that same btree
- * node here, but it won't have been marked as allocated and we'll see
- * spurious disk usage inconsistencies in the transactional part below
- * if we don't skip it:
- */
- ret = bch2_journal_error(&c->journal);
- if (ret)
- goto err;
-
- if (!btree_update_new_nodes_marked_sb(as))
- btree_update_new_nodes_mark_sb(as);
-
- /*
- * Wait for any in flight writes to finish before we free the old nodes
- * on disk:
- */
- for (i = 0; i < as->nr_old_nodes; i++) {
- b = as->old_nodes[i];
-
- if (btree_node_seq_matches(b, as->old_nodes_seq[i]))
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
- TASK_UNINTERRUPTIBLE);
- }
-
- /*
- * We did an update to a parent node where the pointers we added pointed
- * to child nodes that weren't written yet: now, the child nodes have
- * been written so we can write out the update to the interior node.
- */
-
- /*
- * We can't call into journal reclaim here: we'd block on the journal
- * reclaim lock, but we may need to release the open buckets we have
- * pinned in order for other btree updates to make forward progress, and
- * journal reclaim does btree updates when flushing bkey_cached entries,
- * which may require allocations as well.
- */
- ret = commit_do(trans, &as->disk_res, &journal_seq,
- BCH_WATERMARK_interior_updates|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_journal_reclaim,
- btree_update_nodes_written_trans(trans, as));
- bch2_trans_unlock(trans);
-
- bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
- "%s", bch2_err_str(ret));
-err:
- /*
- * Ensure transaction is unlocked before using btree_node_lock_nopath()
- * (the use of which is always suspect, we need to work on removing this
- * in the future)
- *
- * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
- * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
- * rarely end up with a locked path besides the one we have here:
- */
- bch2_trans_unlock(trans);
- bch2_trans_begin(trans);
-
- /*
- * We have to be careful because another thread might be getting ready
- * to free as->b and calling btree_update_reparent() on us - we'll
- * recheck under btree_update_lock below:
- */
- b = READ_ONCE(as->b);
- if (b) {
- /*
- * @b is the node we did the final insert into:
- *
- * On failure to get a journal reservation, we still have to
- * unblock the write and allow most of the write path to happen
- * so that shutdown works, but the i->journal_seq mechanism
- * won't work to prevent the btree write from being visible (we
- * didn't get a journal sequence number) - instead
- * __bch2_btree_node_write() doesn't do the actual write if
- * we're in journal error state:
- */
-
- btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
- as->btree_id, b->c.level, b->key.k.p);
- struct btree_path *path = trans->paths + path_idx;
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
- path->l[b->c.level].b = b;
-
- bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-
- mutex_lock(&c->btree_interior_update_lock);
-
- list_del(&as->write_blocked_list);
- if (list_empty(&b->write_blocked))
- clear_btree_node_write_blocked(b);
-
- /*
- * Node might have been freed, recheck under
- * btree_interior_update_lock:
- */
- if (as->b == b) {
- BUG_ON(!b->c.level);
- BUG_ON(!btree_node_dirty(b));
-
- if (!ret) {
- struct bset *last = btree_bset_last(b);
-
- last->journal_seq = cpu_to_le64(
- max(journal_seq,
- le64_to_cpu(last->journal_seq)));
-
- bch2_btree_add_journal_pin(c, b, journal_seq);
- } else {
- /*
- * If we didn't get a journal sequence number we
- * can't write this btree node, because recovery
- * won't know to ignore this write:
- */
- set_btree_node_never_write(b);
- }
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- six_unlock_write(&b->c.lock);
-
- btree_node_write_if_need(trans, b, SIX_LOCK_intent);
- btree_node_unlock(trans, path, b->c.level);
- bch2_path_put(trans, path_idx, true);
- }
-
- bch2_journal_pin_drop(&c->journal, &as->journal);
-
- mutex_lock(&c->btree_interior_update_lock);
- for (i = 0; i < as->nr_new_nodes; i++) {
- b = as->new_nodes[i];
-
- BUG_ON(b->will_make_reachable != (unsigned long) as);
- b->will_make_reachable = 0;
- clear_btree_node_will_make_reachable(b);
- }
- mutex_unlock(&c->btree_interior_update_lock);
-
- for (i = 0; i < as->nr_new_nodes; i++) {
- b = as->new_nodes[i];
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- btree_node_write_if_need(trans, b, SIX_LOCK_read);
- six_unlock_read(&b->c.lock);
- }
-
- for (i = 0; i < as->nr_open_buckets; i++)
- bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
-
- bch2_btree_update_free(as, trans);
- bch2_trans_put(trans);
-}
-
-static void btree_interior_update_work(struct work_struct *work)
-{
- struct bch_fs *c =
- container_of(work, struct bch_fs, btree_interior_update_work);
- struct btree_update *as;
-
- while (1) {
- mutex_lock(&c->btree_interior_update_lock);
- as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
- struct btree_update, unwritten_list);
- if (as && !as->nodes_written)
- as = NULL;
- mutex_unlock(&c->btree_interior_update_lock);
-
- if (!as)
- break;
-
- btree_update_nodes_written(as);
- }
-}
-
-static CLOSURE_CALLBACK(btree_update_set_nodes_written)
-{
- closure_type(as, struct btree_update, cl);
- struct bch_fs *c = as->c;
-
- mutex_lock(&c->btree_interior_update_lock);
- as->nodes_written = true;
- mutex_unlock(&c->btree_interior_update_lock);
-
- queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
-}
-
-/*
- * We're updating @b with pointers to nodes that haven't finished writing yet:
- * block @b from being written until @as completes
- */
-static void btree_update_updated_node(struct btree_update *as, struct btree *b)
-{
- struct bch_fs *c = as->c;
-
- BUG_ON(as->mode != BTREE_UPDATE_none);
- BUG_ON(as->update_level_end < b->c.level);
- BUG_ON(!btree_node_dirty(b));
- BUG_ON(!b->c.level);
-
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
- as->mode = BTREE_UPDATE_node;
- as->b = b;
- as->update_level_end = b->c.level;
-
- set_btree_node_write_blocked(b);
- list_add(&as->write_blocked_list, &b->write_blocked);
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static int bch2_update_reparent_journal_pin_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
-{
- return 0;
-}
-
-static void btree_update_reparent(struct btree_update *as,
- struct btree_update *child)
-{
- struct bch_fs *c = as->c;
-
- lockdep_assert_held(&c->btree_interior_update_lock);
-
- child->b = NULL;
- child->mode = BTREE_UPDATE_update;
-
- bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
- bch2_update_reparent_journal_pin_flush);
-}
-
-static void btree_update_updated_root(struct btree_update *as, struct btree *b)
-{
- struct bkey_i *insert = &b->key;
- struct bch_fs *c = as->c;
-
- BUG_ON(as->mode != BTREE_UPDATE_none);
-
- BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
- ARRAY_SIZE(as->journal_entries));
-
- as->journal_u64s +=
- journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
- BCH_JSET_ENTRY_btree_root,
- b->c.btree_id, b->c.level,
- insert, insert->k.u64s);
-
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
- as->mode = BTREE_UPDATE_root;
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-/*
- * bch2_btree_update_add_new_node:
- *
- * This causes @as to wait on @b to be written, before it gets to
- * bch2_btree_update_nodes_written
- *
- * Additionally, it sets b->will_make_reachable to prevent any additional writes
- * to @b from happening besides the first until @b is reachable on disk
- *
- * And it adds @b to the list of @as's new nodes, so that we can update sector
- * counts in bch2_btree_update_nodes_written:
- */
-static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
-{
- struct bch_fs *c = as->c;
-
- closure_get(&as->cl);
-
- mutex_lock(&c->btree_interior_update_lock);
- BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
- BUG_ON(b->will_make_reachable);
-
- as->new_nodes[as->nr_new_nodes++] = b;
- b->will_make_reachable = 1UL|(unsigned long) as;
- set_btree_node_will_make_reachable(b);
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- btree_update_add_key(as, &as->new_keys, b);
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
- unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
-
- bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
- cpu_to_le16(sectors);
- }
-}
-
-/*
- * returns true if @b was a new node
- */
-static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
-{
- struct btree_update *as;
- unsigned long v;
- unsigned i;
-
- mutex_lock(&c->btree_interior_update_lock);
- /*
- * When b->will_make_reachable != 0, it owns a ref on as->cl that's
- * dropped when it gets written by bch2_btree_complete_write - the
- * xchg() is for synchronization with bch2_btree_complete_write:
- */
- v = xchg(&b->will_make_reachable, 0);
- clear_btree_node_will_make_reachable(b);
- as = (struct btree_update *) (v & ~1UL);
-
- if (!as) {
- mutex_unlock(&c->btree_interior_update_lock);
- return;
- }
-
- for (i = 0; i < as->nr_new_nodes; i++)
- if (as->new_nodes[i] == b)
- goto found;
-
- BUG();
-found:
- array_remove_item(as->new_nodes, as->nr_new_nodes, i);
- mutex_unlock(&c->btree_interior_update_lock);
-
- if (v & 1)
- closure_put(&as->cl);
-}
-
-static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
-{
- while (b->ob.nr)
- as->open_buckets[as->nr_open_buckets++] =
- b->ob.v[--b->ob.nr];
-}
-
-static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
-{
- return 0;
-}
-
-/*
- * @b is being split/rewritten: it may have pointers to not-yet-written btree
- * nodes and thus outstanding btree_updates - redirect @b's
- * btree_updates to point to this btree_update:
- */
-static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
- struct btree *b)
-{
- struct bch_fs *c = as->c;
- struct btree_update *p, *n;
- struct btree_write *w;
-
- set_btree_node_dying(b);
-
- if (btree_node_fake(b))
- return;
-
- mutex_lock(&c->btree_interior_update_lock);
-
- /*
- * Does this node have any btree_update operations preventing
- * it from being written?
- *
- * If so, redirect them to point to this btree_update: we can
- * write out our new nodes, but we won't make them visible until those
- * operations complete
- */
- list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
- list_del_init(&p->write_blocked_list);
- btree_update_reparent(as, p);
-
- /*
- * for flush_held_btree_writes() waiting on updates to flush or
- * nodes to be writeable:
- */
- closure_wake_up(&c->btree_interior_update_wait);
- }
-
- clear_btree_node_dirty_acct(c, b);
- clear_btree_node_need_write(b);
- clear_btree_node_write_blocked(b);
-
- /*
- * Does this node have unwritten data that has a pin on the journal?
- *
- * If so, transfer that pin to the btree_update operation -
- * note that if we're freeing multiple nodes, we only need to keep the
- * oldest pin of any of the nodes we're freeing. We'll release the pin
- * when the new nodes are persistent and reachable on disk:
- */
- w = btree_current_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
- bch2_btree_update_will_free_node_journal_pin_flush);
- bch2_journal_pin_drop(&c->journal, &w->journal);
-
- w = btree_prev_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
- bch2_btree_update_will_free_node_journal_pin_flush);
- bch2_journal_pin_drop(&c->journal, &w->journal);
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- /*
- * Is this a node that isn't reachable on disk yet?
- *
- * Nodes that aren't reachable yet have writes blocked until they're
- * reachable - now that we've cancelled any pending writes and moved
- * things waiting on that write to wait on this update, we can drop this
- * node from the list of nodes that the other update is making
- * reachable, prior to freeing it:
- */
- btree_update_drop_new_node(c, b);
-
- btree_update_add_key(as, &as->old_keys, b);
-
- as->old_nodes[as->nr_old_nodes] = b;
- as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
- as->nr_old_nodes++;
-}
-
-static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
-{
- struct bch_fs *c = as->c;
- u64 start_time = as->start_time;
-
- BUG_ON(as->mode == BTREE_UPDATE_none);
-
- if (as->took_gc_lock)
- up_read(&as->c->gc_lock);
- as->took_gc_lock = false;
-
- bch2_btree_reserve_put(as, trans);
-
- continue_at(&as->cl, btree_update_set_nodes_written,
- as->c->btree_interior_update_worker);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
- start_time);
-}
-
-static struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level_start, bool split, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_update *as;
- u64 start_time = local_clock();
- int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
- ? BCH_DISK_RESERVATION_NOFAIL : 0;
- unsigned nr_nodes[2] = { 0, 0 };
- unsigned level_end = level_start;
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- int ret = 0;
- u32 restart_count = trans->restart_count;
-
- BUG_ON(!path->should_be_locked);
-
- if (watermark == BCH_WATERMARK_copygc)
- watermark = BCH_WATERMARK_btree_copygc;
- if (watermark < BCH_WATERMARK_btree)
- watermark = BCH_WATERMARK_btree;
-
- flags &= ~BCH_WATERMARK_MASK;
- flags |= watermark;
-
- if (watermark < BCH_WATERMARK_reclaim &&
- test_bit(JOURNAL_space_low, &c->journal.flags)) {
- if (flags & BCH_TRANS_COMMIT_journal_reclaim)
- return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
-
- ret = drop_locks_do(trans,
- ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
- if (ret)
- return ERR_PTR(ret);
- }
-
- while (1) {
- nr_nodes[!!level_end] += 1 + split;
- level_end++;
-
- ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
- if (ret)
- return ERR_PTR(ret);
-
- if (!btree_path_node(path, level_end)) {
- /* Allocating new root? */
- nr_nodes[1] += split;
- level_end = BTREE_MAX_DEPTH;
- break;
- }
-
- /*
- * Always check for space for two keys, even if we won't have to
- * split at prior level - it might have been a merge instead:
- */
- if (bch2_btree_node_insert_fits(path->l[level_end].b,
- BKEY_BTREE_PTR_U64s_MAX * 2))
- break;
-
- split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
- }
-
- if (!down_read_trylock(&c->gc_lock)) {
- ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
- if (ret) {
- up_read(&c->gc_lock);
- return ERR_PTR(ret);
- }
- }
-
- as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
- memset(as, 0, sizeof(*as));
- closure_init(&as->cl, NULL);
- as->c = c;
- as->start_time = start_time;
- as->ip_started = _RET_IP_;
- as->mode = BTREE_UPDATE_none;
- as->flags = flags;
- as->took_gc_lock = true;
- as->btree_id = path->btree_id;
- as->update_level_start = level_start;
- as->update_level_end = level_end;
- INIT_LIST_HEAD(&as->list);
- INIT_LIST_HEAD(&as->unwritten_list);
- INIT_LIST_HEAD(&as->write_blocked_list);
- bch2_keylist_init(&as->old_keys, as->_old_keys);
- bch2_keylist_init(&as->new_keys, as->_new_keys);
- bch2_keylist_init(&as->parent_keys, as->inline_keys);
-
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->list, &c->btree_interior_update_list);
- mutex_unlock(&c->btree_interior_update_lock);
-
- /*
- * We don't want to allocate if we're in an error state, that can cause
- * deadlock on emergency shutdown due to open buckets getting stuck in
- * the btree_reserve_cache after allocator shutdown has cleared it out.
- * This check needs to come after adding us to the btree_interior_update
- * list but before calling bch2_btree_reserve_get, to synchronize with
- * __bch2_fs_read_only().
- */
- ret = bch2_journal_error(&c->journal);
- if (ret)
- goto err;
-
- ret = bch2_disk_reservation_get(c, &as->disk_res,
- (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
- c->opts.metadata_replicas,
- disk_res_flags);
- if (ret)
- goto err;
-
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
- if (bch2_err_matches(ret, ENOSPC) ||
- bch2_err_matches(ret, ENOMEM)) {
- struct closure cl;
-
- /*
- * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
- * flag
- */
- if (bch2_err_matches(ret, ENOSPC) &&
- (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < BCH_WATERMARK_reclaim) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
- goto err;
- }
-
- closure_init_stack(&cl);
-
- do {
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
-
- bch2_trans_unlock(trans);
- bch2_wait_on_allocator(c, &cl);
- } while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
- }
-
- if (ret) {
- trace_and_count(c, btree_reserve_get_fail, trans->fn,
- _RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
- goto err;
- }
-
- ret = bch2_trans_relock(trans);
- if (ret)
- goto err;
-
- bch2_trans_verify_not_restarted(trans, restart_count);
- return as;
-err:
- bch2_btree_update_free(as, trans);
- if (!bch2_err_matches(ret, ENOSPC) &&
- !bch2_err_matches(ret, EROFS) &&
- ret != -BCH_ERR_journal_reclaim_would_deadlock)
- bch_err_fn_ratelimited(c, ret);
- return ERR_PTR(ret);
-}
-
-/* Btree root updates: */
-
-static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
-{
- /* Root nodes cannot be reaped */
- mutex_lock(&c->btree_cache.lock);
- list_del_init(&b->list);
- mutex_unlock(&c->btree_cache.lock);
-
- mutex_lock(&c->btree_root_lock);
- bch2_btree_id_root(c, b->c.btree_id)->b = b;
- mutex_unlock(&c->btree_root_lock);
-
- bch2_recalc_btree_reserve(c);
-}
-
-static int bch2_btree_set_root(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- bool nofail)
-{
- struct bch_fs *c = as->c;
-
- trace_and_count(c, btree_node_set_root, trans, b);
-
- struct btree *old = btree_node_root(c, b);
-
- /*
- * Ensure no one is using the old root while we switch to the
- * new root:
- */
- if (nofail) {
- bch2_btree_node_lock_write_nofail(trans, path, &old->c);
- } else {
- int ret = bch2_btree_node_lock_write(trans, path, &old->c);
- if (ret)
- return ret;
- }
-
- bch2_btree_set_root_inmem(c, b);
-
- btree_update_updated_root(as, b);
-
- /*
- * Unlock old root after new root is visible:
- *
- * The new root isn't persistent, but that's ok: we still have
- * an intent lock on the new root, and any updates that would
- * depend on the new root would have to update the new root.
- */
- bch2_btree_node_unlock_write(trans, path, old);
- return 0;
-}
-
-/* Interior node updates: */
-
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bkey_i *insert)
-{
- struct bch_fs *c = as->c;
- struct bkey_packed *k;
- struct printbuf buf = PRINTBUF;
- unsigned long old, new;
-
- BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
- !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
-
- if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
- bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
-
- struct bkey_validate_context from = (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level,
- .btree = b->c.btree_id,
- .flags = BCH_VALIDATE_commit,
- };
- if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?:
- bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) {
- bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
- dump_stack();
- }
-
- BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
- ARRAY_SIZE(as->journal_entries));
-
- as->journal_u64s +=
- journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
- BCH_JSET_ENTRY_btree_keys,
- b->c.btree_id, b->c.level,
- insert, insert->k.u64s);
-
- while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
- bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
- bch2_btree_node_iter_advance(node_iter, b);
-
- bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
- set_btree_node_dirty_acct(c, b);
-
- old = READ_ONCE(b->flags);
- do {
- new = old;
-
- new &= ~BTREE_WRITE_TYPE_MASK;
- new |= BTREE_WRITE_interior;
- new |= 1 << BTREE_NODE_need_write;
- } while (!try_cmpxchg(&b->flags, &old, new));
-
- printbuf_exit(&buf);
-}
-
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct keylist *keys)
-{
- struct bkey_i *insert = bch2_keylist_front(keys);
- struct bkey_packed *k;
-
- BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
-
- while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
- (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
- ;
-
- for (;
- insert != keys->top && bpos_le(insert->k.p, b->key.k.p);
- insert = bkey_next(insert))
- bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
-
- if (bch2_btree_node_check_topology(trans, b)) {
- struct printbuf buf = PRINTBUF;
-
- for (struct bkey_i *k = keys->keys;
- k != insert;
- k = bkey_next(k)) {
- bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
- prt_newline(&buf);
- }
-
- panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf);
- }
-
- memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
- keys->top_p -= insert->_data - keys->keys_p;
-}
-
-static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
-{
- if (insert_keys)
- for_each_keylist_key(insert_keys, k)
- if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos))
- return true;
- return false;
-}
-
-/*
- * Move keys from n1 (original replacement node, now lower node) to n2 (higher
- * node)
- */
-static void __btree_split_node(struct btree_update *as,
- struct btree_trans *trans,
- struct btree *b,
- struct btree *n[2],
- struct keylist *insert_keys)
-{
- struct bkey_packed *k;
- struct bpos n1_pos = POS_MIN;
- struct btree_node_iter iter;
- struct bset *bsets[2];
- struct bkey_format_state format[2];
- struct bkey_packed *out[2];
- struct bkey uk;
- unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
- struct { unsigned nr_keys, val_u64s; } nr_keys[2];
- int i;
-
- memset(&nr_keys, 0, sizeof(nr_keys));
-
- for (i = 0; i < 2; i++) {
- BUG_ON(n[i]->nsets != 1);
-
- bsets[i] = btree_bset_first(n[i]);
- out[i] = bsets[i]->start;
-
- SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
- bch2_bkey_format_init(&format[i]);
- }
-
- u64s = 0;
- for_each_btree_node_key(b, k, &iter) {
- if (bkey_deleted(k))
- continue;
-
- uk = bkey_unpack_key(b, k);
-
- if (b->c.level &&
- u64s < n1_u64s &&
- u64s + k->u64s >= n1_u64s &&
- (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) ||
- key_deleted_in_insert(insert_keys, uk.p)))
- n1_u64s += k->u64s;
-
- i = u64s >= n1_u64s;
- u64s += k->u64s;
- if (!i)
- n1_pos = uk.p;
- bch2_bkey_format_add_key(&format[i], &uk);
-
- nr_keys[i].nr_keys++;
- nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
- }
-
- btree_set_min(n[0], b->data->min_key);
- btree_set_max(n[0], n1_pos);
- btree_set_min(n[1], bpos_successor(n1_pos));
- btree_set_max(n[1], b->data->max_key);
-
- for (i = 0; i < 2; i++) {
- bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
- bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
-
- n[i]->data->format = bch2_bkey_format_done(&format[i]);
-
- unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
- nr_keys[i].val_u64s;
- if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
- n[i]->data->format = b->format;
-
- btree_node_set_format(n[i], n[i]->data->format);
- }
-
- u64s = 0;
- for_each_btree_node_key(b, k, &iter) {
- if (bkey_deleted(k))
- continue;
-
- i = u64s >= n1_u64s;
- u64s += k->u64s;
-
- if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
- ? &b->format: &bch2_bkey_format_current, k))
- out[i]->format = KEY_FORMAT_LOCAL_BTREE;
- else
- bch2_bkey_unpack(b, (void *) out[i], k);
-
- out[i]->needs_whiteout = false;
-
- btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
- out[i] = bkey_p_next(out[i]);
- }
-
- for (i = 0; i < 2; i++) {
- bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
-
- BUG_ON(!bsets[i]->u64s);
-
- set_btree_bset_end(n[i], n[i]->set);
-
- btree_node_reset_sib_u64s(n[i]);
-
- bch2_verify_btree_nr_keys(n[i]);
-
- BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
- }
-}
-
-/*
- * For updates to interior nodes, we've got to do the insert before we split
- * because the stuff we're inserting has to be inserted atomically. Post split,
- * the keys might have to go in different nodes and the split would no longer be
- * atomic.
- *
- * Worse, if the insert is from btree node coalescing, if we do the insert after
- * we do the split (and pick the pivot) - the pivot we pick might be between
- * nodes that were coalesced, and thus in the middle of a child node post
- * coalescing:
- */
-static void btree_split_insert_keys(struct btree_update *as,
- struct btree_trans *trans,
- btree_path_idx_t path_idx,
- struct btree *b,
- struct keylist *keys)
-{
- struct btree_path *path = trans->paths + path_idx;
-
- if (!bch2_keylist_empty(keys) &&
- bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
- struct btree_node_iter node_iter;
-
- bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
-
- bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
- }
-}
-
-static int btree_split(struct btree_update *as, struct btree_trans *trans,
- btree_path_idx_t path, struct btree *b,
- struct keylist *keys)
-{
- struct bch_fs *c = as->c;
- struct btree *parent = btree_node_parent(trans->paths + path, b);
- struct btree *n1, *n2 = NULL, *n3 = NULL;
- btree_path_idx_t path1 = 0, path2 = 0;
- u64 start_time = local_clock();
- int ret = 0;
-
- bch2_verify_btree_nr_keys(b);
- BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
-
- ret = bch2_btree_node_check_topology(trans, b);
- if (ret)
- return ret;
-
- if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
- struct btree *n[2];
-
- trace_and_count(c, btree_node_split, trans, b);
-
- n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
- n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
-
- __btree_split_node(as, trans, b, n, keys);
-
- if (keys) {
- btree_split_insert_keys(as, trans, path, n1, keys);
- btree_split_insert_keys(as, trans, path, n2, keys);
- BUG_ON(!bch2_keylist_empty(keys));
- }
-
- bch2_btree_build_aux_trees(n2);
- bch2_btree_build_aux_trees(n1);
-
- bch2_btree_update_add_new_node(as, n1);
- bch2_btree_update_add_new_node(as, n2);
- six_unlock_write(&n2->c.lock);
- six_unlock_write(&n1->c.lock);
-
- path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
- six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
- path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
- six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + path2, n2);
-
- /*
- * Note that on recursive parent_keys == keys, so we
- * can't start adding new keys to parent_keys before emptying it
- * out (which we did with btree_split_insert_keys() above)
- */
- bch2_keylist_add(&as->parent_keys, &n1->key);
- bch2_keylist_add(&as->parent_keys, &n2->key);
-
- if (!parent) {
- /* Depth increases, make a new root */
- n3 = __btree_root_alloc(as, trans, b->c.level + 1);
-
- bch2_btree_update_add_new_node(as, n3);
- six_unlock_write(&n3->c.lock);
-
- trans->paths[path2].locks_want++;
- BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
- six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + path2, n3);
-
- n3->sib_u64s[0] = U16_MAX;
- n3->sib_u64s[1] = U16_MAX;
-
- btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
- }
- } else {
- trace_and_count(c, btree_node_compact, trans, b);
-
- n1 = bch2_btree_node_alloc_replacement(as, trans, b);
-
- if (keys) {
- btree_split_insert_keys(as, trans, path, n1, keys);
- BUG_ON(!bch2_keylist_empty(keys));
- }
-
- bch2_btree_build_aux_trees(n1);
- bch2_btree_update_add_new_node(as, n1);
- six_unlock_write(&n1->c.lock);
-
- path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
- six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
- if (parent)
- bch2_keylist_add(&as->parent_keys, &n1->key);
- }
-
- /* New nodes all written, now make them visible: */
-
- if (parent) {
- /* Split a non root node */
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
- } else if (n3) {
- ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
- } else {
- /* Root filled up but didn't need to be split */
- ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
- }
-
- if (ret)
- goto err;
-
- bch2_btree_interior_update_will_free_node(as, b);
-
- if (n3) {
- bch2_btree_update_get_open_buckets(as, n3);
- bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0);
- }
- if (n2) {
- bch2_btree_update_get_open_buckets(as, n2);
- bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0);
- }
- bch2_btree_update_get_open_buckets(as, n1);
- bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0);
-
- /*
- * The old node must be freed (in memory) _before_ unlocking the new
- * nodes - else another thread could re-acquire a read lock on the old
- * node after another thread has locked and updated the new node, thus
- * seeing stale data:
- */
- bch2_btree_node_free_inmem(trans, trans->paths + path, b);
-
- if (n3)
- bch2_trans_node_add(trans, trans->paths + path, n3);
- if (n2)
- bch2_trans_node_add(trans, trans->paths + path2, n2);
- bch2_trans_node_add(trans, trans->paths + path1, n1);
-
- if (n3)
- six_unlock_intent(&n3->c.lock);
- if (n2)
- six_unlock_intent(&n2->c.lock);
- six_unlock_intent(&n1->c.lock);
-out:
- if (path2) {
- __bch2_btree_path_unlock(trans, trans->paths + path2);
- bch2_path_put(trans, path2, true);
- }
- if (path1) {
- __bch2_btree_path_unlock(trans, trans->paths + path1);
- bch2_path_put(trans, path1, true);
- }
-
- bch2_trans_verify_locks(trans);
-
- bch2_time_stats_update(&c->times[n2
- ? BCH_TIME_btree_node_split
- : BCH_TIME_btree_node_compact],
- start_time);
- return ret;
-err:
- if (n3)
- bch2_btree_node_free_never_used(as, trans, n3);
- if (n2)
- bch2_btree_node_free_never_used(as, trans, n2);
- bch2_btree_node_free_never_used(as, trans, n1);
- goto out;
-}
-
-/**
- * bch2_btree_insert_node - insert bkeys into a given btree node
- *
- * @as: btree_update object
- * @trans: btree_trans object
- * @path_idx: path that points to current node
- * @b: node to insert keys into
- * @keys: list of keys to insert
- *
- * Returns: 0 on success, typically transaction restart error on failure
- *
- * Inserts as many keys as it can into a given btree node, splitting it if full.
- * If a split occurred, this function will return early. This can only happen
- * for leaf nodes -- inserts into interior nodes have to be atomic.
- */
-static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
- btree_path_idx_t path_idx, struct btree *b,
- struct keylist *keys)
-{
- struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx, *linked;
- unsigned i;
- int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
- int old_live_u64s = b->nr.live_u64s;
- int live_u64s_added, u64s_added;
- int ret;
-
- lockdep_assert_held(&c->gc_lock);
- BUG_ON(!btree_node_intent_locked(path, b->c.level));
- BUG_ON(!b->c.level);
- BUG_ON(!as || as->b);
- bch2_verify_keylist_sorted(keys);
-
- ret = bch2_btree_node_lock_write(trans, path, &b->c);
- if (ret)
- return ret;
-
- bch2_btree_node_prep_for_write(trans, path, b);
-
- if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
- bch2_btree_node_unlock_write(trans, path, b);
- goto split;
- }
-
- ret = bch2_btree_node_check_topology(trans, b);
- if (ret) {
- bch2_btree_node_unlock_write(trans, path, b);
- return ret;
- }
-
- bch2_btree_insert_keys_interior(as, trans, path, b,
- path->l[b->c.level].iter, keys);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
- bch2_trans_verify_paths(trans);
-
- live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
- u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
-
- if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
- if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
- if (u64s_added > live_u64s_added &&
- bch2_maybe_compact_whiteouts(c, b))
- bch2_trans_node_reinit_iter(trans, b);
-
- btree_update_updated_node(as, b);
- bch2_btree_node_unlock_write(trans, path, b);
- return 0;
-split:
- /*
- * We could attempt to avoid the transaction restart, by calling
- * bch2_btree_path_upgrade() and allocating more nodes:
- */
- if (b->c.level >= as->update_level_end) {
- trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
- }
-
- return btree_split(as, trans, path_idx, b, keys);
-}
-
-int bch2_btree_split_leaf(struct btree_trans *trans,
- btree_path_idx_t path,
- unsigned flags)
-{
- /* btree_split & merge may both cause paths array to be reallocated */
- struct btree *b = path_l(trans->paths + path)->b;
- struct btree_update *as;
- unsigned l;
- int ret = 0;
-
- as = bch2_btree_update_start(trans, trans->paths + path,
- trans->paths[path].level,
- true, flags);
- if (IS_ERR(as))
- return PTR_ERR(as);
-
- ret = btree_split(as, trans, path, b, NULL);
- if (ret) {
- bch2_btree_update_free(as, trans);
- return ret;
- }
-
- bch2_btree_update_done(as, trans);
-
- for (l = trans->paths[path].level + 1;
- btree_node_intent_locked(&trans->paths[path], l) && !ret;
- l++)
- ret = bch2_foreground_maybe_merge(trans, path, l, flags);
-
- return ret;
-}
-
-static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
- btree_path_idx_t path_idx)
-{
- struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx;
- struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
-
- BUG_ON(!btree_node_locked(path, b->c.level));
-
- n = __btree_root_alloc(as, trans, b->c.level + 1);
-
- bch2_btree_update_add_new_node(as, n);
- six_unlock_write(&n->c.lock);
-
- path->locks_want++;
- BUG_ON(btree_node_locked(path, n->c.level));
- six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path, n);
-
- n->sib_u64s[0] = U16_MAX;
- n->sib_u64s[1] = U16_MAX;
-
- bch2_keylist_add(&as->parent_keys, &b->key);
- btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
-
- int ret = bch2_btree_set_root(as, trans, path, n, true);
- BUG_ON(ret);
-
- bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
- bch2_trans_node_add(trans, path, n);
- six_unlock_intent(&n->c.lock);
-
- mutex_lock(&c->btree_cache.lock);
- list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
- mutex_unlock(&c->btree_cache.lock);
-
- bch2_trans_verify_locks(trans);
-}
-
-int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
-
- if (btree_node_fake(b))
- return bch2_btree_split_leaf(trans, path, flags);
-
- struct btree_update *as =
- bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
- if (IS_ERR(as))
- return PTR_ERR(as);
-
- __btree_increase_depth(as, trans, path);
- bch2_btree_update_done(as, trans);
- return 0;
-}
-
-int __bch2_foreground_maybe_merge(struct btree_trans *trans,
- btree_path_idx_t path,
- unsigned level,
- unsigned flags,
- enum btree_node_sibling sib)
-{
- struct bch_fs *c = trans->c;
- struct btree_update *as;
- struct bkey_format_state new_s;
- struct bkey_format new_f;
- struct bkey_i delete;
- struct btree *b, *m, *n, *prev, *next, *parent;
- struct bpos sib_pos;
- size_t sib_u64s;
- enum btree_id btree = trans->paths[path].btree_id;
- btree_path_idx_t sib_path = 0, new_path = 0;
- u64 start_time = local_clock();
- int ret = 0;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- BUG_ON(!trans->paths[path].should_be_locked);
- BUG_ON(!btree_node_locked(&trans->paths[path], level));
-
- /*
- * Work around a deadlock caused by the btree write buffer not doing
- * merges and leaving tons of merges for us to do - we really don't need
- * to be doing merges at all from the interior update path, and if the
- * interior update path is generating too many new interior updates we
- * deadlock:
- */
- if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
- return 0;
-
- if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
- flags &= ~BCH_WATERMARK_MASK;
- flags |= BCH_WATERMARK_btree;
- flags |= BCH_TRANS_COMMIT_journal_reclaim;
- }
-
- b = trans->paths[path].l[level].b;
-
- if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
- (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
- b->sib_u64s[sib] = U16_MAX;
- return 0;
- }
-
- sib_pos = sib == btree_prev_sib
- ? bpos_predecessor(b->data->min_key)
- : bpos_successor(b->data->max_key);
-
- sib_path = bch2_path_get(trans, btree, sib_pos,
- U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, sib_path, false);
- if (ret)
- goto err;
-
- btree_path_set_should_be_locked(trans, trans->paths + sib_path);
-
- m = trans->paths[sib_path].l[level].b;
-
- if (btree_node_parent(trans->paths + path, b) !=
- btree_node_parent(trans->paths + sib_path, m)) {
- b->sib_u64s[sib] = U16_MAX;
- goto out;
- }
-
- if (sib == btree_prev_sib) {
- prev = m;
- next = b;
- } else {
- prev = b;
- next = m;
- }
-
- if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
- bch2_bpos_to_text(&buf1, prev->data->max_key);
- bch2_bpos_to_text(&buf2, next->data->min_key);
- bch_err(c,
- "%s(): btree topology error:\n"
- " prev ends at %s\n"
- " next starts at %s",
- __func__, buf1.buf, buf2.buf);
- printbuf_exit(&buf1);
- printbuf_exit(&buf2);
- ret = bch2_topology_error(c);
- goto err;
- }
-
- bch2_bkey_format_init(&new_s);
- bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
- __bch2_btree_calc_format(&new_s, prev);
- __bch2_btree_calc_format(&new_s, next);
- bch2_bkey_format_add_pos(&new_s, next->data->max_key);
- new_f = bch2_bkey_format_done(&new_s);
-
- sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
- btree_node_u64s_with_format(m->nr, &m->format, &new_f);
-
- if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
- sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
- sib_u64s /= 2;
- sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
- }
-
- sib_u64s = min(sib_u64s, btree_max_u64s(c));
- sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
- b->sib_u64s[sib] = sib_u64s;
-
- if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
- goto out;
-
- parent = btree_node_parent(trans->paths + path, b);
- as = bch2_btree_update_start(trans, trans->paths + path, level, false,
- BCH_TRANS_COMMIT_no_enospc|flags);
- ret = PTR_ERR_OR_ZERO(as);
- if (ret)
- goto err;
-
- trace_and_count(c, btree_node_merge, trans, b);
-
- n = bch2_btree_node_alloc(as, trans, b->c.level);
-
- SET_BTREE_NODE_SEQ(n->data,
- max(BTREE_NODE_SEQ(b->data),
- BTREE_NODE_SEQ(m->data)) + 1);
-
- btree_set_min(n, prev->data->min_key);
- btree_set_max(n, next->data->max_key);
-
- n->data->format = new_f;
- btree_node_set_format(n, new_f);
-
- bch2_btree_sort_into(c, n, prev);
- bch2_btree_sort_into(c, n, next);
-
- bch2_btree_build_aux_trees(n);
- bch2_btree_update_add_new_node(as, n);
- six_unlock_write(&n->c.lock);
-
- new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
- six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
- bkey_init(&delete.k);
- delete.k.p = prev->key.k.p;
- bch2_keylist_add(&as->parent_keys, &delete);
- bch2_keylist_add(&as->parent_keys, &n->key);
-
- bch2_trans_verify_paths(trans);
-
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
- if (ret)
- goto err_free_update;
-
- bch2_btree_interior_update_will_free_node(as, b);
- bch2_btree_interior_update_will_free_node(as, m);
-
- bch2_trans_verify_paths(trans);
-
- bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-
- bch2_btree_node_free_inmem(trans, trans->paths + path, b);
- bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
-
- bch2_trans_node_add(trans, trans->paths + path, n);
-
- bch2_trans_verify_paths(trans);
-
- six_unlock_intent(&n->c.lock);
-
- bch2_btree_update_done(as, trans);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
-out:
-err:
- if (new_path)
- bch2_path_put(trans, new_path, true);
- bch2_path_put(trans, sib_path, true);
- bch2_trans_verify_locks(trans);
- if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
- ret = 0;
- if (!ret)
- ret = bch2_trans_relock(trans);
- return ret;
-err_free_update:
- bch2_btree_node_free_never_used(as, trans, n);
- bch2_btree_update_free(as, trans);
- goto out;
-}
-
-static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
- struct btree *b)
-{
- bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
- BTREE_MAX_DEPTH, b->c.level,
- BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto err;
-
- /* has node been freed? */
- if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
- /* node has been freed: */
- BUG_ON(!btree_node_dying(b));
- ret = -BCH_ERR_btree_node_dying;
- goto err;
- }
-
- BUG_ON(!btree_node_hashed(b));
- return 0;
-err:
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree *b,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree *n, *parent;
- struct btree_update *as;
- btree_path_idx_t new_path = 0;
- int ret;
-
- flags |= BCH_TRANS_COMMIT_no_enospc;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
- ret = PTR_ERR_OR_ZERO(as);
- if (ret)
- goto out;
-
- n = bch2_btree_node_alloc_replacement(as, trans, b);
-
- bch2_btree_build_aux_trees(n);
- bch2_btree_update_add_new_node(as, n);
- six_unlock_write(&n->c.lock);
-
- new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
- six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
- trace_and_count(c, btree_node_rewrite, trans, b);
-
- if (parent) {
- bch2_keylist_add(&as->parent_keys, &n->key);
- ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
- } else {
- ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
- }
-
- if (ret)
- goto err;
-
- bch2_btree_interior_update_will_free_node(as, b);
-
- bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-
- bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
-
- bch2_trans_node_add(trans, trans->paths + iter->path, n);
- six_unlock_intent(&n->c.lock);
-
- bch2_btree_update_done(as, trans);
-out:
- if (new_path)
- bch2_path_put(trans, new_path, true);
- bch2_trans_downgrade(trans);
- return ret;
-err:
- bch2_btree_node_free_never_used(as, trans, n);
- bch2_btree_update_free(as, trans);
- goto out;
-}
-
-static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_i *k, unsigned flags)
-{
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter,
- btree, k->k.p,
- BTREE_MAX_DEPTH, level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto out;
-
- bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
- ret = found
- ? bch2_btree_node_rewrite(trans, &iter, b, flags)
- : -ENOENT;
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bpos pos, unsigned flags)
-{
- BUG_ON(!level);
-
- /* Traverse one depth lower to get a pointer to the node itself: */
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto err;
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
- struct btree *b, unsigned flags)
-{
- struct btree_iter iter;
- int ret = get_iter_to_node(trans, &iter, b);
- if (ret)
- return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-struct async_btree_rewrite {
- struct bch_fs *c;
- struct work_struct work;
- struct list_head list;
- enum btree_id btree_id;
- unsigned level;
- struct bkey_buf key;
-};
-
-static void async_btree_node_rewrite_work(struct work_struct *work)
-{
- struct async_btree_rewrite *a =
- container_of(work, struct async_btree_rewrite, work);
- struct bch_fs *c = a->c;
-
- int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
- a->btree_id, a->level, a->key.k, 0));
- if (ret != -ENOENT)
- bch_err_fn_ratelimited(c, ret);
-
- spin_lock(&c->btree_node_rewrites_lock);
- list_del(&a->list);
- spin_unlock(&c->btree_node_rewrites_lock);
-
- closure_wake_up(&c->btree_node_rewrites_wait);
-
- bch2_bkey_buf_exit(&a->key, c);
- bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
- kfree(a);
-}
-
-void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
-{
- struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
- if (!a)
- return;
-
- a->c = c;
- a->btree_id = b->c.btree_id;
- a->level = b->c.level;
- INIT_WORK(&a->work, async_btree_node_rewrite_work);
-
- bch2_bkey_buf_init(&a->key);
- bch2_bkey_buf_copy(&a->key, c, &b->key);
-
- bool now = false, pending = false;
-
- spin_lock(&c->btree_node_rewrites_lock);
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay &&
- bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
- list_add(&a->list, &c->btree_node_rewrites);
- now = true;
- } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
- list_add(&a->list, &c->btree_node_rewrites_pending);
- pending = true;
- }
- spin_unlock(&c->btree_node_rewrites_lock);
-
- if (now) {
- queue_work(c->btree_node_rewrite_worker, &a->work);
- } else if (pending) {
- /* bch2_do_pending_node_rewrites will execute */
- } else {
- bch2_bkey_buf_exit(&a->key, c);
- kfree(a);
- }
-}
-
-void bch2_async_btree_node_rewrites_flush(struct bch_fs *c)
-{
- closure_wait_event(&c->btree_node_rewrites_wait,
- list_empty(&c->btree_node_rewrites));
-}
-
-void bch2_do_pending_node_rewrites(struct bch_fs *c)
-{
- while (1) {
- spin_lock(&c->btree_node_rewrites_lock);
- struct async_btree_rewrite *a =
- list_pop_entry(&c->btree_node_rewrites_pending,
- struct async_btree_rewrite, list);
- if (a)
- list_add(&a->list, &c->btree_node_rewrites);
- spin_unlock(&c->btree_node_rewrites_lock);
-
- if (!a)
- break;
-
- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
- queue_work(c->btree_node_rewrite_worker, &a->work);
- }
-}
-
-void bch2_free_pending_node_rewrites(struct bch_fs *c)
-{
- while (1) {
- spin_lock(&c->btree_node_rewrites_lock);
- struct async_btree_rewrite *a =
- list_pop_entry(&c->btree_node_rewrites_pending,
- struct async_btree_rewrite, list);
- spin_unlock(&c->btree_node_rewrites_lock);
-
- if (!a)
- break;
-
- bch2_bkey_buf_exit(&a->key, c);
- kfree(a);
- }
-}
-
-static int __bch2_btree_node_update_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree *b, struct btree *new_hash,
- struct bkey_i *new_key,
- unsigned commit_flags,
- bool skip_triggers)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter2 = { NULL };
- struct btree *parent;
- int ret;
-
- if (!skip_triggers) {
- ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
- bkey_i_to_s_c(&b->key),
- BTREE_TRIGGER_transactional) ?:
- bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
- bkey_i_to_s(new_key),
- BTREE_TRIGGER_transactional);
- if (ret)
- return ret;
- }
-
- if (new_hash) {
- bkey_copy(&new_hash->key, new_key);
- ret = bch2_btree_node_hash_insert(&c->btree_cache,
- new_hash, b->c.level, b->c.btree_id);
- BUG_ON(ret);
- }
-
- parent = btree_node_parent(btree_iter_path(trans, iter), b);
- if (parent) {
- bch2_trans_copy_iter(&iter2, iter);
-
- iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
- iter2.flags & BTREE_ITER_intent,
- _THIS_IP_);
-
- struct btree_path *path2 = btree_iter_path(trans, &iter2);
- BUG_ON(path2->level != b->c.level);
- BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
-
- btree_path_set_level_up(trans, path2);
-
- trans->paths_sorted = false;
-
- ret = bch2_btree_iter_traverse(&iter2) ?:
- bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
- if (ret)
- goto err;
- } else {
- BUG_ON(btree_node_root(c, b) != b);
-
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
- jset_u64s(new_key->k.u64s));
- ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- return ret;
-
- journal_entry_set(e,
- BCH_JSET_ENTRY_btree_root,
- b->c.btree_id, b->c.level,
- new_key, new_key->k.u64s);
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
- if (ret)
- goto err;
-
- bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
-
- if (new_hash) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
-
- __bch2_btree_node_hash_remove(&c->btree_cache, b);
-
- bkey_copy(&b->key, new_key);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
- mutex_unlock(&c->btree_cache.lock);
- } else {
- bkey_copy(&b->key, new_key);
- }
-
- bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
-out:
- bch2_trans_iter_exit(trans, &iter2);
- return ret;
-err:
- if (new_hash) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
- }
- goto out;
-}
-
-int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
- struct btree *b, struct bkey_i *new_key,
- unsigned commit_flags, bool skip_triggers)
-{
- struct bch_fs *c = trans->c;
- struct btree *new_hash = NULL;
- struct btree_path *path = btree_iter_path(trans, iter);
- struct closure cl;
- int ret = 0;
-
- ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
- if (ret)
- return ret;
-
- closure_init_stack(&cl);
-
- /*
- * check btree_ptr_hash_val() after @b is locked by
- * btree_iter_traverse():
- */
- if (btree_ptr_hash_val(new_key) != b->hash_val) {
- ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
- if (ret) {
- ret = drop_locks_do(trans, (closure_sync(&cl), 0));
- if (ret)
- return ret;
- }
-
- new_hash = bch2_btree_node_mem_alloc(trans, false);
- ret = PTR_ERR_OR_ZERO(new_hash);
- if (ret)
- goto err;
- }
-
- path->intent_ref++;
- ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
- commit_flags, skip_triggers);
- --path->intent_ref;
-
- if (new_hash)
- bch2_btree_node_to_freelist(c, new_hash);
-err:
- closure_sync(&cl);
- bch2_btree_cache_cannibalize_unlock(trans);
- return ret;
-}
-
-int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
- struct btree *b, struct bkey_i *new_key,
- unsigned commit_flags, bool skip_triggers)
-{
- struct btree_iter iter;
- int ret = get_iter_to_node(trans, &iter, b);
- if (ret)
- return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
-
- bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
- !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
-
- ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
- commit_flags, skip_triggers);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* Init code: */
-
-/*
- * Only for filesystem bringup, when first reading the btree roots or allocating
- * btree roots when initializing a new filesystem:
- */
-void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
-{
- BUG_ON(btree_node_root(c, b));
-
- bch2_btree_set_root_inmem(c, b);
-}
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
-{
- struct bch_fs *c = trans->c;
- struct closure cl;
- struct btree *b;
- int ret;
-
- closure_init_stack(&cl);
-
- do {
- ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
- closure_sync(&cl);
- } while (ret);
-
- b = bch2_btree_node_mem_alloc(trans, false);
- bch2_btree_cache_cannibalize_unlock(trans);
-
- ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- return ret;
-
- set_btree_node_fake(b);
- set_btree_node_need_rewrite(b);
- b->c.level = level;
- b->c.btree_id = id;
-
- bkey_btree_ptr_init(&b->key);
- b->key.k.p = SPOS_MAX;
- *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
-
- bch2_bset_init_first(b, &b->data->keys);
- bch2_btree_build_aux_trees(b);
-
- b->data->flags = 0;
- btree_set_min(b, POS_MIN);
- btree_set_max(b, SPOS_MAX);
- b->data->format = bch2_btree_calc_format(b);
- btree_node_set_format(b, b->data->format);
-
- ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
- b->c.level, b->c.btree_id);
- BUG_ON(ret);
-
- bch2_btree_set_root_inmem(c, b);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- return 0;
-}
-
-void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
-{
- bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
-}
-
-static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
-{
- prt_printf(out, "%ps: ", (void *) as->ip_started);
- bch2_trans_commit_flags_to_text(out, as->flags);
-
- prt_str(out, " ");
- bch2_btree_id_to_text(out, as->btree_id);
- prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
- as->update_level_start,
- as->update_level_end,
- bch2_btree_update_modes[as->mode],
- as->nodes_written,
- closure_nr_remaining(&as->cl),
- as->journal.seq);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct btree_update *as;
-
- mutex_lock(&c->btree_interior_update_lock);
- list_for_each_entry(as, &c->btree_interior_update_list, list)
- bch2_btree_update_to_text(out, as);
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
-{
- bool ret;
-
- mutex_lock(&c->btree_interior_update_lock);
- ret = !list_empty(&c->btree_interior_update_list);
- mutex_unlock(&c->btree_interior_update_lock);
-
- return ret;
-}
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *c)
-{
- bool ret = bch2_btree_interior_updates_pending(c);
-
- if (ret)
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_pending(c));
- return ret;
-}
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
-{
- struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
-
- mutex_lock(&c->btree_root_lock);
-
- r->level = entry->level;
- r->alive = true;
- bkey_copy(&r->key, (struct bkey_i *) entry->start);
-
- mutex_unlock(&c->btree_root_lock);
-}
-
-struct jset_entry *
-bch2_btree_roots_to_journal_entries(struct bch_fs *c,
- struct jset_entry *end,
- unsigned long skip)
-{
- unsigned i;
-
- mutex_lock(&c->btree_root_lock);
-
- for (i = 0; i < btree_id_nr_alive(c); i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (r->alive && !test_bit(i, &skip)) {
- journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
- i, r->level, &r->key, r->key.k.u64s);
- end = vstruct_next(end);
- }
- }
-
- mutex_unlock(&c->btree_root_lock);
-
- return end;
-}
-
-static void bch2_btree_alloc_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct btree_alloc *a)
-{
- printbuf_indent_add(out, 2);
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k));
- prt_newline(out);
-
- struct open_bucket *ob;
- unsigned i;
- open_bucket_for_each(c, &a->ob, ob, i)
- bch2_open_bucket_to_text(out, c, ob);
-
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
-{
- for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++)
- bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]);
-}
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
-{
- WARN_ON(!list_empty(&c->btree_node_rewrites));
- WARN_ON(!list_empty(&c->btree_node_rewrites_pending));
-
- if (c->btree_node_rewrite_worker)
- destroy_workqueue(c->btree_node_rewrite_worker);
- if (c->btree_interior_update_worker)
- destroy_workqueue(c->btree_interior_update_worker);
- mempool_exit(&c->btree_interior_update_pool);
-}
-
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
-{
- mutex_init(&c->btree_reserve_cache_lock);
- INIT_LIST_HEAD(&c->btree_interior_update_list);
- INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
- mutex_init(&c->btree_interior_update_lock);
- INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
-
- INIT_LIST_HEAD(&c->btree_node_rewrites);
- INIT_LIST_HEAD(&c->btree_node_rewrites_pending);
- spin_lock_init(&c->btree_node_rewrites_lock);
-}
-
-int bch2_fs_btree_interior_update_init(struct bch_fs *c)
-{
- c->btree_interior_update_worker =
- alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
- if (!c->btree_interior_update_worker)
- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
-
- c->btree_node_rewrite_worker =
- alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
- if (!c->btree_node_rewrite_worker)
- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
-
- if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
- sizeof(struct btree_update)))
- return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
-
- return 0;
-}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
deleted file mode 100644
index be71cd73b864..000000000000
--- a/fs/bcachefs/btree_update_interior.h
+++ /dev/null
@@ -1,354 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-
-#include "btree_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-
-#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
-
-#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
-
-int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
-
-#define BTREE_UPDATE_MODES() \
- x(none) \
- x(node) \
- x(root) \
- x(update)
-
-enum btree_update_mode {
-#define x(n) BTREE_UPDATE_##n,
- BTREE_UPDATE_MODES()
-#undef x
-};
-
-/*
- * Tracks an in progress split/rewrite of a btree node and the update to the
- * parent node:
- *
- * When we split/rewrite a node, we do all the updates in memory without
- * waiting for any writes to complete - we allocate the new node(s) and update
- * the parent node, possibly recursively up to the root.
- *
- * The end result is that we have one or more new nodes being written -
- * possibly several, if there were multiple splits - and then a write (updating
- * an interior node) which will make all these new nodes visible.
- *
- * Additionally, as we split/rewrite nodes we free the old nodes - but the old
- * nodes can't be freed (their space on disk can't be reclaimed) until the
- * update to the interior node that makes the new node visible completes -
- * until then, the old nodes are still reachable on disk.
- *
- */
-struct btree_update {
- struct closure cl;
- struct bch_fs *c;
- u64 start_time;
- unsigned long ip_started;
-
- struct list_head list;
- struct list_head unwritten_list;
-
- enum btree_update_mode mode;
- enum bch_trans_commit_flags flags;
- unsigned nodes_written:1;
- unsigned took_gc_lock:1;
-
- enum btree_id btree_id;
- unsigned update_level_start;
- unsigned update_level_end;
-
- struct disk_reservation disk_res;
-
- /*
- * BTREE_UPDATE_node:
- * The update that made the new nodes visible was a regular update to an
- * existing interior node - @b. We can't write out the update to @b
- * until the new nodes we created are finished writing, so we block @b
- * from writing by putting this btree_interior update on the
- * @b->write_blocked list with @write_blocked_list:
- */
- struct btree *b;
- struct list_head write_blocked_list;
-
- /*
- * We may be freeing nodes that were dirty, and thus had journal entries
- * pinned: we need to transfer the oldest of those pins to the
- * btree_update operation, and release it when the new node(s)
- * are all persistent and reachable:
- */
- struct journal_entry_pin journal;
-
- /* Preallocated nodes we reserve when we start the update: */
- struct prealloc_nodes {
- struct btree *b[BTREE_UPDATE_NODES_MAX];
- unsigned nr;
- } prealloc_nodes[2];
-
- /* Nodes being freed: */
- struct keylist old_keys;
- u64 _old_keys[BTREE_UPDATE_NODES_MAX *
- BKEY_BTREE_PTR_U64s_MAX];
-
- /* Nodes being added: */
- struct keylist new_keys;
- u64 _new_keys[BTREE_UPDATE_NODES_MAX *
- BKEY_BTREE_PTR_U64s_MAX];
-
- /* New nodes, that will be made reachable by this update: */
- struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
- unsigned nr_new_nodes;
-
- struct btree *old_nodes[BTREE_UPDATE_NODES_MAX];
- __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX];
- unsigned nr_old_nodes;
-
- open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX *
- BCH_REPLICAS_MAX];
- open_bucket_idx_t nr_open_buckets;
-
- unsigned journal_u64s;
- u64 journal_entries[BTREE_UPDATE_JOURNAL_RES];
-
- /* Only here to reduce stack usage on recursive splits: */
- struct keylist parent_keys;
- /*
- * Enough room for btree_split's keys without realloc - btree node
- * pointers never have crc/compression info, so we only need to acount
- * for the pointers for three keys
- */
- u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
-};
-
-struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
- struct btree_trans *,
- struct btree *,
- struct bkey_format);
-
-int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
- unsigned, unsigned, enum btree_node_sibling);
-
-static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
- btree_path_idx_t path_idx,
- unsigned level, unsigned flags,
- enum btree_node_sibling sib)
-{
- struct btree_path *path = trans->paths + path_idx;
- struct btree *b;
-
- EBUG_ON(!btree_node_locked(path, level));
-
- if (bch2_btree_node_merging_disabled)
- return 0;
-
- b = path->l[level].b;
- if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
- return 0;
-
- return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
-}
-
-static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
- btree_path_idx_t path,
- unsigned level,
- unsigned flags)
-{
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
- btree_prev_sib) ?:
- bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
- btree_next_sib);
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
- struct btree *, unsigned);
-int bch2_btree_node_rewrite_pos(struct btree_trans *,
- enum btree_id, unsigned,
- struct bpos, unsigned);
-int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
- struct btree *, unsigned);
-
-void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
-
-int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
- struct btree *, struct bkey_i *,
- unsigned, bool);
-int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
- struct bkey_i *, unsigned, bool);
-
-void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
-void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
-
-static inline unsigned btree_update_reserve_required(struct bch_fs *c,
- struct btree *b)
-{
- unsigned depth = btree_node_root(c, b)->c.level + 1;
-
- /*
- * Number of nodes we might have to allocate in a worst case btree
- * split operation - we split all the way up to the root, then allocate
- * a new root, unless we're already at max depth:
- */
- if (depth < BTREE_MAX_DEPTH)
- return (depth - b->c.level) * 2 + 1;
- else
- return (depth - b->c.level) * 2 - 1;
-}
-
-static inline void btree_node_reset_sib_u64s(struct btree *b)
-{
- b->sib_u64s[0] = b->nr.live_u64s;
- b->sib_u64s[1] = b->nr.live_u64s;
-}
-
-static inline void *btree_data_end(struct btree *b)
-{
- return (void *) b->data + btree_buf_bytes(b);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
-{
- return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
-{
- return btree_data_end(b);
-}
-
-static inline void *write_block(struct btree *b)
-{
- return (void *) b->data + (b->written << 9);
-}
-
-static inline bool __btree_addr_written(struct btree *b, void *p)
-{
- return p < write_block(b);
-}
-
-static inline bool bset_written(struct btree *b, struct bset *i)
-{
- return __btree_addr_written(b, i);
-}
-
-static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
-{
- return __btree_addr_written(b, k);
-}
-
-static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
-{
- ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
- b->whiteout_u64s;
- ssize_t total = btree_buf_bytes(b) >> 3;
-
- /* Always leave one extra u64 for bch2_varint_decode: */
- used++;
-
- return total - used;
-}
-
-static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
-{
- ssize_t remaining = __bch2_btree_u64s_remaining(b,
- btree_bkey_last(b, bset_tree_last(b)));
-
- BUG_ON(remaining < 0);
-
- if (bset_written(b, btree_bset_last(b)))
- return 0;
-
- return remaining;
-}
-
-#define BTREE_WRITE_SET_U64s_BITS 9
-
-static inline unsigned btree_write_set_buffer(struct btree *b)
-{
- /*
- * Could buffer up larger amounts of keys for btrees with larger keys,
- * pending benchmarking:
- */
- return 8 << BTREE_WRITE_SET_U64s_BITS;
-}
-
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
-{
- struct bset_tree *t = bset_tree_last(b);
- struct btree_node_entry *bne = max(write_block(b),
- (void *) btree_bkey_last(b, t));
- ssize_t remaining_space =
- __bch2_btree_u64s_remaining(b, bne->keys.start);
-
- if (unlikely(bset_written(b, bset(b, t)))) {
- if (b->written + block_sectors(c) <= btree_sectors(c))
- return bne;
- } else {
- if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
- remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
- return bne;
- }
-
- return NULL;
-}
-
-static inline void push_whiteout(struct btree *b, struct bpos pos)
-{
- struct bkey_packed k;
-
- BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
- EBUG_ON(btree_node_just_written(b));
-
- if (!bkey_pack_pos(&k, pos, b)) {
- struct bkey *u = (void *) &k;
-
- bkey_init(u);
- u->p = pos;
- }
-
- k.needs_whiteout = true;
-
- b->whiteout_u64s += k.u64s;
- bkey_p_copy(unwritten_whiteouts_start(b), &k);
-}
-
-/*
- * write lock must be held on @b (else the dirty bset that we were going to
- * insert into could be written out from under us)
- */
-static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
-{
- if (unlikely(btree_node_need_rewrite(b)))
- return false;
-
- return u64s <= bch2_btree_keys_u64s_remaining(b);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *);
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
-struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
- struct jset_entry *, unsigned long);
-
-void bch2_async_btree_node_rewrites_flush(struct bch_fs *);
-void bch2_do_pending_node_rewrites(struct bch_fs *);
-void bch2_free_pending_node_rewrites(struct bch_fs *);
-
-void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *);
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
-int bch2_fs_btree_interior_update_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
deleted file mode 100644
index 2c09d19dd621..000000000000
--- a/fs/bcachefs/btree_write_buffer.c
+++ /dev/null
@@ -1,883 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-
-#include <linux/prefetch.h>
-#include <linux/sort.h>
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *,
- struct journal_entry_pin *, u64);
-
-static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
- return (cmp_int(l->hi, r->hi) ?:
- cmp_int(l->mi, r->mi) ?:
- cmp_int(l->lo, r->lo)) >= 0;
-}
-
-static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
-#ifdef CONFIG_X86_64
- int cmp;
-
- asm("mov (%[l]), %%rax;"
- "sub (%[r]), %%rax;"
- "mov 8(%[l]), %%rax;"
- "sbb 8(%[r]), %%rax;"
- "mov 16(%[l]), %%rax;"
- "sbb 16(%[r]), %%rax;"
- : "=@ccae" (cmp)
- : [l] "r" (l), [r] "r" (r)
- : "rax", "cc");
-
- EBUG_ON(cmp != __wb_key_ref_cmp(l, r));
- return cmp;
-#else
- return __wb_key_ref_cmp(l, r);
-#endif
-}
-
-static int wb_key_seq_cmp(const void *_l, const void *_r)
-{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
-
- return cmp_int(l->journal_seq, r->journal_seq);
-}
-
-/* Compare excluding idx, the low 24 bits: */
-static inline bool wb_key_eq(const void *_l, const void *_r)
-{
- const struct wb_key_ref *l = _l;
- const struct wb_key_ref *r = _r;
-
- return !((l->hi ^ r->hi)|
- (l->mi ^ r->mi)|
- ((l->lo >> 24) ^ (r->lo >> 24)));
-}
-
-static noinline void wb_sort(struct wb_key_ref *base, size_t num)
-{
- size_t n = num, a = num / 2;
-
- if (!a) /* num < 2 || size == 0 */
- return;
-
- for (;;) {
- size_t b, c, d;
-
- if (a) /* Building heap: sift down --a */
- --a;
- else if (--n) /* Sorting: Extract root to --n */
- swap(base[0], base[n]);
- else /* Sort complete */
- break;
-
- /*
- * Sift element at "a" down into heap. This is the
- * "bottom-up" variant, which significantly reduces
- * calls to cmp_func(): we find the sift-down path all
- * the way to the leaves (one compare per level), then
- * backtrack to find where to insert the target element.
- *
- * Because elements tend to sift down close to the leaves,
- * this uses fewer compares than doing two per level
- * on the way down. (A bit more than half as many on
- * average, 3/4 worst-case.)
- */
- for (b = a; c = 2*b + 1, (d = c + 1) < n;)
- b = wb_key_ref_cmp(base + c, base + d) ? c : d;
- if (d == n) /* Special case last leaf with no sibling */
- b = c;
-
- /* Now backtrack from "b" to the correct location for "a" */
- while (b != a && wb_key_ref_cmp(base + a, base + b))
- b = (b - 1) / 2;
- c = b; /* Where "a" belongs */
- while (b != a) { /* Shift it into place */
- b = (b - 1) / 2;
- swap(base[b], base[c]);
- }
- }
-}
-
-static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree_write_buffered_key *wb)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
-
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-
- trans->journal_res.seq = wb->journal_seq;
-
- return bch2_trans_update(trans, iter, &wb->k,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_TRANS_COMMIT_journal_reclaim);
-}
-
-static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
- struct btree_write_buffered_key *wb,
- bool *write_locked,
- bool *accounting_accumulated,
- size_t *fast)
-{
- struct btree_path *path;
- int ret;
-
- EBUG_ON(!wb->journal_seq);
- EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
- EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
-
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
-
- if (k.k->type == KEY_TYPE_accounting)
- bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
- bkey_s_c_to_accounting(k));
- }
- *accounting_accumulated = true;
-
- /*
- * We can't clone a path that has write locks: unshare it now, before
- * set_pos and traverse():
- */
- if (btree_iter_path(trans, iter)->ref > 1)
- iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
-
- path = btree_iter_path(trans, iter);
-
- if (!*write_locked) {
- ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
- if (ret)
- return ret;
-
- bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
- *write_locked = true;
- }
-
- if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
- *write_locked = false;
- return wb_flush_one_slowpath(trans, iter, wb);
- }
-
- bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
- (*fast)++;
- return 0;
-}
-
-/*
- * Update a btree with a write buffered key using the journal seq of the
- * original write buffer insert.
- *
- * It is not safe to rejournal the key once it has been inserted into the write
- * buffer because that may break recovery ordering. For example, the key may
- * have already been modified in the active write buffer in a seq that comes
- * before the current transaction. If we were to journal this key again and
- * crash, recovery would process updates in the wrong order.
- */
-static int
-btree_write_buffered_insert(struct btree_trans *trans,
- struct btree_write_buffered_key *wb)
-{
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
- BTREE_ITER_cached|BTREE_ITER_intent);
-
- trans->journal_res.seq = wb->journal_seq;
-
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, &wb->k,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
-{
- struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
- struct journal *j = &c->journal;
-
- if (!wb->inc.keys.nr)
- return;
-
- bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
- bch2_btree_write_buffer_journal_flush);
-
- darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
- darray_resize(&wb->sorted, wb->flushing.keys.size);
-
- if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
- swap(wb->flushing.keys, wb->inc.keys);
- goto out;
- }
-
- size_t nr = min(darray_room(wb->flushing.keys),
- wb->sorted.size - wb->flushing.keys.nr);
- nr = min(nr, wb->inc.keys.nr);
-
- memcpy(&darray_top(wb->flushing.keys),
- wb->inc.keys.data,
- sizeof(wb->inc.keys.data[0]) * nr);
-
- memmove(wb->inc.keys.data,
- wb->inc.keys.data + nr,
- sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
-
- wb->flushing.keys.nr += nr;
- wb->inc.keys.nr -= nr;
-out:
- if (!wb->inc.keys.nr)
- bch2_journal_pin_drop(j, &wb->inc.pin);
- else
- bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
- bch2_btree_write_buffer_journal_flush);
-
- if (j->watermark) {
- spin_lock(&j->lock);
- bch2_journal_set_watermark(j);
- spin_unlock(&j->lock);
- }
-
- BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
-}
-
-int bch2_btree_write_buffer_insert_err(struct btree_trans *trans,
- enum btree_id btree, struct bkey_i *k)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
- bch2_btree_id_to_text(&buf, btree);
- prt_str(&buf, "\n");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
- bch2_fs_inconsistent(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return -EROFS;
-}
-
-static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_iter iter = { NULL };
- size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
- bool write_locked = false;
- bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
- int ret = 0;
-
- ret = bch2_journal_error(&c->journal);
- if (ret)
- return ret;
-
- bch2_trans_unlock(trans);
- bch2_trans_begin(trans);
-
- mutex_lock(&wb->inc.lock);
- move_keys_from_inc_to_flushing(wb);
- mutex_unlock(&wb->inc.lock);
-
- for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
- wb->sorted.data[i].idx = i;
- wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
- memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
- }
- wb->sorted.nr = wb->flushing.keys.nr;
-
- /*
- * We first sort so that we can detect and skip redundant updates, and
- * then we attempt to flush in sorted btree order, as this is most
- * efficient.
- *
- * However, since we're not flushing in the order they appear in the
- * journal we won't be able to drop our journal pin until everything is
- * flushed - which means this could deadlock the journal if we weren't
- * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
- * if it would block taking a journal reservation.
- *
- * If that happens, simply skip the key so we can optimistically insert
- * as many keys as possible in the fast path.
- */
- wb_sort(wb->sorted.data, wb->sorted.nr);
-
- darray_for_each(wb->sorted, i) {
- struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
-
- if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
- ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k);
- goto err;
- }
-
- for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
- prefetch(&wb->flushing.keys.data[n->idx]);
-
- BUG_ON(!k->journal_seq);
-
- if (!accounting_replay_done &&
- k->k.k.type == KEY_TYPE_accounting) {
- slowpath++;
- continue;
- }
-
- if (i + 1 < &darray_top(wb->sorted) &&
- wb_key_eq(i, i + 1)) {
- struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
-
- if (k->k.k.type == KEY_TYPE_accounting &&
- n->k.k.type == KEY_TYPE_accounting)
- bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
- bkey_i_to_s_c_accounting(&k->k));
-
- overwritten++;
- n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
- k->journal_seq = 0;
- continue;
- }
-
- if (write_locked) {
- struct btree_path *path = btree_iter_path(trans, &iter);
-
- if (path->btree_id != i->btree ||
- bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
- write_locked = false;
-
- ret = lockrestart_do(trans,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_foreground_maybe_merge(trans, iter.path, 0,
- BCH_WATERMARK_reclaim|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc));
- if (ret)
- goto err;
- }
- }
-
- if (!iter.path || iter.btree_id != k->btree) {
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
- BTREE_ITER_intent|BTREE_ITER_all_snapshots);
- }
-
- bch2_btree_iter_set_pos(&iter, k->k.k.p);
- btree_iter_path(trans, &iter)->preserve = false;
-
- bool accounting_accumulated = false;
- do {
- if (race_fault()) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
- break;
- }
-
- ret = wb_flush_one(trans, &iter, k, &write_locked,
- &accounting_accumulated, &fast);
- if (!write_locked)
- bch2_trans_begin(trans);
- } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
- if (!ret) {
- k->journal_seq = 0;
- } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
- slowpath++;
- ret = 0;
- } else
- break;
- }
-
- if (write_locked) {
- struct btree_path *path = btree_iter_path(trans, &iter);
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- goto err;
-
- if (slowpath) {
- /*
- * Flush in the order they were present in the journal, so that
- * we can release journal pins:
- * The fastpath zapped the seq of keys that were successfully flushed so
- * we can skip those here.
- */
- trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
-
- sort(wb->flushing.keys.data,
- wb->flushing.keys.nr,
- sizeof(wb->flushing.keys.data[0]),
- wb_key_seq_cmp, NULL);
-
- darray_for_each(wb->flushing.keys, i) {
- if (!i->journal_seq)
- continue;
-
- if (!accounting_replay_done &&
- i->k.k.type == KEY_TYPE_accounting) {
- could_not_insert++;
- continue;
- }
-
- if (!could_not_insert)
- bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
- bch2_btree_write_buffer_journal_flush);
-
- bch2_trans_begin(trans);
-
- ret = commit_do(trans, NULL, NULL,
- BCH_WATERMARK_reclaim|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_journal_res ,
- btree_write_buffered_insert(trans, i));
- if (ret)
- goto err;
-
- i->journal_seq = 0;
- }
-
- /*
- * If journal replay hasn't finished with accounting keys we
- * can't flush accounting keys at all - condense them and leave
- * them for next time.
- *
- * Q: Can the write buffer overflow?
- * A Shouldn't be any actual risk. It's just new accounting
- * updates that the write buffer can't flush, and those are only
- * going to be generated by interior btree node updates as
- * journal replay has to split/rewrite nodes to make room for
- * its updates.
- *
- * And for those new acounting updates, updates to the same
- * counters get accumulated as they're flushed from the journal
- * to the write buffer - see the patch for eytzingcer tree
- * accumulated. So we could only overflow if the number of
- * distinct counters touched somehow was very large.
- */
- if (could_not_insert) {
- struct btree_write_buffered_key *dst = wb->flushing.keys.data;
-
- darray_for_each(wb->flushing.keys, i)
- if (i->journal_seq)
- *dst++ = *i;
- wb->flushing.keys.nr = dst - wb->flushing.keys.data;
- }
- }
-err:
- if (ret || !could_not_insert) {
- bch2_journal_pin_drop(j, &wb->flushing.pin);
- wb->flushing.keys.nr = 0;
- }
-
- bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
- trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
- return ret;
-}
-
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
-{
- struct journal_keys_to_wb dst;
- int ret = 0;
-
- bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
-
- for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
- jset_entry_for_each_key(entry, k) {
- ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
- if (ret)
- goto out;
- }
-
- entry->type = BCH_JSET_ENTRY_btree_keys;
- }
-out:
- ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
- return ret;
-}
-
-static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
-{
- struct journal *j = &c->journal;
- struct journal_buf *buf;
- bool blocked;
- int ret = 0;
-
- while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) {
- ret = bch2_journal_keys_to_write_buffer(c, buf);
-
- if (!blocked && !ret) {
- spin_lock(&j->lock);
- buf->need_flush_to_write_buffer = false;
- spin_unlock(&j->lock);
- }
-
- mutex_unlock(&j->buf_lock);
-
- if (blocked) {
- bch2_journal_unblock(j);
- break;
- }
- }
-
- return ret;
-}
-
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq,
- bool *did_work)
-{
- struct bch_fs *c = trans->c;
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- int ret = 0, fetch_from_journal_err;
-
- do {
- bch2_trans_unlock(trans);
-
- fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq);
-
- *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
-
- /*
- * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
- * is not guaranteed to empty wb->inc:
- */
- mutex_lock(&wb->flushing.lock);
- ret = bch2_btree_write_buffer_flush_locked(trans);
- mutex_unlock(&wb->flushing.lock);
- } while (!ret &&
- (fetch_from_journal_err ||
- (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) ||
- (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq)));
-
- return ret;
-}
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- bool did_work = false;
-
- return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work));
-}
-
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- bool did_work = false;
-
- trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
-
- return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work);
-}
-
-/*
- * The write buffer requires flushing when going RO: keys in the journal for the
- * write buffer don't have a journal pin yet
- */
-bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c)
-{
- if (bch2_journal_error(&c->journal))
- return false;
-
- bool did_work = false;
- bch2_trans_run(c, btree_write_buffer_flush_seq(trans,
- journal_cur_seq(&c->journal), &did_work));
- return did_work;
-}
-
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- int ret = 0;
-
- if (mutex_trylock(&wb->flushing.lock)) {
- ret = bch2_btree_write_buffer_flush_locked(trans);
- mutex_unlock(&wb->flushing.lock);
- }
-
- return ret;
-}
-
-int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
- return -BCH_ERR_erofs_no_writes;
-
- int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
- return ret;
-}
-
-/*
- * In check and repair code, when checking references to write buffer btrees we
- * need to issue a flush before we have a definitive error: this issues a flush
- * if this is a key we haven't yet checked.
- */
-int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
- struct bkey_s_c referring_k,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct bkey_buf tmp;
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
-
- if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
- if (trace_write_buffer_maybe_flush_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, referring_k);
- trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf);
- printbuf_exit(&buf);
- }
-
- bch2_bkey_buf_reassemble(&tmp, c, referring_k);
-
- if (bkey_is_btree_ptr(referring_k.k)) {
- bch2_trans_unlock(trans);
- bch2_btree_interior_updates_flush(c);
- }
-
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
- bch2_bkey_buf_copy(last_flushed, c, tmp.k);
- ret = -BCH_ERR_transaction_restart_write_buffer_flush;
- }
-err:
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
-}
-
-static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- int ret;
-
- mutex_lock(&wb->flushing.lock);
- do {
- ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
- } while (!ret && bch2_btree_write_buffer_should_flush(c));
- mutex_unlock(&wb->flushing.lock);
-
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
-}
-
-static void wb_accounting_sort(struct btree_write_buffer *wb)
-{
- eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
- sizeof(wb->accounting.data[0]),
- wb_key_cmp, NULL);
-}
-
-int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
- struct bkey_i_accounting *k)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_write_buffered_key new = { .btree = btree };
-
- bkey_copy(&new.k, &k->k_i);
-
- int ret = darray_push(&wb->accounting, new);
- if (ret)
- return ret;
-
- wb_accounting_sort(wb);
- return 0;
-}
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
- struct journal_keys_to_wb *dst,
- enum btree_id btree, struct bkey_i *k)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- int ret;
-retry:
- ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
- if (!ret && dst->wb == &wb->flushing)
- ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
-
- if (unlikely(ret)) {
- if (dst->wb == &c->btree_write_buffer.flushing) {
- mutex_unlock(&dst->wb->lock);
- dst->wb = &c->btree_write_buffer.inc;
- bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
- bch2_btree_write_buffer_journal_flush);
- goto retry;
- }
-
- return ret;
- }
-
- dst->room = darray_room(dst->wb->keys);
- if (dst->wb == &wb->flushing)
- dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
- BUG_ON(!dst->room);
- BUG_ON(!dst->seq);
-
- struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
- wb_k->journal_seq = dst->seq;
- wb_k->btree = btree;
- bkey_copy(&wb_k->k, k);
- dst->wb->keys.nr++;
- dst->room--;
- return 0;
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- if (mutex_trylock(&wb->flushing.lock)) {
- mutex_lock(&wb->inc.lock);
- move_keys_from_inc_to_flushing(wb);
-
- /*
- * Attempt to skip wb->inc, and add keys directly to
- * wb->flushing, saving us a copy later:
- */
-
- if (!wb->inc.keys.nr) {
- dst->wb = &wb->flushing;
- } else {
- mutex_unlock(&wb->flushing.lock);
- dst->wb = &wb->inc;
- }
- } else {
- mutex_lock(&wb->inc.lock);
- dst->wb = &wb->inc;
- }
-
- dst->room = darray_room(dst->wb->keys);
- if (dst->wb == &wb->flushing)
- dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
- dst->seq = seq;
-
- bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
- bch2_btree_write_buffer_journal_flush);
-
- darray_for_each(wb->accounting, i)
- memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
-}
-
-int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- unsigned live_accounting_keys = 0;
- int ret = 0;
-
- darray_for_each(wb->accounting, i)
- if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
- i->journal_seq = dst->seq;
- live_accounting_keys++;
- ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
- if (ret)
- break;
- }
-
- if (live_accounting_keys * 2 < wb->accounting.nr) {
- struct btree_write_buffered_key *dst = wb->accounting.data;
-
- darray_for_each(wb->accounting, src)
- if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
- *dst++ = *src;
- wb->accounting.nr = dst - wb->accounting.data;
- wb_accounting_sort(wb);
- }
-
- if (!dst->wb->keys.nr)
- bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
-
- if (bch2_btree_write_buffer_should_flush(c) &&
- __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
- !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
-
- if (dst->wb == &wb->flushing)
- mutex_unlock(&wb->flushing.lock);
- mutex_unlock(&wb->inc.lock);
-
- return ret;
-}
-
-static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
-{
- if (wb->keys.size >= new_size)
- return 0;
-
- if (!mutex_trylock(&wb->lock))
- return -EINTR;
-
- int ret = darray_resize(&wb->keys, new_size);
- mutex_unlock(&wb->lock);
- return ret;
-}
-
-int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- return wb_keys_resize(&wb->flushing, new_size) ?:
- wb_keys_resize(&wb->inc, new_size);
-}
-
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
- !bch2_journal_error(&c->journal));
-
- darray_exit(&wb->accounting);
- darray_exit(&wb->sorted);
- darray_exit(&wb->flushing.keys);
- darray_exit(&wb->inc.keys);
-}
-
-int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- mutex_init(&wb->inc.lock);
- mutex_init(&wb->flushing.lock);
- INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
-
- /* Will be resized by journal as needed: */
- unsigned initial_size = 1 << 16;
-
- return darray_make_room(&wb->inc.keys, initial_size) ?:
- darray_make_room(&wb->flushing.keys, initial_size) ?:
- darray_make_room(&wb->sorted, initial_size);
-}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
deleted file mode 100644
index d535cea28bde..000000000000
--- a/fs/bcachefs/btree_write_buffer.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-
-#include "bkey.h"
-#include "disk_accounting.h"
-
-static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
-}
-
-static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
-}
-
-struct btree_trans;
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *);
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
-int bch2_btree_write_buffer_tryflush(struct btree_trans *);
-
-struct bkey_buf;
-int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *);
-
-struct journal_keys_to_wb {
- struct btree_write_buffer_keys *wb;
- size_t room;
- u64 seq;
-};
-
-static inline int wb_key_cmp(const void *_l, const void *_r)
-{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
-
- return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p);
-}
-
-int bch2_accounting_key_to_wb_slowpath(struct bch_fs *,
- enum btree_id, struct bkey_i_accounting *);
-
-static inline int bch2_accounting_key_to_wb(struct bch_fs *c,
- enum btree_id btree, struct bkey_i_accounting *k)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_write_buffered_key search;
- search.btree = btree;
- search.k.k.p = k->k.p;
-
- unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr,
- sizeof(wb->accounting.data[0]),
- wb_key_cmp, &search);
-
- if (idx >= wb->accounting.nr)
- return bch2_accounting_key_to_wb_slowpath(c, btree, k);
-
- struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k);
- bch2_accounting_accumulate(dst, accounting_i_to_s_c(k));
- return 0;
-}
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
- struct journal_keys_to_wb *,
- enum btree_id, struct bkey_i *);
-
-static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
- struct journal_keys_to_wb *dst,
- enum btree_id btree, struct bkey_i *k)
-{
- if (unlikely(!dst->room))
- return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
-
- struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
- wb_k->journal_seq = dst->seq;
- wb_k->btree = btree;
- bkey_copy(&wb_k->k, k);
- dst->wb->keys.nr++;
- dst->room--;
- return 0;
-}
-
-static inline int bch2_journal_key_to_wb(struct bch_fs *c,
- struct journal_keys_to_wb *dst,
- enum btree_id btree, struct bkey_i *k)
-{
- EBUG_ON(!dst->seq);
-
- return k->k.type == KEY_TYPE_accounting
- ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
- : __bch2_journal_key_to_wb(c, dst, btree, k);
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
-int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
-
-int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
-int bch2_fs_btree_write_buffer_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
deleted file mode 100644
index e9e76e20f43b..000000000000
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-
-#include "darray.h"
-#include "journal_types.h"
-
-#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
-#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
-
-struct wb_key_ref {
-union {
- struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- unsigned idx:24;
- u8 pos[sizeof(struct bpos)];
- enum btree_id btree:8;
-#else
- enum btree_id btree:8;
- u8 pos[sizeof(struct bpos)];
- unsigned idx:24;
-#endif
- } __packed;
- struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- u64 lo;
- u64 mi;
- u64 hi;
-#else
- u64 hi;
- u64 mi;
- u64 lo;
-#endif
- };
-};
-};
-
-struct btree_write_buffered_key {
- enum btree_id btree:8;
- u64 journal_seq:56;
- __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-struct btree_write_buffer_keys {
- DARRAY(struct btree_write_buffered_key) keys;
- struct journal_entry_pin pin;
- struct mutex lock;
-};
-
-struct btree_write_buffer {
- DARRAY(struct wb_key_ref) sorted;
- struct btree_write_buffer_keys inc;
- struct btree_write_buffer_keys flushing;
- struct work_struct flush_work;
-
- DARRAY(struct btree_write_buffered_key) accounting;
-};
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
deleted file mode 100644
index e56ef623ebc1..000000000000
--- a/fs/bcachefs/buckets.c
+++ /dev/null
@@ -1,1322 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "inode.h"
-#include "movinggc.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/preempt.h>
-
-void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
-{
- memset(usage, 0, sizeof(*usage));
- acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
-}
-
-static u64 reserve_factor(u64 r)
-{
- return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
-}
-
-static struct bch_fs_usage_short
-__bch2_fs_usage_read_short(struct bch_fs *c)
-{
- struct bch_fs_usage_short ret;
- u64 data, reserved;
-
- ret.capacity = c->capacity -
- percpu_u64_get(&c->usage->hidden);
-
- data = percpu_u64_get(&c->usage->data) +
- percpu_u64_get(&c->usage->btree);
- reserved = percpu_u64_get(&c->usage->reserved) +
- percpu_u64_get(c->online_reserved);
-
- ret.used = min(ret.capacity, data + reserve_factor(reserved));
- ret.free = ret.capacity - ret.used;
-
- ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes);
-
- return ret;
-}
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *c)
-{
- struct bch_fs_usage_short ret;
-
- percpu_down_read(&c->mark_lock);
- ret = __bch2_fs_usage_read_short(c);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-void bch2_dev_usage_to_text(struct printbuf *out,
- struct bch_dev *ca,
- struct bch_dev_usage *usage)
-{
- if (out->nr_tabstops < 5) {
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- }
-
- prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++) {
- bch2_prt_data_type(out, i);
- prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
- usage->d[i].buckets,
- usage->d[i].sectors,
- usage->d[i].fragmented);
- }
-
- prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets);
-}
-
-static int bch2_check_fix_ptr(struct btree_trans *trans,
- struct bkey_s_c k,
- struct extent_ptr_decoded p,
- const union bch_extent_entry *entry,
- bool *do_update)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
- if (!ca) {
- if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
- trans, ptr_to_invalid_device,
- "pointer to missing device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
- return 0;
- }
-
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- if (!g) {
- if (fsck_err(trans, ptr_to_invalid_device,
- "pointer to invalid bucket on device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
- goto out;
- }
-
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
-
- if (fsck_err_on(!g->gen_valid,
- trans, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- } else {
- *do_update = true;
- }
- }
-
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
- trans, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached &&
- (g->data_type != BCH_DATA_btree ||
- data_type == BCH_DATA_btree)) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = 0;
- g->stripe_sectors = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- } else {
- *do_update = true;
- }
- }
-
- if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
- trans, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
-
- if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
- trans, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
-
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
- goto out;
-
- if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
- trans, ptr_bucket_data_type_mismatch,
- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (data_type == BCH_DATA_btree) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = data_type;
- g->stripe_sectors = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- } else {
- *do_update = true;
- }
- }
-
- if (p.has_ec) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
- if (fsck_err_on(!m || !m->alive,
- trans, ptr_to_missing_stripe,
- "pointer to nonexistent stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
-
- if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
- trans, ptr_to_incorrect_stripe,
- "pointer does not match stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
- }
-out:
-fsck_err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_fix_ptrs(struct btree_trans *trans,
- enum btree_id btree, unsigned level, struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry_c;
- struct extent_ptr_decoded p = { 0 };
- bool do_update = false;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
- ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
- if (ret)
- goto err;
- }
-
- if (do_update) {
- if (flags & BTREE_TRIGGER_is_root) {
- bch_err(c, "cannot update btree roots yet");
- ret = -EINVAL;
- goto err;
- }
-
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- rcu_read_lock();
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
- rcu_read_unlock();
-
- if (level) {
- /*
- * We don't want to drop btree node pointers - if the
- * btree node isn't there anymore, the read path will
- * sort it out:
- */
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
- ptr->gen = g->gen;
- }
- rcu_read_unlock();
- } else {
- struct bkey_ptrs ptrs;
- union bch_extent_entry *entry;
-
- rcu_read_lock();
-restart_drop_ptrs:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
-
- if ((p.ptr.cached &&
- (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
- (!p.ptr.cached &&
- gen_cmp(p.ptr.gen, g->gen) < 0) ||
- gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
- (g->data_type &&
- g->data_type != data_type)) {
- bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
- goto restart_drop_ptrs;
- }
- }
- rcu_read_unlock();
-again:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_extent_entry_for_each(ptrs, entry) {
- if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
- entry->stripe_ptr.idx);
- union bch_extent_entry *next_ptr;
-
- bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
- if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
- goto found;
- next_ptr = NULL;
-found:
- if (!next_ptr) {
- bch_err(c, "aieee, found stripe ptr with no data ptr");
- continue;
- }
-
- if (!m || !m->alive ||
- !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
- &next_ptr->ptr,
- m->sectors)) {
- bch2_bkey_extent_entry_drop(new, entry);
- goto again;
- }
- }
- }
- }
-
- if (0) {
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, k);
- bch_info(c, "updated %s", buf.buf);
-
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
- bch_info(c, "new key %s", buf.buf);
- }
-
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
- BTREE_ITER_intent|BTREE_ITER_all_snapshots);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_internal_snapshot_node|
- BTREE_TRIGGER_norun);
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- goto err;
-
- if (level)
- bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
- }
-err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 b_gen, u8 bucket_data_type,
- u32 *bucket_sectors)
-{
- struct bch_fs *c = trans->c;
- size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
- struct printbuf buf = PRINTBUF;
- bool inserting = sectors > 0;
- int ret = 0;
-
- BUG_ON(!sectors);
-
- if (gen_after(ptr->gen, b_gen)) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
- "while marking %s",
- ptr->dev, bucket_nr, b_gen,
- bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- ptr->gen,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
- goto out;
- }
-
- if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, ptr_too_stale,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- ptr->dev, bucket_nr, b_gen,
- bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- ptr->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
- goto out;
- }
-
- if (b_gen != ptr->gen && ptr->cached) {
- ret = 1;
- goto out;
- }
-
- if (b_gen != ptr->gen) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, stale_dirty_ptr,
- "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
- "while marking %s",
- ptr->dev, bucket_nr, b_gen,
- bucket_gen_get(ca, bucket_nr),
- bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- ptr->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
- goto out;
- }
-
- if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, ptr_bucket_data_type_mismatch,
- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- ptr->dev, bucket_nr, b_gen,
- bch2_data_type_str(bucket_data_type),
- bch2_data_type_str(ptr_data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
- goto out;
- }
-
- if ((u64) *bucket_sectors + sectors > U32_MAX) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, bucket_sector_count_overflow,
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
- "while marking %s",
- ptr->dev, bucket_nr, b_gen,
- bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- *bucket_sectors, sectors,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
- sectors = -*bucket_sectors;
- }
-
- *bucket_sectors += sectors;
-out:
- printbuf_exit(&buf);
- return ret;
-err:
-fsck_err:
- bch2_dump_trans_updates(trans);
- bch2_inconsistent_error(c);
- ret = -BCH_ERR_bucket_ref_update;
- goto out;
-}
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- static int warned_disk_usage = 0;
- bool warn = false;
-
- percpu_down_read(&c->mark_lock);
- struct bch_fs_usage_base *src = &trans->fs_usage_delta;
-
- s64 added = src->btree + src->data + src->reserved;
-
- /*
- * Not allowed to reduce sectors_available except by getting a
- * reservation:
- */
- s64 should_not_have_added = added - (s64) disk_res_sectors;
- if (unlikely(should_not_have_added > 0)) {
- u64 old, new;
-
- old = atomic64_read(&c->sectors_available);
- do {
- new = max_t(s64, 0, old - should_not_have_added);
- } while (!atomic64_try_cmpxchg(&c->sectors_available,
- &old, new));
-
- added -= should_not_have_added;
- warn = true;
- }
-
- if (added > 0) {
- trans->disk_res->sectors -= added;
- this_cpu_sub(*c->online_reserved, added);
- }
-
- preempt_disable();
- struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
- acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
- preempt_enable();
- percpu_up_read(&c->mark_lock);
-
- if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
- bch2_trans_inconsistent(trans,
- "disk usage increased %lli more than %llu sectors reserved)",
- should_not_have_added, disk_res_sectors);
-}
-
-/* KEY_TYPE_extent: */
-
-static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
- struct bkey_s_c k,
- const struct extent_ptr_decoded *p,
- s64 sectors, enum bch_data_type ptr_data_type,
- struct bch_alloc_v4 *a,
- bool insert)
-{
- u32 *dst_sectors = p->has_ec ? &a->stripe_sectors :
- !p->ptr.cached ? &a->dirty_sectors :
- &a->cached_sectors;
- int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type,
- a->gen, a->data_type, dst_sectors);
-
- if (ret)
- return ret;
- if (insert)
- alloc_data_type_set(a, ptr_data_type);
- return 0;
-}
-
-static int bch2_trigger_pointer(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- const union bch_extent_entry *entry,
- s64 *sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- bool insert = !(flags & BTREE_TRIGGER_overwrite);
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bkey_i_backpointer bp;
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp);
-
- *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len;
-
- struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
- if (unlikely(!ca)) {
- if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
- ret = -BCH_ERR_trigger_pointer;
- goto err;
- }
-
- struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
-
- if (flags & BTREE_TRIGGER_transactional) {
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
- ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert);
- if (ret)
- goto err;
-
- ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
- if (ret)
- goto err;
- }
-
- if (flags & BTREE_TRIGGER_gc) {
- struct bucket *g = gc_bucket(ca, bucket.offset);
- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
- p.ptr.dev,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -BCH_ERR_trigger_pointer;
- goto err;
- }
-
- bucket_lock(g);
- struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
- ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert);
- alloc_to_bucket(g, new);
- bucket_unlock(g);
-
- if (!ret)
- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
- }
-err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
- struct bkey_s_c k,
- struct extent_ptr_decoded p,
- enum bch_data_type data_type,
- s64 sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- if (flags & BTREE_TRIGGER_transactional) {
- struct btree_iter iter;
- struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_stripes, POS(0, p.ec.idx),
- BTREE_ITER_with_updates, stripe);
- int ret = PTR_ERR_OR_ZERO(s);
- if (unlikely(ret)) {
- bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
- "pointer to nonexistent stripe %llu",
- (u64) p.ec.idx);
- goto err;
- }
-
- if (!bch2_ptr_matches_stripe(&s->v, p)) {
- bch2_trans_inconsistent(trans,
- "stripe pointer doesn't match stripe %llu",
- (u64) p.ec.idx);
- ret = -BCH_ERR_trigger_stripe_pointer;
- goto err;
- }
-
- stripe_blockcount_set(&s->v, p.ec.block,
- stripe_blockcount_get(&s->v, p.ec.block) +
- sectors);
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
- bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
- acc.replicas.data_type = data_type;
- ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
- }
-
- if (flags & BTREE_TRIGGER_gc) {
- struct bch_fs *c = trans->c;
-
- struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- (u64) p.ec.idx);
- return -BCH_ERR_ENOMEM_mark_stripe_ptr;
- }
-
- gc_stripe_lock(m);
-
- if (!m || !m->alive) {
- gc_stripe_unlock(m);
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
- (u64) p.ec.idx, buf.buf);
- printbuf_exit(&buf);
- bch2_inconsistent_error(c);
- return -BCH_ERR_trigger_stripe_pointer;
- }
-
- m->block_sectors[p.ec.block] += sectors;
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
- memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
- gc_stripe_unlock(m);
-
- acc.replicas.data_type = data_type;
- int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int __trigger_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags,
- s64 *replicas_sectors)
-{
- bool gc = flags & BTREE_TRIGGER_gc;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
- ? BCH_DATA_btree
- : BCH_DATA_user;
- int ret = 0;
-
- struct disk_accounting_pos acc_replicas_key = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- .replicas.data_type = data_type,
- .replicas.nr_devs = 0,
- .replicas.nr_required = 1,
- };
-
- unsigned cur_compression_type = 0;
- u64 compression_acct[3] = { 1, 0, 0 };
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = 0;
- ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
- if (ret < 0)
- return ret;
-
- bool stale = ret > 0;
-
- if (p.ptr.cached && stale)
- continue;
-
- if (p.ptr.cached) {
- ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
- if (ret)
- return ret;
- } else if (!p.has_ec) {
- *replicas_sectors += disk_sectors;
- replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
- } else {
- ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
- if (ret)
- return ret;
-
- /*
- * There may be other dirty pointers in this extent, but
- * if so they're not required for mounting if we have an
- * erasure coded pointer in this extent:
- */
- acc_replicas_key.replicas.nr_required = 0;
- }
-
- if (cur_compression_type &&
- cur_compression_type != p.crc.compression_type) {
- if (flags & BTREE_TRIGGER_overwrite)
- bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
-
- ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
- compression, cur_compression_type);
- if (ret)
- return ret;
-
- compression_acct[0] = 1;
- compression_acct[1] = 0;
- compression_acct[2] = 0;
- }
-
- cur_compression_type = p.crc.compression_type;
- if (p.crc.compression_type) {
- compression_acct[1] += p.crc.uncompressed_size;
- compression_acct[2] += p.crc.compressed_size;
- }
- }
-
- if (acc_replicas_key.replicas.nr_devs) {
- ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
- if (ret)
- return ret;
- }
-
- if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
- ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot);
- if (ret)
- return ret;
- }
-
- if (cur_compression_type) {
- if (flags & BTREE_TRIGGER_overwrite)
- bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
-
- ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
- compression, cur_compression_type);
- if (ret)
- return ret;
- }
-
- if (level) {
- ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id);
- if (ret)
- return ret;
- } else {
- bool insert = !(flags & BTREE_TRIGGER_overwrite);
-
- s64 v[3] = {
- insert ? 1 : -1,
- insert ? k.k->size : -((s64) k.k->size),
- *replicas_sectors,
- };
- ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int bch2_trigger_extent(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
- struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
- unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
- unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
-
- if (unlikely(flags & BTREE_TRIGGER_check_repair))
- return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
-
- /* if pointers aren't changing - nothing to do: */
- if (new_ptrs_bytes == old_ptrs_bytes &&
- !memcmp(new_ptrs.start,
- old_ptrs.start,
- new_ptrs_bytes))
- return 0;
-
- if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
-
- if (old.k->type) {
- int ret = __trigger_extent(trans, btree, level, old,
- flags & ~BTREE_TRIGGER_insert,
- &old_replicas_sectors);
- if (ret)
- return ret;
- }
-
- if (new.k->type) {
- int ret = __trigger_extent(trans, btree, level, new.s_c,
- flags & ~BTREE_TRIGGER_overwrite,
- &new_replicas_sectors);
- if (ret)
- return ret;
- }
-
- int need_rebalance_delta = 0;
- s64 need_rebalance_sectors_delta[1] = { 0 };
-
- s64 s = bch2_bkey_sectors_need_rebalance(c, old);
- need_rebalance_delta -= s != 0;
- need_rebalance_sectors_delta[0] -= s;
-
- s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
- need_rebalance_delta += s != 0;
- need_rebalance_sectors_delta[0] += s;
-
- if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
- int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- new.k->p, need_rebalance_delta > 0);
- if (ret)
- return ret;
- }
-
- if (need_rebalance_sectors_delta[0]) {
- int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
- need_rebalance_sectors_delta, rebalance_work);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-/* KEY_TYPE_reservation */
-
-static int __trigger_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level, struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
-{
- if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- s64 sectors[1] = { k.k->size };
-
- if (flags & BTREE_TRIGGER_overwrite)
- sectors[0] = -sectors[0];
-
- return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors,
- persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas);
- }
-
- return 0;
-}
-
-int bch2_trigger_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
-}
-
-/* Mark superblocks: */
-
-static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- enum bch_data_type type,
- unsigned sectors)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- int ret = 0;
-
- struct bkey_i_alloc_v4 *a =
- bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- if (a->v.data_type && type && a->v.data_type != type) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, bucket_metadata_type_mismatch,
- "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- bch2_data_type_str(type),
- bch2_data_type_str(type));
- ret = -BCH_ERR_metadata_bucket_inconsistency;
- goto err;
- }
-
- if (a->v.data_type != type ||
- a->v.dirty_sectors != sectors) {
- a->v.data_type = type;
- a->v.dirty_sectors = sectors;
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- }
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca,
- u64 b, enum bch_data_type data_type, unsigned sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- struct bucket *g = gc_bucket(ca, b);
- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
- ca->dev_idx, bch2_data_type_str(data_type)))
- goto err;
-
- bucket_lock(g);
- struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
-
- if (bch2_fs_inconsistent_on(g->data_type &&
- g->data_type != data_type, c,
- "different types of data in same bucket: %s, %s",
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type)))
- goto err_unlock;
-
- if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
- "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
- ca->dev_idx, b, g->gen,
- bch2_data_type_str(g->data_type ?: data_type),
- g->dirty_sectors, sectors))
- goto err_unlock;
-
- g->data_type = data_type;
- g->dirty_sectors += sectors;
- struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
- bucket_unlock(g);
- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
- return ret;
-err_unlock:
- bucket_unlock(g);
-err:
- return -BCH_ERR_metadata_bucket_inconsistency;
-}
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- enum bch_data_type type, unsigned sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- BUG_ON(type != BCH_DATA_free &&
- type != BCH_DATA_sb &&
- type != BCH_DATA_journal);
-
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return 0;
-
- if (flags & BTREE_TRIGGER_gc)
- return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags);
- else if (flags & BTREE_TRIGGER_transactional)
- return commit_do(trans, NULL, NULL, 0,
- __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
- else
- BUG();
-}
-
-static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
- struct bch_dev *ca, u64 start, u64 end,
- enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- do {
- u64 b = sector_to_bucket(ca, start);
- unsigned sectors =
- min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
-
- if (b != *bucket && *bucket_sectors) {
- int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
- type, *bucket_sectors, flags);
- if (ret)
- return ret;
-
- *bucket_sectors = 0;
- }
-
- *bucket = b;
- *bucket_sectors += sectors;
- start += sectors;
- } while (start < end);
-
- return 0;
-}
-
-static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_layout layout = ca->disk_sb.sb->layout;
- mutex_unlock(&c->sb_lock);
-
- u64 bucket = 0;
- unsigned i, bucket_sectors = 0;
- int ret;
-
- for (i = 0; i < layout.nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout.sb_offset[i]);
-
- if (offset == BCH_SB_SECTOR) {
- ret = bch2_trans_mark_metadata_sectors(trans, ca,
- 0, BCH_SB_SECTOR,
- BCH_DATA_sb, &bucket, &bucket_sectors, flags);
- if (ret)
- return ret;
- }
-
- ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
- offset + (1 << layout.sb_max_size_bits),
- BCH_DATA_sb, &bucket, &bucket_sectors, flags);
- if (ret)
- return ret;
- }
-
- if (bucket_sectors) {
- ret = bch2_trans_mark_metadata_bucket(trans, ca,
- bucket, BCH_DATA_sb, bucket_sectors, flags);
- if (ret)
- return ret;
- }
-
- for (i = 0; i < ca->journal.nr; i++) {
- ret = bch2_trans_mark_metadata_bucket(trans, ca,
- ca->journal.buckets[i],
- BCH_DATA_journal, ca->mi.bucket_size, flags);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
- enum btree_iter_update_trigger_flags flags)
-{
- int ret = bch2_trans_run(c,
- __bch2_trans_mark_dev_sb(trans, ca, flags));
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
- enum btree_iter_update_trigger_flags flags)
-{
- for_each_online_member(c, ca) {
- int ret = bch2_trans_mark_dev_sb(c, ca, flags);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_trans_mark_dev_sbs(struct bch_fs *c)
-{
- return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
-}
-
-bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
- u64 b_offset = bucket_to_sector(ca, b);
- u64 b_end = bucket_to_sector(ca, b + 1);
- unsigned i;
-
- if (!b)
- return true;
-
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
- u64 end = offset + (1 << layout->sb_max_size_bits);
-
- if (!(offset >= b_end || end <= b_offset))
- return true;
- }
-
- for (i = 0; i < ca->journal.nr; i++)
- if (b == ca->journal.buckets[i])
- return true;
-
- return false;
-}
-
-/* Disk reservations: */
-
-#define SECTORS_CACHE 1024
-
-int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, enum bch_reservation_flags flags)
-{
- struct bch_fs_pcpu *pcpu;
- u64 old, get;
- u64 sectors_available;
- int ret;
-
- percpu_down_read(&c->mark_lock);
- preempt_disable();
- pcpu = this_cpu_ptr(c->pcpu);
-
- if (sectors <= pcpu->sectors_available)
- goto out;
-
- old = atomic64_read(&c->sectors_available);
- do {
- get = min((u64) sectors + SECTORS_CACHE, old);
-
- if (get < sectors) {
- preempt_enable();
- goto recalculate;
- }
- } while (!atomic64_try_cmpxchg(&c->sectors_available,
- &old, old - get));
-
- pcpu->sectors_available += get;
-
-out:
- pcpu->sectors_available -= sectors;
- this_cpu_add(*c->online_reserved, sectors);
- res->sectors += sectors;
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
- return 0;
-
-recalculate:
- mutex_lock(&c->sectors_available_lock);
-
- percpu_u64_set(&c->pcpu->sectors_available, 0);
- sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
-
- if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL))
- sectors = min(sectors, sectors_available);
-
- if (sectors <= sectors_available ||
- (flags & BCH_DISK_RESERVATION_NOFAIL)) {
- atomic64_set(&c->sectors_available,
- max_t(s64, 0, sectors_available - sectors));
- this_cpu_add(*c->online_reserved, sectors);
- res->sectors += sectors;
- ret = 0;
- } else {
- atomic64_set(&c->sectors_available, sectors_available);
- ret = -BCH_ERR_ENOSPC_disk_reservation;
- }
-
- mutex_unlock(&c->sectors_available_lock);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-/* Startup/shutdown: */
-
-void bch2_buckets_nouse_free(struct bch_fs *c)
-{
- for_each_member_device(c, ca) {
- kvfree_rcu_mightsleep(ca->buckets_nouse);
- ca->buckets_nouse = NULL;
- }
-}
-
-int bch2_buckets_nouse_alloc(struct bch_fs *c)
-{
- for_each_member_device(c, ca) {
- BUG_ON(ca->buckets_nouse);
-
- ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
- sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO);
- if (!ca->buckets_nouse) {
- bch2_dev_put(ca);
- return -BCH_ERR_ENOMEM_buckets_nouse;
- }
- }
-
- return 0;
-}
-
-static void bucket_gens_free_rcu(struct rcu_head *rcu)
-{
- struct bucket_gens *buckets =
- container_of(rcu, struct bucket_gens, rcu);
-
- kvfree(buckets);
-}
-
-int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
-{
- struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
- bool resize = ca->bucket_gens != NULL;
- int ret;
-
- if (resize)
- lockdep_assert_held(&c->state_lock);
-
- if (resize && ca->buckets_nouse)
- return -BCH_ERR_no_resize_with_buckets_nouse;
-
- bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
- GFP_KERNEL|__GFP_ZERO);
- if (!bucket_gens) {
- ret = -BCH_ERR_ENOMEM_bucket_gens;
- goto err;
- }
-
- bucket_gens->first_bucket = ca->mi.first_bucket;
- bucket_gens->nbuckets = nbuckets;
- bucket_gens->nbuckets_minus_first =
- bucket_gens->nbuckets - bucket_gens->first_bucket;
-
- old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
-
- if (resize) {
- bucket_gens->nbuckets = min(bucket_gens->nbuckets,
- old_bucket_gens->nbuckets);
- bucket_gens->nbuckets_minus_first =
- bucket_gens->nbuckets - bucket_gens->first_bucket;
- memcpy(bucket_gens->b,
- old_bucket_gens->b,
- bucket_gens->nbuckets);
- }
-
- rcu_assign_pointer(ca->bucket_gens, bucket_gens);
- bucket_gens = old_bucket_gens;
-
- nbuckets = ca->mi.nbuckets;
-
- ret = 0;
-err:
- if (bucket_gens)
- call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
-
- return ret;
-}
-
-void bch2_dev_buckets_free(struct bch_dev *ca)
-{
- kvfree(ca->buckets_nouse);
- kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
- free_percpu(ca->usage);
-}
-
-int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
- ca->usage = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage)
- return -BCH_ERR_ENOMEM_usage_init;
-
- return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
-}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
deleted file mode 100644
index c5363256e363..000000000000
--- a/fs/bcachefs/buckets.h
+++ /dev/null
@@ -1,368 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#ifndef _BUCKETS_H
-#define _BUCKETS_H
-
-#include "buckets_types.h"
-#include "extents.h"
-#include "sb-members.h"
-
-static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
- return div_u64(s, ca->mi.bucket_size);
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
- return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
- u32 remainder;
-
- div_u64_rem(s, ca->mi.bucket_size, &remainder);
- return remainder;
-}
-
-static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset)
-{
- return div_u64_rem(s, ca->mi.bucket_size, offset);
-}
-
-#define for_each_bucket(_b, _buckets) \
- for (_b = (_buckets)->b + (_buckets)->first_bucket; \
- _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-
-static inline void bucket_unlock(struct bucket *b)
-{
- BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-
- clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
- wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
-}
-
-static inline void bucket_lock(struct bucket *b)
-{
- wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
- TASK_UNINTERRUPTIBLE);
-}
-
-static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
-{
- return bucket_valid(ca, b)
- ? genradix_ptr(&ca->buckets_gc, b)
- : NULL;
-}
-
-static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
-{
- return rcu_dereference_check(ca->bucket_gens,
- lockdep_is_held(&ca->fs->state_lock));
-}
-
-static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
-{
- struct bucket_gens *gens = bucket_gens(ca);
-
- if (b - gens->first_bucket >= gens->nbuckets_minus_first)
- return NULL;
- return gens->b + b;
-}
-
-static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
-{
- u8 *gen = bucket_gen(ca, b);
- return gen ? *gen : -1;
-}
-
-static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
-{
- rcu_read_lock();
- int ret = bucket_gen_get_rcu(ca, b);
- rcu_read_unlock();
- return ret;
-}
-
-static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return sector_to_bucket(ca, ptr->offset);
-}
-
-static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca,
- const struct bch_extent_ptr *ptr,
- u32 *bucket_offset)
-{
- return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
-}
-
-static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline enum bch_data_type ptr_data_type(const struct bkey *k,
- const struct bch_extent_ptr *ptr)
-{
- if (bkey_is_btree_ptr(k))
- return BCH_DATA_btree;
-
- return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
-}
-
-static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
- EBUG_ON(sectors < 0);
-
- return crc_is_compressed(p.crc)
- ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
- p.crc.uncompressed_size)
- : sectors;
-}
-
-static inline int gen_cmp(u8 a, u8 b)
-{
- return (s8) (a - b);
-}
-
-static inline int gen_after(u8 a, u8 b)
-{
- return max(0, gen_cmp(a, b));
-}
-
-static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
- int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr));
- return gen < 0 ? gen : gen_after(gen, ptr->gen);
-}
-
-/**
- * dev_ptr_stale() - check if a pointer points into a bucket that has been
- * invalidated.
- */
-static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
- rcu_read_lock();
- int ret = dev_ptr_stale_rcu(ca, ptr);
- rcu_read_unlock();
- return ret;
-}
-
-/* Device usage: */
-
-void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
-static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
-{
- struct bch_dev_usage ret;
-
- bch2_dev_usage_read_fast(ca, &ret);
- return ret;
-}
-
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *);
-
-static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
-{
- s64 reserved = 0;
-
- switch (watermark) {
- case BCH_WATERMARK_NR:
- BUG();
- case BCH_WATERMARK_stripe:
- reserved += ca->mi.nbuckets >> 6;
- fallthrough;
- case BCH_WATERMARK_normal:
- reserved += ca->mi.nbuckets >> 6;
- fallthrough;
- case BCH_WATERMARK_copygc:
- reserved += ca->nr_btree_reserve;
- fallthrough;
- case BCH_WATERMARK_btree:
- reserved += ca->nr_btree_reserve;
- fallthrough;
- case BCH_WATERMARK_btree_copygc:
- case BCH_WATERMARK_reclaim:
- case BCH_WATERMARK_interior_updates:
- break;
- }
-
- return reserved;
-}
-
-static inline u64 dev_buckets_free(struct bch_dev *ca,
- struct bch_dev_usage usage,
- enum bch_watermark watermark)
-{
- return max_t(s64, 0,
- usage.d[BCH_DATA_free].buckets -
- ca->nr_open_buckets -
- bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
- struct bch_dev_usage usage,
- enum bch_watermark watermark)
-{
- return max_t(s64, 0,
- usage.d[BCH_DATA_free].buckets
- + usage.d[BCH_DATA_cached].buckets
- + usage.d[BCH_DATA_need_gc_gens].buckets
- + usage.d[BCH_DATA_need_discard].buckets
- - ca->nr_open_buckets
- - bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 dev_buckets_available(struct bch_dev *ca,
- enum bch_watermark watermark)
-{
- return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
-}
-
-/* Filesystem usage: */
-
-static inline unsigned dev_usage_u64s(void)
-{
- return sizeof(struct bch_dev_usage) / sizeof(u64);
-}
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *);
-
-int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
- struct bkey_s_c, const struct bch_extent_ptr *,
- s64, enum bch_data_type, u8, u8, u32 *);
-
-int bch2_check_fix_ptrs(struct btree_trans *,
- enum btree_id, unsigned, struct bkey_s_c,
- enum btree_iter_update_trigger_flags);
-
-int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
-({ \
- int ret = 0; \
- \
- if (_old.k->type) \
- ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \
- if (!ret && _new.k->type) \
- ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\
- ret; \
-})
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *);
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
- enum bch_data_type, unsigned,
- enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *,
- enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
- enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs(struct bch_fs *);
-
-bool bch2_is_superblock_bucket(struct bch_dev *, u64);
-
-static inline const char *bch2_data_type_str(enum bch_data_type type)
-{
- return type < BCH_DATA_NR
- ? __bch2_data_types[type]
- : "(invalid data type)";
-}
-
-/* disk reservations: */
-
-static inline void bch2_disk_reservation_put(struct bch_fs *c,
- struct disk_reservation *res)
-{
- if (res->sectors) {
- this_cpu_sub(*c->online_reserved, res->sectors);
- res->sectors = 0;
- }
-}
-
-enum bch_reservation_flags {
- BCH_DISK_RESERVATION_NOFAIL = 1 << 0,
- BCH_DISK_RESERVATION_PARTIAL = 1 << 1,
-};
-
-int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *,
- u64, enum bch_reservation_flags);
-
-static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, enum bch_reservation_flags flags)
-{
-#ifdef __KERNEL__
- u64 old, new;
-
- old = this_cpu_read(c->pcpu->sectors_available);
- do {
- if (sectors > old)
- return __bch2_disk_reservation_add(c, res, sectors, flags);
-
- new = old - sectors;
- } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new));
-
- this_cpu_add(*c->online_reserved, sectors);
- res->sectors += sectors;
- return 0;
-#else
- return __bch2_disk_reservation_add(c, res, sectors, flags);
-#endif
-}
-
-static inline struct disk_reservation
-bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
-{
- return (struct disk_reservation) {
- .sectors = 0,
-#if 0
- /* not used yet: */
- .gen = c->capacity_gen,
-#endif
- .nr_replicas = nr_replicas,
- };
-}
-
-static inline int bch2_disk_reservation_get(struct bch_fs *c,
- struct disk_reservation *res,
- u64 sectors, unsigned nr_replicas,
- int flags)
-{
- *res = bch2_disk_reservation_init(c, nr_replicas);
-
- return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
-}
-
-#define RESERVE_FACTOR 6
-
-static inline u64 avail_factor(u64 r)
-{
- return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
-}
-
-void bch2_buckets_nouse_free(struct bch_fs *);
-int bch2_buckets_nouse_alloc(struct bch_fs *);
-
-int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
-void bch2_dev_buckets_free(struct bch_dev *);
-int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
-
-#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
deleted file mode 100644
index 900b8680c8b5..000000000000
--- a/fs/bcachefs/buckets_types.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_TYPES_H
-#define _BUCKETS_TYPES_H
-
-#include "bcachefs_format.h"
-#include "util.h"
-
-#define BUCKET_JOURNAL_SEQ_BITS 16
-
-/*
- * Ugly hack alert:
- *
- * We need to cram a spinlock in a single byte, because that's what we have left
- * in struct bucket, and we care about the size of these - during fsck, we need
- * in memory state for every single bucket on every device.
- *
- * We used to do
- * while (xchg(&b->lock, 1) cpu_relax();
- * but, it turns out not all architectures support xchg on a single byte.
- *
- * So now we use bit_spin_lock(), with fun games since we can't burn a whole
- * ulong for this - we just need to make sure the lock bit always ends up in the
- * first byte.
- */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define BUCKET_LOCK_BITNR 0
-#else
-#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
-#endif
-
-union ulong_byte_assert {
- ulong ulong;
- u8 byte;
-};
-
-struct bucket {
- u8 lock;
- u8 gen_valid:1;
- u8 data_type:7;
- u8 gen;
- u8 stripe_redundancy;
- u32 stripe;
- u32 dirty_sectors;
- u32 cached_sectors;
- u32 stripe_sectors;
-} __aligned(sizeof(long));
-
-struct bucket_gens {
- struct rcu_head rcu;
- u16 first_bucket;
- size_t nbuckets;
- size_t nbuckets_minus_first;
- u8 b[] __counted_by(nbuckets);
-};
-
-struct bch_dev_usage {
- struct bch_dev_usage_type {
- u64 buckets;
- u64 sectors; /* _compressed_ sectors: */
- /*
- * XXX
- * Why do we have this? Isn't it just buckets * bucket_size -
- * sectors?
- */
- u64 fragmented;
- } d[BCH_DATA_NR];
-};
-
-struct bch_fs_usage_base {
- u64 hidden;
- u64 btree;
- u64 data;
- u64 cached;
- u64 reserved;
- u64 nr_inodes;
-};
-
-struct bch_fs_usage_short {
- u64 capacity;
- u64 used;
- u64 free;
- u64 nr_inodes;
-};
-
-/*
- * A reservation for space on disk:
- */
-struct disk_reservation {
- u64 sectors;
- u32 gen;
- unsigned nr_replicas;
-};
-
-#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
deleted file mode 100644
index c8a488e6b7b8..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "buckets_waiting_for_journal.h"
-#include <linux/hash.h>
-#include <linux/random.h>
-
-static inline struct bucket_hashed *
-bucket_hash(struct buckets_waiting_for_journal_table *t,
- unsigned hash_seed_idx, u64 dev_bucket)
-{
- return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
-}
-
-static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
-{
- unsigned i;
-
- t->bits = bits;
- for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
- get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
- memset(t->d, 0, sizeof(t->d[0]) << t->bits);
-}
-
-u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b,
- unsigned dev, u64 bucket)
-{
- struct buckets_waiting_for_journal_table *t;
- u64 dev_bucket = (u64) dev << 56 | bucket;
- u64 ret = 0;
-
- mutex_lock(&b->lock);
- t = b->t;
-
- for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
- struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
-
- if (h->dev_bucket == dev_bucket) {
- ret = h->journal_seq;
- break;
- }
- }
-
- mutex_unlock(&b->lock);
-
- return ret;
-}
-
-static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
- struct bucket_hashed *new,
- u64 flushed_seq)
-{
- struct bucket_hashed *last_evicted = NULL;
- unsigned tries, i;
-
- for (tries = 0; tries < 10; tries++) {
- struct bucket_hashed *old, *victim = NULL;
-
- for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
- old = bucket_hash(t, i, new->dev_bucket);
-
- if (old->dev_bucket == new->dev_bucket ||
- old->journal_seq <= flushed_seq) {
- *old = *new;
- return true;
- }
-
- if (last_evicted != old)
- victim = old;
- }
-
- /* hashed to same slot 3 times: */
- if (!victim)
- break;
-
- /* Failed to find an empty slot: */
- swap(*new, *victim);
- last_evicted = victim;
- }
-
- return false;
-}
-
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
- u64 flushed_seq,
- unsigned dev, u64 bucket,
- u64 journal_seq)
-{
- struct buckets_waiting_for_journal_table *t, *n;
- struct bucket_hashed tmp, new = {
- .dev_bucket = (u64) dev << 56 | bucket,
- .journal_seq = journal_seq,
- };
- size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0;
- int ret = 0;
-
- mutex_lock(&b->lock);
-
- if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
- goto out;
-
- t = b->t;
- size = 1UL << t->bits;
- for (i = 0; i < size; i++)
- nr_elements += t->d[i].journal_seq > flushed_seq;
-
- new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
-realloc:
- n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
- if (!n) {
- ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
- goto out;
- }
-
-retry_rehash:
- if (nr_rehashes_this_size == 3) {
- new_bits++;
- nr_rehashes_this_size = 0;
- kvfree(n);
- goto realloc;
- }
-
- nr_rehashes++;
- nr_rehashes_this_size++;
-
- bucket_table_init(n, new_bits);
-
- tmp = new;
- BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
-
- for (i = 0; i < 1UL << t->bits; i++) {
- if (t->d[i].journal_seq <= flushed_seq)
- continue;
-
- tmp = t->d[i];
- if (!bucket_table_insert(n, &tmp, flushed_seq))
- goto retry_rehash;
- }
-
- b->t = n;
- kvfree(t);
-
- pr_debug("took %zu rehashes, table at %zu/%lu elements",
- nr_rehashes, nr_elements, 1UL << b->t->bits);
-out:
- mutex_unlock(&b->lock);
-
- return ret;
-}
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
-{
- struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
- kvfree(b->t);
-}
-
-#define INITIAL_TABLE_BITS 3
-
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
-{
- struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
- mutex_init(&b->lock);
-
- b->t = kvmalloc(sizeof(*b->t) +
- (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
- if (!b->t)
- return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
-
- bucket_table_init(b->t, INITIAL_TABLE_BITS);
- return 0;
-}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
deleted file mode 100644
index 365619ca44c8..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_H
-
-#include "buckets_waiting_for_journal_types.h"
-
-u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *,
- unsigned, u64);
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
- u64, unsigned, u64, u64);
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
deleted file mode 100644
index e593db061d81..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal_types.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-
-#include <linux/siphash.h>
-
-struct bucket_hashed {
- u64 dev_bucket;
- u64 journal_seq;
-};
-
-struct buckets_waiting_for_journal_table {
- unsigned bits;
- u64 hash_seeds[3];
- struct bucket_hashed d[];
-};
-
-struct buckets_waiting_for_journal {
- struct mutex lock;
- struct buckets_waiting_for_journal_table *t;
-};
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
deleted file mode 100644
index 57d55b3ddc71..000000000000
--- a/fs/bcachefs/chardev.c
+++ /dev/null
@@ -1,831 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_CHARDEV
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "disk_accounting.h"
-#include "fsck.h"
-#include "journal.h"
-#include "move.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-counters.h"
-#include "super-io.h"
-#include "thread_with_file.h"
-
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/major.h>
-#include <linux/sched/task.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
- unsigned flags)
-{
- struct bch_dev *ca;
-
- if (flags & BCH_BY_INDEX) {
- if (dev >= c->sb.nr_devices)
- return ERR_PTR(-EINVAL);
-
- ca = bch2_dev_tryget_noerror(c, dev);
- if (!ca)
- return ERR_PTR(-EINVAL);
- } else {
- char *path;
-
- path = strndup_user((const char __user *)
- (unsigned long) dev, PATH_MAX);
- if (IS_ERR(path))
- return ERR_CAST(path);
-
- ca = bch2_dev_lookup(c, path);
- kfree(path);
- }
-
- return ca;
-}
-
-#if 0
-static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
-{
- struct bch_ioctl_assemble arg;
- struct bch_fs *c;
- u64 *user_devs = NULL;
- char **devs = NULL;
- unsigned i;
- int ret = -EFAULT;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
- if (!user_devs)
- return -ENOMEM;
-
- devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
-
- if (copy_from_user(user_devs, user_arg->devs,
- sizeof(u64) * arg.nr_devs))
- goto err;
-
- for (i = 0; i < arg.nr_devs; i++) {
- devs[i] = strndup_user((const char __user *)(unsigned long)
- user_devs[i],
- PATH_MAX);
- ret= PTR_ERR_OR_ZERO(devs[i]);
- if (ret)
- goto err;
- }
-
- c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
- ret = PTR_ERR_OR_ZERO(c);
- if (!ret)
- closure_put(&c->cl);
-err:
- if (devs)
- for (i = 0; i < arg.nr_devs; i++)
- kfree(devs[i]);
- kfree(devs);
- return ret;
-}
-
-static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
-{
- struct bch_ioctl_incremental arg;
- const char *err;
- char *path;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- ret = PTR_ERR_OR_ZERO(path);
- if (ret)
- return ret;
-
- err = bch2_fs_open_incremental(path);
- kfree(path);
-
- if (err) {
- pr_err("Could not register bcachefs devices: %s", err);
- return -EINVAL;
- }
-
- return 0;
-}
-#endif
-
-static long bch2_global_ioctl(unsigned cmd, void __user *arg)
-{
- long ret;
-
- switch (cmd) {
-#if 0
- case BCH_IOCTL_ASSEMBLE:
- return bch2_ioctl_assemble(arg);
- case BCH_IOCTL_INCREMENTAL:
- return bch2_ioctl_incremental(arg);
-#endif
- case BCH_IOCTL_FSCK_OFFLINE: {
- ret = bch2_ioctl_fsck_offline(arg);
- break;
- }
- default:
- ret = -ENOTTY;
- break;
- }
-
- if (ret < 0)
- ret = bch2_err_class(ret);
- return ret;
-}
-
-static long bch2_ioctl_query_uuid(struct bch_fs *c,
- struct bch_ioctl_query_uuid __user *user_arg)
-{
- return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
- sizeof(c->sb.user_uuid));
-}
-
-#if 0
-static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
-{
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- return bch2_fs_start(c);
-}
-
-static long bch2_ioctl_stop(struct bch_fs *c)
-{
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- bch2_fs_stop(c);
- return 0;
-}
-#endif
-
-static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
- char *path;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- ret = PTR_ERR_OR_ZERO(path);
- if (ret)
- return ret;
-
- ret = bch2_dev_add(c, path);
- if (!IS_ERR(path))
- kfree(path);
-
- return ret;
-}
-
-static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
- struct bch_dev *ca;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
- BCH_FORCE_IF_METADATA_LOST|
- BCH_FORCE_IF_DEGRADED|
- BCH_BY_INDEX)) ||
- arg.pad)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- return bch2_dev_remove(c, ca, arg.flags);
-}
-
-static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
- char *path;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- ret = PTR_ERR_OR_ZERO(path);
- if (ret)
- return ret;
-
- ret = bch2_dev_online(c, path);
- kfree(path);
- return ret;
-}
-
-static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
- struct bch_dev *ca;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
- BCH_FORCE_IF_METADATA_LOST|
- BCH_FORCE_IF_DEGRADED|
- BCH_BY_INDEX)) ||
- arg.pad)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch2_dev_offline(c, ca, arg.flags);
- bch2_dev_put(ca);
- return ret;
-}
-
-static long bch2_ioctl_disk_set_state(struct bch_fs *c,
- struct bch_ioctl_disk_set_state arg)
-{
- struct bch_dev *ca;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
- BCH_FORCE_IF_METADATA_LOST|
- BCH_FORCE_IF_DEGRADED|
- BCH_BY_INDEX)) ||
- arg.pad[0] || arg.pad[1] || arg.pad[2] ||
- arg.new_state >= BCH_MEMBER_STATE_NR)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
- if (ret)
- bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
-
- bch2_dev_put(ca);
- return ret;
-}
-
-struct bch_data_ctx {
- struct thread_with_file thr;
-
- struct bch_fs *c;
- struct bch_ioctl_data arg;
- struct bch_move_stats stats;
-};
-
-static int bch2_data_thread(void *arg)
-{
- struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
-
- ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
- if (ctx->thr.ret == -BCH_ERR_device_offline)
- ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
- else {
- ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
- ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
- }
- return 0;
-}
-
-static int bch2_data_job_release(struct inode *inode, struct file *file)
-{
- struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
-
- bch2_thread_with_file_exit(&ctx->thr);
- kfree(ctx);
- return 0;
-}
-
-static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
-{
- struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
- struct bch_fs *c = ctx->c;
- struct bch_ioctl_data_event e = {
- .type = BCH_DATA_EVENT_PROGRESS,
- .ret = ctx->stats.ret,
- .p.data_type = ctx->stats.data_type,
- .p.btree_id = ctx->stats.pos.btree,
- .p.pos = ctx->stats.pos.pos,
- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
- .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected),
- .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected),
- };
-
- if (ctx->arg.op == BCH_DATA_OP_scrub) {
- struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
- if (ca) {
- struct bch_dev_usage u;
- bch2_dev_usage_read_fast(ca, &u);
- for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
- if (ctx->arg.scrub.data_types & BIT(i))
- e.p.sectors_total += u.d[i].sectors;
- bch2_dev_put(ca);
- }
- } else {
- e.p.sectors_total = bch2_fs_usage_read_short(c).used;
- }
-
- if (len < sizeof(e))
- return -EINVAL;
-
- return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
-}
-
-static const struct file_operations bcachefs_data_ops = {
- .release = bch2_data_job_release,
- .read = bch2_data_job_read,
-};
-
-static long bch2_ioctl_data(struct bch_fs *c,
- struct bch_ioctl_data arg)
-{
- struct bch_data_ctx *ctx;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (arg.op >= BCH_DATA_OP_NR || arg.flags)
- return -EINVAL;
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
-
- ctx->c = c;
- ctx->arg = arg;
-
- ret = bch2_run_thread_with_file(&ctx->thr,
- &bcachefs_data_ops,
- bch2_data_thread);
- if (ret < 0)
- kfree(ctx);
- return ret;
-}
-
-static long bch2_ioctl_fs_usage(struct bch_fs *c,
- struct bch_ioctl_fs_usage __user *user_arg)
-{
- struct bch_ioctl_fs_usage arg = {};
- darray_char replicas = {};
- u32 replica_entries_bytes;
- int ret = 0;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
- return -EFAULT;
-
- ret = bch2_fs_replicas_usage_read(c, &replicas) ?:
- (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?:
- copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr);
- if (ret)
- goto err;
-
- struct bch_fs_usage_short u = bch2_fs_usage_read_short(c);
- arg.capacity = c->capacity;
- arg.used = u.used;
- arg.online_reserved = percpu_u64_get(c->online_reserved);
- arg.replica_entries_bytes = replicas.nr;
-
- for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
- struct disk_accounting_pos k = {
- .type = BCH_DISK_ACCOUNTING_persistent_reserved,
- .persistent_reserved.nr_replicas = i,
- };
-
- bch2_accounting_mem_read(c,
- disk_accounting_pos_to_bpos(&k),
- &arg.persistent_reserved[i], 1);
- }
-
- ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-err:
- darray_exit(&replicas);
- return ret;
-}
-
-static long bch2_ioctl_query_accounting(struct bch_fs *c,
- struct bch_ioctl_query_accounting __user *user_arg)
-{
- struct bch_ioctl_query_accounting arg;
- darray_char accounting = {};
- int ret = 0;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?:
- bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?:
- (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?:
- copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr);
- if (ret)
- goto err;
-
- arg.capacity = c->capacity;
- arg.used = bch2_fs_usage_read_short(c).used;
- arg.online_reserved = percpu_u64_get(c->online_reserved);
- arg.accounting_u64s = accounting.nr / sizeof(u64);
-
- ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-err:
- darray_exit(&accounting);
- return ret;
-}
-
-/* obsolete, didn't allow for new data types: */
-static long bch2_ioctl_dev_usage(struct bch_fs *c,
- struct bch_ioctl_dev_usage __user *user_arg)
-{
- struct bch_ioctl_dev_usage arg;
- struct bch_dev_usage src;
- struct bch_dev *ca;
- unsigned i;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if ((arg.flags & ~BCH_BY_INDEX) ||
- arg.pad[0] ||
- arg.pad[1] ||
- arg.pad[2])
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- src = bch2_dev_usage_read(ca);
-
- arg.state = ca->mi.state;
- arg.bucket_size = ca->mi.bucket_size;
- arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
-
- for (i = 0; i < ARRAY_SIZE(arg.d); i++) {
- arg.d[i].buckets = src.d[i].buckets;
- arg.d[i].sectors = src.d[i].sectors;
- arg.d[i].fragmented = src.d[i].fragmented;
- }
-
- bch2_dev_put(ca);
-
- return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-}
-
-static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
- struct bch_ioctl_dev_usage_v2 __user *user_arg)
-{
- struct bch_ioctl_dev_usage_v2 arg;
- struct bch_dev_usage src;
- struct bch_dev *ca;
- int ret = 0;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if ((arg.flags & ~BCH_BY_INDEX) ||
- arg.pad[0] ||
- arg.pad[1] ||
- arg.pad[2])
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- src = bch2_dev_usage_read(ca);
-
- arg.state = ca->mi.state;
- arg.bucket_size = ca->mi.bucket_size;
- arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR);
- arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
-
- ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
- if (ret)
- goto err;
-
- for (unsigned i = 0; i < arg.nr_data_types; i++) {
- struct bch_ioctl_dev_usage_type t = {
- .buckets = src.d[i].buckets,
- .sectors = src.d[i].sectors,
- .fragmented = src.d[i].fragmented,
- };
-
- ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
- if (ret)
- goto err;
- }
-err:
- bch2_dev_put(ca);
- return ret;
-}
-
-static long bch2_ioctl_read_super(struct bch_fs *c,
- struct bch_ioctl_read_super arg)
-{
- struct bch_dev *ca = NULL;
- struct bch_sb *sb;
- int ret = 0;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
- arg.pad)
- return -EINVAL;
-
- mutex_lock(&c->sb_lock);
-
- if (arg.flags & BCH_READ_DEV) {
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- ret = PTR_ERR_OR_ZERO(ca);
- if (ret)
- goto err_unlock;
-
- sb = ca->disk_sb.sb;
- } else {
- sb = c->disk_sb.sb;
- }
-
- if (vstruct_bytes(sb) > arg.size) {
- ret = -ERANGE;
- goto err;
- }
-
- ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
- vstruct_bytes(sb));
-err:
- bch2_dev_put(ca);
-err_unlock:
- mutex_unlock(&c->sb_lock);
- return ret;
-}
-
-static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
- struct bch_ioctl_disk_get_idx arg)
-{
- dev_t dev = huge_decode_dev(arg.dev);
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!dev)
- return -EINVAL;
-
- for_each_online_member(c, ca)
- if (ca->dev == dev) {
- percpu_ref_put(&ca->io_ref);
- return ca->dev_idx;
- }
-
- return -BCH_ERR_ENOENT_dev_idx_not_found;
-}
-
-static long bch2_ioctl_disk_resize(struct bch_fs *c,
- struct bch_ioctl_disk_resize arg)
-{
- struct bch_dev *ca;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~BCH_BY_INDEX) ||
- arg.pad)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch2_dev_resize(c, ca, arg.nbuckets);
-
- bch2_dev_put(ca);
- return ret;
-}
-
-static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
- struct bch_ioctl_disk_resize_journal arg)
-{
- struct bch_dev *ca;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~BCH_BY_INDEX) ||
- arg.pad)
- return -EINVAL;
-
- if (arg.nbuckets > U32_MAX)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
-
- bch2_dev_put(ca);
- return ret;
-}
-
-#define BCH_IOCTL(_name, _argtype) \
-do { \
- _argtype i; \
- \
- if (copy_from_user(&i, arg, sizeof(i))) \
- return -EFAULT; \
- ret = bch2_ioctl_##_name(c, i); \
- goto out; \
-} while (0)
-
-long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
-{
- long ret;
-
- switch (cmd) {
- case BCH_IOCTL_QUERY_UUID:
- return bch2_ioctl_query_uuid(c, arg);
- case BCH_IOCTL_FS_USAGE:
- return bch2_ioctl_fs_usage(c, arg);
- case BCH_IOCTL_DEV_USAGE:
- return bch2_ioctl_dev_usage(c, arg);
- case BCH_IOCTL_DEV_USAGE_V2:
- return bch2_ioctl_dev_usage_v2(c, arg);
-#if 0
- case BCH_IOCTL_START:
- BCH_IOCTL(start, struct bch_ioctl_start);
- case BCH_IOCTL_STOP:
- return bch2_ioctl_stop(c);
-#endif
- case BCH_IOCTL_READ_SUPER:
- BCH_IOCTL(read_super, struct bch_ioctl_read_super);
- case BCH_IOCTL_DISK_GET_IDX:
- BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
- }
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- switch (cmd) {
- case BCH_IOCTL_DISK_ADD:
- BCH_IOCTL(disk_add, struct bch_ioctl_disk);
- case BCH_IOCTL_DISK_REMOVE:
- BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
- case BCH_IOCTL_DISK_ONLINE:
- BCH_IOCTL(disk_online, struct bch_ioctl_disk);
- case BCH_IOCTL_DISK_OFFLINE:
- BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
- case BCH_IOCTL_DISK_SET_STATE:
- BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
- case BCH_IOCTL_DATA:
- BCH_IOCTL(data, struct bch_ioctl_data);
- case BCH_IOCTL_DISK_RESIZE:
- BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
- case BCH_IOCTL_DISK_RESIZE_JOURNAL:
- BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
- case BCH_IOCTL_FSCK_ONLINE:
- BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
- case BCH_IOCTL_QUERY_ACCOUNTING:
- return bch2_ioctl_query_accounting(c, arg);
- case BCH_IOCTL_QUERY_COUNTERS:
- return bch2_ioctl_query_counters(c, arg);
- default:
- return -ENOTTY;
- }
-out:
- if (ret < 0)
- ret = bch2_err_class(ret);
- return ret;
-}
-
-static DEFINE_IDR(bch_chardev_minor);
-
-static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
-{
- unsigned minor = iminor(file_inode(filp));
- struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
- void __user *arg = (void __user *) v;
-
- return c
- ? bch2_fs_ioctl(c, cmd, arg)
- : bch2_global_ioctl(cmd, arg);
-}
-
-static const struct file_operations bch_chardev_fops = {
- .owner = THIS_MODULE,
- .unlocked_ioctl = bch2_chardev_ioctl,
- .open = nonseekable_open,
-};
-
-static int bch_chardev_major;
-static const struct class bch_chardev_class = {
- .name = "bcachefs",
-};
-static struct device *bch_chardev;
-
-void bch2_fs_chardev_exit(struct bch_fs *c)
-{
- if (!IS_ERR_OR_NULL(c->chardev))
- device_unregister(c->chardev);
- if (c->minor >= 0)
- idr_remove(&bch_chardev_minor, c->minor);
-}
-
-int bch2_fs_chardev_init(struct bch_fs *c)
-{
- c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
- if (c->minor < 0)
- return c->minor;
-
- c->chardev = device_create(&bch_chardev_class, NULL,
- MKDEV(bch_chardev_major, c->minor), c,
- "bcachefs%u-ctl", c->minor);
- if (IS_ERR(c->chardev))
- return PTR_ERR(c->chardev);
-
- return 0;
-}
-
-void bch2_chardev_exit(void)
-{
- device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX));
- class_unregister(&bch_chardev_class);
- if (bch_chardev_major > 0)
- unregister_chrdev(bch_chardev_major, "bcachefs");
-}
-
-int __init bch2_chardev_init(void)
-{
- int ret;
-
- bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
- if (bch_chardev_major < 0)
- return bch_chardev_major;
-
- ret = class_register(&bch_chardev_class);
- if (ret)
- goto major_out;
-
- bch_chardev = device_create(&bch_chardev_class, NULL,
- MKDEV(bch_chardev_major, U8_MAX),
- NULL, "bcachefs-ctl");
- if (IS_ERR(bch_chardev)) {
- ret = PTR_ERR(bch_chardev);
- goto class_out;
- }
-
- return 0;
-
-class_out:
- class_unregister(&bch_chardev_class);
-major_out:
- unregister_chrdev(bch_chardev_major, "bcachefs-ctl");
- return ret;
-}
-
-#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
deleted file mode 100644
index 0f563ca53c36..000000000000
--- a/fs/bcachefs/chardev.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHARDEV_H
-#define _BCACHEFS_CHARDEV_H
-
-#ifndef NO_BCACHEFS_FS
-
-long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
-
-void bch2_fs_chardev_exit(struct bch_fs *);
-int bch2_fs_chardev_init(struct bch_fs *);
-
-void bch2_chardev_exit(void);
-int __init bch2_chardev_init(void);
-
-#else
-
-static inline long bch2_fs_ioctl(struct bch_fs *c,
- unsigned cmd, void __user * arg)
-{
- return -ENOTTY;
-}
-
-static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
-static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
-
-static inline void bch2_chardev_exit(void) {}
-static inline int __init bch2_chardev_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
deleted file mode 100644
index 3726689093e3..000000000000
--- a/fs/bcachefs/checksum.c
+++ /dev/null
@@ -1,831 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "errcode.h"
-#include "error.h"
-#include "super.h"
-#include "super-io.h"
-
-#include <linux/crc32c.h>
-#include <linux/crypto.h>
-#include <linux/xxhash.h>
-#include <linux/key.h>
-#include <linux/random.h>
-#include <linux/ratelimit.h>
-#include <linux/scatterlist.h>
-#include <crypto/algapi.h>
-#include <crypto/chacha.h>
-#include <crypto/hash.h>
-#include <crypto/poly1305.h>
-#include <crypto/skcipher.h>
-#include <keys/user-type.h>
-
-/*
- * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
- * it features page merging without having the checksum algorithm lose its state.
- * for native checksum aglorithms (like crc), a default seed value will do.
- * for hash-like algorithms, a state needs to be stored
- */
-
-struct bch2_checksum_state {
- union {
- u64 seed;
- struct xxh64_state h64state;
- };
- unsigned int type;
-};
-
-static void bch2_checksum_init(struct bch2_checksum_state *state)
-{
- switch (state->type) {
- case BCH_CSUM_none:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_crc64:
- state->seed = 0;
- break;
- case BCH_CSUM_crc32c_nonzero:
- state->seed = U32_MAX;
- break;
- case BCH_CSUM_crc64_nonzero:
- state->seed = U64_MAX;
- break;
- case BCH_CSUM_xxhash:
- xxh64_reset(&state->h64state, 0);
- break;
- default:
- BUG();
- }
-}
-
-static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
-{
- switch (state->type) {
- case BCH_CSUM_none:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_crc64:
- return state->seed;
- case BCH_CSUM_crc32c_nonzero:
- return state->seed ^ U32_MAX;
- case BCH_CSUM_crc64_nonzero:
- return state->seed ^ U64_MAX;
- case BCH_CSUM_xxhash:
- return xxh64_digest(&state->h64state);
- default:
- BUG();
- }
-}
-
-static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
-{
- switch (state->type) {
- case BCH_CSUM_none:
- return;
- case BCH_CSUM_crc32c_nonzero:
- case BCH_CSUM_crc32c:
- state->seed = crc32c(state->seed, data, len);
- break;
- case BCH_CSUM_crc64_nonzero:
- case BCH_CSUM_crc64:
- state->seed = crc64_be(state->seed, data, len);
- break;
- case BCH_CSUM_xxhash:
- xxh64_update(&state->h64state, data, len);
- break;
- default:
- BUG();
- }
-}
-
-static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- struct scatterlist *sg, size_t len)
-{
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-
- skcipher_request_set_sync_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
-
- int ret = crypto_skcipher_encrypt(req);
- if (ret)
- pr_err("got error %i from crypto_skcipher_encrypt()", ret);
-
- return ret;
-}
-
-static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- void *buf, size_t len)
-{
- if (!is_vmalloc_addr(buf)) {
- struct scatterlist sg = {};
-
- sg_mark_end(&sg);
- sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
- return do_encrypt_sg(tfm, nonce, &sg, len);
- } else {
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
- size_t sgl_len = 0;
- int ret;
-
- darray_init(&sgl);
-
- while (len) {
- unsigned offset = offset_in_page(buf);
- struct scatterlist sg = {
- .page_link = (unsigned long) vmalloc_to_page(buf),
- .offset = offset,
- .length = min(len, PAGE_SIZE - offset),
- };
-
- if (darray_push(&sgl, sg)) {
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
- if (ret)
- goto err;
-
- nonce = nonce_add(nonce, sgl_len);
- sgl_len = 0;
- sgl.nr = 0;
- BUG_ON(darray_push(&sgl, sg));
- }
-
- buf += sg.length;
- len -= sg.length;
- sgl_len += sg.length;
- }
-
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
-err:
- darray_exit(&sgl);
- return ret;
- }
-}
-
-int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
- void *buf, size_t len)
-{
- struct crypto_sync_skcipher *chacha20 =
- crypto_alloc_sync_skcipher("chacha20", 0, 0);
- int ret;
-
- ret = PTR_ERR_OR_ZERO(chacha20);
- if (ret) {
- pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
- return ret;
- }
-
- ret = crypto_skcipher_setkey(&chacha20->base,
- (void *) key, sizeof(*key));
- if (ret) {
- pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
- goto err;
- }
-
- ret = do_encrypt(chacha20, nonce, buf, len);
-err:
- crypto_free_sync_skcipher(chacha20);
- return ret;
-}
-
-static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
- struct nonce nonce)
-{
- u8 key[POLY1305_KEY_SIZE];
- int ret;
-
- nonce.d[3] ^= BCH_NONCE_POLY;
-
- memset(key, 0, sizeof(key));
- ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
- if (ret)
- return ret;
-
- desc->tfm = c->poly1305;
- crypto_shash_init(desc);
- crypto_shash_update(desc, key, sizeof(key));
- return 0;
-}
-
-struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
- struct nonce nonce, const void *data, size_t len)
-{
- switch (type) {
- case BCH_CSUM_none:
- case BCH_CSUM_crc32c_nonzero:
- case BCH_CSUM_crc64_nonzero:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_xxhash:
- case BCH_CSUM_crc64: {
- struct bch2_checksum_state state;
-
- state.type = type;
-
- bch2_checksum_init(&state);
- bch2_checksum_update(&state, data, len);
-
- return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
- }
-
- case BCH_CSUM_chacha20_poly1305_80:
- case BCH_CSUM_chacha20_poly1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
- u8 digest[POLY1305_DIGEST_SIZE];
- struct bch_csum ret = { 0 };
-
- gen_poly_key(c, desc, nonce);
-
- crypto_shash_update(desc, data, len);
- crypto_shash_final(desc, digest);
-
- memcpy(&ret, digest, bch_crc_bytes[type]);
- return ret;
- }
- default:
- return (struct bch_csum) {};
- }
-}
-
-int bch2_encrypt(struct bch_fs *c, unsigned type,
- struct nonce nonce, void *data, size_t len)
-{
- if (!bch2_csum_type_is_encryption(type))
- return 0;
-
- if (bch2_fs_inconsistent_on(!c->chacha20,
- c, "attempting to encrypt without encryption key"))
- return -BCH_ERR_no_encryption_key;
-
- return do_encrypt(c->chacha20, nonce, data, len);
-}
-
-static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv;
-
- switch (type) {
- case BCH_CSUM_none:
- return (struct bch_csum) { 0 };
- case BCH_CSUM_crc32c_nonzero:
- case BCH_CSUM_crc64_nonzero:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_xxhash:
- case BCH_CSUM_crc64: {
- struct bch2_checksum_state state;
-
- state.type = type;
- bch2_checksum_init(&state);
-
-#ifdef CONFIG_HIGHMEM
- __bio_for_each_segment(bv, bio, *iter, *iter) {
- void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
- bch2_checksum_update(&state, p, bv.bv_len);
- kunmap_local(p);
- }
-#else
- __bio_for_each_bvec(bv, bio, *iter, *iter)
- bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
- bv.bv_len);
-#endif
- return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
- }
-
- case BCH_CSUM_chacha20_poly1305_80:
- case BCH_CSUM_chacha20_poly1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
- u8 digest[POLY1305_DIGEST_SIZE];
- struct bch_csum ret = { 0 };
-
- gen_poly_key(c, desc, nonce);
-
-#ifdef CONFIG_HIGHMEM
- __bio_for_each_segment(bv, bio, *iter, *iter) {
- void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
- crypto_shash_update(desc, p, bv.bv_len);
- kunmap_local(p);
- }
-#else
- __bio_for_each_bvec(bv, bio, *iter, *iter)
- crypto_shash_update(desc,
- page_address(bv.bv_page) + bv.bv_offset,
- bv.bv_len);
-#endif
- crypto_shash_final(desc, digest);
-
- memcpy(&ret, digest, bch_crc_bytes[type]);
- return ret;
- }
- default:
- return (struct bch_csum) {};
- }
-}
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
-{
- struct bvec_iter iter = bio->bi_iter;
-
- return __bch2_checksum_bio(c, type, nonce, bio, &iter);
-}
-
-int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
- size_t sgl_len = 0;
- int ret = 0;
-
- if (bch2_fs_inconsistent_on(!c->chacha20,
- c, "attempting to encrypt without encryption key"))
- return -BCH_ERR_no_encryption_key;
-
- darray_init(&sgl);
-
- bio_for_each_segment(bv, bio, iter) {
- struct scatterlist sg = {
- .page_link = (unsigned long) bv.bv_page,
- .offset = bv.bv_offset,
- .length = bv.bv_len,
- };
-
- if (darray_push(&sgl, sg)) {
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
- if (ret)
- goto err;
-
- nonce = nonce_add(nonce, sgl_len);
- sgl_len = 0;
- sgl.nr = 0;
-
- BUG_ON(darray_push(&sgl, sg));
- }
-
- sgl_len += sg.length;
- }
-
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
-err:
- darray_exit(&sgl);
- return ret;
-}
-
-struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
- struct bch_csum b, size_t b_len)
-{
- struct bch2_checksum_state state;
-
- state.type = type;
- bch2_checksum_init(&state);
- state.seed = le64_to_cpu(a.lo);
-
- BUG_ON(!bch2_checksum_mergeable(type));
-
- while (b_len) {
- unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
-
- bch2_checksum_update(&state,
- page_address(ZERO_PAGE(0)), page_len);
- b_len -= page_len;
- }
- a.lo = cpu_to_le64(bch2_checksum_final(&state));
- a.lo ^= b.lo;
- a.hi ^= b.hi;
- return a;
-}
-
-int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
- struct bversion version,
- struct bch_extent_crc_unpacked crc_old,
- struct bch_extent_crc_unpacked *crc_a,
- struct bch_extent_crc_unpacked *crc_b,
- unsigned len_a, unsigned len_b,
- unsigned new_csum_type)
-{
- struct bvec_iter iter = bio->bi_iter;
- struct nonce nonce = extent_nonce(version, crc_old);
- struct bch_csum merged = { 0 };
- struct crc_split {
- struct bch_extent_crc_unpacked *crc;
- unsigned len;
- unsigned csum_type;
- struct bch_csum csum;
- } splits[3] = {
- { crc_a, len_a, new_csum_type, { 0 }},
- { crc_b, len_b, new_csum_type, { 0 } },
- { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
- }, *i;
- bool mergeable = crc_old.csum_type == new_csum_type &&
- bch2_checksum_mergeable(new_csum_type);
- unsigned crc_nonce = crc_old.nonce;
-
- BUG_ON(len_a + len_b > bio_sectors(bio));
- BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
- BUG_ON(crc_is_compressed(crc_old));
- BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
- bch2_csum_type_is_encryption(new_csum_type));
-
- for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
- iter.bi_size = i->len << 9;
- if (mergeable || i->crc)
- i->csum = __bch2_checksum_bio(c, i->csum_type,
- nonce, bio, &iter);
- else
- bio_advance_iter(bio, &iter, i->len << 9);
- nonce = nonce_add(nonce, i->len << 9);
- }
-
- if (mergeable)
- for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
- merged = bch2_checksum_merge(new_csum_type, merged,
- i->csum, i->len << 9);
- else
- merged = bch2_checksum_bio(c, crc_old.csum_type,
- extent_nonce(version, crc_old), bio);
-
- if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
- " expected %0llx:%0llx got %0llx:%0llx (old type ",
- __func__,
- crc_old.csum.hi,
- crc_old.csum.lo,
- merged.hi,
- merged.lo);
- bch2_prt_csum_type(&buf, crc_old.csum_type);
- prt_str(&buf, " new type ");
- bch2_prt_csum_type(&buf, new_csum_type);
- prt_str(&buf, ")");
- WARN_RATELIMIT(1, "%s", buf.buf);
- printbuf_exit(&buf);
- return -BCH_ERR_recompute_checksum;
- }
-
- for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
- if (i->crc)
- *i->crc = (struct bch_extent_crc_unpacked) {
- .csum_type = i->csum_type,
- .compression_type = crc_old.compression_type,
- .compressed_size = i->len,
- .uncompressed_size = i->len,
- .offset = 0,
- .live_size = i->len,
- .nonce = crc_nonce,
- .csum = i->csum,
- };
-
- if (bch2_csum_type_is_encryption(new_csum_type))
- crc_nonce += i->len;
- }
-
- return 0;
-}
-
-/* BCH_SB_FIELD_crypt: */
-
-static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
- if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
- prt_printf(err, "wrong size (got %zu should be %zu)",
- vstruct_bytes(&crypt->field), sizeof(*crypt));
- return -BCH_ERR_invalid_sb_crypt;
- }
-
- if (BCH_CRYPT_KDF_TYPE(crypt)) {
- prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
- return -BCH_ERR_invalid_sb_crypt;
- }
-
- return 0;
-}
-
-static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
- prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt));
- prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt));
- prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt));
- prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt));
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
- .validate = bch2_sb_crypt_validate,
- .to_text = bch2_sb_crypt_to_text,
-};
-
-#ifdef __KERNEL__
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
- struct key *keyring_key;
- const struct user_key_payload *ukp;
- int ret;
-
- keyring_key = request_key(&key_type_user, key_description, NULL);
- if (IS_ERR(keyring_key))
- return PTR_ERR(keyring_key);
-
- down_read(&keyring_key->sem);
- ukp = dereference_key_locked(keyring_key);
- if (ukp->datalen == sizeof(*key)) {
- memcpy(key, ukp->data, ukp->datalen);
- ret = 0;
- } else {
- ret = -EINVAL;
- }
- up_read(&keyring_key->sem);
- key_put(keyring_key);
-
- return ret;
-}
-#else
-#include <keyutils.h>
-
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
- key_serial_t key_id;
-
- key_id = request_key("user", key_description, NULL,
- KEY_SPEC_SESSION_KEYRING);
- if (key_id >= 0)
- goto got_key;
-
- key_id = request_key("user", key_description, NULL,
- KEY_SPEC_USER_KEYRING);
- if (key_id >= 0)
- goto got_key;
-
- key_id = request_key("user", key_description, NULL,
- KEY_SPEC_USER_SESSION_KEYRING);
- if (key_id >= 0)
- goto got_key;
-
- return -errno;
-got_key:
-
- if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
- return -1;
-
- return 0;
-}
-
-#include "crypto.h"
-#endif
-
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
-{
- struct printbuf key_description = PRINTBUF;
- int ret;
-
- prt_printf(&key_description, "bcachefs:");
- pr_uuid(&key_description, sb->user_uuid.b);
-
- ret = __bch2_request_key(key_description.buf, key);
- printbuf_exit(&key_description);
-
-#ifndef __KERNEL__
- if (ret) {
- char *passphrase = read_passphrase("Enter passphrase: ");
- struct bch_encrypted_key sb_key;
-
- bch2_passphrase_check(sb, passphrase,
- key, &sb_key);
- ret = 0;
- }
-#endif
-
- /* stash with memfd, pass memfd fd to mount */
-
- return ret;
-}
-
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *sb)
-{
- key_serial_t key_id;
- struct printbuf key_description = PRINTBUF;
-
- prt_printf(&key_description, "bcachefs:");
- pr_uuid(&key_description, sb->user_uuid.b);
-
- key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
- printbuf_exit(&key_description);
- if (key_id < 0)
- return errno;
-
- keyctl_revoke(key_id);
-
- return 0;
-}
-#endif
-
-int bch2_decrypt_sb_key(struct bch_fs *c,
- struct bch_sb_field_crypt *crypt,
- struct bch_key *key)
-{
- struct bch_encrypted_key sb_key = crypt->key;
- struct bch_key user_key;
- int ret = 0;
-
- /* is key encrypted? */
- if (!bch2_key_is_encrypted(&sb_key))
- goto out;
-
- ret = bch2_request_key(c->disk_sb.sb, &user_key);
- if (ret) {
- bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
- goto err;
- }
-
- /* decrypt real key: */
- ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
- &sb_key, sizeof(sb_key));
- if (ret)
- goto err;
-
- if (bch2_key_is_encrypted(&sb_key)) {
- bch_err(c, "incorrect encryption key");
- ret = -EINVAL;
- goto err;
- }
-out:
- *key = sb_key.key;
-err:
- memzero_explicit(&sb_key, sizeof(sb_key));
- memzero_explicit(&user_key, sizeof(user_key));
- return ret;
-}
-
-static int bch2_alloc_ciphers(struct bch_fs *c)
-{
- if (c->chacha20)
- return 0;
-
- struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
- int ret = PTR_ERR_OR_ZERO(chacha20);
- if (ret) {
- bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
- return ret;
- }
-
- struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- ret = PTR_ERR_OR_ZERO(poly1305);
- if (ret) {
- bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
- crypto_free_sync_skcipher(chacha20);
- return ret;
- }
-
- c->chacha20 = chacha20;
- c->poly1305 = poly1305;
- return 0;
-}
-
-#if 0
-
-/*
- * This seems to be duplicating code in cmd_remove_passphrase() in
- * bcachefs-tools, but we might want to switch userspace to use this - and
- * perhaps add an ioctl for calling this at runtime, so we can take the
- * passphrase off of a mounted filesystem (which has come up).
- */
-int bch2_disable_encryption(struct bch_fs *c)
-{
- struct bch_sb_field_crypt *crypt;
- struct bch_key key;
- int ret = -EINVAL;
-
- mutex_lock(&c->sb_lock);
-
- crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
- if (!crypt)
- goto out;
-
- /* is key encrypted? */
- ret = 0;
- if (bch2_key_is_encrypted(&crypt->key))
- goto out;
-
- ret = bch2_decrypt_sb_key(c, crypt, &key);
- if (ret)
- goto out;
-
- crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC);
- crypt->key.key = key;
-
- SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
- bch2_write_super(c);
-out:
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-/*
- * For enabling encryption on an existing filesystem: not hooked up yet, but it
- * should be
- */
-int bch2_enable_encryption(struct bch_fs *c, bool keyed)
-{
- struct bch_encrypted_key key;
- struct bch_key user_key;
- struct bch_sb_field_crypt *crypt;
- int ret = -EINVAL;
-
- mutex_lock(&c->sb_lock);
-
- /* Do we already have an encryption key? */
- if (bch2_sb_field_get(c->disk_sb.sb, crypt))
- goto err;
-
- ret = bch2_alloc_ciphers(c);
- if (ret)
- goto err;
-
- key.magic = cpu_to_le64(BCH_KEY_MAGIC);
- get_random_bytes(&key.key, sizeof(key.key));
-
- if (keyed) {
- ret = bch2_request_key(c->disk_sb.sb, &user_key);
- if (ret) {
- bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
- goto err;
- }
-
- ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
- &key, sizeof(key));
- if (ret)
- goto err;
- }
-
- ret = crypto_skcipher_setkey(&c->chacha20->base,
- (void *) &key.key, sizeof(key.key));
- if (ret)
- goto err;
-
- crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
- sizeof(*crypt) / sizeof(u64));
- if (!crypt) {
- ret = -BCH_ERR_ENOSPC_sb_crypt;
- goto err;
- }
-
- crypt->key = key;
-
- /* write superblock */
- SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
- bch2_write_super(c);
-err:
- mutex_unlock(&c->sb_lock);
- memzero_explicit(&user_key, sizeof(user_key));
- memzero_explicit(&key, sizeof(key));
- return ret;
-}
-#endif
-
-void bch2_fs_encryption_exit(struct bch_fs *c)
-{
- if (c->poly1305)
- crypto_free_shash(c->poly1305);
- if (c->chacha20)
- crypto_free_sync_skcipher(c->chacha20);
-}
-
-int bch2_fs_encryption_init(struct bch_fs *c)
-{
- struct bch_sb_field_crypt *crypt;
- struct bch_key key;
- int ret = 0;
-
- crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
- if (!crypt)
- goto out;
-
- ret = bch2_alloc_ciphers(c);
- if (ret)
- goto out;
-
- ret = bch2_decrypt_sb_key(c, crypt, &key);
- if (ret)
- goto out;
-
- ret = crypto_skcipher_setkey(&c->chacha20->base,
- (void *) &key.key, sizeof(key.key));
- if (ret)
- goto out;
-out:
- memzero_explicit(&key, sizeof(key));
- return ret;
-}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
deleted file mode 100644
index 4ac251c8fcd8..000000000000
--- a/fs/bcachefs/checksum.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHECKSUM_H
-#define _BCACHEFS_CHECKSUM_H
-
-#include "bcachefs.h"
-#include "extents_types.h"
-#include "super-io.h"
-
-#include <linux/crc64.h>
-#include <crypto/chacha.h>
-
-static inline bool bch2_checksum_mergeable(unsigned type)
-{
-
- switch (type) {
- case BCH_CSUM_none:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_crc64:
- return true;
- default:
- return false;
- }
-}
-
-struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
- struct bch_csum, size_t);
-
-#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28)
-#define BCH_NONCE_BTREE cpu_to_le32(2 << 28)
-#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28)
-#define BCH_NONCE_PRIO cpu_to_le32(4 << 28)
-#define BCH_NONCE_POLY cpu_to_le32(1 << 31)
-
-struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
- const void *, size_t);
-
-/*
- * This is used for various on disk data structures - bch_sb, prio_set, bset,
- * jset: The checksum is _always_ the first field of these structs
- */
-#define csum_vstruct(_c, _type, _nonce, _i) \
-({ \
- const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
- \
- bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
-})
-
-static inline void bch2_csum_to_text(struct printbuf *out,
- enum bch_csum_type type,
- struct bch_csum csum)
-{
- const u8 *p = (u8 *) &csum;
- unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
-
- for (unsigned i = 0; i < bytes; i++)
- prt_hex_byte(out, p[i]);
-}
-
-static inline void bch2_csum_err_msg(struct printbuf *out,
- enum bch_csum_type type,
- struct bch_csum expected,
- struct bch_csum got)
-{
- prt_str(out, "checksum error, type ");
- bch2_prt_csum_type(out, type);
- prt_str(out, ": got ");
- bch2_csum_to_text(out, type, got);
- prt_str(out, " should be ");
- bch2_csum_to_text(out, type, expected);
-}
-
-int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
-int bch2_request_key(struct bch_sb *, struct bch_key *);
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *);
-#endif
-
-int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
- void *data, size_t);
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
-
-int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
- struct bch_extent_crc_unpacked,
- struct bch_extent_crc_unpacked *,
- struct bch_extent_crc_unpacked *,
- unsigned, unsigned, unsigned);
-
-int __bch2_encrypt_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
-
-static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
-{
- return bch2_csum_type_is_encryption(type)
- ? __bch2_encrypt_bio(c, type, nonce, bio)
- : 0;
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
-
-int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
- struct bch_key *);
-
-#if 0
-int bch2_disable_encryption(struct bch_fs *);
-int bch2_enable_encryption(struct bch_fs *, bool);
-#endif
-
-void bch2_fs_encryption_exit(struct bch_fs *);
-int bch2_fs_encryption_init(struct bch_fs *);
-
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
- bool data)
-{
- switch (type) {
- case BCH_CSUM_OPT_none:
- return BCH_CSUM_none;
- case BCH_CSUM_OPT_crc32c:
- return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
- case BCH_CSUM_OPT_crc64:
- return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
- case BCH_CSUM_OPT_xxhash:
- return BCH_CSUM_xxhash;
- default:
- BUG();
- }
-}
-
-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
- struct bch_io_opts opts)
-{
- if (opts.nocow)
- return 0;
-
- if (c->sb.encryption_type)
- return c->opts.wide_macs
- ? BCH_CSUM_chacha20_poly1305_128
- : BCH_CSUM_chacha20_poly1305_80;
-
- return bch2_csum_opt_to_type(opts.data_checksum, true);
-}
-
-static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
-{
- if (c->sb.encryption_type)
- return BCH_CSUM_chacha20_poly1305_128;
-
- return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
-}
-
-static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
- unsigned type)
-{
- if (type >= BCH_CSUM_NR)
- return false;
-
- if (bch2_csum_type_is_encryption(type) && !c->chacha20)
- return false;
-
- return true;
-}
-
-/* returns true if not equal */
-static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
-{
- /*
- * XXX: need some way of preventing the compiler from optimizing this
- * into a form that isn't constant time..
- */
- return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
-}
-
-/* for skipping ahead and encrypting/decrypting at an offset: */
-static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
-{
- EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
-
- le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
- return nonce;
-}
-
-static inline struct nonce null_nonce(void)
-{
- struct nonce ret;
-
- memset(&ret, 0, sizeof(ret));
- return ret;
-}
-
-static inline struct nonce extent_nonce(struct bversion version,
- struct bch_extent_crc_unpacked crc)
-{
- unsigned compression_type = crc_is_compressed(crc)
- ? crc.compression_type
- : 0;
- unsigned size = compression_type ? crc.uncompressed_size : 0;
- struct nonce nonce = (struct nonce) {{
- [0] = cpu_to_le32(size << 22),
- [1] = cpu_to_le32(version.lo),
- [2] = cpu_to_le32(version.lo >> 32),
- [3] = cpu_to_le32(version.hi|
- (compression_type << 24))^BCH_NONCE_EXTENT,
- }};
-
- return nonce_add(nonce, crc.nonce << 9);
-}
-
-static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
-{
- return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
-}
-
-static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
-{
- __le64 magic = __bch2_sb_magic(sb);
-
- return (struct nonce) {{
- [0] = 0,
- [1] = 0,
- [2] = ((__le32 *) &magic)[0],
- [3] = ((__le32 *) &magic)[1],
- }};
-}
-
-static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
-{
- __le64 magic = bch2_sb_magic(c);
-
- return (struct nonce) {{
- [0] = 0,
- [1] = 0,
- [2] = ((__le32 *) &magic)[0],
- [3] = ((__le32 *) &magic)[1],
- }};
-}
-
-#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
deleted file mode 100644
index 1f8e035d7119..000000000000
--- a/fs/bcachefs/clock.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "clock.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-
-static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args)
-{
- struct io_timer **_l = (struct io_timer **)l;
- struct io_timer **_r = (struct io_timer **)r;
-
- return (*_l)->expire < (*_r)->expire;
-}
-
-static const struct min_heap_callbacks callbacks = {
- .less = io_timer_cmp,
- .swp = NULL,
-};
-
-void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
-{
- spin_lock(&clock->timer_lock);
-
- if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
- spin_unlock(&clock->timer_lock);
- timer->fn(timer);
- return;
- }
-
- for (size_t i = 0; i < clock->timers.nr; i++)
- if (clock->timers.data[i] == timer)
- goto out;
-
- BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL));
-out:
- spin_unlock(&clock->timer_lock);
-}
-
-void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
-{
- spin_lock(&clock->timer_lock);
-
- for (size_t i = 0; i < clock->timers.nr; i++)
- if (clock->timers.data[i] == timer) {
- min_heap_del(&clock->timers, i, &callbacks, NULL);
- break;
- }
-
- spin_unlock(&clock->timer_lock);
-}
-
-struct io_clock_wait {
- struct io_timer io_timer;
- struct timer_list cpu_timer;
- struct task_struct *task;
- int expired;
-};
-
-static void io_clock_wait_fn(struct io_timer *timer)
-{
- struct io_clock_wait *wait = container_of(timer,
- struct io_clock_wait, io_timer);
-
- wait->expired = 1;
- wake_up_process(wait->task);
-}
-
-static void io_clock_cpu_timeout(struct timer_list *timer)
-{
- struct io_clock_wait *wait = container_of(timer,
- struct io_clock_wait, cpu_timer);
-
- wait->expired = 1;
- wake_up_process(wait->task);
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
-{
- struct io_clock_wait wait = {
- .io_timer.expire = until,
- .io_timer.fn = io_clock_wait_fn,
- .io_timer.fn2 = (void *) _RET_IP_,
- .task = current,
- };
-
- bch2_io_timer_add(clock, &wait.io_timer);
- schedule();
- bch2_io_timer_del(clock, &wait.io_timer);
-}
-
-void bch2_kthread_io_clock_wait(struct io_clock *clock,
- u64 io_until, unsigned long cpu_timeout)
-{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct io_clock_wait wait = {
- .io_timer.expire = io_until,
- .io_timer.fn = io_clock_wait_fn,
- .io_timer.fn2 = (void *) _RET_IP_,
- .task = current,
- };
-
- bch2_io_timer_add(clock, &wait.io_timer);
-
- timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
-
- if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
- mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
-
- do {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread && kthread_should_stop())
- break;
-
- if (wait.expired)
- break;
-
- schedule();
- try_to_freeze();
- } while (0);
-
- __set_current_state(TASK_RUNNING);
- del_timer_sync(&wait.cpu_timer);
- destroy_timer_on_stack(&wait.cpu_timer);
- bch2_io_timer_del(clock, &wait.io_timer);
-}
-
-static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
-{
- struct io_timer *ret = NULL;
-
- if (clock->timers.nr &&
- time_after_eq64(now, clock->timers.data[0]->expire)) {
- ret = *min_heap_peek(&clock->timers);
- min_heap_pop(&clock->timers, &callbacks, NULL);
- }
-
- return ret;
-}
-
-void __bch2_increment_clock(struct io_clock *clock, u64 sectors)
-{
- struct io_timer *timer;
- u64 now = atomic64_add_return(sectors, &clock->now);
-
- spin_lock(&clock->timer_lock);
- while ((timer = get_expired_timer(clock, now)))
- timer->fn(timer);
- spin_unlock(&clock->timer_lock);
-}
-
-void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
-{
- out->atomic++;
- spin_lock(&clock->timer_lock);
- u64 now = atomic64_read(&clock->now);
-
- printbuf_tabstop_push(out, 40);
- prt_printf(out, "current time:\t%llu\n", now);
-
- for (unsigned i = 0; i < clock->timers.nr; i++)
- prt_printf(out, "%ps %ps:\t%llu\n",
- clock->timers.data[i]->fn,
- clock->timers.data[i]->fn2,
- clock->timers.data[i]->expire);
- spin_unlock(&clock->timer_lock);
- --out->atomic;
-}
-
-void bch2_io_clock_exit(struct io_clock *clock)
-{
- free_heap(&clock->timers);
- free_percpu(clock->pcpu_buf);
-}
-
-int bch2_io_clock_init(struct io_clock *clock)
-{
- atomic64_set(&clock->now, 0);
- spin_lock_init(&clock->timer_lock);
-
- clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
-
- clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
- if (!clock->pcpu_buf)
- return -BCH_ERR_ENOMEM_io_clock_init;
-
- if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
- return -BCH_ERR_ENOMEM_io_clock_init;
-
- return 0;
-}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
deleted file mode 100644
index 82c79c8baf92..000000000000
--- a/fs/bcachefs/clock.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_H
-#define _BCACHEFS_CLOCK_H
-
-void bch2_io_timer_add(struct io_clock *, struct io_timer *);
-void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
-
-void __bch2_increment_clock(struct io_clock *, u64);
-
-static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
- int rw)
-{
- struct io_clock *clock = &c->io_clock[rw];
-
- if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
- IO_CLOCK_PCPU_SECTORS))
- __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
-
-void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
-
-void bch2_io_clock_exit(struct io_clock *);
-int bch2_io_clock_init(struct io_clock *);
-
-#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
deleted file mode 100644
index 37554e4514fe..000000000000
--- a/fs/bcachefs/clock_types.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_TYPES_H
-#define _BCACHEFS_CLOCK_TYPES_H
-
-#include "util.h"
-
-#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
-
-/*
- * Clocks/timers in units of sectors of IO:
- *
- * Note - they use percpu batching, so they're only approximate.
- */
-
-struct io_timer;
-typedef void (*io_timer_fn)(struct io_timer *);
-
-struct io_timer {
- io_timer_fn fn;
- void *fn2;
- u64 expire;
-};
-
-/* Amount to buffer up on a percpu counter */
-#define IO_CLOCK_PCPU_SECTORS 128
-
-typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap) io_timer_heap;
-
-struct io_clock {
- atomic64_t now;
- u16 __percpu *pcpu_buf;
- unsigned max_slop;
-
- spinlock_t timer_lock;
- io_timer_heap timers;
-};
-
-#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
deleted file mode 100644
index 85fc90342492..000000000000
--- a/fs/bcachefs/compress.c
+++ /dev/null
@@ -1,772 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "compress.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "opts.h"
-#include "super-io.h"
-
-#include <linux/lz4.h>
-#include <linux/zlib.h>
-#include <linux/zstd.h>
-
-static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type)
-{
- switch (type) {
- case BCH_COMPRESSION_TYPE_none:
- case BCH_COMPRESSION_TYPE_incompressible:
- return BCH_COMPRESSION_OPT_none;
- case BCH_COMPRESSION_TYPE_lz4_old:
- case BCH_COMPRESSION_TYPE_lz4:
- return BCH_COMPRESSION_OPT_lz4;
- case BCH_COMPRESSION_TYPE_gzip:
- return BCH_COMPRESSION_OPT_gzip;
- case BCH_COMPRESSION_TYPE_zstd:
- return BCH_COMPRESSION_OPT_zstd;
- default:
- BUG();
- }
-}
-
-/* Bounce buffer: */
-struct bbuf {
- void *b;
- enum {
- BB_NONE,
- BB_VMAP,
- BB_KMALLOC,
- BB_MEMPOOL,
- } type;
- int rw;
-};
-
-static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
-{
- void *b;
-
- BUG_ON(size > c->opts.encoded_extent_max);
-
- b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
- if (b)
- return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
-
- b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
- if (b)
- return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
-
- BUG();
-}
-
-static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
- void *expected_start = NULL;
-
- __bio_for_each_bvec(bv, bio, iter, start) {
- if (expected_start &&
- expected_start != page_address(bv.bv_page) + bv.bv_offset)
- return false;
-
- expected_start = page_address(bv.bv_page) +
- bv.bv_offset + bv.bv_len;
- }
-
- return true;
-}
-
-static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
- struct bvec_iter start, int rw)
-{
- struct bbuf ret;
- struct bio_vec bv;
- struct bvec_iter iter;
- unsigned nr_pages = 0;
- struct page *stack_pages[16];
- struct page **pages = NULL;
- void *data;
-
- BUG_ON(start.bi_size > c->opts.encoded_extent_max);
-
- if (!PageHighMem(bio_iter_page(bio, start)) &&
- bio_phys_contig(bio, start))
- return (struct bbuf) {
- .b = page_address(bio_iter_page(bio, start)) +
- bio_iter_offset(bio, start),
- .type = BB_NONE, .rw = rw
- };
-
- /* check if we can map the pages contiguously: */
- __bio_for_each_segment(bv, bio, iter, start) {
- if (iter.bi_size != start.bi_size &&
- bv.bv_offset)
- goto bounce;
-
- if (bv.bv_len < iter.bi_size &&
- bv.bv_offset + bv.bv_len < PAGE_SIZE)
- goto bounce;
-
- nr_pages++;
- }
-
- BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
-
- pages = nr_pages > ARRAY_SIZE(stack_pages)
- ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
- : stack_pages;
- if (!pages)
- goto bounce;
-
- nr_pages = 0;
- __bio_for_each_segment(bv, bio, iter, start)
- pages[nr_pages++] = bv.bv_page;
-
- data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (pages != stack_pages)
- kfree(pages);
-
- if (data)
- return (struct bbuf) {
- .b = data + bio_iter_offset(bio, start),
- .type = BB_VMAP, .rw = rw
- };
-bounce:
- ret = __bounce_alloc(c, start.bi_size, rw);
-
- if (rw == READ)
- memcpy_from_bio(ret.b, bio, start);
-
- return ret;
-}
-
-static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
-{
- return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
-}
-
-static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
-{
- switch (buf.type) {
- case BB_NONE:
- break;
- case BB_VMAP:
- vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
- break;
- case BB_KMALLOC:
- kfree(buf.b);
- break;
- case BB_MEMPOOL:
- mempool_free(buf.b, &c->compression_bounce[buf.rw]);
- break;
- }
-}
-
-static inline void zlib_set_workspace(z_stream *strm, void *workspace)
-{
-#ifdef __KERNEL__
- strm->workspace = workspace;
-#endif
-}
-
-static int __bio_uncompress(struct bch_fs *c, struct bio *src,
- void *dst_data, struct bch_extent_crc_unpacked crc)
-{
- struct bbuf src_data = { NULL };
- size_t src_len = src->bi_iter.bi_size;
- size_t dst_len = crc.uncompressed_size << 9;
- void *workspace;
- int ret = 0, ret2;
-
- enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
- mempool_t *workspace_pool = &c->compress_workspace[opt];
- if (unlikely(!mempool_initialized(workspace_pool))) {
- if (fsck_err(c, compression_type_not_marked_in_sb,
- "compression type %s set but not marked in superblock",
- __bch2_compression_types[crc.compression_type]))
- ret = bch2_check_set_has_compressed_data(c, opt);
- else
- ret = -BCH_ERR_compression_workspace_not_initialized;
- if (ret)
- goto err;
- }
-
- src_data = bio_map_or_bounce(c, src, READ);
-
- switch (crc.compression_type) {
- case BCH_COMPRESSION_TYPE_lz4_old:
- case BCH_COMPRESSION_TYPE_lz4:
- ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
- src_len, dst_len, dst_len);
- if (ret2 != dst_len)
- ret = -BCH_ERR_decompress_lz4;
- break;
- case BCH_COMPRESSION_TYPE_gzip: {
- z_stream strm = {
- .next_in = src_data.b,
- .avail_in = src_len,
- .next_out = dst_data,
- .avail_out = dst_len,
- };
-
- workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-
- zlib_set_workspace(&strm, workspace);
- zlib_inflateInit2(&strm, -MAX_WBITS);
- ret2 = zlib_inflate(&strm, Z_FINISH);
-
- mempool_free(workspace, workspace_pool);
-
- if (ret2 != Z_STREAM_END)
- ret = -BCH_ERR_decompress_gzip;
- break;
- }
- case BCH_COMPRESSION_TYPE_zstd: {
- ZSTD_DCtx *ctx;
- size_t real_src_len = le32_to_cpup(src_data.b);
-
- if (real_src_len > src_len - 4) {
- ret = -BCH_ERR_decompress_zstd_src_len_bad;
- goto err;
- }
-
- workspace = mempool_alloc(workspace_pool, GFP_NOFS);
- ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
-
- ret2 = zstd_decompress_dctx(ctx,
- dst_data, dst_len,
- src_data.b + 4, real_src_len);
-
- mempool_free(workspace, workspace_pool);
-
- if (ret2 != dst_len)
- ret = -BCH_ERR_decompress_zstd;
- break;
- }
- default:
- BUG();
- }
-err:
-fsck_err:
- bio_unmap_or_unbounce(c, src_data);
- return ret;
-}
-
-int bch2_bio_uncompress_inplace(struct bch_write_op *op,
- struct bio *bio)
-{
- struct bch_fs *c = op->c;
- struct bch_extent_crc_unpacked *crc = &op->crc;
- struct bbuf data = { NULL };
- size_t dst_len = crc->uncompressed_size << 9;
- int ret = 0;
-
- /* bio must own its pages: */
- BUG_ON(!bio->bi_vcnt);
- BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
-
- if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) {
- bch2_write_op_error(op, op->pos.offset,
- "extent too big to decompress (%u > %u)",
- crc->uncompressed_size << 9, c->opts.encoded_extent_max);
- return -BCH_ERR_decompress_exceeded_max_encoded_extent;
- }
-
- data = __bounce_alloc(c, dst_len, WRITE);
-
- ret = __bio_uncompress(c, bio, data.b, *crc);
-
- if (c->opts.no_data_io)
- ret = 0;
-
- if (ret) {
- bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret));
- goto err;
- }
-
- /*
- * XXX: don't have a good way to assert that the bio was allocated with
- * enough space, we depend on bch2_move_extent doing the right thing
- */
- bio->bi_iter.bi_size = crc->live_size << 9;
-
- memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
-
- crc->csum_type = 0;
- crc->compression_type = 0;
- crc->compressed_size = crc->live_size;
- crc->uncompressed_size = crc->live_size;
- crc->offset = 0;
- crc->csum = (struct bch_csum) { 0, 0 };
-err:
- bio_unmap_or_unbounce(c, data);
- return ret;
-}
-
-int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
- struct bio *dst, struct bvec_iter dst_iter,
- struct bch_extent_crc_unpacked crc)
-{
- struct bbuf dst_data = { NULL };
- size_t dst_len = crc.uncompressed_size << 9;
- int ret;
-
- if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
- crc.compressed_size << 9 > c->opts.encoded_extent_max)
- return -BCH_ERR_decompress_exceeded_max_encoded_extent;
-
- dst_data = dst_len == dst_iter.bi_size
- ? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
- : __bounce_alloc(c, dst_len, WRITE);
-
- ret = __bio_uncompress(c, src, dst_data.b, crc);
- if (ret)
- goto err;
-
- if (dst_data.type != BB_NONE &&
- dst_data.type != BB_VMAP)
- memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
-err:
- bio_unmap_or_unbounce(c, dst_data);
- return ret;
-}
-
-static int attempt_compress(struct bch_fs *c,
- void *workspace,
- void *dst, size_t dst_len,
- void *src, size_t src_len,
- struct bch_compression_opt compression)
-{
- enum bch_compression_type compression_type =
- __bch2_compression_opt_to_type[compression.type];
-
- switch (compression_type) {
- case BCH_COMPRESSION_TYPE_lz4:
- if (compression.level < LZ4HC_MIN_CLEVEL) {
- int len = src_len;
- int ret = LZ4_compress_destSize(
- src, dst,
- &len, dst_len,
- workspace);
- if (len < src_len)
- return -len;
-
- return ret;
- } else {
- int ret = LZ4_compress_HC(
- src, dst,
- src_len, dst_len,
- compression.level,
- workspace);
-
- return ret ?: -1;
- }
- case BCH_COMPRESSION_TYPE_gzip: {
- z_stream strm = {
- .next_in = src,
- .avail_in = src_len,
- .next_out = dst,
- .avail_out = dst_len,
- };
-
- zlib_set_workspace(&strm, workspace);
- zlib_deflateInit2(&strm,
- compression.level
- ? clamp_t(unsigned, compression.level,
- Z_BEST_SPEED, Z_BEST_COMPRESSION)
- : Z_DEFAULT_COMPRESSION,
- Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
- Z_DEFAULT_STRATEGY);
-
- if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
- return 0;
-
- if (zlib_deflateEnd(&strm) != Z_OK)
- return 0;
-
- return strm.total_out;
- }
- case BCH_COMPRESSION_TYPE_zstd: {
- /*
- * rescale:
- * zstd max compression level is 22, our max level is 15
- */
- unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
- ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
- ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
-
- /*
- * ZSTD requires that when we decompress we pass in the exact
- * compressed size - rounding it up to the nearest sector
- * doesn't work, so we use the first 4 bytes of the buffer for
- * that.
- *
- * Additionally, the ZSTD code seems to have a bug where it will
- * write just past the end of the buffer - so subtract a fudge
- * factor (7 bytes) from the dst buffer size to account for
- * that.
- */
- size_t len = zstd_compress_cctx(ctx,
- dst + 4, dst_len - 4 - 7,
- src, src_len,
- &params);
- if (zstd_is_error(len))
- return 0;
-
- *((__le32 *) dst) = cpu_to_le32(len);
- return len + 4;
- }
- default:
- BUG();
- }
-}
-
-static unsigned __bio_compress(struct bch_fs *c,
- struct bio *dst, size_t *dst_len,
- struct bio *src, size_t *src_len,
- struct bch_compression_opt compression)
-{
- struct bbuf src_data = { NULL }, dst_data = { NULL };
- void *workspace;
- enum bch_compression_type compression_type =
- __bch2_compression_opt_to_type[compression.type];
- unsigned pad;
- int ret = 0;
-
- /* bch2_compression_decode catches unknown compression types: */
- BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
-
- mempool_t *workspace_pool = &c->compress_workspace[compression.type];
- if (unlikely(!mempool_initialized(workspace_pool))) {
- if (fsck_err(c, compression_opt_not_marked_in_sb,
- "compression opt %s set but not marked in superblock",
- bch2_compression_opts[compression.type])) {
- ret = bch2_check_set_has_compressed_data(c, compression.type);
- if (ret) /* memory allocation failure, don't compress */
- return 0;
- } else {
- return 0;
- }
- }
-
- /* If it's only one block, don't bother trying to compress: */
- if (src->bi_iter.bi_size <= c->opts.block_size)
- return BCH_COMPRESSION_TYPE_incompressible;
-
- dst_data = bio_map_or_bounce(c, dst, WRITE);
- src_data = bio_map_or_bounce(c, src, READ);
-
- workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-
- *src_len = src->bi_iter.bi_size;
- *dst_len = dst->bi_iter.bi_size;
-
- /*
- * XXX: this algorithm sucks when the compression code doesn't tell us
- * how much would fit, like LZ4 does:
- */
- while (1) {
- if (*src_len <= block_bytes(c)) {
- ret = -1;
- break;
- }
-
- ret = attempt_compress(c, workspace,
- dst_data.b, *dst_len,
- src_data.b, *src_len,
- compression);
- if (ret > 0) {
- *dst_len = ret;
- ret = 0;
- break;
- }
-
- /* Didn't fit: should we retry with a smaller amount? */
- if (*src_len <= *dst_len) {
- ret = -1;
- break;
- }
-
- /*
- * If ret is negative, it's a hint as to how much data would fit
- */
- BUG_ON(-ret >= *src_len);
-
- if (ret < 0)
- *src_len = -ret;
- else
- *src_len -= (*src_len - *dst_len) / 2;
- *src_len = round_down(*src_len, block_bytes(c));
- }
-
- mempool_free(workspace, workspace_pool);
-
- if (ret)
- goto err;
-
- /* Didn't get smaller: */
- if (round_up(*dst_len, block_bytes(c)) >= *src_len)
- goto err;
-
- pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
-
- memset(dst_data.b + *dst_len, 0, pad);
- *dst_len += pad;
-
- if (dst_data.type != BB_NONE &&
- dst_data.type != BB_VMAP)
- memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
-
- BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
- BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
- BUG_ON(*dst_len & (block_bytes(c) - 1));
- BUG_ON(*src_len & (block_bytes(c) - 1));
- ret = compression_type;
-out:
- bio_unmap_or_unbounce(c, src_data);
- bio_unmap_or_unbounce(c, dst_data);
- return ret;
-err:
- ret = BCH_COMPRESSION_TYPE_incompressible;
- goto out;
-fsck_err:
- ret = 0;
- goto out;
-}
-
-unsigned bch2_bio_compress(struct bch_fs *c,
- struct bio *dst, size_t *dst_len,
- struct bio *src, size_t *src_len,
- unsigned compression_opt)
-{
- unsigned orig_dst = dst->bi_iter.bi_size;
- unsigned orig_src = src->bi_iter.bi_size;
- unsigned compression_type;
-
- /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
- src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
- c->opts.encoded_extent_max);
- /* Don't generate a bigger output than input: */
- dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-
- compression_type =
- __bio_compress(c, dst, dst_len, src, src_len,
- bch2_compression_decode(compression_opt));
-
- dst->bi_iter.bi_size = orig_dst;
- src->bi_iter.bi_size = orig_src;
- return compression_type;
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *, u64);
-
-#define BCH_FEATURE_none 0
-
-static const unsigned bch2_compression_opt_to_feature[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
- BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-#undef BCH_FEATURE_none
-
-static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
-{
- int ret = 0;
-
- if ((c->sb.features & f) == f)
- return 0;
-
- mutex_lock(&c->sb_lock);
-
- if ((c->sb.features & f) == f) {
- mutex_unlock(&c->sb_lock);
- return 0;
- }
-
- ret = __bch2_fs_compress_init(c, c->sb.features|f);
- if (ret) {
- mutex_unlock(&c->sb_lock);
- return ret;
- }
-
- c->disk_sb.sb->features[0] |= cpu_to_le64(f);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-int bch2_check_set_has_compressed_data(struct bch_fs *c,
- unsigned compression_opt)
-{
- unsigned compression_type = bch2_compression_decode(compression_opt).type;
-
- BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
-
- return compression_type
- ? __bch2_check_set_has_compressed_data(c,
- 1ULL << bch2_compression_opt_to_feature[compression_type])
- : 0;
-}
-
-void bch2_fs_compress_exit(struct bch_fs *c)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
- mempool_exit(&c->compress_workspace[i]);
- mempool_exit(&c->compression_bounce[WRITE]);
- mempool_exit(&c->compression_bounce[READ]);
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
-{
- ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
- c->opts.encoded_extent_max);
-
- c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
-
- struct {
- unsigned feature;
- enum bch_compression_opts type;
- size_t compress_workspace;
- } compression_types[] = {
- { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
- max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
- { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
- max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
- zlib_inflate_workspacesize()) },
- { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
- max(c->zstd_workspace_size,
- zstd_dctx_workspace_bound()) },
- }, *i;
- bool have_compressed = false;
-
- for (i = compression_types;
- i < compression_types + ARRAY_SIZE(compression_types);
- i++)
- have_compressed |= (features & (1 << i->feature)) != 0;
-
- if (!have_compressed)
- return 0;
-
- if (!mempool_initialized(&c->compression_bounce[READ]) &&
- mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
- 1, c->opts.encoded_extent_max))
- return -BCH_ERR_ENOMEM_compression_bounce_read_init;
-
- if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
- mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
- 1, c->opts.encoded_extent_max))
- return -BCH_ERR_ENOMEM_compression_bounce_write_init;
-
- for (i = compression_types;
- i < compression_types + ARRAY_SIZE(compression_types);
- i++) {
- if (!(features & (1 << i->feature)))
- continue;
-
- if (mempool_initialized(&c->compress_workspace[i->type]))
- continue;
-
- if (mempool_init_kvmalloc_pool(
- &c->compress_workspace[i->type],
- 1, i->compress_workspace))
- return -BCH_ERR_ENOMEM_compression_workspace_init;
- }
-
- return 0;
-}
-
-static u64 compression_opt_to_feature(unsigned v)
-{
- unsigned type = bch2_compression_decode(v).type;
-
- return BIT_ULL(bch2_compression_opt_to_feature[type]);
-}
-
-int bch2_fs_compress_init(struct bch_fs *c)
-{
- u64 f = c->sb.features;
-
- f |= compression_opt_to_feature(c->opts.compression);
- f |= compression_opt_to_feature(c->opts.background_compression);
-
- return __bch2_fs_compress_init(c, f);
-}
-
-int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
- struct printbuf *err)
-{
- char *val = kstrdup(_val, GFP_KERNEL);
- char *p = val, *type_str, *level_str;
- struct bch_compression_opt opt = { 0 };
- int ret;
-
- if (!val)
- return -ENOMEM;
-
- type_str = strsep(&p, ":");
- level_str = p;
-
- ret = match_string(bch2_compression_opts, -1, type_str);
- if (ret < 0 && err)
- prt_str(err, "invalid compression type");
- if (ret < 0)
- goto err;
-
- opt.type = ret;
-
- if (level_str) {
- unsigned level;
-
- ret = kstrtouint(level_str, 10, &level);
- if (!ret && !opt.type && level)
- ret = -EINVAL;
- if (!ret && level > 15)
- ret = -EINVAL;
- if (ret < 0 && err)
- prt_str(err, "invalid compression level");
- if (ret < 0)
- goto err;
-
- opt.level = level;
- }
-
- *res = bch2_compression_encode(opt);
-err:
- kfree(val);
- return ret;
-}
-
-void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
-{
- struct bch_compression_opt opt = bch2_compression_decode(v);
-
- if (opt.type < BCH_COMPRESSION_OPT_NR)
- prt_str(out, bch2_compression_opts[opt.type]);
- else
- prt_printf(out, "(unknown compression opt %u)", opt.type);
- if (opt.level)
- prt_printf(out, ":%u", opt.level);
-}
-
-void bch2_opt_compression_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
-{
- return bch2_compression_opt_to_text(out, v);
-}
-
-int bch2_opt_compression_validate(u64 v, struct printbuf *err)
-{
- if (!bch2_compression_opt_valid(v)) {
- prt_printf(err, "invalid compression opt %llu", v);
- return -BCH_ERR_invalid_sb_opt_compression;
- }
-
- return 0;
-}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
deleted file mode 100644
index bec2f05bfd52..000000000000
--- a/fs/bcachefs/compress.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COMPRESS_H
-#define _BCACHEFS_COMPRESS_H
-
-#include "extents_types.h"
-
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
- BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-struct bch_compression_opt {
- u8 type:4,
- level:4;
-};
-
-static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
-{
- return (struct bch_compression_opt) {
- .type = v & 15,
- .level = v >> 4,
- };
-}
-
-static inline bool bch2_compression_opt_valid(unsigned v)
-{
- struct bch_compression_opt opt = __bch2_compression_decode(v);
-
- return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
-}
-
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
-{
- return bch2_compression_opt_valid(v)
- ? __bch2_compression_decode(v)
- : (struct bch_compression_opt) { 0 };
-}
-
-static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
-{
- return opt.type|(opt.level << 4);
-}
-
-static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
-{
- return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
-}
-
-struct bch_write_op;
-int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *);
-int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
- struct bvec_iter, struct bch_extent_crc_unpacked);
-unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
- struct bio *, size_t *, unsigned);
-
-int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
-void bch2_fs_compress_exit(struct bch_fs *);
-int bch2_fs_compress_init(struct bch_fs *);
-
-void bch2_compression_opt_to_text(struct printbuf *, u64);
-
-int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-int bch2_opt_compression_validate(u64, struct printbuf *);
-
-#define bch2_opt_compression (struct bch_opt_fn) { \
- .parse = bch2_opt_compression_parse, \
- .to_text = bch2_opt_compression_to_text, \
- .validate = bch2_opt_compression_validate, \
-}
-
-#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
deleted file mode 100644
index e86d36d23e9e..000000000000
--- a/fs/bcachefs/darray.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/log2.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include "darray.h"
-
-int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
-{
- if (new_size > d->size) {
- new_size = roundup_pow_of_two(new_size);
-
- /*
- * This is a workaround: kvmalloc() doesn't support > INT_MAX
- * allocations, but vmalloc() does.
- * The limit needs to be lifted from kvmalloc, and when it does
- * we'll go back to just using that.
- */
- size_t bytes;
- if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
- return -ENOMEM;
-
- void *data = likely(bytes < INT_MAX)
- ? kvmalloc_noprof(bytes, gfp)
- : vmalloc_noprof(bytes);
- if (!data)
- return -ENOMEM;
-
- if (d->size)
- memcpy(data, d->data, d->size * element_size);
- if (d->data != d->preallocated)
- kvfree(d->data);
- d->data = data;
- d->size = new_size;
- }
-
- return 0;
-}
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
deleted file mode 100644
index c6151495985f..000000000000
--- a/fs/bcachefs/darray.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DARRAY_H
-#define _BCACHEFS_DARRAY_H
-
-/*
- * Dynamic arrays:
- *
- * Inspired by CCAN's darray
- */
-
-#include <linux/slab.h>
-
-#define DARRAY_PREALLOCATED(_type, _nr) \
-struct { \
- size_t nr, size; \
- _type *data; \
- _type preallocated[_nr]; \
-}
-
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
-
-typedef DARRAY(char) darray_char;
-typedef DARRAY(char *) darray_str;
-
-int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
-
-#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
-
-#define __darray_resize(_d, _element_size, _new_size, _gfp) \
- (unlikely((_new_size) > (_d)->size) \
- ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
- : 0)
-
-#define darray_resize_gfp(_d, _new_size, _gfp) \
- __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
-
-#define darray_resize(_d, _new_size) \
- darray_resize_gfp(_d, _new_size, GFP_KERNEL)
-
-#define darray_make_room_gfp(_d, _more, _gfp) \
- darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
-
-#define darray_make_room(_d, _more) \
- darray_make_room_gfp(_d, _more, GFP_KERNEL)
-
-#define darray_room(_d) ((_d).size - (_d).nr)
-
-#define darray_top(_d) ((_d).data[(_d).nr])
-
-#define darray_push_gfp(_d, _item, _gfp) \
-({ \
- int _ret = darray_make_room_gfp((_d), 1, _gfp); \
- \
- if (!_ret) \
- (_d)->data[(_d)->nr++] = (_item); \
- _ret; \
-})
-
-#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL)
-
-#define darray_pop(_d) ((_d)->data[--(_d)->nr])
-
-#define darray_first(_d) ((_d).data[0])
-#define darray_last(_d) ((_d).data[(_d).nr - 1])
-
-#define darray_insert_item(_d, pos, _item) \
-({ \
- size_t _pos = (pos); \
- int _ret = darray_make_room((_d), 1); \
- \
- if (!_ret) \
- array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \
- _ret; \
-})
-
-#define darray_remove_item(_d, _pos) \
- array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
-
-#define __darray_for_each(_d, _i) \
- for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each(_d, _i) \
- for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each_reverse(_d, _i) \
- for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
-
-#define darray_init(_d) \
-do { \
- (_d)->nr = 0; \
- (_d)->size = ARRAY_SIZE((_d)->preallocated); \
- (_d)->data = (_d)->size ? (_d)->preallocated : NULL; \
-} while (0)
-
-#define darray_exit(_d) \
-do { \
- if (!ARRAY_SIZE((_d)->preallocated) || \
- (_d)->data != (_d)->preallocated) \
- kvfree((_d)->data); \
- darray_init(_d); \
-} while (0)
-
-#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
deleted file mode 100644
index 0ec273daccb7..000000000000
--- a/fs/bcachefs/data_update.c
+++ /dev/null
@@ -1,907 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "compress.h"
-#include "data_update.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-
-static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr)
- bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
-}
-
-static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr) {
- if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
- bkey_for_each_ptr(ptrs, ptr2) {
- if (ptr2 == ptr)
- break;
- bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
- }
- return false;
- }
- }
- return true;
-}
-
-static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
-}
-
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- if (ctxt) {
- bool locked;
-
- move_ctxt_wait_event(ctxt,
- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
- list_empty(&ctxt->ios));
-
- if (!locked)
- bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
- } else {
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
- bkey_for_each_ptr(ptrs, ptr2) {
- if (ptr2 == ptr)
- break;
-
- ca = bch2_dev_have_ref(c, ptr2->dev);
- bucket = PTR_BUCKET_POS(ca, ptr2);
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
- return false;
- }
- }
- }
- return true;
-}
-
-static noinline void trace_io_move_finish2(struct data_update *u,
- struct bkey_i *new,
- struct bkey_i *insert)
-{
- struct bch_fs *c = u->op.c;
- struct printbuf buf = PRINTBUF;
-
- prt_newline(&buf);
-
- bch2_data_update_to_text(&buf, u);
- prt_newline(&buf);
-
- prt_str_indented(&buf, "new replicas:\t");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
- prt_newline(&buf);
-
- prt_str_indented(&buf, "insert:\t");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
- prt_newline(&buf);
-
- trace_io_move_finish(c, buf.buf);
- printbuf_exit(&buf);
-}
-
-static void trace_io_move_fail2(struct data_update *m,
- struct bkey_s_c new,
- struct bkey_s_c wrote,
- struct bkey_i *insert,
- const char *msg)
-{
- struct bch_fs *c = m->op.c;
- struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
- struct printbuf buf = PRINTBUF;
- unsigned rewrites_found = 0;
-
- if (!trace_io_move_fail_enabled())
- return;
-
- prt_str(&buf, msg);
-
- if (insert) {
- const union bch_extent_entry *entry;
- struct bch_extent_ptr *ptr;
- struct extent_ptr_decoded p;
-
- unsigned ptr_bit = 1;
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
- if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
- (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
- !ptr->cached)
- rewrites_found |= ptr_bit;
- ptr_bit <<= 1;
- }
- }
-
- prt_str(&buf, "rewrites found:\t");
- bch2_prt_u64_base2(&buf, rewrites_found);
- prt_newline(&buf);
-
- bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
-
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, new);
-
- prt_str(&buf, "\nwrote: ");
- bch2_bkey_val_to_text(&buf, c, wrote);
-
- if (insert) {
- prt_str(&buf, "\ninsert: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
- }
-
- trace_io_move_fail(c, buf.buf);
- printbuf_exit(&buf);
-}
-
-static int __bch2_data_update_index_update(struct btree_trans *trans,
- struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_iter iter;
- struct data_update *m =
- container_of(op, struct data_update, op);
- struct keylist *keys = &op->insert_keys;
- struct bkey_buf _new, _insert;
- int ret = 0;
-
- bch2_bkey_buf_init(&_new);
- bch2_bkey_buf_init(&_insert);
- bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
-
- bch2_trans_iter_init(trans, &iter, m->btree_id,
- bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_slots|BTREE_ITER_intent);
-
- while (1) {
- struct bkey_s_c k;
- struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
- struct bkey_i *insert = NULL;
- struct bkey_i_extent *new;
- const union bch_extent_entry *entry_c;
- union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bch_extent_ptr *ptr;
- const struct bch_extent_ptr *ptr_c;
- struct bpos next_pos;
- bool should_check_enospc;
- s64 i_sectors_delta = 0, disk_sectors_delta = 0;
- unsigned rewrites_found = 0, durability, ptr_bit;
-
- bch2_trans_begin(trans);
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- new = bkey_i_to_extent(bch2_keylist_front(keys));
-
- if (!bch2_extents_match(k, old)) {
- trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
- NULL, "no match:");
- goto nowork;
- }
-
- bkey_reassemble(_insert.k, k);
- insert = _insert.k;
-
- bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
- new = bkey_i_to_extent(_new.k);
- bch2_cut_front(iter.pos, &new->k_i);
-
- bch2_cut_front(iter.pos, insert);
- bch2_cut_back(new->k.p, insert);
- bch2_cut_back(insert->k.p, &new->k_i);
-
- /*
- * @old: extent that we read from
- * @insert: key that we're going to update, initialized from
- * extent currently in btree - same as @old unless we raced with
- * other updates
- * @new: extent with new pointers that we'll be adding to @insert
- *
- * Fist, drop rewrite_ptrs from @new:
- */
- ptr_bit = 1;
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
- if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
- (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
- !ptr->cached) {
- bch2_extent_ptr_set_cached(c, &m->op.opts,
- bkey_i_to_s(insert), ptr);
- rewrites_found |= ptr_bit;
- }
- ptr_bit <<= 1;
- }
-
- if (m->data_opts.rewrite_ptrs &&
- !rewrites_found &&
- bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
- trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
- goto nowork;
- }
-
- /*
- * A replica that we just wrote might conflict with a replica
- * that we want to keep, due to racing with another move:
- */
-restart_drop_conflicting_replicas:
- extent_for_each_ptr(extent_i_to_s(new), ptr)
- if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
- !ptr_c->cached) {
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
- goto restart_drop_conflicting_replicas;
- }
-
- if (!bkey_val_u64s(&new->k)) {
- trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
- goto nowork;
- }
-
- /* Now, drop pointers that conflict with what we just wrote: */
- extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
- if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
-
- durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
- bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
-
- /* Now, drop excess replicas: */
- rcu_read_lock();
-restart_drop_extra_replicas:
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
- unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
-
- if (!p.ptr.cached &&
- durability - ptr_durability >= m->op.opts.data_replicas) {
- durability -= ptr_durability;
-
- bch2_extent_ptr_set_cached(c, &m->op.opts,
- bkey_i_to_s(insert), &entry->ptr);
- goto restart_drop_extra_replicas;
- }
- }
- rcu_read_unlock();
-
- /* Finally, add the pointers we just wrote: */
- extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
- bch2_extent_ptr_decoded_append(insert, &p);
-
- bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
-
- ret = bch2_sum_sector_overwrites(trans, &iter, insert,
- &should_check_enospc,
- &i_sectors_delta,
- &disk_sectors_delta);
- if (ret)
- goto err;
-
- if (disk_sectors_delta > (s64) op->res.sectors) {
- ret = bch2_disk_reservation_add(c, &op->res,
- disk_sectors_delta - op->res.sectors,
- !should_check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- goto out;
- }
-
- next_pos = insert->k.p;
-
- /*
- * Check for nonce offset inconsistency:
- * This is debug code - we've been seeing this bug rarely, and
- * it's been hard to reproduce, so this should give us some more
- * information when it does occur:
- */
- int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
- (struct bkey_validate_context) {
- .btree = m->btree_id,
- .flags = BCH_VALIDATE_commit,
- });
- if (invalid) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "about to insert invalid key in data update path");
- prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-
- bch2_fatal_error(c);
- ret = -BCH_ERR_invalid_bkey;
- goto out;
- }
-
- if (trace_data_update_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- trace_data_update(c, buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
- k.k->p, bkey_start_pos(&insert->k)) ?:
- bch2_insert_snapshot_whiteouts(trans, m->btree_id,
- k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
- bch2_trans_update(trans, &iter, insert,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, &op->res,
- NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc|
- m->data_opts.btree_insert_flags);
- if (!ret) {
- bch2_btree_iter_set_pos(&iter, next_pos);
-
- this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
- if (trace_io_move_finish_enabled())
- trace_io_move_finish2(m, &new->k_i, insert);
- }
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
- if (ret)
- break;
-next:
- while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
- bch2_keylist_pop_front(keys);
- if (bch2_keylist_empty(keys))
- goto out;
- }
- continue;
-nowork:
- if (m->stats) {
- BUG_ON(k.k->p.offset <= iter.pos.offset);
- atomic64_inc(&m->stats->keys_raced);
- atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->stats->sectors_raced);
- }
-
- count_event(c, io_move_fail);
-
- bch2_btree_iter_advance(&iter);
- goto next;
- }
-out:
- bch2_trans_iter_exit(trans, &iter);
- bch2_bkey_buf_exit(&_insert, c);
- bch2_bkey_buf_exit(&_new, c);
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
- return ret;
-}
-
-int bch2_data_update_index_update(struct bch_write_op *op)
-{
- return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
-}
-
-void bch2_data_update_read_done(struct data_update *m)
-{
- m->read_done = true;
-
- /* write bio must own pages: */
- BUG_ON(!m->op.wbio.bio.bi_vcnt);
-
- m->op.crc = m->rbio.pick.crc;
- m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
-
- this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
-
- closure_call(&m->op.cl, bch2_write, NULL, NULL);
-}
-
-void bch2_data_update_exit(struct data_update *update)
-{
- struct bch_fs *c = update->op.c;
- struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
-
- bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
- kfree(update->bvecs);
- update->bvecs = NULL;
-
- if (c->opts.nocow_enabled)
- bkey_nocow_unlock(c, k);
- bkey_put_dev_refs(c, k);
- bch2_disk_reservation_put(c, &update->op.res);
- bch2_bkey_buf_exit(&update->k, c);
-}
-
-static int bch2_update_unwritten_extent(struct btree_trans *trans,
- struct data_update *update)
-{
- struct bch_fs *c = update->op.c;
- struct bkey_i_extent *e;
- struct write_point *wp;
- struct closure cl;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- closure_init_stack(&cl);
- bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
-
- while (bpos_lt(update->op.pos, update->k.k->k.p)) {
- unsigned sectors = update->k.k->k.p.offset -
- update->op.pos.offset;
-
- bch2_trans_begin(trans);
-
- bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
- BTREE_ITER_slots);
- ret = lockrestart_do(trans, ({
- k = bch2_btree_iter_peek_slot(&iter);
- bkey_err(k);
- }));
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
- break;
-
- e = bkey_extent_init(update->op.insert_keys.top);
- e->k.p = update->op.pos;
-
- ret = bch2_alloc_sectors_start_trans(trans,
- update->op.target,
- false,
- update->op.write_point,
- &update->op.devs_have,
- update->op.nr_replicas,
- update->op.nr_replicas,
- update->op.watermark,
- 0, &cl, &wp);
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- continue;
- }
-
- bch_err_fn_ratelimited(c, ret);
-
- if (ret)
- break;
-
- sectors = min(sectors, wp->sectors_free);
-
- bch2_key_resize(&e->k, sectors);
-
- bch2_open_bucket_get(c, wp, &update->op.open_buckets);
- bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
- bch2_alloc_sectors_done(c, wp);
-
- update->op.pos.offset += sectors;
-
- extent_for_each_ptr(extent_i_to_s(e), ptr)
- ptr->unwritten = true;
- bch2_keylist_push(&update->op.insert_keys);
-
- ret = __bch2_data_update_index_update(trans, &update->op);
-
- bch2_open_buckets_put(c, &update->op.open_buckets);
-
- if (ret)
- break;
- }
-
- if (closure_nr_remaining(&cl) != 1) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- }
-
- return ret;
-}
-
-void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 20);
-
- prt_str_indented(out, "rewrite ptrs:\t");
- bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
- prt_newline(out);
-
- prt_str_indented(out, "kill ptrs:\t");
- bch2_prt_u64_base2(out, data_opts->kill_ptrs);
- prt_newline(out);
-
- prt_str_indented(out, "target:\t");
- bch2_target_to_text(out, c, data_opts->target);
- prt_newline(out);
-
- prt_str_indented(out, "compression:\t");
- bch2_compression_opt_to_text(out, io_opts->background_compression);
- prt_newline(out);
-
- prt_str_indented(out, "opts.replicas:\t");
- prt_u64(out, io_opts->data_replicas);
- prt_newline(out);
-
- prt_str_indented(out, "extra replicas:\t");
- prt_u64(out, data_opts->extra_replicas);
-}
-
-void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
-{
- bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
- prt_newline(out);
-
- prt_str_indented(out, "old key:\t");
- bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
-}
-
-void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m)
-{
- bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
- prt_newline(out);
- printbuf_indent_add(out, 2);
- bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
- prt_printf(out, "read_done:\t\%u\n", m->read_done);
- bch2_write_op_to_text(out, &m->op);
- printbuf_indent_sub(out, 2);
-}
-
-int bch2_extent_drop_ptrs(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i *n;
- int ret;
-
- n = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- while (data_opts->kill_ptrs) {
- unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
-
- bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
- data_opts->kill_ptrs ^= 1U << drop;
- }
-
- /*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_error key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
-
- /*
- * Since we're not inserting through an extent iterator
- * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
- * we aren't using the extent overwrite path to delete, we're
- * just using the normal key deletion path:
- */
- if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents))
- n->k.size = 0;
-
- return bch2_trans_relock(trans) ?:
- bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
- struct bch_io_opts *io_opts)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- /* write path might have to decompress data: */
- unsigned buf_bytes = 0;
- bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
- buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-
- unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
-
- m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
- if (!m->bvecs)
- return -ENOMEM;
-
- bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
- bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
-
- if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
- kfree(m->bvecs);
- m->bvecs = NULL;
- return -ENOMEM;
- }
-
- rbio_init(&m->rbio.bio, c, *io_opts, NULL);
- m->rbio.data_update = true;
- m->rbio.bio.bi_iter.bi_size = buf_bytes;
- m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
- m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
- return 0;
-}
-
-static int can_write_extent(struct bch_fs *c, struct data_update *m)
-{
- if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
- unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
- return -BCH_ERR_data_update_done_would_block;
-
- unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
- ? m->op.target
- : 0;
- struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
-
- darray_for_each(m->op.devs_have, i)
- __clear_bit(*i, devs.d);
-
- rcu_read_lock();
- unsigned nr_replicas = 0, i;
- for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
- struct bch_dev *ca = bch2_dev_rcu(c, i);
-
- struct bch_dev_usage usage;
- bch2_dev_usage_read_fast(ca, &usage);
-
- if (!dev_buckets_free(ca, usage, m->op.watermark))
- continue;
-
- nr_replicas += ca->mi.durability;
- if (nr_replicas >= m->op.nr_replicas)
- break;
- }
- rcu_read_unlock();
-
- if (!nr_replicas)
- return -BCH_ERR_data_update_done_no_rw_devs;
- if (nr_replicas < m->op.nr_replicas)
- return -BCH_ERR_insufficient_devices;
- return 0;
-}
-
-int bch2_data_update_init(struct btree_trans *trans,
- struct btree_iter *iter,
- struct moving_context *ctxt,
- struct data_update *m,
- struct write_point_specifier wp,
- struct bch_io_opts *io_opts,
- struct data_update_opts data_opts,
- enum btree_id btree_id,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
- int ret = 0;
-
- /*
- * fs is corrupt we have a key for a snapshot node that doesn't exist,
- * and we have to check for this because we go rw before repairing the
- * snapshots table - just skip it, we can move it later.
- */
- if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
- return -BCH_ERR_data_update_done_no_snapshot;
-
- bch2_bkey_buf_init(&m->k);
- bch2_bkey_buf_reassemble(&m->k, c, k);
- m->btree_id = btree_id;
- m->data_opts = data_opts;
- m->ctxt = ctxt;
- m->stats = ctxt ? ctxt->stats : NULL;
-
- bch2_write_op_init(&m->op, c, *io_opts);
- m->op.pos = bkey_start_pos(k.k);
- m->op.version = k.k->bversion;
- m->op.target = data_opts.target;
- m->op.write_point = wp;
- m->op.nr_replicas = 0;
- m->op.flags |= BCH_WRITE_pages_stable|
- BCH_WRITE_pages_owned|
- BCH_WRITE_data_encoded|
- BCH_WRITE_move|
- m->data_opts.write_flags;
- m->op.compression_opt = io_opts->background_compression;
- m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
-
- unsigned durability_have = 0, durability_removing = 0;
-
- unsigned ptr_bit = 1;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (!p.ptr.cached) {
- rcu_read_lock();
- if (ptr_bit & m->data_opts.rewrite_ptrs) {
- if (crc_is_compressed(p.crc))
- reserve_sectors += k.k->size;
-
- m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
- durability_removing += bch2_extent_ptr_desired_durability(c, &p);
- } else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
- bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
- durability_have += bch2_extent_ptr_durability(c, &p);
- }
- rcu_read_unlock();
- }
-
- /*
- * op->csum_type is normally initialized from the fs/file's
- * current options - but if an extent is encrypted, we require
- * that it stays encrypted:
- */
- if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
- m->op.nonce = p.crc.nonce + p.crc.offset;
- m->op.csum_type = p.crc.csum_type;
- }
-
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
- m->op.incompressible = true;
-
- ptr_bit <<= 1;
- }
-
- unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
-
- /*
- * If current extent durability is less than io_opts.data_replicas,
- * we're not trying to rereplicate the extent up to data_replicas here -
- * unless extra_replicas was specified
- *
- * Increasing replication is an explicit operation triggered by
- * rereplicate, currently, so that users don't get an unexpected -ENOSPC
- */
- m->op.nr_replicas = min(durability_removing, durability_required) +
- m->data_opts.extra_replicas;
-
- /*
- * If device(s) were set to durability=0 after data was written to them
- * we can end up with a duribilty=0 extent, and the normal algorithm
- * that tries not to increase durability doesn't work:
- */
- if (!(durability_have + durability_removing))
- m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
-
- m->op.nr_replicas_required = m->op.nr_replicas;
-
- /*
- * It might turn out that we don't need any new replicas, if the
- * replicas or durability settings have been changed since the extent
- * was written:
- */
- if (!m->op.nr_replicas) {
- m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
- m->data_opts.rewrite_ptrs = 0;
- /* if iter == NULL, it's just a promote */
- if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
- if (!ret)
- ret = -BCH_ERR_data_update_done_no_writes_needed;
- goto out_bkey_buf_exit;
- }
-
- /*
- * Check if the allocation will succeed, to avoid getting an error later
- * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
- * read:
- *
- * This guards against
- * - BCH_WRITE_alloc_nowait allocations failing (promotes)
- * - Destination target full
- * - Device(s) in destination target offline
- * - Insufficient durability available in destination target
- * (i.e. trying to move a durability=2 replica to a target with a
- * single durability=2 device)
- */
- ret = can_write_extent(c, m);
- if (ret)
- goto out_bkey_buf_exit;
-
- if (reserve_sectors) {
- ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
- m->data_opts.extra_replicas
- ? 0
- : BCH_DISK_RESERVATION_NOFAIL);
- if (ret)
- goto out_bkey_buf_exit;
- }
-
- if (!bkey_get_dev_refs(c, k)) {
- ret = -BCH_ERR_data_update_done_no_dev_refs;
- goto out_put_disk_res;
- }
-
- if (c->opts.nocow_enabled &&
- !bkey_nocow_lock(c, ctxt, k)) {
- ret = -BCH_ERR_nocow_lock_blocked;
- goto out_put_dev_refs;
- }
-
- if (bkey_extent_is_unwritten(k)) {
- ret = bch2_update_unwritten_extent(trans, m) ?:
- -BCH_ERR_data_update_done_unwritten;
- goto out_nocow_unlock;
- }
-
- ret = bch2_data_update_bios_init(m, c, io_opts);
- if (ret)
- goto out_nocow_unlock;
-
- return 0;
-out_nocow_unlock:
- if (c->opts.nocow_enabled)
- bkey_nocow_unlock(c, k);
-out_put_dev_refs:
- bkey_put_dev_refs(c, k);
-out_put_disk_res:
- bch2_disk_reservation_put(c, &m->op.res);
-out_bkey_buf_exit:
- bch2_bkey_buf_exit(&m->k, c);
- return ret;
-}
-
-void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned ptr_bit = 1;
-
- bkey_for_each_ptr(ptrs, ptr) {
- if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
- opts->kill_ptrs |= ptr_bit;
- opts->rewrite_ptrs ^= ptr_bit;
- }
-
- ptr_bit <<= 1;
- }
-}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
deleted file mode 100644
index c194cbbf5b51..000000000000
--- a/fs/bcachefs/data_update.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _BCACHEFS_DATA_UPDATE_H
-#define _BCACHEFS_DATA_UPDATE_H
-
-#include "bkey_buf.h"
-#include "io_read.h"
-#include "io_write_types.h"
-
-struct moving_context;
-
-struct data_update_opts {
- unsigned rewrite_ptrs;
- unsigned kill_ptrs;
- u16 target;
- u8 extra_replicas;
- unsigned btree_insert_flags;
- unsigned write_flags;
-
- int read_dev;
- bool scrub;
-};
-
-void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
- struct bch_io_opts *, struct data_update_opts *);
-
-struct data_update {
- /* extent being updated: */
- bool read_done;
- enum btree_id btree_id;
- struct bkey_buf k;
- struct data_update_opts data_opts;
- struct moving_context *ctxt;
- struct bch_move_stats *stats;
-
- struct bch_read_bio rbio;
- struct bch_write_op op;
- struct bio_vec *bvecs;
-};
-
-void bch2_data_update_to_text(struct printbuf *, struct data_update *);
-void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
-
-int bch2_data_update_index_update(struct bch_write_op *);
-
-void bch2_data_update_read_done(struct data_update *);
-
-int bch2_extent_drop_ptrs(struct btree_trans *,
- struct btree_iter *,
- struct bkey_s_c,
- struct bch_io_opts *,
- struct data_update_opts *);
-
-int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
- struct bch_io_opts *);
-
-void bch2_data_update_exit(struct data_update *);
-int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
- struct moving_context *,
- struct data_update *,
- struct write_point_specifier,
- struct bch_io_opts *, struct data_update_opts,
- enum btree_id, struct bkey_s_c);
-void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
-
-#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
deleted file mode 100644
index 788af88f6979..000000000000
--- a/fs/bcachefs/debug.c
+++ /dev/null
@@ -1,980 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Assorted bcachefs debug code
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal_reclaim.h"
-#include "super.h"
-
-#include <linux/console.h>
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-
-static struct dentry *bch_debug;
-
-static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
- struct extent_ptr_decoded pick)
-{
- struct btree *v = c->verify_data;
- struct btree_node *n_ondisk = c->verify_ondisk;
- struct btree_node *n_sorted = c->verify_data->data;
- struct bset *sorted, *inmemory = &b->data->keys;
- struct bio *bio;
- bool failed = false, saw_error = false;
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
- if (!ca)
- return false;
-
- bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_sorted, btree_buf_bytes(b)),
- REQ_OP_READ|REQ_META,
- GFP_NOFS,
- &c->btree_bio);
- bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
-
- submit_bio_wait(bio);
-
- bio_put(bio);
- percpu_ref_put(&ca->io_ref);
-
- memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
-
- v->written = 0;
- if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
- return false;
-
- n_sorted = c->verify_data->data;
- sorted = &n_sorted->keys;
-
- if (inmemory->u64s != sorted->u64s ||
- memcmp(inmemory->start,
- sorted->start,
- vstruct_end(inmemory) - (void *) inmemory->start)) {
- unsigned offset = 0, sectors;
- struct bset *i;
- unsigned j;
-
- console_lock();
-
- printk(KERN_ERR "*** in memory:\n");
- bch2_dump_bset(c, b, inmemory, 0);
-
- printk(KERN_ERR "*** read back in:\n");
- bch2_dump_bset(c, v, sorted, 0);
-
- while (offset < v->written) {
- if (!offset) {
- i = &n_ondisk->keys;
- sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
- c->block_bits;
- } else {
- struct btree_node_entry *bne =
- (void *) n_ondisk + (offset << 9);
- i = &bne->keys;
-
- sectors = vstruct_blocks(bne, c->block_bits) <<
- c->block_bits;
- }
-
- printk(KERN_ERR "*** on disk block %u:\n", offset);
- bch2_dump_bset(c, b, i, offset);
-
- offset += sectors;
- }
-
- for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
- if (inmemory->_data[j] != sorted->_data[j])
- break;
-
- console_unlock();
- bch_err(c, "verify failed at key %u", j);
-
- failed = true;
- }
-
- if (v->written != b->written) {
- bch_err(c, "written wrong: expected %u, got %u",
- b->written, v->written);
- failed = true;
- }
-
- return failed;
-}
-
-void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
- struct bkey_ptrs_c ptrs;
- struct extent_ptr_decoded p;
- const union bch_extent_entry *entry;
- struct btree *v;
- struct bset *inmemory = &b->data->keys;
- struct bkey_packed *k;
- bool failed = false;
-
- if (c->opts.nochanges)
- return;
-
- bch2_btree_node_io_lock(b);
- mutex_lock(&c->verify_lock);
-
- if (!c->verify_ondisk) {
- c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
- if (!c->verify_ondisk)
- goto out;
- }
-
- if (!c->verify_data) {
- c->verify_data = __bch2_btree_node_mem_alloc(c);
- if (!c->verify_data)
- goto out;
-
- list_del_init(&c->verify_data->list);
- }
-
- BUG_ON(b->nsets != 1);
-
- for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
- if (k->type == KEY_TYPE_btree_ptr_v2)
- ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
-
- v = c->verify_data;
- bkey_copy(&v->key, &b->key);
- v->c.level = b->c.level;
- v->c.btree_id = b->c.btree_id;
- bch2_btree_keys_init(v);
-
- ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
- bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
- failed |= bch2_btree_verify_replica(c, b, p);
-
- if (failed) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
- printbuf_exit(&buf);
- }
-out:
- mutex_unlock(&c->verify_lock);
- bch2_btree_node_io_unlock(b);
-}
-
-void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
- const struct btree *b)
-{
- struct btree_node *n_ondisk = NULL;
- struct extent_ptr_decoded pick;
- struct bch_dev *ca;
- struct bio *bio = NULL;
- unsigned offset = 0;
- int ret;
-
- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) {
- prt_printf(out, "error getting device to read from: invalid device\n");
- return;
- }
-
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
- if (!ca) {
- prt_printf(out, "error getting device to read from: not online\n");
- return;
- }
-
- n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
- if (!n_ondisk) {
- prt_printf(out, "memory allocation failure\n");
- goto out;
- }
-
- bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_ondisk, btree_buf_bytes(b)),
- REQ_OP_READ|REQ_META,
- GFP_NOFS,
- &c->btree_bio);
- bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
-
- ret = submit_bio_wait(bio);
- if (ret) {
- prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
- goto out;
- }
-
- while (offset < btree_sectors(c)) {
- struct bset *i;
- struct nonce nonce;
- struct bch_csum csum;
- struct bkey_packed *k;
- unsigned sectors;
-
- if (!offset) {
- i = &n_ondisk->keys;
-
- if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
- prt_printf(out, "unknown checksum type at offset %u: %llu\n",
- offset, BSET_CSUM_TYPE(i));
- goto out;
- }
-
- nonce = btree_nonce(i, offset << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
-
- if (bch2_crc_cmp(csum, n_ondisk->csum)) {
- prt_printf(out, "invalid checksum\n");
- goto out;
- }
-
- bset_encrypt(c, i, offset << 9);
-
- sectors = vstruct_sectors(n_ondisk, c->block_bits);
- } else {
- struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
-
- i = &bne->keys;
-
- if (i->seq != n_ondisk->keys.seq)
- break;
-
- if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
- prt_printf(out, "unknown checksum type at offset %u: %llu\n",
- offset, BSET_CSUM_TYPE(i));
- goto out;
- }
-
- nonce = btree_nonce(i, offset << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
- if (bch2_crc_cmp(csum, bne->csum)) {
- prt_printf(out, "invalid checksum");
- goto out;
- }
-
- bset_encrypt(c, i, offset << 9);
-
- sectors = vstruct_sectors(bne, c->block_bits);
- }
-
- prt_printf(out, " offset %u version %u, journal seq %llu\n",
- offset,
- le16_to_cpu(i->version),
- le64_to_cpu(i->journal_seq));
- offset += sectors;
-
- printbuf_indent_add(out, 4);
-
- for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
- struct bkey u;
-
- bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
- prt_newline(out);
- }
-
- printbuf_indent_sub(out, 4);
- }
-out:
- if (bio)
- bio_put(bio);
- kvfree(n_ondisk);
- percpu_ref_put(&ca->io_ref);
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-/* XXX: bch_fs refcounting */
-
-struct dump_iter {
- struct bch_fs *c;
- enum btree_id id;
- struct bpos from;
- struct bpos prev_node;
- u64 iter;
-
- struct printbuf buf;
-
- char __user *ubuf; /* destination user buffer */
- size_t size; /* size of requested read */
- ssize_t ret; /* bytes read so far */
-};
-
-static ssize_t flush_buf(struct dump_iter *i)
-{
- if (i->buf.pos) {
- size_t bytes = min_t(size_t, i->buf.pos, i->size);
- int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
-
- i->ret += copied;
- i->ubuf += copied;
- i->size -= copied;
- i->buf.pos -= copied;
- memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
-
- if (copied != bytes)
- return -EFAULT;
- }
-
- return i->size ? 0 : i->ret;
-}
-
-static int bch2_dump_open(struct inode *inode, struct file *file)
-{
- struct btree_debug *bd = inode->i_private;
- struct dump_iter *i;
-
- i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
- if (!i)
- return -ENOMEM;
-
- file->private_data = i;
- i->from = POS_MIN;
- i->iter = 0;
- i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
- i->id = bd->id;
- i->buf = PRINTBUF;
-
- return 0;
-}
-
-static int bch2_dump_release(struct inode *inode, struct file *file)
-{
- struct dump_iter *i = file->private_data;
-
- printbuf_exit(&i->buf);
- kfree(i);
- return 0;
-}
-
-static ssize_t bch2_read_btree(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- return flush_buf(i) ?:
- bch2_trans_run(i->c,
- for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k, ({
- bch2_bkey_val_to_text(&i->buf, i->c, k);
- prt_newline(&i->buf);
- bch2_trans_unlock(trans);
- i->from = bpos_successor(iter.pos);
- flush_buf(i);
- }))) ?:
- i->ret;
-}
-
-static const struct file_operations btree_debug_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_read_btree,
-};
-
-static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- ssize_t ret = flush_buf(i);
- if (ret)
- return ret;
-
- if (bpos_eq(SPOS_MAX, i->from))
- return i->ret;
-
- return bch2_trans_run(i->c,
- for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({
- bch2_btree_node_to_text(&i->buf, i->c, b);
- i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
- ? bpos_successor(b->key.k.p)
- : b->key.k.p;
-
- drop_locks_do(trans, flush_buf(i));
- }))) ?: i->ret;
-}
-
-static const struct file_operations btree_format_debug_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_read_btree_formats,
-};
-
-static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- return flush_buf(i) ?:
- bch2_trans_run(i->c,
- for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k, ({
- struct btree_path_level *l =
- &btree_iter_path(trans, &iter)->l[0];
- struct bkey_packed *_k =
- bch2_btree_node_iter_peek(&l->iter, l->b);
-
- if (bpos_gt(l->b->key.k.p, i->prev_node)) {
- bch2_btree_node_to_text(&i->buf, i->c, l->b);
- i->prev_node = l->b->key.k.p;
- }
-
- bch2_bfloat_to_text(&i->buf, l->b, _k);
- bch2_trans_unlock(trans);
- i->from = bpos_successor(iter.pos);
- flush_buf(i);
- }))) ?:
- i->ret;
-}
-
-static const struct file_operations bfloat_failed_debug_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_read_bfloat_failed,
-};
-
-static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
- struct btree *b)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- prt_printf(out, "%px ", b);
- bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
- prt_printf(out, "\n");
-
- printbuf_indent_add(out, 2);
-
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
- prt_newline(out);
-
- prt_printf(out, "flags:\t");
- prt_bitflags(out, bch2_btree_node_flags, b->flags);
- prt_newline(out);
-
- prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL);
- prt_printf(out, "written:\t%u\n", b->written);
- prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked));
- prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable);
-
- prt_printf(out, "journal pin %px:\t%llu\n",
- &b->writes[0].journal, b->writes[0].journal.seq);
- prt_printf(out, "journal pin %px:\t%llu\n",
- &b->writes[1].journal, b->writes[1].journal.seq);
-
- printbuf_indent_sub(out, 2);
-}
-
-static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- bool done = false;
- ssize_t ret = 0;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- do {
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
-
- ret = flush_buf(i);
- if (ret)
- return ret;
-
- rcu_read_lock();
- i->buf.atomic++;
- tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
- &c->btree_cache.table);
- if (i->iter < tbl->size) {
- rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
- bch2_cached_btree_node_to_text(&i->buf, c, b);
- i->iter++;
- } else {
- done = true;
- }
- --i->buf.atomic;
- rcu_read_unlock();
- } while (!done);
-
- if (i->buf.allocation_failure)
- ret = -ENOMEM;
-
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
-}
-
-static const struct file_operations cached_btree_nodes_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_cached_btree_nodes_read,
-};
-
-typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
-
-static void list_sort(struct list_head *head, list_cmp_fn cmp)
-{
- struct list_head *pos;
-
- list_for_each(pos, head)
- while (!list_is_last(pos, head) &&
- cmp(pos, pos->next) > 0) {
- struct list_head *pos2, *next = pos->next;
-
- list_del(next);
- list_for_each(pos2, head)
- if (cmp(next, pos2) < 0)
- goto pos_found;
- BUG();
-pos_found:
- list_add_tail(next, pos2);
- }
-}
-
-static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
-{
- return cmp_int(l, r);
-}
-
-static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- struct btree_trans *trans;
- ssize_t ret = 0;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-restart:
- seqmutex_lock(&c->btree_trans_lock);
- list_sort(&c->btree_trans_list, list_ptr_order_cmp);
-
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- if ((ulong) trans <= i->iter)
- continue;
-
- i->iter = (ulong) trans;
-
- if (!closure_get_not_zero(&trans->ref))
- continue;
-
- u32 seq = seqmutex_unlock(&c->btree_trans_lock);
-
- bch2_btree_trans_to_text(&i->buf, trans);
-
- prt_printf(&i->buf, "backtrace:\n");
- printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
- printbuf_indent_sub(&i->buf, 2);
- prt_newline(&i->buf);
-
- closure_put(&trans->ref);
-
- ret = flush_buf(i);
- if (ret)
- goto unlocked;
-
- if (!seqmutex_relock(&c->btree_trans_lock, seq))
- goto restart;
- }
- seqmutex_unlock(&c->btree_trans_lock);
-unlocked:
- if (i->buf.allocation_failure)
- ret = -ENOMEM;
-
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
-}
-
-static const struct file_operations btree_transactions_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_btree_transactions_read,
-};
-
-static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- bool done = false;
- int err;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- while (1) {
- err = flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- break;
-
- if (done)
- break;
-
- done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
- i->iter++;
- }
-
- if (i->buf.allocation_failure)
- return -ENOMEM;
-
- return i->ret;
-}
-
-static const struct file_operations journal_pins_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_journal_pins_read,
-};
-
-static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- int err;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- if (!i->iter) {
- bch2_btree_updates_to_text(&i->buf, c);
- i->iter++;
- }
-
- err = flush_buf(i);
- if (err)
- return err;
-
- if (i->buf.allocation_failure)
- return -ENOMEM;
-
- return i->ret;
-}
-
-static const struct file_operations btree_updates_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_btree_updates_read,
-};
-
-static int btree_transaction_stats_open(struct inode *inode, struct file *file)
-{
- struct bch_fs *c = inode->i_private;
- struct dump_iter *i;
-
- i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
- if (!i)
- return -ENOMEM;
-
- i->iter = 1;
- i->c = c;
- i->buf = PRINTBUF;
- file->private_data = i;
-
- return 0;
-}
-
-static int btree_transaction_stats_release(struct inode *inode, struct file *file)
-{
- struct dump_iter *i = file->private_data;
-
- printbuf_exit(&i->buf);
- kfree(i);
-
- return 0;
-}
-
-static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- int err;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- while (1) {
- struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
-
- err = flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- break;
-
- if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
- !bch2_btree_transaction_fns[i->iter])
- break;
-
- prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]);
- printbuf_indent_add(&i->buf, 2);
-
- mutex_lock(&s->lock);
-
- prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
- prt_printf(&i->buf, "Transaction duration:\n");
-
- printbuf_indent_add(&i->buf, 2);
- bch2_time_stats_to_text(&i->buf, &s->duration);
- printbuf_indent_sub(&i->buf, 2);
-
- if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
- prt_printf(&i->buf, "Lock hold times:\n");
-
- printbuf_indent_add(&i->buf, 2);
- bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
- printbuf_indent_sub(&i->buf, 2);
- }
-
- if (s->max_paths_text) {
- prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths);
-
- printbuf_indent_add(&i->buf, 2);
- prt_str_indented(&i->buf, s->max_paths_text);
- printbuf_indent_sub(&i->buf, 2);
- }
-
- mutex_unlock(&s->lock);
-
- printbuf_indent_sub(&i->buf, 2);
- prt_newline(&i->buf);
- i->iter++;
- }
-
- if (i->buf.allocation_failure)
- return -ENOMEM;
-
- return i->ret;
-}
-
-static const struct file_operations btree_transaction_stats_op = {
- .owner = THIS_MODULE,
- .open = btree_transaction_stats_open,
- .release = btree_transaction_stats_release,
- .read = btree_transaction_stats_read,
-};
-
-/* walk btree transactions until we find a deadlock and print it */
-static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct btree_trans *trans;
- ulong iter = 0;
-restart:
- seqmutex_lock(&c->btree_trans_lock);
- list_sort(&c->btree_trans_list, list_ptr_order_cmp);
-
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- if ((ulong) trans <= iter)
- continue;
-
- iter = (ulong) trans;
-
- if (!closure_get_not_zero(&trans->ref))
- continue;
-
- u32 seq = seqmutex_unlock(&c->btree_trans_lock);
-
- bool found = bch2_check_for_deadlock(trans, out) != 0;
-
- closure_put(&trans->ref);
-
- if (found)
- return;
-
- if (!seqmutex_relock(&c->btree_trans_lock, seq))
- goto restart;
- }
- seqmutex_unlock(&c->btree_trans_lock);
-}
-
-typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *);
-
-static ssize_t bch2_simple_print(struct file *file, char __user *buf,
- size_t size, loff_t *ppos,
- fs_to_text_fn fn)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- ssize_t ret = 0;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- if (!i->iter) {
- fn(&i->buf, c);
- i->iter++;
- }
-
- if (i->buf.allocation_failure)
- ret = -ENOMEM;
-
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
-}
-
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text);
-}
-
-static const struct file_operations btree_deadlock_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_btree_deadlock_read,
-};
-
-static ssize_t bch2_write_points_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text);
-}
-
-static const struct file_operations write_points_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_write_points_read,
-};
-
-void bch2_fs_debug_exit(struct bch_fs *c)
-{
- if (!IS_ERR_OR_NULL(c->fs_debug_dir))
- debugfs_remove_recursive(c->fs_debug_dir);
-}
-
-static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
-{
- struct dentry *d;
-
- d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
-
- debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
-
- debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
-
- debugfs_create_file("bfloat-failed", 0400, d, bd,
- &bfloat_failed_debug_ops);
-}
-
-void bch2_fs_debug_init(struct bch_fs *c)
-{
- struct btree_debug *bd;
- char name[100];
-
- if (IS_ERR_OR_NULL(bch_debug))
- return;
-
- snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
- c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
- if (IS_ERR_OR_NULL(c->fs_debug_dir))
- return;
-
- debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
- c->btree_debug, &cached_btree_nodes_ops);
-
- debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
- c->btree_debug, &btree_transactions_ops);
-
- debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
- c->btree_debug, &journal_pins_ops);
-
- debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
- c->btree_debug, &btree_updates_ops);
-
- debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
- c, &btree_transaction_stats_op);
-
- debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
- c->btree_debug, &btree_deadlock_ops);
-
- debugfs_create_file("write_points", 0400, c->fs_debug_dir,
- c->btree_debug, &write_points_ops);
-
- c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
- if (IS_ERR_OR_NULL(c->btree_debug_dir))
- return;
-
- for (bd = c->btree_debug;
- bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
- bd++) {
- bd->id = bd - c->btree_debug;
- bch2_fs_debug_btree_init(c, bd);
- }
-}
-
-#endif
-
-void bch2_debug_exit(void)
-{
- if (!IS_ERR_OR_NULL(bch_debug))
- debugfs_remove_recursive(bch_debug);
-}
-
-int __init bch2_debug_init(void)
-{
- bch_debug = debugfs_create_dir("bcachefs", NULL);
- return 0;
-}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
deleted file mode 100644
index 2c37143b5fd1..000000000000
--- a/fs/bcachefs/debug.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DEBUG_H
-#define _BCACHEFS_DEBUG_H
-
-#include "bcachefs.h"
-
-struct bio;
-struct btree;
-struct bch_fs;
-
-void __bch2_btree_verify(struct bch_fs *, struct btree *);
-void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
- const struct btree *);
-
-static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
- if (bch2_verify_btree_ondisk)
- __bch2_btree_verify(c, b);
-}
-
-#ifdef CONFIG_DEBUG_FS
-void bch2_fs_debug_exit(struct bch_fs *);
-void bch2_fs_debug_init(struct bch_fs *);
-#else
-static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
-static inline void bch2_fs_debug_init(struct bch_fs *c) {}
-#endif
-
-void bch2_debug_exit(void);
-int bch2_debug_init(void);
-
-#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
deleted file mode 100644
index d7f9f79318a2..000000000000
--- a/fs/bcachefs/dirent.c
+++ /dev/null
@@ -1,782 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "dirent.h"
-#include "fs.h"
-#include "keylist.h"
-#include "str_hash.h"
-#include "subvolume.h"
-
-#include <linux/dcache.h>
-
-static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
- const struct qstr *str, struct qstr *out_cf)
-{
- *out_cf = (struct qstr) QSTR_INIT(NULL, 0);
-
-#ifdef CONFIG_UNICODE
- unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
- int ret = PTR_ERR_OR_ZERO(buf);
- if (ret)
- return ret;
-
- ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
- if (ret <= 0)
- return ret;
-
- *out_cf = (struct qstr) QSTR_INIT(buf, ret);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static inline int bch2_maybe_casefold(struct btree_trans *trans,
- const struct bch_hash_info *info,
- const struct qstr *str, struct qstr *out_cf)
-{
- if (likely(!info->cf_encoding)) {
- *out_cf = *str;
- return 0;
- } else {
- return bch2_casefold(trans, info, str, out_cf);
- }
-}
-
-static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
-{
- if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
- return 0;
-
- unsigned bkey_u64s = bkey_val_u64s(d.k);
- unsigned bkey_bytes = bkey_u64s * sizeof(u64);
- u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
-#if CPU_BIG_ENDIAN
- unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
-#else
- unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
-#endif
-
- return bkey_bytes -
- (d.v->d_casefold
- ? offsetof(struct bch_dirent, d_cf_name_block.d_names)
- : offsetof(struct bch_dirent, d_name)) -
- trailing_nuls;
-}
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
-{
- if (d.v->d_casefold) {
- unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
- return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
- } else {
- return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
- }
-}
-
-static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
-{
- if (d.v->d_casefold) {
- unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
- unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
- return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
- } else {
- return (struct qstr) QSTR_INIT(NULL, 0);
- }
-}
-
-static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
-{
- return d.v->d_casefold
- ? bch2_dirent_get_casefold_name(d)
- : bch2_dirent_get_name(d);
-}
-
-static u64 bch2_dirent_hash(const struct bch_hash_info *info,
- const struct qstr *name)
-{
- struct bch_str_hash_ctx ctx;
-
- bch2_str_hash_init(&ctx, info);
- bch2_str_hash_update(&ctx, info, name->name, name->len);
-
- /* [0,2) reserved for dots */
- return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
-}
-
-static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
-{
- return bch2_dirent_hash(info, key);
-}
-
-static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- struct qstr name = bch2_dirent_get_lookup_name(d);
-
- return bch2_dirent_hash(info, &name);
-}
-
-static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
-{
- struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- const struct qstr l_name = bch2_dirent_get_lookup_name(l);
- const struct qstr *r_name = _r;
-
- return !qstr_eq(l_name, *r_name);
-}
-
-static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
- struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
- const struct qstr l_name = bch2_dirent_get_lookup_name(l);
- const struct qstr r_name = bch2_dirent_get_lookup_name(r);
-
- return !qstr_eq(l_name, r_name);
-}
-
-static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
-{
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
- if (d.v->d_type == DT_SUBVOL)
- return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
- return true;
-}
-
-const struct bch_hash_desc bch2_dirent_hash_desc = {
- .btree_id = BTREE_ID_dirents,
- .key_type = KEY_TYPE_dirent,
- .hash_key = dirent_hash_key,
- .hash_bkey = dirent_hash_bkey,
- .cmp_key = dirent_cmp_key,
- .cmp_bkey = dirent_cmp_bkey,
- .is_visible = dirent_is_visible,
-};
-
-int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- unsigned name_block_len = bch2_dirent_name_bytes(d);
- struct qstr d_name = bch2_dirent_get_name(d);
- struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
- int ret = 0;
-
- bkey_fsck_err_on(!d_name.len,
- c, dirent_empty_name,
- "empty name");
-
- bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
- c, dirent_val_too_big,
- "dirent names exceed bkey size (%d + %d > %d)",
- d_name.len, d_cf_name.len, name_block_len);
-
- /*
- * Check new keys don't exceed the max length
- * (older keys may be larger.)
- */
- bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
- c, dirent_name_too_long,
- "dirent name too big (%u > %u)",
- d_name.len, BCH_NAME_MAX);
-
- bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len),
- c, dirent_name_embedded_nul,
- "dirent has stray data after name's NUL");
-
- bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
- (d_name.len == 2 && !memcmp(d_name.name, "..", 2)),
- c, dirent_name_dot_or_dotdot,
- "invalid name");
-
- bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len),
- c, dirent_name_has_slash,
- "name with /");
-
- bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
- le64_to_cpu(d.v->d_inum) == d.k->p.inode,
- c, dirent_to_itself,
- "dirent points to own directory");
-
- if (d.v->d_casefold) {
- bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
- d_cf_name.len > BCH_NAME_MAX,
- c, dirent_cf_name_too_big,
- "dirent w/ cf name too big (%u > %u)",
- d_cf_name.len, BCH_NAME_MAX);
-
- bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
- c, dirent_stray_data_after_cf_name,
- "dirent has stray data after cf name's NUL");
- }
-fsck_err:
- return ret;
-}
-
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- struct qstr d_name = bch2_dirent_get_name(d);
-
- prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
-
- if (d.v->d_type != DT_SUBVOL)
- prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
- else
- prt_printf(out, "%u -> %u",
- le32_to_cpu(d.v->d_parent_subvol),
- le32_to_cpu(d.v->d_child_subvol));
-
- prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
-}
-
-static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans,
- subvol_inum dir,
- u8 type,
- int name_len, int cf_name_len,
- u64 dst)
-{
- struct bkey_i_dirent *dirent;
- unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len);
-
- BUG_ON(u64s > U8_MAX);
-
- dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
- if (IS_ERR(dirent))
- return dirent;
-
- bkey_dirent_init(&dirent->k_i);
- dirent->k.u64s = u64s;
-
- if (type != DT_SUBVOL) {
- dirent->v.d_inum = cpu_to_le64(dst);
- } else {
- dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
- dirent->v.d_child_subvol = cpu_to_le32(dst);
- }
-
- dirent->v.d_type = type;
- dirent->v.d_unused = 0;
- dirent->v.d_casefold = cf_name_len ? 1 : 0;
-
- return dirent;
-}
-
-static void dirent_init_regular_name(struct bkey_i_dirent *dirent,
- const struct qstr *name)
-{
- EBUG_ON(dirent->v.d_casefold);
-
- memcpy(&dirent->v.d_name[0], name->name, name->len);
- memset(&dirent->v.d_name[name->len], 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_name) -
- name->len);
-}
-
-static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
- const struct qstr *name,
- const struct qstr *cf_name)
-{
- EBUG_ON(!dirent->v.d_casefold);
- EBUG_ON(!cf_name->len);
-
- dirent->v.d_cf_name_block.d_name_len = name->len;
- dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len;
- memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
- memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
- memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_cf_name_block.d_names) -
- name->len + cf_name->len);
-
- EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len);
-}
-
-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
- subvol_inum dir,
- u8 type,
- const struct qstr *name,
- const struct qstr *cf_name,
- u64 dst)
-{
- struct bkey_i_dirent *dirent;
-
- if (name->len > BCH_NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
-
- dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
- if (IS_ERR(dirent))
- return dirent;
-
- if (cf_name)
- dirent_init_casefolded_name(dirent, name, cf_name);
- else
- dirent_init_regular_name(dirent, name);
-
- EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
-
- return dirent;
-}
-
-int bch2_dirent_create_snapshot(struct btree_trans *trans,
- u32 dir_subvol, u64 dir, u32 snapshot,
- const struct bch_hash_info *hash_info,
- u8 type, const struct qstr *name, u64 dst_inum,
- u64 *dir_offset,
- enum btree_iter_update_trigger_flags flags)
-{
- subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
- struct bkey_i_dirent *dirent;
- int ret;
-
- dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum);
- ret = PTR_ERR_OR_ZERO(dirent);
- if (ret)
- return ret;
-
- dirent->k.p.inode = dir;
- dirent->k.p.snapshot = snapshot;
-
- ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, snapshot, &dirent->k_i,
- flags|BTREE_UPDATE_internal_snapshot_node);
- *dir_offset = dirent->k.p.offset;
-
- return ret;
-}
-
-int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
- const struct bch_hash_info *hash_info,
- u8 type, const struct qstr *name, u64 dst_inum,
- u64 *dir_offset,
- u64 *i_size,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_i_dirent *dirent;
- int ret;
-
- if (hash_info->cf_encoding) {
- struct qstr cf_name;
- ret = bch2_casefold(trans, hash_info, name, &cf_name);
- if (ret)
- return ret;
- dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum);
- } else {
- dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum);
- }
-
- ret = PTR_ERR_OR_ZERO(dirent);
- if (ret)
- return ret;
-
- *i_size += bkey_bytes(&dirent->k);
-
- ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, flags);
- *dir_offset = dirent->k.p.offset;
-
- return ret;
-}
-
-int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
- struct bkey_s_c_dirent d, subvol_inum *target)
-{
- struct bch_subvolume s;
- int ret = 0;
-
- if (d.v->d_type == DT_SUBVOL &&
- le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
- return 1;
-
- if (likely(d.v->d_type != DT_SUBVOL)) {
- target->subvol = dir.subvol;
- target->inum = le64_to_cpu(d.v->d_inum);
- } else {
- target->subvol = le32_to_cpu(d.v->d_child_subvol);
-
- ret = bch2_subvolume_get(trans, target->subvol, true, &s);
-
- target->inum = le64_to_cpu(s.inode);
- }
-
- return ret;
-}
-
-int bch2_dirent_rename(struct btree_trans *trans,
- subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size,
- subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size,
- const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
- const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
- enum bch_rename_mode mode)
-{
- struct qstr src_name_lookup, dst_name_lookup;
- struct btree_iter src_iter = { NULL };
- struct btree_iter dst_iter = { NULL };
- struct bkey_s_c old_src, old_dst = bkey_s_c_null;
- struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
- struct bpos dst_pos =
- POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
- unsigned src_update_flags = 0;
- bool delete_src, delete_dst;
- int ret = 0;
-
- memset(src_inum, 0, sizeof(*src_inum));
- memset(dst_inum, 0, sizeof(*dst_inum));
-
- /* Lookup src: */
- ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
- if (ret)
- goto out;
- old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
- src_hash, src_dir, &src_name_lookup,
- BTREE_ITER_intent);
- ret = bkey_err(old_src);
- if (ret)
- goto out;
-
- ret = bch2_dirent_read_target(trans, src_dir,
- bkey_s_c_to_dirent(old_src), src_inum);
- if (ret)
- goto out;
-
- /* Lookup dst: */
- ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
- if (ret)
- goto out;
- if (mode == BCH_RENAME) {
- /*
- * Note that we're _not_ checking if the target already exists -
- * we're relying on the VFS to do that check for us for
- * correctness:
- */
- ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, &dst_name_lookup);
- if (ret)
- goto out;
- } else {
- old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, &dst_name_lookup,
- BTREE_ITER_intent);
- ret = bkey_err(old_dst);
- if (ret)
- goto out;
-
- ret = bch2_dirent_read_target(trans, dst_dir,
- bkey_s_c_to_dirent(old_dst), dst_inum);
- if (ret)
- goto out;
- }
-
- if (mode != BCH_RENAME_EXCHANGE)
- *src_offset = dst_iter.pos.offset;
-
- /* Create new dst key: */
- new_dst = dirent_create_key(trans, dst_dir, 0, dst_name,
- dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
- ret = PTR_ERR_OR_ZERO(new_dst);
- if (ret)
- goto out;
-
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
- new_dst->k.p = dst_iter.pos;
-
- /* Create new src key: */
- if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(trans, src_dir, 0, src_name,
- src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
- ret = PTR_ERR_OR_ZERO(new_src);
- if (ret)
- goto out;
-
- dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
- new_src->k.p = src_iter.pos;
- } else {
- new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
- ret = PTR_ERR_OR_ZERO(new_src);
- if (ret)
- goto out;
-
- bkey_init(&new_src->k);
- new_src->k.p = src_iter.pos;
-
- if (bkey_le(dst_pos, src_iter.pos) &&
- bkey_lt(src_iter.pos, dst_iter.pos)) {
- /*
- * We have a hash collision for the new dst key,
- * and new_src - the key we're deleting - is between
- * new_dst's hashed slot and the slot we're going to be
- * inserting it into - oops. This will break the hash
- * table if we don't deal with it:
- */
- if (mode == BCH_RENAME) {
- /*
- * If we're not overwriting, we can just insert
- * new_dst at the src position:
- */
- new_src = new_dst;
- new_src->k.p = src_iter.pos;
- goto out_set_src;
- } else {
- /* If we're overwriting, we can't insert new_dst
- * at a different slot because it has to
- * overwrite old_dst - just make sure to use a
- * whiteout when deleting src:
- */
- new_src->k.type = KEY_TYPE_hash_whiteout;
- }
- } else {
- /* Check if we need a whiteout to delete src: */
- ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
- src_hash, &src_iter);
- if (ret < 0)
- goto out;
-
- if (ret)
- new_src->k.type = KEY_TYPE_hash_whiteout;
- }
- }
-
- if (new_dst->v.d_type == DT_SUBVOL)
- new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
-
- if ((mode == BCH_RENAME_EXCHANGE) &&
- new_src->v.d_type == DT_SUBVOL)
- new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
-
- if (old_dst.k)
- *dst_dir_i_size -= bkey_bytes(old_dst.k);
- *src_dir_i_size -= bkey_bytes(old_src.k);
-
- if (mode == BCH_RENAME_EXCHANGE)
- *src_dir_i_size += bkey_bytes(&new_src->k);
- *dst_dir_i_size += bkey_bytes(&new_dst->k);
-
- ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
- if (ret)
- goto out;
-out_set_src:
- /*
- * If we're deleting a subvolume we need to really delete the dirent,
- * not just emit a whiteout in the current snapshot - there can only be
- * single dirent that points to a given subvolume.
- *
- * IOW, we don't maintain multiple versions in different snapshots of
- * dirents that point to subvolumes - dirents that point to subvolumes
- * are only visible in one particular subvolume so it's not necessary,
- * and it would be particularly confusing for fsck to have to deal with.
- */
- delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
- new_src->k.p.snapshot != old_src.k->p.snapshot;
-
- delete_dst = old_dst.k &&
- bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
- new_dst->k.p.snapshot != old_dst.k->p.snapshot;
-
- if (!delete_src || !bkey_deleted(&new_src->k)) {
- ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
- if (ret)
- goto out;
- }
-
- if (delete_src) {
- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&src_iter) ?:
- bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
- if (ret)
- goto out;
- }
-
- if (delete_dst) {
- bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&dst_iter) ?:
- bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
- if (ret)
- goto out;
- }
-
- if (mode == BCH_RENAME_EXCHANGE)
- *src_offset = new_src->k.p.offset;
- *dst_offset = new_dst->k.p.offset;
-out:
- bch2_trans_iter_exit(trans, &src_iter);
- bch2_trans_iter_exit(trans, &dst_iter);
- return ret;
-}
-
-int bch2_dirent_lookup_trans(struct btree_trans *trans,
- struct btree_iter *iter,
- subvol_inum dir,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, subvol_inum *inum,
- unsigned flags)
-{
- struct qstr lookup_name;
- int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
- if (ret)
- return ret;
-
- struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, &lookup_name, flags);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
- if (ret > 0)
- ret = -ENOENT;
-err:
- if (ret)
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, subvol_inum *inum)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
-
- int ret = lockrestart_do(trans,
- bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(dir, 0, snapshot),
- POS(dir, U64_MAX), 0, k, ret)
- if (k.k->type == KEY_TYPE_dirent) {
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
- continue;
- ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
-{
- u32 snapshot;
-
- return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
- bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
-}
-
-static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
-{
- struct qstr name = bch2_dirent_get_name(d);
- /*
- * Although not required by the kernel code, updating ctx->pos is needed
- * for the bcachefs FUSE driver. Without this update, the FUSE
- * implementation will be stuck in an infinite loop when reading
- * directories (via the bcachefs_fuse_readdir callback).
- * In kernel space, ctx->pos is updated by the VFS code.
- */
- ctx->pos = d.k->p.offset;
- bool ret = dir_emit(ctx, name.name,
- name.len,
- target.inum,
- vfs_d_type(d.v->d_type));
- if (ret)
- ctx->pos = d.k->p.offset + 1;
- return ret;
-}
-
-int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
-{
- struct bkey_buf sk;
- bch2_bkey_buf_init(&sk);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents,
- POS(inum.inum, ctx->pos),
- POS(inum.inum, U64_MAX),
- inum.subvol, 0, k, ({
- if (k.k->type != KEY_TYPE_dirent)
- continue;
-
- /* dir_emit() can fault and block: */
- bch2_bkey_buf_reassemble(&sk, c, k);
- struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
-
- subvol_inum target;
- int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
- if (ret2 > 0)
- continue;
-
- ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
- })));
-
- bch2_bkey_buf_exit(&sk, c);
-
- return ret < 0 ? ret : 0;
-}
-
-/* fsck */
-
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inode_nr)
- break;
- if (!bkey_is_inode(k.k))
- continue;
- ret = bch2_inode_unpack(k, inode);
- goto found;
- }
- ret = -BCH_ERR_ENOENT_inode;
-found:
- bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bch_inode_unpacked dir_inode;
- struct bch_hash_info dir_hash_info;
- int ret;
-
- ret = lookup_first_inode(trans, pos.inode, &dir_inode);
- if (ret)
- goto err;
-
- dir_hash_info = bch2_hash_info_init(c, &dir_inode);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
-
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter);
-err:
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
deleted file mode 100644
index 0880772b80a9..000000000000
--- a/fs/bcachefs/dirent.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_H
-#define _BCACHEFS_DIRENT_H
-
-#include "str_hash.h"
-
-extern const struct bch_hash_desc bch2_dirent_hash_desc;
-
-int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
- .key_validate = bch2_dirent_validate, \
- .val_to_text = bch2_dirent_to_text, \
- .min_val_size = 16, \
-})
-
-struct qstr;
-struct file;
-struct dir_context;
-struct bch_fs;
-struct bch_hash_info;
-struct bch_inode_info;
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
-
-static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
-{
- unsigned bytes = cf_len
- ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len
- : offsetof(struct bch_dirent, d_name) + len;
-
- return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
- struct bkey_s_c_dirent, subvol_inum *);
-
-static inline void dirent_copy_target(struct bkey_i_dirent *dst,
- struct bkey_s_c_dirent src)
-{
- dst->v.d_inum = src.v->d_inum;
- dst->v.d_type = src.v->d_type;
-}
-
-int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
- const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *,
- enum btree_iter_update_trigger_flags);
-int bch2_dirent_create(struct btree_trans *, subvol_inum,
- const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *, u64 *,
- enum btree_iter_update_trigger_flags);
-
-static inline unsigned vfs_d_type(unsigned type)
-{
- return type == DT_SUBVOL ? DT_DIR : type;
-}
-
-enum bch_rename_mode {
- BCH_RENAME,
- BCH_RENAME_OVERWRITE,
- BCH_RENAME_EXCHANGE,
-};
-
-int bch2_dirent_rename(struct btree_trans *,
- subvol_inum, struct bch_hash_info *, u64 *,
- subvol_inum, struct bch_hash_info *, u64 *,
- const struct qstr *, subvol_inum *, u64 *,
- const struct qstr *, subvol_inum *, u64 *,
- enum bch_rename_mode);
-
-int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
- subvol_inum, const struct bch_hash_info *,
- const struct qstr *, subvol_inum *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
- const struct bch_hash_info *,
- const struct qstr *, subvol_inum *);
-
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
-int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
-int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
-
-int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
-
-#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
deleted file mode 100644
index a46dbddd21aa..000000000000
--- a/fs/bcachefs/dirent_format.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_FORMAT_H
-#define _BCACHEFS_DIRENT_FORMAT_H
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
- struct bch_val v;
-
- /* Target inode number: */
- union {
- __le64 d_inum;
- struct { /* DT_SUBVOL */
- __le32 d_child_subvol;
- __le32 d_parent_subvol;
- };
- };
-
- /*
- * Copy of mode bits 12-15 from the target inode - so userspace can get
- * the filetype without having to do a stat()
- */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u8 d_type:5,
- d_unused:2,
- d_casefold:1;
-#elif defined(__BIG_ENDIAN_BITFIELD)
- __u8 d_casefold:1,
- d_unused:2,
- d_type:5;
-#endif
-
- union {
- struct {
- __u8 d_pad;
- __le16 d_name_len;
- __le16 d_cf_name_len;
- __u8 d_names[];
- } d_cf_name_block __packed;
- __DECLARE_FLEX_ARRAY(__u8, d_name);
- } __packed;
-} __packed __aligned(8);
-
-#define DT_SUBVOL 16
-#define BCH_DT_MAX 17
-
-#define BCH_NAME_MAX 512
-
-#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
deleted file mode 100644
index b32e91ba8be8..000000000000
--- a/fs/bcachefs/disk_accounting.c
+++ /dev/null
@@ -1,1012 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "btree_cache.h"
-#include "btree_journal_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "compress.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "journal_io.h"
-#include "replicas.h"
-
-/*
- * Notes on disk accounting:
- *
- * We have two parallel sets of counters to be concerned with, and both must be
- * kept in sync.
- *
- * - Persistent/on disk accounting, stored in the accounting btree and updated
- * via btree write buffer updates that treat new accounting keys as deltas to
- * apply to existing values. But reading from a write buffer btree is
- * expensive, so we also have
- *
- * - In memory accounting, where accounting is stored as an array of percpu
- * counters, indexed by an eytzinger array of disk acounting keys/bpos (which
- * are the same thing, excepting byte swabbing on big endian).
- *
- * Cheap to read, but non persistent.
- *
- * Disk accounting updates are generated by transactional triggers; these run as
- * keys enter and leave the btree, and can compare old and new versions of keys;
- * the output of these triggers are deltas to the various counters.
- *
- * Disk accounting updates are done as btree write buffer updates, where the
- * counters in the disk accounting key are deltas that will be applied to the
- * counter in the btree when the key is flushed by the write buffer (or journal
- * replay).
- *
- * To do a disk accounting update:
- * - initialize a disk_accounting_pos, to specify which counter is being update
- * - initialize counter deltas, as an array of 1-3 s64s
- * - call bch2_disk_accounting_mod()
- *
- * This queues up the accounting update to be done at transaction commit time.
- * Underneath, it's a normal btree write buffer update.
- *
- * The transaction commit path is responsible for propagating updates to the in
- * memory counters, with bch2_accounting_mem_mod().
- *
- * The commit path also assigns every disk accounting update a unique version
- * number, based on the journal sequence number and offset within that journal
- * buffer; this is used by journal replay to determine which updates have been
- * done.
- *
- * The transaction commit path also ensures that replicas entry accounting
- * updates are properly marked in the superblock (so that we know whether we can
- * mount without data being unavailable); it will update the superblock if
- * bch2_accounting_mem_mod() tells it to.
- */
-
-static const char * const disk_accounting_type_strs[] = {
-#define x(t, n, ...) [n] = #t,
- BCH_DISK_ACCOUNTING_TYPES()
-#undef x
- NULL
-};
-
-static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
- s64 *d, unsigned nr)
-{
- struct bkey_i_accounting *acc = bkey_accounting_init(k);
-
- acc->k.p = disk_accounting_pos_to_bpos(pos);
- set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
-
- memcpy_u64s_small(acc->v.d, d, nr);
-}
-
-static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
-
-int bch2_disk_accounting_mod(struct btree_trans *trans,
- struct disk_accounting_pos *k,
- s64 *d, unsigned nr, bool gc)
-{
- /* Normalize: */
- switch (k->type) {
- case BCH_DISK_ACCOUNTING_replicas:
- bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp);
- break;
- }
-
- BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
-
- struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
-
- accounting_key_init(&k_i.k, k, d, nr);
-
- if (unlikely(gc)) {
- int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
- if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
- ret = drop_locks_do(trans,
- bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
- bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
- return ret;
- } else {
- return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k);
- }
-}
-
-int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
- unsigned dev, s64 sectors,
- bool gc)
-{
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
-
- bch2_replicas_entry_cached(&acc.replicas, dev);
-
- return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
-}
-
-static inline bool is_zero(char *start, char *end)
-{
- BUG_ON(start > end);
-
- for (; start < end; start++)
- if (*start)
- return false;
- return true;
-}
-
-#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
-
-int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
- void *end = &acc_k + 1;
- int ret = 0;
-
- bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
- bversion_zero(k.k->bversion),
- c, accounting_key_version_0,
- "accounting key with version=0");
-
- switch (acc_k.type) {
- case BCH_DISK_ACCOUNTING_nr_inodes:
- end = field_end(acc_k, nr_inodes);
- break;
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- end = field_end(acc_k, persistent_reserved);
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- bkey_fsck_err_on(!acc_k.replicas.nr_devs,
- c, accounting_key_replicas_nr_devs_0,
- "accounting key replicas entry with nr_devs=0");
-
- bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
- (acc_k.replicas.nr_required > 1 &&
- acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
- c, accounting_key_replicas_nr_required_bad,
- "accounting key replicas entry with bad nr_required");
-
- for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
- bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
- c, accounting_key_replicas_devs_unsorted,
- "accounting key replicas entry with unsorted devs");
-
- end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- end = field_end(acc_k, dev_data_type);
- break;
- case BCH_DISK_ACCOUNTING_compression:
- end = field_end(acc_k, compression);
- break;
- case BCH_DISK_ACCOUNTING_snapshot:
- end = field_end(acc_k, snapshot);
- break;
- case BCH_DISK_ACCOUNTING_btree:
- end = field_end(acc_k, btree);
- break;
- case BCH_DISK_ACCOUNTING_rebalance_work:
- end = field_end(acc_k, rebalance_work);
- break;
- }
-
- bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
- c, accounting_key_junk_at_end,
- "junk at end of accounting key");
-fsck_err:
- return ret;
-}
-
-void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
-{
- if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
- prt_printf(out, "unknown type %u", k->type);
- return;
- }
-
- prt_str(out, disk_accounting_type_strs[k->type]);
- prt_str(out, " ");
-
- switch (k->type) {
- case BCH_DISK_ACCOUNTING_nr_inodes:
- break;
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- bch2_replicas_entry_to_text(out, &k->replicas);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
- bch2_prt_data_type(out, k->dev_data_type.data_type);
- break;
- case BCH_DISK_ACCOUNTING_compression:
- bch2_prt_compression_type(out, k->compression.type);
- break;
- case BCH_DISK_ACCOUNTING_snapshot:
- prt_printf(out, "id=%u", k->snapshot.id);
- break;
- case BCH_DISK_ACCOUNTING_btree:
- prt_str(out, "btree=");
- bch2_btree_id_to_text(out, k->btree.id);
- break;
- }
-}
-
-void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- bch2_accounting_key_to_text(out, &acc_k);
-
- for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
- prt_printf(out, " %lli", acc.v->d[i]);
-}
-
-void bch2_accounting_swab(struct bkey_s k)
-{
- for (u64 *p = (u64 *) k.v;
- p < (u64 *) bkey_val_end(k);
- p++)
- *p = swab64(*p);
-}
-
-static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
- struct disk_accounting_pos *acc)
-{
- unsafe_memcpy(r, &acc->replicas,
- replicas_entry_bytes(&acc->replicas),
- "variable length struct");
-}
-
-static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
-{
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, p);
-
- switch (acc_k.type) {
- case BCH_DISK_ACCOUNTING_replicas:
- __accounting_to_replicas(r, &acc_k);
- return true;
- default:
- return false;
- }
-}
-
-static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
-{
- struct bch_replicas_padded r;
- return accounting_to_replicas(&r.e, p)
- ? bch2_mark_replicas(c, &r.e)
- : 0;
-}
-
-/*
- * Ensure accounting keys being updated are present in the superblock, when
- * applicable (i.e. replicas updates)
- */
-int bch2_accounting_update_sb(struct btree_trans *trans)
-{
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i))
- if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) {
- int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a)
-{
- struct bch_accounting_mem *acc = &c->accounting;
-
- /* raced with another insert, already present: */
- if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &a.k->p) < acc->k.nr)
- return 0;
-
- struct accounting_mem_entry n = {
- .pos = a.k->p,
- .bversion = a.k->bversion,
- .nr_counters = bch2_accounting_counters(a.k),
- .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
- sizeof(u64), GFP_KERNEL),
- };
-
- if (!n.v[0])
- goto err;
-
- if (acc->gc_running) {
- n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
- sizeof(u64), GFP_KERNEL);
- if (!n.v[1])
- goto err;
- }
-
- if (darray_push(&acc->k, n))
- goto err;
-
- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, NULL);
-
- if (trace_accounting_mem_insert_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_accounting_to_text(&buf, c, a.s_c);
- trace_accounting_mem_insert(c, buf.buf);
- printbuf_exit(&buf);
- }
- return 0;
-err:
- free_percpu(n.v[1]);
- free_percpu(n.v[0]);
- return -BCH_ERR_ENOMEM_disk_accounting;
-}
-
-int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
- enum bch_accounting_mode mode)
-{
- struct bch_replicas_padded r;
-
- if (mode != BCH_ACCOUNTING_read &&
- accounting_to_replicas(&r.e, a.k->p) &&
- !bch2_replicas_marked_locked(c, &r.e))
- return -BCH_ERR_btree_insert_need_mark_replicas;
-
- percpu_up_read(&c->mark_lock);
- percpu_down_write(&c->mark_lock);
- int ret = __bch2_accounting_mem_insert(c, a);
- percpu_up_write(&c->mark_lock);
- percpu_down_read(&c->mark_lock);
- return ret;
-}
-
-static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
-{
- for (unsigned i = 0; i < e->nr_counters; i++)
- if (percpu_u64_get(e->v[0] + i) ||
- (e->v[1] &&
- percpu_u64_get(e->v[1] + i)))
- return false;
- return true;
-}
-
-void bch2_accounting_mem_gc(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
-
- percpu_down_write(&c->mark_lock);
- struct accounting_mem_entry *dst = acc->k.data;
-
- darray_for_each(acc->k, src) {
- if (accounting_mem_entry_is_zero(src)) {
- free_percpu(src->v[0]);
- free_percpu(src->v[1]);
- } else {
- *dst++ = *src;
- }
- }
-
- acc->k.nr = dst - acc->k.data;
- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, NULL);
- percpu_up_write(&c->mark_lock);
-}
-
-/*
- * Read out accounting keys for replicas entries, as an array of
- * bch_replicas_usage entries.
- *
- * Note: this may be deprecated/removed at smoe point in the future and replaced
- * with something more general, it exists to support the ioctl used by the
- * 'bcachefs fs usage' command.
- */
-int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
-{
- struct bch_accounting_mem *acc = &c->accounting;
- int ret = 0;
-
- darray_init(usage);
-
- percpu_down_read(&c->mark_lock);
- darray_for_each(acc->k, i) {
- struct {
- struct bch_replicas_usage r;
- u8 pad[BCH_BKEY_PTRS_MAX];
- } u;
-
- if (!accounting_to_replicas(&u.r.r, i->pos))
- continue;
-
- u64 sectors;
- bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false);
- u.r.sectors = sectors;
-
- ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
- if (ret)
- break;
-
- memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
- usage->nr += replicas_usage_bytes(&u.r);
- }
- percpu_up_read(&c->mark_lock);
-
- if (ret)
- darray_exit(usage);
- return ret;
-}
-
-int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
-{
-
- struct bch_accounting_mem *acc = &c->accounting;
- int ret = 0;
-
- darray_init(out_buf);
-
- percpu_down_read(&c->mark_lock);
- darray_for_each(acc->k, i) {
- struct disk_accounting_pos a_p;
- bpos_to_disk_accounting_pos(&a_p, i->pos);
-
- if (!(accounting_types_mask & BIT(a_p.type)))
- continue;
-
- ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
- sizeof(u64) * i->nr_counters);
- if (ret)
- break;
-
- struct bkey_i_accounting *a_out =
- bkey_accounting_init((void *) &darray_top(*out_buf));
- set_bkey_val_u64s(&a_out->k, i->nr_counters);
- a_out->k.p = i->pos;
- bch2_accounting_mem_read_counters(acc, i - acc->k.data,
- a_out->v.d, i->nr_counters, false);
-
- if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
- out_buf->nr += bkey_bytes(&a_out->k);
- }
-
- percpu_up_read(&c->mark_lock);
-
- if (ret)
- darray_exit(out_buf);
- return ret;
-}
-
-static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
-{
- darray_for_each(acc->k, e) {
- free_percpu(e->v[gc]);
- e->v[gc] = NULL;
- }
-}
-
-int bch2_gc_accounting_start(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
- int ret = 0;
-
- percpu_down_write(&c->mark_lock);
- darray_for_each(acc->k, e) {
- e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64),
- sizeof(u64), GFP_KERNEL);
- if (!e->v[1]) {
- bch2_accounting_free_counters(acc, true);
- ret = -BCH_ERR_ENOMEM_disk_accounting;
- break;
- }
- }
-
- acc->gc_running = !ret;
- percpu_up_write(&c->mark_lock);
-
- return ret;
-}
-
-int bch2_gc_accounting_done(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
- struct btree_trans *trans = bch2_trans_get(c);
- struct printbuf buf = PRINTBUF;
- struct bpos pos = POS_MIN;
- int ret = 0;
-
- percpu_down_write(&c->mark_lock);
- while (1) {
- unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &pos);
-
- if (idx >= acc->k.nr)
- break;
-
- struct accounting_mem_entry *e = acc->k.data + idx;
- pos = bpos_successor(e->pos);
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, e->pos);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- continue;
-
- u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
- u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
-
- unsigned nr = e->nr_counters;
- bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false);
- bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true);
-
- if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
- printbuf_reset(&buf);
- prt_str(&buf, "accounting mismatch for ");
- bch2_accounting_key_to_text(&buf, &acc_k);
-
- prt_str(&buf, ": got");
- for (unsigned j = 0; j < nr; j++)
- prt_printf(&buf, " %llu", dst_v[j]);
-
- prt_str(&buf, " should be");
- for (unsigned j = 0; j < nr; j++)
- prt_printf(&buf, " %llu", src_v[j]);
-
- for (unsigned j = 0; j < nr; j++)
- src_v[j] -= dst_v[j];
-
- if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) {
- percpu_up_write(&c->mark_lock);
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
- percpu_down_write(&c->mark_lock);
- if (ret)
- goto err;
-
- if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
- memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
- struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
-
- accounting_key_init(&k_i.k, &acc_k, src_v, nr);
- bch2_accounting_mem_mod_locked(trans,
- bkey_i_to_s_c_accounting(&k_i.k),
- BCH_ACCOUNTING_normal);
-
- preempt_disable();
- struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
- struct bch_fs_usage_base *src = &trans->fs_usage_delta;
- acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
- preempt_enable();
- }
- }
- }
- }
-err:
-fsck_err:
- percpu_up_write(&c->mark_lock);
- printbuf_exit(&buf);
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
-
- if (k.k->type != KEY_TYPE_accounting)
- return 0;
-
- percpu_down_read(&c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
- BCH_ACCOUNTING_read);
- percpu_up_read(&c->mark_lock);
- return ret;
-}
-
-static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
- struct disk_accounting_pos acc,
- u64 *v, unsigned nr)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0, invalid_dev = -1;
-
- switch (acc.type) {
- case BCH_DISK_ACCOUNTING_replicas: {
- struct bch_replicas_padded r;
- __accounting_to_replicas(&r.e, &acc);
-
- for (unsigned i = 0; i < r.e.nr_devs; i++)
- if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
- !bch2_dev_exists(c, r.e.devs[i])) {
- invalid_dev = r.e.devs[i];
- goto invalid_device;
- }
-
- /*
- * All replicas entry checks except for invalid device are done
- * in bch2_accounting_validate
- */
- BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
-
- if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
- trans, accounting_replicas_not_marked,
- "accounting not marked in superblock replicas\n %s",
- (printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, &acc),
- buf.buf))) {
- /*
- * We're not RW yet and still single threaded, dropping
- * and retaking lock is ok:
- */
- percpu_up_write(&c->mark_lock);
- ret = bch2_mark_replicas(c, &r.e);
- if (ret)
- goto fsck_err;
- percpu_down_write(&c->mark_lock);
- }
- break;
- }
-
- case BCH_DISK_ACCOUNTING_dev_data_type:
- if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
- invalid_dev = acc.dev_data_type.dev;
- goto invalid_device;
- }
- break;
- }
-
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-invalid_device:
- if (fsck_err(trans, accounting_to_invalid_device,
- "accounting entry points to invalid device %i\n %s",
- invalid_dev,
- (printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, &acc),
- buf.buf))) {
- for (unsigned i = 0; i < nr; i++)
- v[i] = -v[i];
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
- -BCH_ERR_remove_disk_accounting_entry;
- } else {
- ret = -BCH_ERR_remove_disk_accounting_entry;
- }
- goto fsck_err;
-}
-
-/*
- * At startup time, initialize the in memory accounting from the btree (and
- * journal)
- */
-int bch2_accounting_read(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
- struct btree_trans *trans = bch2_trans_get(c);
- struct printbuf buf = PRINTBUF;
-
- /*
- * We might run more than once if we rewind to start topology repair or
- * btree node scan - and those might cause us to get different results,
- * so we can't just skip if we've already run.
- *
- * Instead, zero out any accounting we have:
- */
- percpu_down_write(&c->mark_lock);
- darray_for_each(acc->k, e)
- percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
- for_each_member_device(c, ca)
- percpu_memset(ca->usage, 0, sizeof(*ca->usage));
- percpu_memset(c->usage, 0, sizeof(*c->usage));
- percpu_up_write(&c->mark_lock);
-
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
- iter.flags &= ~BTREE_ITER_with_journal;
- int ret = for_each_btree_key_continue(trans, iter,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
-
- if (k.k->type != KEY_TYPE_accounting)
- continue;
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- break;
-
- if (!bch2_accounting_is_mem(acc_k)) {
- struct disk_accounting_pos next = { .type = acc_k.type + 1 };
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
- continue;
- }
-
- accounting_read_key(trans, k);
- }));
- if (ret)
- goto err;
-
- struct journal_keys *keys = &c->journal_keys;
- struct journal_key *dst = keys->data;
- move_gap(keys, keys->nr);
-
- darray_for_each(*keys, i) {
- if (i->k->k.type == KEY_TYPE_accounting) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
-
- if (!bch2_accounting_is_mem(acc_k))
- continue;
-
- struct bkey_s_c k = bkey_i_to_s_c(i->k);
- unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
- sizeof(acc->k.data[0]),
- accounting_pos_cmp, &k.k->p);
-
- bool applied = idx < acc->k.nr &&
- bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
-
- if (applied)
- continue;
-
- if (i + 1 < &darray_top(*keys) &&
- i[1].k->k.type == KEY_TYPE_accounting &&
- !journal_key_cmp(i, i + 1)) {
- WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
-
- i[1].journal_seq = i[0].journal_seq;
-
- bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
- bkey_s_c_to_accounting(k));
- continue;
- }
-
- ret = accounting_read_key(trans, k);
- if (ret)
- goto err;
- }
-
- *dst++ = *i;
- }
- keys->gap = keys->nr = dst - keys->data;
-
- percpu_down_write(&c->mark_lock);
-
- darray_for_each_reverse(acc->k, i) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, i->pos);
-
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- memset(v, 0, sizeof(v));
-
- for (unsigned j = 0; j < i->nr_counters; j++)
- v[j] = percpu_u64_get(i->v[0] + j);
-
- /*
- * If the entry counters are zeroed, it should be treated as
- * nonexistent - it might point to an invalid device.
- *
- * Remove it, so that if it's re-added it gets re-marked in the
- * superblock:
- */
- ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
- ? -BCH_ERR_remove_disk_accounting_entry
- : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters);
-
- if (ret == -BCH_ERR_remove_disk_accounting_entry) {
- free_percpu(i->v[0]);
- free_percpu(i->v[1]);
- darray_remove_item(&acc->k, i);
- ret = 0;
- continue;
- }
-
- if (ret)
- goto fsck_err;
- }
-
- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, NULL);
-
- preempt_disable();
- struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
-
- for (unsigned i = 0; i < acc->k.nr; i++) {
- struct disk_accounting_pos k;
- bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
-
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
-
- switch (k.type) {
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
- if (ca) {
- struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
- percpu_u64_set(&d->buckets, v[0]);
- percpu_u64_set(&d->sectors, v[1]);
- percpu_u64_set(&d->fragmented, v[2]);
-
- if (k.dev_data_type.data_type == BCH_DATA_sb ||
- k.dev_data_type.data_type == BCH_DATA_journal)
- usage->hidden += v[0] * ca->mi.bucket_size;
- }
- rcu_read_unlock();
- break;
- }
- }
- preempt_enable();
-fsck_err:
- percpu_up_write(&c->mark_lock);
-err:
- printbuf_exit(&buf);
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
-{
- return bch2_trans_run(c,
- bch2_btree_write_buffer_flush_sync(trans) ?:
- for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
- BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
- struct disk_accounting_pos acc;
- bpos_to_disk_accounting_pos(&acc, k.k->p);
-
- acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
- acc.dev_data_type.dev == dev
- ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
- : 0;
- })) ?:
- bch2_btree_write_buffer_flush_sync(trans));
-}
-
-int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
-{
- struct bch_fs *c = ca->fs;
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
- .dev_data_type.dev = ca->dev_idx,
- .dev_data_type.data_type = BCH_DATA_free,
- };
- u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
-
- int ret = bch2_trans_do(c, ({
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?:
- (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
- }));
- bch_err_fn(c, ret);
- return ret;
-}
-
-void bch2_verify_accounting_clean(struct bch_fs *c)
-{
- bool mismatch = false;
- struct bch_fs_usage_base base = {}, base_inmem = {};
-
- bch2_trans_run(c,
- for_each_btree_key(trans, iter,
- BTREE_ID_accounting, POS_MIN,
- BTREE_ITER_all_snapshots, k, ({
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
- unsigned nr = bch2_accounting_counters(k.k);
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- break;
-
- if (!bch2_accounting_is_mem(acc_k)) {
- struct disk_accounting_pos next = { .type = acc_k.type + 1 };
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
- continue;
- }
-
- bch2_accounting_mem_read(c, k.k->p, v, nr);
-
- if (memcmp(a.v->d, v, nr * sizeof(u64))) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, " !=");
- for (unsigned j = 0; j < nr; j++)
- prt_printf(&buf, " %llu", v[j]);
-
- pr_err("%s", buf.buf);
- printbuf_exit(&buf);
- mismatch = true;
- }
-
- switch (acc_k.type) {
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type: {
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
- if (!ca) {
- rcu_read_unlock();
- continue;
- }
-
- v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
- v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
- v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
- rcu_read_unlock();
-
- if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, " in mem");
- for (unsigned j = 0; j < nr; j++)
- prt_printf(&buf, " %llu", v[j]);
-
- pr_err("dev accounting mismatch: %s", buf.buf);
- printbuf_exit(&buf);
- mismatch = true;
- }
- }
- }
-
- 0;
- })));
-
- acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64));
-
-#define check(x) \
- if (base.x != base_inmem.x) { \
- pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \
- mismatch = true; \
- }
-
- //check(hidden);
- check(btree);
- check(data);
- check(cached);
- check(reserved);
- check(nr_inodes);
-
- WARN_ON(mismatch);
-}
-
-void bch2_accounting_gc_free(struct bch_fs *c)
-{
- lockdep_assert_held(&c->mark_lock);
-
- struct bch_accounting_mem *acc = &c->accounting;
-
- bch2_accounting_free_counters(acc, true);
- acc->gc_running = false;
-}
-
-void bch2_fs_accounting_exit(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
-
- bch2_accounting_free_counters(acc, false);
- darray_exit(&acc->k);
-}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
deleted file mode 100644
index f9214e2d1346..000000000000
--- a/fs/bcachefs/disk_accounting.h
+++ /dev/null
@@ -1,293 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_H
-#define _BCACHEFS_DISK_ACCOUNTING_H
-
-#include "btree_update.h"
-#include "eytzinger.h"
-#include "sb-members.h"
-
-static inline void bch2_u64s_neg(u64 *v, unsigned nr)
-{
- for (unsigned i = 0; i < nr; i++)
- v[i] = -v[i];
-}
-
-static inline unsigned bch2_accounting_counters(const struct bkey *k)
-{
- return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
-}
-
-static inline void bch2_accounting_neg(struct bkey_s_accounting a)
-{
- bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
-}
-
-static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
-{
- for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
- if (a.v->d[i])
- return false;
- return true;
-}
-
-static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
- struct bkey_s_c_accounting src)
-{
- EBUG_ON(dst->k.u64s != src.k->u64s);
-
- for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
- dst->v.d[i] += src.v->d[i];
- if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
- dst->k.bversion = src.k->bversion;
-}
-
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
- enum bch_data_type data_type,
- s64 sectors)
-{
- switch (data_type) {
- case BCH_DATA_btree:
- fs_usage->btree += sectors;
- break;
- case BCH_DATA_user:
- case BCH_DATA_parity:
- fs_usage->data += sectors;
- break;
- case BCH_DATA_cached:
- fs_usage->cached += sectors;
- break;
- default:
- break;
- }
-}
-
-static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
-{
- BUILD_BUG_ON(sizeof(*acc) != sizeof(p));
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- acc->_pad = p;
-#else
- memcpy_swab(acc, &p, sizeof(p));
-#endif
-}
-
-static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc)
-{
- struct bpos p;
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- p = acc->_pad;
-#else
- memcpy_swab(&p, acc, sizeof(p));
-#endif
- return p;
-}
-
-int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
- s64 *, unsigned, bool);
-
-#define disk_accounting_key_init(_k, _type, ...) \
-do { \
- memset(&(_k), 0, sizeof(_k)); \
- (_k).type = BCH_DISK_ACCOUNTING_##_type; \
- (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \
-} while (0)
-
-#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \
-({ \
- struct disk_accounting_pos pos; \
- disk_accounting_key_init(pos, __VA_ARGS__); \
- bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \
-})
-
-#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \
- bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__)
-
-int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
-
-int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
-void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_accounting_swab(struct bkey_s);
-
-#define bch2_bkey_ops_accounting ((struct bkey_ops) { \
- .key_validate = bch2_accounting_validate, \
- .val_to_text = bch2_accounting_to_text, \
- .swab = bch2_accounting_swab, \
- .min_val_size = 8, \
-})
-
-int bch2_accounting_update_sb(struct btree_trans *);
-
-static inline int accounting_pos_cmp(const void *_l, const void *_r)
-{
- const struct bpos *l = _l, *r = _r;
-
- return bpos_cmp(*l, *r);
-}
-
-enum bch_accounting_mode {
- BCH_ACCOUNTING_normal,
- BCH_ACCOUNTING_gc,
- BCH_ACCOUNTING_read,
-};
-
-int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
-void bch2_accounting_mem_gc(struct bch_fs *);
-
-static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
-{
- return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR &&
- acc.type != BCH_DISK_ACCOUNTING_inum;
-}
-
-/*
- * Update in memory counters so they match the btree update we're doing; called
- * from transaction commit path
- */
-static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
- struct bkey_s_c_accounting a,
- enum bch_accounting_mode mode)
-{
- struct bch_fs *c = trans->c;
- struct bch_accounting_mem *acc = &c->accounting;
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, a.k->p);
- bool gc = mode == BCH_ACCOUNTING_gc;
-
- if (gc && !acc->gc_running)
- return 0;
-
- if (!bch2_accounting_is_mem(acc_k))
- return 0;
-
- if (mode == BCH_ACCOUNTING_normal) {
- switch (acc_k.type) {
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
- if (ca) {
- this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
- this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
- this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
- }
- rcu_read_unlock();
- break;
- }
- }
-
- unsigned idx;
-
- while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
- int ret = bch2_accounting_mem_insert(c, a, mode);
- if (ret)
- return ret;
- }
-
- struct accounting_mem_entry *e = &acc->k.data[idx];
-
- EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
-
- for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
- this_cpu_add(e->v[gc][i], a.v->d[i]);
- return 0;
-}
-
-static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
-{
- percpu_down_read(&trans->c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
- percpu_up_read(&trans->c->mark_lock);
- return ret;
-}
-
-static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc,
- unsigned idx, u64 *v, unsigned nr, bool gc)
-{
- memset(v, 0, sizeof(*v) * nr);
-
- if (unlikely(idx >= acc->k.nr))
- return;
-
- struct accounting_mem_entry *e = &acc->k.data[idx];
-
- nr = min_t(unsigned, nr, e->nr_counters);
-
- for (unsigned i = 0; i < nr; i++)
- v[i] = percpu_u64_get(e->v[gc] + i);
-}
-
-static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
- u64 *v, unsigned nr)
-{
- percpu_down_read(&c->mark_lock);
- struct bch_accounting_mem *acc = &c->accounting;
- unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &p);
-
- bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
- percpu_up_read(&c->mark_lock);
-}
-
-static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
-{
- EBUG_ON(!res->ref);
-
- return (struct bversion) {
- .hi = res->seq >> 32,
- .lo = (res->seq << 32) | (res->offset + offset),
- };
-}
-
-static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
- struct bkey_i_accounting *a,
- unsigned commit_flags)
-{
- a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
- (u64 *) a - (u64 *) trans->journal_entries);
-
- EBUG_ON(bversion_zero(a->k.bversion));
-
- return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
- ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
- : 0;
-}
-
-static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans,
- struct bkey_i_accounting *a_i,
- unsigned commit_flags)
-{
- if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
- struct bkey_s_accounting a = accounting_i_to_s(a_i);
-
- bch2_accounting_neg(a);
- bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
- bch2_accounting_neg(a);
- }
-}
-
-int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
-int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
-
-int bch2_gc_accounting_start(struct bch_fs *);
-int bch2_gc_accounting_done(struct bch_fs *);
-
-int bch2_accounting_read(struct bch_fs *);
-
-int bch2_dev_usage_remove(struct bch_fs *, unsigned);
-int bch2_dev_usage_init(struct bch_dev *, bool);
-
-void bch2_verify_accounting_clean(struct bch_fs *c);
-
-void bch2_accounting_gc_free(struct bch_fs *);
-void bch2_fs_accounting_exit(struct bch_fs *);
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
deleted file mode 100644
index 15190196485f..000000000000
--- a/fs/bcachefs/disk_accounting_format.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
-#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
-
-#include "replicas_format.h"
-
-/*
- * Disk accounting - KEY_TYPE_accounting - on disk format:
- *
- * Here, the key has considerably more structure than a typical key (bpos); an
- * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
- *
- * More specifically: a key is just a muliword integer (where word endianness
- * matches native byte order), so we're treating bpos as an opaque 20 byte
- * integer and mapping bch_accounting_key to that.
- *
- * This is a type-tagged union of all our various subtypes; a disk accounting
- * key can be device counters, replicas counters, et cetera - it's extensible.
- *
- * The value is a list of u64s or s64s; the number of counters is specific to a
- * given accounting type.
- *
- * Unlike with other key types, updates are _deltas_, and the deltas are not
- * resolved until the update to the underlying btree, done by btree write buffer
- * flush or journal replay.
- *
- * Journal replay in particular requires special handling. The journal tracks a
- * range of entries which may possibly have not yet been applied to the btree
- * yet - it does not know definitively whether individual entries are dirty and
- * still need to be applied.
- *
- * To handle this, we use the version field of struct bkey, and give every
- * accounting update a unique version number - a total ordering in time; the
- * version number is derived from the key's position in the journal. Then
- * journal replay can compare the version number of the key from the journal
- * with the version number of the key in the btree to determine if a key needs
- * to be replayed.
- *
- * For this to work, we must maintain this strict time ordering of updates as
- * they are flushed to the btree, both via write buffer flush and via journal
- * replay. This has complications for the write buffer code while journal replay
- * is still in progress; the write buffer cannot flush any accounting keys to
- * the btree until journal replay has finished replaying its accounting keys, or
- * the (newer) version number of the keys from the write buffer will cause
- * updates from journal replay to be lost.
- */
-
-struct bch_accounting {
- struct bch_val v;
- __u64 d[];
-};
-
-#define BCH_ACCOUNTING_MAX_COUNTERS 3
-
-#define BCH_DATA_TYPES() \
- x(free, 0) \
- x(sb, 1) \
- x(journal, 2) \
- x(btree, 3) \
- x(user, 4) \
- x(cached, 5) \
- x(parity, 6) \
- x(stripe, 7) \
- x(need_gc_gens, 8) \
- x(need_discard, 9) \
- x(unstriped, 10)
-
-enum bch_data_type {
-#define x(t, n) BCH_DATA_##t,
- BCH_DATA_TYPES()
-#undef x
- BCH_DATA_NR
-};
-
-static inline bool data_type_is_empty(enum bch_data_type type)
-{
- switch (type) {
- case BCH_DATA_free:
- case BCH_DATA_need_gc_gens:
- case BCH_DATA_need_discard:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool data_type_is_hidden(enum bch_data_type type)
-{
- switch (type) {
- case BCH_DATA_sb:
- case BCH_DATA_journal:
- return true;
- default:
- return false;
- }
-}
-
-#define BCH_DISK_ACCOUNTING_TYPES() \
- x(nr_inodes, 0) \
- x(persistent_reserved, 1) \
- x(replicas, 2) \
- x(dev_data_type, 3) \
- x(compression, 4) \
- x(snapshot, 5) \
- x(btree, 6) \
- x(rebalance_work, 7) \
- x(inum, 8)
-
-enum disk_accounting_type {
-#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
- BCH_DISK_ACCOUNTING_TYPES()
-#undef x
- BCH_DISK_ACCOUNTING_TYPE_NR,
-};
-
-struct bch_acct_nr_inodes {
-};
-
-struct bch_acct_persistent_reserved {
- __u8 nr_replicas;
-};
-
-struct bch_acct_dev_data_type {
- __u8 dev;
- __u8 data_type;
-};
-
-struct bch_acct_compression {
- __u8 type;
-};
-
-struct bch_acct_snapshot {
- __u32 id;
-} __packed;
-
-struct bch_acct_btree {
- __u32 id;
-} __packed;
-
-struct bch_acct_inum {
- __u64 inum;
-} __packed;
-
-struct bch_acct_rebalance_work {
-};
-
-struct disk_accounting_pos {
- union {
- struct {
- __u8 type;
- union {
- struct bch_acct_nr_inodes nr_inodes;
- struct bch_acct_persistent_reserved persistent_reserved;
- struct bch_replicas_entry_v1 replicas;
- struct bch_acct_dev_data_type dev_data_type;
- struct bch_acct_compression compression;
- struct bch_acct_snapshot snapshot;
- struct bch_acct_btree btree;
- struct bch_acct_rebalance_work rebalance_work;
- struct bch_acct_inum inum;
- } __packed;
- } __packed;
- struct bpos _pad;
- };
-};
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
deleted file mode 100644
index b1982131b206..000000000000
--- a/fs/bcachefs/disk_accounting_types.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
-#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
-
-#include "darray.h"
-
-struct accounting_mem_entry {
- struct bpos pos;
- struct bversion bversion;
- unsigned nr_counters;
- u64 __percpu *v[2];
-};
-
-struct bch_accounting_mem {
- DARRAY(struct accounting_mem_entry) k;
- bool gc_running;
-};
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
deleted file mode 100644
index 5df8de0b8c02..000000000000
--- a/fs/bcachefs/disk_groups.c
+++ /dev/null
@@ -1,616 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "disk_groups.h"
-#include "sb-members.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-
-static int group_cmp(const void *_l, const void *_r)
-{
- const struct bch_disk_group *l = _l;
- const struct bch_disk_group *r = _r;
-
- return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
- (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
- ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
- (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
- strncmp(l->label, r->label, sizeof(l->label));
-}
-
-static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_disk_groups *groups =
- field_to_type(f, disk_groups);
- struct bch_disk_group *g, *sorted = NULL;
- unsigned nr_groups = disk_groups_nr(groups);
- unsigned i, len;
- int ret = 0;
-
- for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member m = bch2_sb_member_get(sb, i);
- unsigned group_id;
-
- if (!BCH_MEMBER_GROUP(&m))
- continue;
-
- group_id = BCH_MEMBER_GROUP(&m) - 1;
-
- if (group_id >= nr_groups) {
- prt_printf(err, "disk %u has invalid label %u (have %u)",
- i, group_id, nr_groups);
- return -BCH_ERR_invalid_sb_disk_groups;
- }
-
- if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
- prt_printf(err, "disk %u has deleted label %u", i, group_id);
- return -BCH_ERR_invalid_sb_disk_groups;
- }
- }
-
- if (!nr_groups)
- return 0;
-
- for (i = 0; i < nr_groups; i++) {
- g = groups->entries + i;
-
- if (BCH_GROUP_DELETED(g))
- continue;
-
- len = strnlen(g->label, sizeof(g->label));
- if (!len) {
- prt_printf(err, "label %u empty", i);
- return -BCH_ERR_invalid_sb_disk_groups;
- }
- }
-
- sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
- if (!sorted)
- return -BCH_ERR_ENOMEM_disk_groups_validate;
-
- memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
- sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
-
- for (g = sorted; g + 1 < sorted + nr_groups; g++)
- if (!BCH_GROUP_DELETED(g) &&
- !group_cmp(&g[0], &g[1])) {
- prt_printf(err, "duplicate label %llu.%.*s",
- BCH_GROUP_PARENT(g),
- (int) sizeof(g->label), g->label);
- ret = -BCH_ERR_invalid_sb_disk_groups;
- goto err;
- }
-err:
- kfree(sorted);
- return ret;
-}
-
-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
-{
- out->atomic++;
- rcu_read_lock();
-
- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
- if (!g)
- goto out;
-
- for (unsigned i = 0; i < g->nr; i++) {
- if (i)
- prt_printf(out, " ");
-
- if (g->entries[i].deleted) {
- prt_printf(out, "[deleted]");
- continue;
- }
-
- prt_printf(out, "[parent %d devs", g->entries[i].parent);
- for_each_member_device_rcu(c, ca, &g->entries[i].devs)
- prt_printf(out, " %s", ca->name);
- prt_printf(out, "]");
- }
-
-out:
- rcu_read_unlock();
- out->atomic--;
-}
-
-static void bch2_sb_disk_groups_to_text(struct printbuf *out,
- struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_disk_groups *groups =
- field_to_type(f, disk_groups);
- struct bch_disk_group *g;
- unsigned nr_groups = disk_groups_nr(groups);
-
- for (g = groups->entries;
- g < groups->entries + nr_groups;
- g++) {
- if (g != groups->entries)
- prt_printf(out, " ");
-
- if (BCH_GROUP_DELETED(g))
- prt_printf(out, "[deleted]");
- else
- prt_printf(out, "[parent %llu name %s]",
- BCH_GROUP_PARENT(g), g->label);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
- .validate = bch2_sb_disk_groups_validate,
- .to_text = bch2_sb_disk_groups_to_text
-};
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_disk_groups *groups;
- struct bch_disk_groups_cpu *cpu_g, *old_g;
- unsigned i, g, nr_groups;
-
- lockdep_assert_held(&c->sb_lock);
-
- groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups);
- nr_groups = disk_groups_nr(groups);
-
- if (!groups)
- return 0;
-
- cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
- if (!cpu_g)
- return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
-
- cpu_g->nr = nr_groups;
-
- for (i = 0; i < nr_groups; i++) {
- struct bch_disk_group *src = &groups->entries[i];
- struct bch_disk_group_cpu *dst = &cpu_g->entries[i];
-
- dst->deleted = BCH_GROUP_DELETED(src);
- dst->parent = BCH_GROUP_PARENT(src);
- memcpy(dst->label, src->label, sizeof(dst->label));
- }
-
- for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
- struct bch_disk_group_cpu *dst;
-
- if (!bch2_member_alive(&m))
- continue;
-
- g = BCH_MEMBER_GROUP(&m);
- while (g) {
- dst = &cpu_g->entries[g - 1];
- __set_bit(i, dst->devs.d);
- g = dst->parent;
- }
- }
-
- old_g = rcu_dereference_protected(c->disk_groups,
- lockdep_is_held(&c->sb_lock));
- rcu_assign_pointer(c->disk_groups, cpu_g);
- if (old_g)
- kfree_rcu(old_g, rcu);
-
- return 0;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
-{
- struct target t = target_decode(target);
- struct bch_devs_mask *devs;
-
- rcu_read_lock();
-
- switch (t.type) {
- case TARGET_NULL:
- devs = NULL;
- break;
- case TARGET_DEV: {
- struct bch_dev *ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
- devs = ca ? &ca->self : NULL;
- break;
- }
- case TARGET_GROUP: {
- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
- devs = g && t.group < g->nr && !g->entries[t.group].deleted
- ? &g->entries[t.group].devs
- : NULL;
- break;
- }
- default:
- BUG();
- }
-
- rcu_read_unlock();
-
- return devs;
-}
-
-bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
- struct target t = target_decode(target);
-
- switch (t.type) {
- case TARGET_NULL:
- return false;
- case TARGET_DEV:
- return dev == t.dev;
- case TARGET_GROUP: {
- struct bch_disk_groups_cpu *g;
- const struct bch_devs_mask *m;
- bool ret;
-
- rcu_read_lock();
- g = rcu_dereference(c->disk_groups);
- m = g && t.group < g->nr && !g->entries[t.group].deleted
- ? &g->entries[t.group].devs
- : NULL;
-
- ret = m ? test_bit(dev, m->d) : false;
- rcu_read_unlock();
-
- return ret;
- }
- default:
- BUG();
- }
-}
-
-static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
- unsigned parent,
- const char *name, unsigned namelen)
-{
- unsigned i, nr_groups = disk_groups_nr(groups);
-
- if (!namelen || namelen > BCH_SB_LABEL_SIZE)
- return -EINVAL;
-
- for (i = 0; i < nr_groups; i++) {
- struct bch_disk_group *g = groups->entries + i;
-
- if (BCH_GROUP_DELETED(g))
- continue;
-
- if (!BCH_GROUP_DELETED(g) &&
- BCH_GROUP_PARENT(g) == parent &&
- strnlen(g->label, sizeof(g->label)) == namelen &&
- !memcmp(name, g->label, namelen))
- return i;
- }
-
- return -1;
-}
-
-static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
- const char *name, unsigned namelen)
-{
- struct bch_sb_field_disk_groups *groups =
- bch2_sb_field_get(sb->sb, disk_groups);
- unsigned i, nr_groups = disk_groups_nr(groups);
- struct bch_disk_group *g;
-
- if (!namelen || namelen > BCH_SB_LABEL_SIZE)
- return -EINVAL;
-
- for (i = 0;
- i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
- i++)
- ;
-
- if (i == nr_groups) {
- unsigned u64s =
- (sizeof(struct bch_sb_field_disk_groups) +
- sizeof(struct bch_disk_group) * (nr_groups + 1)) /
- sizeof(u64);
-
- groups = bch2_sb_field_resize(sb, disk_groups, u64s);
- if (!groups)
- return -BCH_ERR_ENOSPC_disk_label_add;
-
- nr_groups = disk_groups_nr(groups);
- }
-
- BUG_ON(i >= nr_groups);
-
- g = &groups->entries[i];
-
- memcpy(g->label, name, namelen);
- if (namelen < sizeof(g->label))
- g->label[namelen] = '\0';
- SET_BCH_GROUP_DELETED(g, 0);
- SET_BCH_GROUP_PARENT(g, parent);
- SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-
- return i;
-}
-
-int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
-{
- struct bch_sb_field_disk_groups *groups =
- bch2_sb_field_get(sb->sb, disk_groups);
- int v = -1;
-
- do {
- const char *next = strchrnul(name, '.');
- unsigned len = next - name;
-
- if (*next == '.')
- next++;
-
- v = __bch2_disk_group_find(groups, v + 1, name, len);
- name = next;
- } while (*name && v >= 0);
-
- return v;
-}
-
-int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
-{
- struct bch_sb_field_disk_groups *groups;
- unsigned parent = 0;
- int v = -1;
-
- do {
- const char *next = strchrnul(name, '.');
- unsigned len = next - name;
-
- if (*next == '.')
- next++;
-
- groups = bch2_sb_field_get(sb->sb, disk_groups);
-
- v = __bch2_disk_group_find(groups, parent, name, len);
- if (v < 0)
- v = __bch2_disk_group_add(sb, parent, name, len);
- if (v < 0)
- return v;
-
- parent = v + 1;
- name = next;
- } while (*name && v >= 0);
-
- return v;
-}
-
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
- struct bch_disk_groups_cpu *groups;
- struct bch_disk_group_cpu *g;
- unsigned nr = 0;
- u16 path[32];
-
- out->atomic++;
- rcu_read_lock();
- groups = rcu_dereference(c->disk_groups);
- if (!groups)
- goto invalid;
-
- while (1) {
- if (nr == ARRAY_SIZE(path))
- goto invalid;
-
- if (v >= groups->nr)
- goto invalid;
-
- g = groups->entries + v;
-
- if (g->deleted)
- goto invalid;
-
- path[nr++] = v;
-
- if (!g->parent)
- break;
-
- v = g->parent - 1;
- }
-
- while (nr) {
- v = path[--nr];
- g = groups->entries + v;
-
- prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
- if (nr)
- prt_printf(out, ".");
- }
-out:
- rcu_read_unlock();
- out->atomic--;
- return;
-invalid:
- prt_printf(out, "invalid label %u", v);
- goto out;
-}
-
-void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
- struct bch_sb_field_disk_groups *groups =
- bch2_sb_field_get(sb, disk_groups);
- struct bch_disk_group *g;
- unsigned nr = 0;
- u16 path[32];
-
- while (1) {
- if (nr == ARRAY_SIZE(path))
- goto inval;
-
- if (v >= disk_groups_nr(groups))
- goto inval;
-
- g = groups->entries + v;
-
- if (BCH_GROUP_DELETED(g))
- goto inval;
-
- path[nr++] = v;
-
- if (!BCH_GROUP_PARENT(g))
- break;
-
- v = BCH_GROUP_PARENT(g) - 1;
- }
-
- while (nr) {
- v = path[--nr];
- g = groups->entries + v;
-
- prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
- if (nr)
- prt_printf(out, ".");
- }
- return;
-inval:
- prt_printf(out, "invalid label %u", v);
-}
-
-int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
- struct bch_member *mi;
- int ret, v = -1;
-
- if (!strlen(name) || !strcmp(name, "none"))
- return 0;
-
- v = bch2_disk_path_find_or_create(&c->disk_sb, name);
- if (v < 0)
- return v;
-
- ret = bch2_sb_disk_groups_to_cpu(c);
- if (ret)
- return ret;
-
- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_GROUP(mi, v + 1);
- return 0;
-}
-
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
- int ret;
-
- mutex_lock(&c->sb_lock);
- ret = __bch2_dev_group_set(c, ca, name) ?:
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
- struct printbuf *err)
-{
- struct bch_dev *ca;
- int g;
-
- if (!val)
- return -EINVAL;
-
- if (!c)
- return -BCH_ERR_option_needs_open_fs;
-
- if (!strlen(val) || !strcmp(val, "none")) {
- *res = 0;
- return 0;
- }
-
- /* Is it a device? */
- ca = bch2_dev_lookup(c, val);
- if (!IS_ERR(ca)) {
- *res = dev_to_target(ca->dev_idx);
- bch2_dev_put(ca);
- return 0;
- }
-
- mutex_lock(&c->sb_lock);
- g = bch2_disk_path_find(&c->disk_sb, val);
- mutex_unlock(&c->sb_lock);
-
- if (g >= 0) {
- *res = group_to_target(g);
- return 0;
- }
-
- return -EINVAL;
-}
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
- struct target t = target_decode(v);
-
- switch (t.type) {
- case TARGET_NULL:
- prt_printf(out, "none");
- break;
- case TARGET_DEV: {
- struct bch_dev *ca;
-
- out->atomic++;
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
-
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
- prt_printf(out, "/dev/%s", ca->name);
- percpu_ref_put(&ca->io_ref);
- } else if (ca) {
- prt_printf(out, "offline device %u", t.dev);
- } else {
- prt_printf(out, "invalid device %u", t.dev);
- }
-
- rcu_read_unlock();
- out->atomic--;
- break;
- }
- case TARGET_GROUP:
- bch2_disk_path_to_text(out, c, t.group);
- break;
- default:
- BUG();
- }
-}
-
-static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
- struct target t = target_decode(v);
-
- switch (t.type) {
- case TARGET_NULL:
- prt_printf(out, "none");
- break;
- case TARGET_DEV: {
- struct bch_member m = bch2_sb_member_get(sb, t.dev);
-
- if (bch2_member_exists(sb, t.dev)) {
- prt_printf(out, "Device ");
- pr_uuid(out, m.uuid.b);
- prt_printf(out, " (%u)", t.dev);
- } else {
- prt_printf(out, "Bad device %u", t.dev);
- }
- break;
- }
- case TARGET_GROUP:
- bch2_disk_path_to_text_sb(out, sb, t.group);
- break;
- default:
- BUG();
- }
-}
-
-void bch2_opt_target_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
-{
- if (c)
- bch2_target_to_text(out, c, v);
- else
- bch2_target_to_text_sb(out, sb, v);
-}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
deleted file mode 100644
index 441826fff224..000000000000
--- a/fs/bcachefs/disk_groups.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_H
-#define _BCACHEFS_DISK_GROUPS_H
-
-#include "disk_groups_types.h"
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
-
-static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
-{
- return groups
- ? (vstruct_end(&groups->field) -
- (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
- : 0;
-}
-
-struct target {
- enum {
- TARGET_NULL,
- TARGET_DEV,
- TARGET_GROUP,
- } type;
- union {
- unsigned dev;
- unsigned group;
- };
-};
-
-#define TARGET_DEV_START 1
-#define TARGET_GROUP_START (256 + TARGET_DEV_START)
-
-static inline u16 dev_to_target(unsigned dev)
-{
- return TARGET_DEV_START + dev;
-}
-
-static inline u16 group_to_target(unsigned group)
-{
- return TARGET_GROUP_START + group;
-}
-
-static inline struct target target_decode(unsigned target)
-{
- if (target >= TARGET_GROUP_START)
- return (struct target) {
- .type = TARGET_GROUP,
- .group = target - TARGET_GROUP_START
- };
-
- if (target >= TARGET_DEV_START)
- return (struct target) {
- .type = TARGET_DEV,
- .group = target - TARGET_DEV_START
- };
-
- return (struct target) { .type = TARGET_NULL };
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
-
-static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
- enum bch_data_type data_type,
- u16 target)
-{
- struct bch_devs_mask devs = c->rw_devs[data_type];
- const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
-
- if (t)
- bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
- return devs;
-}
-
-static inline bool bch2_target_accepts_data(struct bch_fs *c,
- enum bch_data_type data_type,
- u16 target)
-{
- struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
- return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
-}
-
-bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
-
-int bch2_disk_path_find(struct bch_sb_handle *, const char *);
-
-/* Exported for userspace bcachefs-tools: */
-int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-
-void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
-void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
-
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-
-#define bch2_opt_target (struct bch_opt_fn) { \
- .parse = bch2_opt_target_parse, \
- .to_text = bch2_opt_target_to_text, \
-}
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
-
-int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-
-const char *bch2_sb_validate_disk_groups(struct bch_sb *,
- struct bch_sb_field *);
-
-void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h
deleted file mode 100644
index 698990bbf1d2..000000000000
--- a/fs/bcachefs/disk_groups_format.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H
-#define _BCACHEFS_DISK_GROUPS_FORMAT_H
-
-#define BCH_SB_LABEL_SIZE 32
-
-struct bch_disk_group {
- __u8 label[BCH_SB_LABEL_SIZE];
- __le64 flags[2];
-} __packed __aligned(8);
-
-LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
-LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
-
-struct bch_sb_field_disk_groups {
- struct bch_sb_field field;
- struct bch_disk_group entries[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */
diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h
deleted file mode 100644
index a54ef085b13d..000000000000
--- a/fs/bcachefs/disk_groups_types.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
-#define _BCACHEFS_DISK_GROUPS_TYPES_H
-
-struct bch_disk_group_cpu {
- bool deleted;
- u16 parent;
- u8 label[BCH_SB_LABEL_SIZE];
- struct bch_devs_mask devs;
-};
-
-struct bch_disk_groups_cpu {
- struct rcu_head rcu;
- unsigned nr;
- struct bch_disk_group_cpu entries[] __counted_by(nr);
-};
-
-#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
deleted file mode 100644
index f2b9225fe0bc..000000000000
--- a/fs/bcachefs/ec.c
+++ /dev/null
@@ -1,2347 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/* erasure coding */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "keylist.h"
-#include "lru.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "util.h"
-
-#include <linux/sort.h>
-#include <linux/string_choices.h>
-
-#ifdef __KERNEL__
-
-#include <linux/raid/pq.h>
-#include <linux/raid/xor.h>
-
-static void raid5_recov(unsigned disks, unsigned failed_idx,
- size_t size, void **data)
-{
- unsigned i = 2, nr;
-
- BUG_ON(failed_idx >= disks);
-
- swap(data[0], data[failed_idx]);
- memcpy(data[0], data[1], size);
-
- while (i < disks) {
- nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
- xor_blocks(nr, size, data[0], data + i);
- i += nr;
- }
-
- swap(data[0], data[failed_idx]);
-}
-
-static void raid_gen(int nd, int np, size_t size, void **v)
-{
- if (np >= 1)
- raid5_recov(nd + np, nd, size, v);
- if (np >= 2)
- raid6_call.gen_syndrome(nd + np, size, v);
- BUG_ON(np > 2);
-}
-
-static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
-{
- switch (nr) {
- case 0:
- break;
- case 1:
- if (ir[0] < nd + 1)
- raid5_recov(nd + 1, ir[0], size, v);
- else
- raid6_call.gen_syndrome(nd + np, size, v);
- break;
- case 2:
- if (ir[1] < nd) {
- /* data+data failure. */
- raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
- } else if (ir[0] < nd) {
- /* data + p/q failure */
-
- if (ir[1] == nd) /* data + p failure */
- raid6_datap_recov(nd + np, size, ir[0], v);
- else { /* data + q failure */
- raid5_recov(nd + 1, ir[0], size, v);
- raid6_call.gen_syndrome(nd + np, size, v);
- }
- } else {
- raid_gen(nd, np, size, v);
- }
- break;
- default:
- BUG();
- }
-}
-
-#else
-
-#include <raid/raid.h>
-
-#endif
-
-struct ec_bio {
- struct bch_dev *ca;
- struct ec_stripe_buf *buf;
- size_t idx;
- u64 submit_time;
- struct bio bio;
-};
-
-/* Stripes btree keys: */
-
-int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- int ret = 0;
-
- bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
- bpos_gt(k.k->p, POS(0, U32_MAX)),
- c, stripe_pos_bad,
- "stripe at bad pos");
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
- c, stripe_val_size_bad,
- "incorrect value size (%zu < %u)",
- bkey_val_u64s(k.k), stripe_val_u64s(s));
-
- bkey_fsck_err_on(s->csum_granularity_bits >= 64,
- c, stripe_csum_granularity_bad,
- "invalid csum granularity (%u >= 64)",
- s->csum_granularity_bits);
-
- ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
- struct bch_stripe s = {};
-
- memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
-
- unsigned nr_data = s.nr_blocks - s.nr_redundant;
-
- prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
- s.algorithm,
- le16_to_cpu(s.sectors),
- nr_data,
- s.nr_redundant);
- bch2_prt_csum_type(out, s.csum_type);
- prt_str(out, " gran ");
- if (s.csum_granularity_bits < 64)
- prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
- else
- prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
-
- if (s.disk_label) {
- prt_str(out, " label");
- bch2_disk_path_to_text(out, c, s.disk_label - 1);
- }
-
- for (unsigned i = 0; i < s.nr_blocks; i++) {
- const struct bch_extent_ptr *ptr = sp->ptrs + i;
-
- if ((void *) ptr >= bkey_val_end(k))
- break;
-
- prt_char(out, ' ');
- bch2_extent_ptr_to_text(out, c, ptr);
-
- if (s.csum_type < BCH_CSUM_NR &&
- i < nr_data &&
- stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
- prt_printf(out, "#%u", stripe_blockcount_get(sp, i));
- }
-}
-
-/* Triggers: */
-
-static int __mark_stripe_bucket(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bkey_s_c_stripe s,
- unsigned ptr_idx, bool deleting,
- struct bpos bucket,
- struct bch_alloc_v4 *a,
- enum btree_iter_update_trigger_flags flags)
-{
- const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
- unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
- bool parity = ptr_idx >= nr_data;
- enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
- s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_fs *c = trans->c;
- if (deleting)
- sectors = -sectors;
-
- if (!deleting) {
- if (bch2_trans_inconsistent_on(a->stripe ||
- a->stripe_redundancy, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
- bucket.inode, bucket.offset, a->gen,
- bch2_data_type_str(a->data_type),
- a->dirty_sectors,
- a->stripe, s.k->p.offset,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
- goto err;
- }
-
- if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
- bucket.inode, bucket.offset, a->gen,
- bch2_data_type_str(a->data_type),
- a->dirty_sectors,
- a->cached_sectors,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
- goto err;
- }
- } else {
- if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
- a->stripe_redundancy != s.v->nr_redundant, trans,
- "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
- bucket.inode, bucket.offset, a->gen,
- a->stripe,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
- goto err;
- }
-
- if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
- "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
- bucket.inode, bucket.offset, a->gen,
- bch2_data_type_str(a->data_type),
- bch2_data_type_str(data_type),
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
- goto err;
- }
-
- if (bch2_trans_inconsistent_on(parity &&
- (a->dirty_sectors != -sectors ||
- a->cached_sectors), trans,
- "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
- bucket.inode, bucket.offset, a->gen,
- a->dirty_sectors,
- a->cached_sectors,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
- goto err;
- }
- }
-
- if (sectors) {
- ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
- a->gen, a->data_type, &a->dirty_sectors);
- if (ret)
- goto err;
- }
-
- if (!deleting) {
- a->stripe = s.k->p.offset;
- a->stripe_redundancy = s.v->nr_redundant;
- alloc_data_type_set(a, data_type);
- } else {
- a->stripe = 0;
- a->stripe_redundancy = 0;
- alloc_data_type_set(a, BCH_DATA_user);
- }
-err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned ptr_idx, bool deleting,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
- if (unlikely(!ca)) {
- if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
- ret = -BCH_ERR_mark_stripe;
- goto err;
- }
-
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- if (flags & BTREE_TRIGGER_transactional) {
- struct extent_ptr_decoded p = {
- .ptr = *ptr,
- .crc = bch2_extent_crc_unpack(s.k, NULL),
- };
- struct bkey_i_backpointer bp;
- bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
- (const union bch_extent_entry *) ptr, &bp);
-
- struct bkey_i_alloc_v4 *a =
- bch2_trans_start_alloc_update(trans, bucket, 0);
- ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
- bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
- !(flags & BTREE_TRIGGER_overwrite));
- if (ret)
- goto err;
- }
-
- if (flags & BTREE_TRIGGER_gc) {
- struct bucket *g = gc_bucket(ca, bucket.offset);
- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
- ptr->dev,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
- goto err;
- }
-
- bucket_lock(g);
- struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
- ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
- alloc_to_bucket(g, new);
- bucket_unlock(g);
-
- if (!ret)
- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
- }
-err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int mark_stripe_buckets(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- enum btree_iter_update_trigger_flags flags)
-{
- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(old).v : NULL;
- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(new).v : NULL;
-
- BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
-
- unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
-
- for (unsigned i = 0; i < nr_blocks; i++) {
- if (new_s && old_s &&
- !memcmp(&new_s->ptrs[i],
- &old_s->ptrs[i],
- sizeof(new_s->ptrs[i])))
- continue;
-
- if (new_s) {
- int ret = mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(new), i, false, flags);
- if (ret)
- return ret;
- }
-
- if (old_s) {
- int ret = mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(old), i, true, flags);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_trigger_stripe(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s _new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_s_c new = _new.s_c;
- struct bch_fs *c = trans->c;
- u64 idx = new.k->p.offset;
- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(old).v : NULL;
- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(new).v : NULL;
-
- if (unlikely(flags & BTREE_TRIGGER_check_repair))
- return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
-
- BUG_ON(new_s && old_s &&
- (new_s->nr_blocks != old_s->nr_blocks ||
- new_s->nr_redundant != old_s->nr_redundant));
-
- if (flags & BTREE_TRIGGER_transactional) {
- int ret = bch2_lru_change(trans,
- BCH_LRU_STRIPE_FRAGMENTATION,
- idx,
- stripe_lru_pos(old_s),
- stripe_lru_pos(new_s));
- if (ret)
- return ret;
- }
-
- if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- /*
- * If the pointers aren't changing, we don't need to do anything:
- */
- if (new_s && old_s &&
- new_s->nr_blocks == old_s->nr_blocks &&
- new_s->nr_redundant == old_s->nr_redundant &&
- !memcmp(old_s->ptrs, new_s->ptrs,
- new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
- return 0;
-
- struct gc_stripe *gc = NULL;
- if (flags & BTREE_TRIGGER_gc) {
- gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
- if (!gc) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
- return -BCH_ERR_ENOMEM_mark_stripe;
- }
-
- /*
- * This will be wrong when we bring back runtime gc: we should
- * be unmarking the old key and then marking the new key
- *
- * Also: when we bring back runtime gc, locking
- */
- gc->alive = true;
- gc->sectors = le16_to_cpu(new_s->sectors);
- gc->nr_blocks = new_s->nr_blocks;
- gc->nr_redundant = new_s->nr_redundant;
-
- for (unsigned i = 0; i < new_s->nr_blocks; i++)
- gc->ptrs[i] = new_s->ptrs[i];
-
- /*
- * gc recalculates this field from stripe ptr
- * references:
- */
- memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
- }
-
- if (new_s) {
- s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
- bch2_bkey_to_replicas(&acc.replicas, new);
- int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
- if (ret)
- return ret;
-
- if (gc)
- memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
- }
-
- if (old_s) {
- s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
- bch2_bkey_to_replicas(&acc.replicas, old);
- int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
- if (ret)
- return ret;
- }
-
- int ret = mark_stripe_buckets(trans, old, new, flags);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* returns blocknr in stripe that we matched: */
-static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
- struct bkey_s_c k, unsigned *block)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
-
- bkey_for_each_ptr(ptrs, ptr)
- for (i = 0; i < nr_data; i++)
- if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
- le16_to_cpu(s->sectors))) {
- *block = i;
- return ptr;
- }
-
- return NULL;
-}
-
-static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
-{
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
-
- extent_for_each_entry(e, entry)
- if (extent_entry_type(entry) ==
- BCH_EXTENT_ENTRY_stripe_ptr &&
- entry->stripe_ptr.idx == idx)
- return true;
-
- break;
- }
- }
-
- return false;
-}
-
-/* Stripe bufs: */
-
-static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
-{
- if (buf->key.k.type == KEY_TYPE_stripe) {
- struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
- unsigned i;
-
- for (i = 0; i < s->v.nr_blocks; i++) {
- kvfree(buf->data[i]);
- buf->data[i] = NULL;
- }
- }
-}
-
-/* XXX: this is a non-mempoolified memory allocation: */
-static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
- unsigned offset, unsigned size)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned csum_granularity = 1U << v->csum_granularity_bits;
- unsigned end = offset + size;
- unsigned i;
-
- BUG_ON(end > le16_to_cpu(v->sectors));
-
- offset = round_down(offset, csum_granularity);
- end = min_t(unsigned, le16_to_cpu(v->sectors),
- round_up(end, csum_granularity));
-
- buf->offset = offset;
- buf->size = end - offset;
-
- memset(buf->valid, 0xFF, sizeof(buf->valid));
-
- for (i = 0; i < v->nr_blocks; i++) {
- buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
- if (!buf->data[i])
- goto err;
- }
-
- return 0;
-err:
- ec_stripe_buf_exit(buf);
- return -BCH_ERR_ENOMEM_stripe_buf;
-}
-
-/* Checksumming: */
-
-static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
- unsigned block, unsigned offset)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned csum_granularity = 1 << v->csum_granularity_bits;
- unsigned end = buf->offset + buf->size;
- unsigned len = min(csum_granularity, end - offset);
-
- BUG_ON(offset >= end);
- BUG_ON(offset < buf->offset);
- BUG_ON(offset & (csum_granularity - 1));
- BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
- (len & (csum_granularity - 1)));
-
- return bch2_checksum(NULL, v->csum_type,
- null_nonce(),
- buf->data[block] + ((offset - buf->offset) << 9),
- len << 9);
-}
-
-static void ec_generate_checksums(struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned i, j, csums_per_device = stripe_csums_per_device(v);
-
- if (!v->csum_type)
- return;
-
- BUG_ON(buf->offset);
- BUG_ON(buf->size != le16_to_cpu(v->sectors));
-
- for (i = 0; i < v->nr_blocks; i++)
- for (j = 0; j < csums_per_device; j++)
- stripe_csum_set(v, i, j,
- ec_block_checksum(buf, i, j << v->csum_granularity_bits));
-}
-
-static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned csum_granularity = 1 << v->csum_granularity_bits;
- unsigned i;
-
- if (!v->csum_type)
- return;
-
- for (i = 0; i < v->nr_blocks; i++) {
- unsigned offset = buf->offset;
- unsigned end = buf->offset + buf->size;
-
- if (!test_bit(i, buf->valid))
- continue;
-
- while (offset < end) {
- unsigned j = offset >> v->csum_granularity_bits;
- unsigned len = min(csum_granularity, end - offset);
- struct bch_csum want = stripe_csum_get(v, i, j);
- struct bch_csum got = ec_block_checksum(buf, i, offset);
-
- if (bch2_crc_cmp(want, got)) {
- struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
- if (ca) {
- struct printbuf err = PRINTBUF;
-
- prt_str(&err, "stripe ");
- bch2_csum_err_msg(&err, v->csum_type, want, got);
- prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
- bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
- bch_err_ratelimited(ca, "%s", err.buf);
- printbuf_exit(&err);
-
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
- }
-
- clear_bit(i, buf->valid);
- break;
- }
-
- offset += len;
- }
- }
-}
-
-/* Erasure coding: */
-
-static void ec_generate_ec(struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned nr_data = v->nr_blocks - v->nr_redundant;
- unsigned bytes = le16_to_cpu(v->sectors) << 9;
-
- raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
-}
-
-static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-
- return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
-}
-
-static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
- unsigned nr_data = v->nr_blocks - v->nr_redundant;
- unsigned bytes = buf->size << 9;
-
- if (ec_nr_failed(buf) > v->nr_redundant) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: unable to read enough blocks");
- return -1;
- }
-
- for (i = 0; i < nr_data; i++)
- if (!test_bit(i, buf->valid))
- failed[nr_failed++] = i;
-
- raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
- return 0;
-}
-
-/* IO: */
-
-static void ec_block_endio(struct bio *bio)
-{
- struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
- struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
- struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
- struct bch_dev *ca = ec_bio->ca;
- struct closure *cl = bio->bi_private;
-
- bch2_account_io_completion(ca, bio_data_dir(bio),
- ec_bio->submit_time, !bio->bi_status);
-
- if (bio->bi_status) {
- bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
- str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status));
- clear_bit(ec_bio->idx, ec_bio->buf->valid);
- }
-
- int stale = dev_ptr_stale(ca, ptr);
- if (stale) {
- bch_err_ratelimited(ca->fs,
- "error %s stripe: stale/invalid pointer (%i) after io",
- bio_data_dir(bio) == READ ? "reading from" : "writing to",
- stale);
- clear_bit(ec_bio->idx, ec_bio->buf->valid);
- }
-
- bio_put(&ec_bio->bio);
- percpu_ref_put(&ca->io_ref);
- closure_put(cl);
-}
-
-static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
- blk_opf_t opf, unsigned idx, struct closure *cl)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned offset = 0, bytes = buf->size << 9;
- struct bch_extent_ptr *ptr = &v->ptrs[idx];
- enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
- ? BCH_DATA_user
- : BCH_DATA_parity;
- int rw = op_is_write(opf);
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
- if (!ca) {
- clear_bit(idx, buf->valid);
- return;
- }
-
- int stale = dev_ptr_stale(ca, ptr);
- if (stale) {
- bch_err_ratelimited(c,
- "error %s stripe: stale pointer (%i)",
- rw == READ ? "reading from" : "writing to",
- stale);
- clear_bit(idx, buf->valid);
- return;
- }
-
-
- this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
-
- while (offset < bytes) {
- unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
- DIV_ROUND_UP(bytes, PAGE_SIZE));
- unsigned b = min_t(size_t, bytes - offset,
- nr_iovecs << PAGE_SHIFT);
- struct ec_bio *ec_bio;
-
- ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
- nr_iovecs,
- opf,
- GFP_KERNEL,
- &c->ec_bioset),
- struct ec_bio, bio);
-
- ec_bio->ca = ca;
- ec_bio->buf = buf;
- ec_bio->idx = idx;
- ec_bio->submit_time = local_clock();
-
- ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
- ec_bio->bio.bi_end_io = ec_block_endio;
- ec_bio->bio.bi_private = cl;
-
- bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
-
- closure_get(cl);
- percpu_ref_get(&ca->io_ref);
-
- submit_bio(&ec_bio->bio);
-
- offset += b;
- }
-
- percpu_ref_put(&ca->io_ref);
-}
-
-static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
- struct ec_stripe_buf *stripe)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- POS(0, idx), BTREE_ITER_slots);
- ret = bkey_err(k);
- if (ret)
- goto err;
- if (k.k->type != KEY_TYPE_stripe) {
- ret = -ENOENT;
- goto err;
- }
- bkey_reassemble(&stripe->key, k);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* recovery read path: */
-int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
- struct bkey_s_c orig_k)
-{
- struct bch_fs *c = trans->c;
- struct ec_stripe_buf *buf = NULL;
- struct closure cl;
- struct bch_stripe *v;
- unsigned i, offset;
- const char *msg = NULL;
- struct printbuf msgbuf = PRINTBUF;
- int ret = 0;
-
- closure_init_stack(&cl);
-
- BUG_ON(!rbio->pick.has_ec);
-
- buf = kzalloc(sizeof(*buf), GFP_NOFS);
- if (!buf)
- return -BCH_ERR_ENOMEM_ec_read_extent;
-
- ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
- if (ret) {
- msg = "stripe not found";
- goto err;
- }
-
- v = &bkey_i_to_stripe(&buf->key)->v;
-
- if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
- msg = "pointer doesn't match stripe";
- goto err;
- }
-
- offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
- if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
- msg = "read is bigger than stripe";
- goto err;
- }
-
- ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
- if (ret) {
- msg = "-ENOMEM";
- goto err;
- }
-
- for (i = 0; i < v->nr_blocks; i++)
- ec_block_io(c, buf, REQ_OP_READ, i, &cl);
-
- closure_sync(&cl);
-
- if (ec_nr_failed(buf) > v->nr_redundant) {
- msg = "unable to read enough blocks";
- goto err;
- }
-
- ec_validate_checksums(c, buf);
-
- ret = ec_do_recov(c, buf);
- if (ret)
- goto err;
-
- memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
- buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
-out:
- ec_stripe_buf_exit(buf);
- kfree(buf);
- return ret;
-err:
- bch2_bkey_val_to_text(&msgbuf, c, orig_k);
- bch_err_ratelimited(c,
- "error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
- printbuf_exit(&msgbuf);
- ret = -BCH_ERR_stripe_reconstruct;
- goto out;
-}
-
-/* stripe bucket accounting: */
-
-static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
-{
- if (c->gc_pos.phase != GC_PHASE_not_running &&
- !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
- return 0;
-}
-
-static int ec_stripe_mem_alloc(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- return allocate_dropping_locks_errcode(trans,
- __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
-}
-
-/*
- * Hash table of open stripes:
- * Stripes that are being created or modified are kept in a hash table, so that
- * stripe deletion can skip them.
- */
-
-static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
- unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
- struct ec_stripe_new *s;
-
- hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
- if (s->idx == idx)
- return true;
- return false;
-}
-
-static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
- bool ret = false;
-
- spin_lock(&c->ec_stripes_new_lock);
- ret = __bch2_stripe_is_open(c, idx);
- spin_unlock(&c->ec_stripes_new_lock);
-
- return ret;
-}
-
-static bool bch2_try_open_stripe(struct bch_fs *c,
- struct ec_stripe_new *s,
- u64 idx)
-{
- bool ret;
-
- spin_lock(&c->ec_stripes_new_lock);
- ret = !__bch2_stripe_is_open(c, idx);
- if (ret) {
- unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
-
- s->idx = idx;
- hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
- }
- spin_unlock(&c->ec_stripes_new_lock);
-
- return ret;
-}
-
-static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
-{
- BUG_ON(!s->idx);
-
- spin_lock(&c->ec_stripes_new_lock);
- hlist_del_init(&s->hash);
- spin_unlock(&c->ec_stripes_new_lock);
-
- s->idx = 0;
-}
-
-/* stripe deletion */
-
-static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
- BTREE_ID_stripes, POS(0, idx),
- BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- /*
- * We expect write buffer races here
- * Important: check stripe_is_open with stripe key locked:
- */
- if (k.k->type == KEY_TYPE_stripe &&
- !bch2_stripe_is_open(trans->c, idx) &&
- stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
- ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * XXX
- * can we kill this and delete stripes from the trigger?
- */
-static void ec_stripe_delete_work(struct work_struct *work)
-{
- struct bch_fs *c =
- container_of(work, struct bch_fs, ec_stripe_delete_work);
-
- bch2_trans_run(c,
- bch2_btree_write_buffer_tryflush(trans) ?:
- for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
- lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
- 0, lru_k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
- ec_stripe_delete(trans, lru_k.k->p.offset);
- })));
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
-}
-
-void bch2_do_stripe_deletes(struct bch_fs *c)
-{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
- !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
-}
-
-/* stripe creation: */
-
-static int ec_stripe_key_update(struct btree_trans *trans,
- struct bkey_i_stripe *old,
- struct bkey_i_stripe *new)
-{
- struct bch_fs *c = trans->c;
- bool create = !old;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
- c, "error %s stripe: got existing key type %s",
- create ? "creating" : "updating",
- bch2_bkey_types[k.k->type])) {
- ret = -EINVAL;
- goto err;
- }
-
- if (k.k->type == KEY_TYPE_stripe) {
- const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
-
- BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
- BUG_ON(old->v.nr_blocks != v->nr_blocks);
-
- for (unsigned i = 0; i < new->v.nr_blocks; i++) {
- unsigned sectors = stripe_blockcount_get(v, i);
-
- if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "stripe changed nonempty block %u", i);
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
- bch2_fs_inconsistent(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -EINVAL;
- goto err;
- }
-
- /*
- * If the stripe ptr changed underneath us, it must have
- * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
- */
- if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
- BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
-
- if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
- new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
- }
-
- stripe_blockcount_set(&new->v, i, sectors);
- }
- }
-
- ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int ec_stripe_update_extent(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bpos bucket, u8 gen,
- struct ec_stripe_buf *s,
- struct bkey_s_c_backpointer bp,
- struct bkey_buf *last_flushed)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- const struct bch_extent_ptr *ptr_c;
- struct bch_extent_ptr *ec_ptr = NULL;
- struct bch_extent_stripe_ptr stripe_ptr;
- struct bkey_i *n;
- int ret, dev, block;
-
- if (bp.v->level) {
- struct printbuf buf = PRINTBUF;
- struct btree_iter node_iter;
- struct btree *b;
-
- b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed);
- bch2_trans_iter_exit(trans, &node_iter);
-
- if (!b)
- return 0;
-
- prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
- bch2_bkey_val_to_text(&buf, c, bp.s_c);
-
- bch2_fs_inconsistent(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return -BCH_ERR_erasure_coding_found_btree_node;
- }
-
- k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
- ret = bkey_err(k);
- if (ret)
- return ret;
- if (!k.k) {
- /*
- * extent no longer exists - we could flush the btree
- * write buffer and retry to verify, but no need:
- */
- return 0;
- }
-
- if (extent_has_stripe_ptr(k, s->key.k.p.offset))
- goto out;
-
- ptr_c = bkey_matches_stripe(v, k, &block);
- /*
- * It doesn't generally make sense to erasure code cached ptrs:
- * XXX: should we be incrementing a counter?
- */
- if (!ptr_c || ptr_c->cached)
- goto out;
-
- dev = v->ptrs[block].dev;
-
- n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto out;
-
- bkey_reassemble(n, k);
-
- bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
- ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
- BUG_ON(!ec_ptr);
-
- stripe_ptr = (struct bch_extent_stripe_ptr) {
- .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
- .block = block,
- .redundancy = v->nr_redundant,
- .idx = s->key.k.p.offset,
- };
-
- __extent_entry_insert(n,
- (union bch_extent_entry *) ec_ptr,
- (union bch_extent_entry *) &stripe_ptr);
-
- ret = bch2_trans_update(trans, &iter, n, 0);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
- unsigned block)
-{
- struct bch_fs *c = trans->c;
- struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- struct bch_extent_ptr ptr = v->ptrs[block];
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
- if (!ca)
- return -BCH_ERR_ENOENT_dev_not_found;
-
- struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
-
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_start(ca, bucket_pos),
- bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc, ({
- if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
- break;
-
- if (bp_k.k->type != KEY_TYPE_backpointer)
- continue;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
- if (bp.v->btree_id == BTREE_ID_stripes)
- continue;
-
- ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
- bp, &last_flushed);
- }));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch2_dev_put(ca);
- return ret;
-}
-
-static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- unsigned nr_data = v->nr_blocks - v->nr_redundant;
-
- int ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
- for (unsigned i = 0; i < nr_data; i++) {
- ret = ec_stripe_update_bucket(trans, s, i);
- if (ret)
- break;
- }
-err:
- bch2_trans_put(trans);
- return ret;
-}
-
-static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
- struct ec_stripe_new *s,
- unsigned block,
- struct open_bucket *ob)
-{
- struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
- if (!ca) {
- s->err = -BCH_ERR_erofs_no_writes;
- return;
- }
-
- unsigned offset = ca->mi.bucket_size - ob->sectors_free;
- memset(s->new_stripe.data[block] + (offset << 9),
- 0,
- ob->sectors_free << 9);
-
- int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
- ob->bucket * ca->mi.bucket_size + offset,
- ob->sectors_free,
- GFP_KERNEL, 0);
-
- percpu_ref_put(&ca->io_ref);
-
- if (ret)
- s->err = ret;
-}
-
-void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
-{
- if (s->idx)
- bch2_stripe_close(c, s);
- kfree(s);
-}
-
-/*
- * data buckets of new stripe all written: create the stripe
- */
-static void ec_stripe_create(struct ec_stripe_new *s)
-{
- struct bch_fs *c = s->c;
- struct open_bucket *ob;
- struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
- unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
- int ret;
-
- BUG_ON(s->h->s == s);
-
- closure_sync(&s->iodone);
-
- if (!s->err) {
- for (i = 0; i < nr_data; i++)
- if (s->blocks[i]) {
- ob = c->open_buckets + s->blocks[i];
-
- if (ob->sectors_free)
- zero_out_rest_of_ec_bucket(c, s, i, ob);
- }
- }
-
- if (s->err) {
- if (!bch2_err_matches(s->err, EROFS))
- bch_err(c, "error creating stripe: error writing data buckets");
- ret = s->err;
- goto err;
- }
-
- if (s->have_existing_stripe) {
- ec_validate_checksums(c, &s->existing_stripe);
-
- if (ec_do_recov(c, &s->existing_stripe)) {
- bch_err(c, "error creating stripe: error reading existing stripe");
- ret = -BCH_ERR_ec_block_read;
- goto err;
- }
-
- for (i = 0; i < nr_data; i++)
- if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
- swap(s->new_stripe.data[i],
- s->existing_stripe.data[i]);
-
- ec_stripe_buf_exit(&s->existing_stripe);
- }
-
- BUG_ON(!s->allocated);
- BUG_ON(!s->idx);
-
- ec_generate_ec(&s->new_stripe);
-
- ec_generate_checksums(&s->new_stripe);
-
- /* write p/q: */
- for (i = nr_data; i < v->nr_blocks; i++)
- ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
- closure_sync(&s->iodone);
-
- if (ec_nr_failed(&s->new_stripe)) {
- bch_err(c, "error creating stripe: error writing redundancy buckets");
- ret = -BCH_ERR_ec_block_write;
- goto err;
- }
-
- ret = bch2_trans_commit_do(c, &s->res, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_key_update(trans,
- s->have_existing_stripe
- ? bkey_i_to_stripe(&s->existing_stripe.key)
- : NULL,
- bkey_i_to_stripe(&s->new_stripe.key)));
- bch_err_msg(c, ret, "creating stripe key");
- if (ret) {
- goto err;
- }
-
- ret = ec_stripe_update_extents(c, &s->new_stripe);
- bch_err_msg(c, ret, "error updating extents");
- if (ret)
- goto err;
-err:
- trace_stripe_create(c, s->idx, ret);
-
- bch2_disk_reservation_put(c, &s->res);
-
- for (i = 0; i < v->nr_blocks; i++)
- if (s->blocks[i]) {
- ob = c->open_buckets + s->blocks[i];
-
- if (i < nr_data) {
- ob->ec = NULL;
- __bch2_open_bucket_put(c, ob);
- } else {
- bch2_open_bucket_put(c, ob);
- }
- }
-
- mutex_lock(&c->ec_stripe_new_lock);
- list_del(&s->list);
- mutex_unlock(&c->ec_stripe_new_lock);
- wake_up(&c->ec_stripe_new_wait);
-
- ec_stripe_buf_exit(&s->existing_stripe);
- ec_stripe_buf_exit(&s->new_stripe);
- closure_debug_destroy(&s->iodone);
-
- ec_stripe_new_put(c, s, STRIPE_REF_stripe);
-}
-
-static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
-{
- struct ec_stripe_new *s;
-
- mutex_lock(&c->ec_stripe_new_lock);
- list_for_each_entry(s, &c->ec_stripe_new_list, list)
- if (!atomic_read(&s->ref[STRIPE_REF_io]))
- goto out;
- s = NULL;
-out:
- mutex_unlock(&c->ec_stripe_new_lock);
-
- return s;
-}
-
-static void ec_stripe_create_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work,
- struct bch_fs, ec_stripe_create_work);
- struct ec_stripe_new *s;
-
- while ((s = get_pending_stripe(c)))
- ec_stripe_create(s);
-
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
-}
-
-void bch2_ec_do_stripe_creates(struct bch_fs *c)
-{
- bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
-
- if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
-}
-
-static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
-{
- struct ec_stripe_new *s = h->s;
-
- lockdep_assert_held(&h->lock);
-
- BUG_ON(!s->allocated && !s->err);
-
- h->s = NULL;
- s->pending = true;
-
- mutex_lock(&c->ec_stripe_new_lock);
- list_add(&s->list, &c->ec_stripe_new_list);
- mutex_unlock(&c->ec_stripe_new_lock);
-
- ec_stripe_new_put(c, s, STRIPE_REF_io);
-}
-
-static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
-{
- h->s->err = err;
- ec_stripe_new_set_pending(c, h);
-}
-
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
-{
- struct ec_stripe_new *s = ob->ec;
-
- s->err = err;
-}
-
-void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
-{
- struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
- if (!ob)
- return NULL;
-
- BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
-
- struct bch_dev *ca = ob_dev(c, ob);
- unsigned offset = ca->mi.bucket_size - ob->sectors_free;
-
- return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
- unsigned l = *((const unsigned *) _l);
- unsigned r = *((const unsigned *) _r);
-
- return cmp_int(l, r);
-}
-
-/* pick most common bucket size: */
-static unsigned pick_blocksize(struct bch_fs *c,
- struct bch_devs_mask *devs)
-{
- unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
- struct {
- unsigned nr, size;
- } cur = { 0, 0 }, best = { 0, 0 };
-
- for_each_member_device_rcu(c, ca, devs)
- sizes[nr++] = ca->mi.bucket_size;
-
- sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
-
- for (unsigned i = 0; i < nr; i++) {
- if (sizes[i] != cur.size) {
- if (cur.nr > best.nr)
- best = cur;
-
- cur.nr = 0;
- cur.size = sizes[i];
- }
-
- cur.nr++;
- }
-
- if (cur.nr > best.nr)
- best = cur;
-
- return best.size;
-}
-
-static bool may_create_new_stripe(struct bch_fs *c)
-{
- return false;
-}
-
-static void ec_stripe_key_init(struct bch_fs *c,
- struct bkey_i *k,
- unsigned nr_data,
- unsigned nr_parity,
- unsigned stripe_size,
- unsigned disk_label)
-{
- struct bkey_i_stripe *s = bkey_stripe_init(k);
- unsigned u64s;
-
- s->v.sectors = cpu_to_le16(stripe_size);
- s->v.algorithm = 0;
- s->v.nr_blocks = nr_data + nr_parity;
- s->v.nr_redundant = nr_parity;
- s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
- s->v.csum_type = BCH_CSUM_crc32c;
- s->v.disk_label = disk_label;
-
- while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
- BUG_ON(1 << s->v.csum_granularity_bits >=
- le16_to_cpu(s->v.sectors) ||
- s->v.csum_granularity_bits == U8_MAX);
- s->v.csum_granularity_bits++;
- }
-
- set_bkey_val_u64s(&s->k, u64s);
-}
-
-static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
-{
- struct ec_stripe_new *s;
-
- lockdep_assert_held(&h->lock);
-
- s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return NULL;
-
- mutex_init(&s->lock);
- closure_init(&s->iodone, NULL);
- atomic_set(&s->ref[STRIPE_REF_stripe], 1);
- atomic_set(&s->ref[STRIPE_REF_io], 1);
- s->c = c;
- s->h = h;
- s->nr_data = min_t(unsigned, h->nr_active_devs,
- BCH_BKEY_PTRS_MAX) - h->redundancy;
- s->nr_parity = h->redundancy;
-
- ec_stripe_key_init(c, &s->new_stripe.key,
- s->nr_data, s->nr_parity,
- h->blocksize, h->disk_label);
- return s;
-}
-
-static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
-{
- struct bch_devs_mask devs = h->devs;
-
- rcu_read_lock();
- h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
- ? group_to_target(h->disk_label - 1)
- : 0);
- unsigned nr_devs = dev_mask_nr(&h->devs);
-
- for_each_member_device_rcu(c, ca, &h->devs)
- if (!ca->mi.durability)
- __clear_bit(ca->dev_idx, h->devs.d);
- unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
-
- h->blocksize = pick_blocksize(c, &h->devs);
-
- h->nr_active_devs = 0;
- for_each_member_device_rcu(c, ca, &h->devs)
- if (ca->mi.bucket_size == h->blocksize)
- h->nr_active_devs++;
-
- rcu_read_unlock();
-
- /*
- * If we only have redundancy + 1 devices, we're better off with just
- * replication:
- */
- h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
-
- if (h->insufficient_devs) {
- const char *err;
-
- if (nr_devs < h->redundancy + 2)
- err = NULL;
- else if (nr_devs_with_durability < h->redundancy + 2)
- err = "cannot use durability=0 devices";
- else
- err = "mismatched bucket sizes";
-
- if (err)
- bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
- h->nr_active_devs, h->redundancy + 2, err);
- }
-
- struct bch_devs_mask devs_leaving;
- bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
-
- if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
- ec_stripe_new_cancel(c, h, -EINTR);
-
- h->rw_devs_change_count = c->rw_devs_change_count;
-}
-
-static struct ec_stripe_head *
-ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
- unsigned algo, unsigned redundancy,
- enum bch_watermark watermark)
-{
- struct ec_stripe_head *h;
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return NULL;
-
- mutex_init(&h->lock);
- BUG_ON(!mutex_trylock(&h->lock));
-
- h->disk_label = disk_label;
- h->algo = algo;
- h->redundancy = redundancy;
- h->watermark = watermark;
-
- list_add(&h->list, &c->ec_stripe_head_list);
- return h;
-}
-
-void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
-{
- if (h->s &&
- h->s->allocated &&
- bitmap_weight(h->s->blocks_allocated,
- h->s->nr_data) == h->s->nr_data)
- ec_stripe_new_set_pending(c, h);
-
- mutex_unlock(&h->lock);
-}
-
-static struct ec_stripe_head *
-__bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned disk_label,
- unsigned algo,
- unsigned redundancy,
- enum bch_watermark watermark)
-{
- struct bch_fs *c = trans->c;
- struct ec_stripe_head *h;
- int ret;
-
- if (!redundancy)
- return NULL;
-
- ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
- if (ret)
- return ERR_PTR(ret);
-
- if (test_bit(BCH_FS_going_ro, &c->flags)) {
- h = ERR_PTR(-BCH_ERR_erofs_no_writes);
- goto err;
- }
-
- list_for_each_entry(h, &c->ec_stripe_head_list, list)
- if (h->disk_label == disk_label &&
- h->algo == algo &&
- h->redundancy == redundancy &&
- h->watermark == watermark) {
- ret = bch2_trans_mutex_lock(trans, &h->lock);
- if (ret) {
- h = ERR_PTR(ret);
- goto err;
- }
- goto found;
- }
-
- h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
- if (!h) {
- h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
- goto err;
- }
-found:
- if (h->rw_devs_change_count != c->rw_devs_change_count)
- ec_stripe_head_devs_update(c, h);
-
- if (h->insufficient_devs) {
- mutex_unlock(&h->lock);
- h = NULL;
- }
-err:
- mutex_unlock(&c->ec_stripe_head_lock);
- return h;
-}
-
-static int new_stripe_alloc_buckets(struct btree_trans *trans,
- struct ec_stripe_head *h, struct ec_stripe_new *s,
- enum bch_watermark watermark, struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- struct bch_devs_mask devs = h->devs;
- struct open_bucket *ob;
- struct open_buckets buckets;
- struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
- unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
- bool have_cache = true;
- int ret = 0;
-
- BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
- BUG_ON(v->nr_redundant != s->nr_parity);
-
- /* * We bypass the sector allocator which normally does this: */
- bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
-
- for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
- /*
- * Note: we don't yet repair invalid blocks (failed/removed
- * devices) when reusing stripes - we still need a codepath to
- * walk backpointers and update all extents that point to that
- * block when updating the stripe
- */
- if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
- __clear_bit(v->ptrs[i].dev, devs.d);
-
- if (i < s->nr_data)
- nr_have_data++;
- else
- nr_have_parity++;
- }
-
- BUG_ON(nr_have_data > s->nr_data);
- BUG_ON(nr_have_parity > s->nr_parity);
-
- buckets.nr = 0;
- if (nr_have_parity < s->nr_parity) {
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
- &h->parity_stripe,
- &devs,
- s->nr_parity,
- &nr_have_parity,
- &have_cache, 0,
- BCH_DATA_parity,
- watermark,
- cl);
-
- open_bucket_for_each(c, &buckets, ob, i) {
- j = find_next_zero_bit(s->blocks_gotten,
- s->nr_data + s->nr_parity,
- s->nr_data);
- BUG_ON(j >= s->nr_data + s->nr_parity);
-
- s->blocks[j] = buckets.v[i];
- v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, s->blocks_gotten);
- }
-
- if (ret)
- return ret;
- }
-
- buckets.nr = 0;
- if (nr_have_data < s->nr_data) {
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
- &h->block_stripe,
- &devs,
- s->nr_data,
- &nr_have_data,
- &have_cache, 0,
- BCH_DATA_user,
- watermark,
- cl);
-
- open_bucket_for_each(c, &buckets, ob, i) {
- j = find_next_zero_bit(s->blocks_gotten,
- s->nr_data, 0);
- BUG_ON(j >= s->nr_data);
-
- s->blocks[j] = buckets.v[i];
- v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, s->blocks_gotten);
- }
-
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int __get_existing_stripe(struct btree_trans *trans,
- struct ec_stripe_head *head,
- struct ec_stripe_buf *stripe,
- u64 idx)
-{
- struct bch_fs *c = trans->c;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
- BTREE_ID_stripes, POS(0, idx), 0);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- /* We expect write buffer races here */
- if (k.k->type != KEY_TYPE_stripe)
- goto out;
-
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
- if (stripe_lru_pos(s.v) <= 1)
- goto out;
-
- if (s.v->disk_label == head->disk_label &&
- s.v->algorithm == head->algo &&
- s.v->nr_redundant == head->redundancy &&
- le16_to_cpu(s.v->sectors) == head->blocksize &&
- bch2_try_open_stripe(c, head->s, idx)) {
- bkey_reassemble(&stripe->key, k);
- ret = 1;
- }
-out:
- bch2_set_btree_iter_dontneed(&iter);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
-{
- struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
- struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
- unsigned i;
-
- BUG_ON(existing_v->nr_redundant != s->nr_parity);
- s->nr_data = existing_v->nr_blocks -
- existing_v->nr_redundant;
-
- int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
- if (ret) {
- bch2_stripe_close(c, s);
- return ret;
- }
-
- BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
-
- /*
- * Free buckets we initially allocated - they might conflict with
- * blocks from the stripe we're reusing:
- */
- for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
- bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
- s->blocks[i] = 0;
- }
- memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
- memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
-
- for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
- if (stripe_blockcount_get(existing_v, i)) {
- __set_bit(i, s->blocks_gotten);
- __set_bit(i, s->blocks_allocated);
- }
-
- ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
- }
-
- bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
- s->have_existing_stripe = true;
-
- return 0;
-}
-
-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
- struct ec_stripe_new *s)
-{
- struct bch_fs *c = trans->c;
-
- /*
- * If we can't allocate a new stripe, and there's no stripes with empty
- * blocks for us to reuse, that means we have to wait on copygc:
- */
- if (may_create_new_stripe(c))
- return -1;
-
- struct btree_iter lru_iter;
- struct bkey_s_c lru_k;
- int ret = 0;
-
- for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
- lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
- 0, lru_k, ret) {
- ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &lru_iter);
- if (!ret)
- ret = -BCH_ERR_stripe_alloc_blocked;
- if (ret == 1)
- ret = 0;
- if (ret)
- return ret;
-
- return init_new_stripe_from_existing(c, s);
-}
-
-static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
- struct ec_stripe_new *s)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bpos min_pos = POS(0, 1);
- struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
- int ret;
-
- if (!s->res.sectors) {
- ret = bch2_disk_reservation_get(c, &s->res,
- h->blocksize,
- s->nr_parity,
- BCH_DISK_RESERVATION_NOFAIL);
- if (ret)
- return ret;
- }
-
- /*
- * Allocate stripe slot
- * XXX: we're going to need a bitrange btree of free stripes
- */
- for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
- BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
- if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
- if (start_pos.offset) {
- start_pos = min_pos;
- bch2_btree_iter_set_pos(&iter, start_pos);
- continue;
- }
-
- ret = -BCH_ERR_ENOSPC_stripe_create;
- break;
- }
-
- if (bkey_deleted(k.k) &&
- bch2_try_open_stripe(c, s, k.k->p.offset))
- break;
- }
-
- c->ec_stripe_hint = iter.pos.offset;
-
- if (ret)
- goto err;
-
- ret = ec_stripe_mem_alloc(trans, &iter);
- if (ret) {
- bch2_stripe_close(c, s);
- goto err;
- }
-
- s->new_stripe.key.k.p = iter.pos;
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-err:
- bch2_disk_reservation_put(c, &s->res);
- goto out;
-}
-
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned target,
- unsigned algo,
- unsigned redundancy,
- enum bch_watermark watermark,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- struct ec_stripe_head *h;
- bool waiting = false;
- unsigned disk_label = 0;
- struct target t = target_decode(target);
- int ret;
-
- if (t.type == TARGET_GROUP) {
- if (t.group > U8_MAX) {
- bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
- return NULL;
- }
- disk_label = t.group + 1; /* 0 == no label */
- }
-
- h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
- if (IS_ERR_OR_NULL(h))
- return h;
-
- if (!h->s) {
- h->s = ec_new_stripe_alloc(c, h);
- if (!h->s) {
- ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
- bch_err(c, "failed to allocate new stripe");
- goto err;
- }
-
- h->nr_created++;
- }
-
- struct ec_stripe_new *s = h->s;
-
- if (s->allocated)
- goto allocated;
-
- if (s->have_existing_stripe)
- goto alloc_existing;
-
- /* First, try to allocate a full stripe: */
- ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
- __bch2_ec_stripe_head_reserve(trans, h, s);
- if (!ret)
- goto allocate_buf;
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, ENOMEM))
- goto err;
-
- /*
- * Not enough buckets available for a full stripe: we must reuse an
- * existing stripe:
- */
- while (1) {
- ret = __bch2_ec_stripe_head_reuse(trans, h, s);
- if (!ret)
- break;
- if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
- goto err;
-
- if (watermark == BCH_WATERMARK_copygc) {
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
- __bch2_ec_stripe_head_reserve(trans, h, s);
- if (ret)
- goto err;
- goto allocate_buf;
- }
-
- /* XXX freelist_wait? */
- closure_wait(&c->freelist_wait, cl);
- waiting = true;
- }
-
- if (waiting)
- closure_wake_up(&c->freelist_wait);
-alloc_existing:
- /*
- * Retry allocating buckets, with the watermark for this
- * particular write:
- */
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
- if (ret)
- goto err;
-
-allocate_buf:
- ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
- if (ret)
- goto err;
-
- s->allocated = true;
-allocated:
- BUG_ON(!s->idx);
- BUG_ON(!s->new_stripe.data[0]);
- BUG_ON(trans->restarted);
- return h;
-err:
- bch2_ec_stripe_head_put(c, h);
- return ERR_PTR(ret);
-}
-
-/* device removal */
-
-static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
-{
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
-
- if (!a->stripe)
- return 0;
-
- if (a->stripe_sectors) {
- bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
- return -BCH_ERR_invalidate_stripe_to_dev;
- }
-
- struct btree_iter iter;
- struct bkey_i_stripe *s =
- bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
- BTREE_ITER_slots, stripe);
- int ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- return ret;
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
-
- s64 sectors = 0;
- for (unsigned i = 0; i < s->v.nr_blocks; i++)
- sectors -= stripe_blockcount_get(&s->v, i);
-
- bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
- acc.replicas.data_type = BCH_DATA_user;
- ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
- if (ret)
- goto err;
-
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == k_a.k->p.inode)
- ptr->dev = BCH_SB_MEMBER_INVALID;
-
- sectors = -sectors;
-
- bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
- acc.replicas.data_type = BCH_DATA_user;
- ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
- if (ret)
- goto err;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
-{
- return bch2_trans_run(c,
- for_each_btree_key_max_commit(trans, iter,
- BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
- BTREE_ITER_intent, k,
- NULL, NULL, 0, ({
- bch2_invalidate_stripe_to_dev(trans, k);
- })));
-}
-
-/* startup/shutdown */
-
-static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
-{
- struct ec_stripe_head *h;
- struct open_bucket *ob;
- unsigned i;
-
- mutex_lock(&c->ec_stripe_head_lock);
- list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- mutex_lock(&h->lock);
- if (!h->s)
- goto unlock;
-
- if (!ca)
- goto found;
-
- for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
- if (!h->s->blocks[i])
- continue;
-
- ob = c->open_buckets + h->s->blocks[i];
- if (ob->dev == ca->dev_idx)
- goto found;
- }
- goto unlock;
-found:
- ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
-unlock:
- mutex_unlock(&h->lock);
- }
- mutex_unlock(&c->ec_stripe_head_lock);
-}
-
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
-{
- __bch2_ec_stop(c, ca);
-}
-
-void bch2_fs_ec_stop(struct bch_fs *c)
-{
- __bch2_ec_stop(c, NULL);
-}
-
-static bool bch2_fs_ec_flush_done(struct bch_fs *c)
-{
- bool ret;
-
- mutex_lock(&c->ec_stripe_new_lock);
- ret = list_empty(&c->ec_stripe_new_list);
- mutex_unlock(&c->ec_stripe_new_lock);
-
- return ret;
-}
-
-void bch2_fs_ec_flush(struct bch_fs *c)
-{
- wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
-}
-
-int bch2_stripes_read(struct bch_fs *c)
-{
- return 0;
-}
-
-static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
- struct ec_stripe_new *s)
-{
- prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
- s->idx, s->nr_data, s->nr_parity,
- bitmap_weight(s->blocks_allocated, s->nr_data),
- atomic_read(&s->ref[STRIPE_REF_io]),
- atomic_read(&s->ref[STRIPE_REF_stripe]),
- bch2_watermarks[s->h->watermark]);
-
- struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
- unsigned i;
- for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
- prt_printf(out, " %u", s->blocks[i]);
- prt_newline(out);
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
- prt_newline(out);
-}
-
-void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct ec_stripe_head *h;
- struct ec_stripe_new *s;
-
- mutex_lock(&c->ec_stripe_head_lock);
- list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
- h->disk_label, h->algo, h->redundancy,
- bch2_watermarks[h->watermark],
- h->nr_created);
-
- if (h->s)
- bch2_new_stripe_to_text(out, c, h->s);
- }
- mutex_unlock(&c->ec_stripe_head_lock);
-
- prt_printf(out, "in flight:\n");
-
- mutex_lock(&c->ec_stripe_new_lock);
- list_for_each_entry(s, &c->ec_stripe_new_list, list)
- bch2_new_stripe_to_text(out, c, s);
- mutex_unlock(&c->ec_stripe_new_lock);
-}
-
-void bch2_fs_ec_exit(struct bch_fs *c)
-{
- struct ec_stripe_head *h;
- unsigned i;
-
- while (1) {
- mutex_lock(&c->ec_stripe_head_lock);
- h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
- mutex_unlock(&c->ec_stripe_head_lock);
-
- if (!h)
- break;
-
- if (h->s) {
- for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
- BUG_ON(h->s->blocks[i]);
-
- kfree(h->s);
- }
- kfree(h);
- }
-
- BUG_ON(!list_empty(&c->ec_stripe_new_list));
-
- bioset_exit(&c->ec_bioset);
-}
-
-void bch2_fs_ec_init_early(struct bch_fs *c)
-{
- spin_lock_init(&c->ec_stripes_new_lock);
-
- INIT_LIST_HEAD(&c->ec_stripe_head_list);
- mutex_init(&c->ec_stripe_head_lock);
-
- INIT_LIST_HEAD(&c->ec_stripe_new_list);
- mutex_init(&c->ec_stripe_new_lock);
- init_waitqueue_head(&c->ec_stripe_new_wait);
-
- INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
- INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
-}
-
-int bch2_fs_ec_init(struct bch_fs *c)
-{
- return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
- BIOSET_NEED_BVECS);
-}
-
-static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
- struct bkey_s_c k,
- struct bkey_buf *last_flushed)
-{
- if (k.k->type != KEY_TYPE_stripe)
- return 0;
-
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- u64 lru_idx = stripe_lru_pos(s.v);
- if (lru_idx) {
- int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
- k.k->p.offset, lru_idx, k, last_flushed);
- if (ret)
- return ret;
- }
- return 0;
-}
-
-int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
-{
- struct bkey_buf last_flushed;
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
- POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
deleted file mode 100644
index 62d27e04d763..000000000000
--- a/fs/bcachefs/ec.h
+++ /dev/null
@@ -1,305 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_H
-#define _BCACHEFS_EC_H
-
-#include "ec_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-
-int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
- .key_validate = bch2_stripe_validate, \
- .val_to_text = bch2_stripe_to_text, \
- .swab = bch2_ptr_swab, \
- .trigger = bch2_trigger_stripe, \
- .min_val_size = 8, \
-})
-
-static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
-{
- return DIV_ROUND_UP(le16_to_cpu(s->sectors),
- 1 << s->csum_granularity_bits);
-}
-
-static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
- unsigned dev, unsigned csum_idx)
-{
- EBUG_ON(s->csum_type >= BCH_CSUM_NR);
-
- unsigned csum_bytes = bch_crc_bytes[s->csum_type];
-
- return sizeof(struct bch_stripe) +
- sizeof(struct bch_extent_ptr) * s->nr_blocks +
- (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
-}
-
-static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
- unsigned idx)
-{
- return stripe_csum_offset(s, s->nr_blocks, 0) +
- sizeof(u16) * idx;
-}
-
-static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
- unsigned idx)
-{
- return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
-}
-
-static inline void stripe_blockcount_set(struct bch_stripe *s,
- unsigned idx, unsigned v)
-{
- __le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
-
- *p = cpu_to_le16(v);
-}
-
-static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
-{
- return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
- sizeof(u64));
-}
-
-static inline void *stripe_csum(struct bch_stripe *s,
- unsigned block, unsigned csum_idx)
-{
- EBUG_ON(block >= s->nr_blocks);
- EBUG_ON(csum_idx >= stripe_csums_per_device(s));
-
- return (void *) s + stripe_csum_offset(s, block, csum_idx);
-}
-
-static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
- unsigned block, unsigned csum_idx)
-{
- struct bch_csum csum = { 0 };
-
- memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
- return csum;
-}
-
-static inline void stripe_csum_set(struct bch_stripe *s,
- unsigned block, unsigned csum_idx,
- struct bch_csum csum)
-{
- memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
-}
-
-#define STRIPE_LRU_POS_EMPTY 1
-
-static inline u64 stripe_lru_pos(const struct bch_stripe *s)
-{
- if (!s)
- return 0;
-
- unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
-
- for (unsigned i = 0; i < nr_data; i++)
- blocks_empty += !stripe_blockcount_get(s, i);
-
- /* Will be picked up by the stripe_delete worker */
- if (blocks_empty == nr_data)
- return STRIPE_LRU_POS_EMPTY;
-
- if (!blocks_empty)
- return 0;
-
- /* invert: more blocks empty = reuse first */
- return LRU_TIME_MAX - blocks_empty;
-}
-
-static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
- const struct bch_extent_ptr *data_ptr,
- unsigned sectors)
-{
- return (data_ptr->dev == stripe_ptr->dev ||
- data_ptr->dev == BCH_SB_MEMBER_INVALID ||
- stripe_ptr->dev == BCH_SB_MEMBER_INVALID) &&
- data_ptr->gen == stripe_ptr->gen &&
- data_ptr->offset >= stripe_ptr->offset &&
- data_ptr->offset < stripe_ptr->offset + sectors;
-}
-
-static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
- struct extent_ptr_decoded p)
-{
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
-
- BUG_ON(!p.has_ec);
-
- if (p.ec.block >= nr_data)
- return false;
-
- return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
- le16_to_cpu(s->sectors));
-}
-
-static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
- struct extent_ptr_decoded p)
-{
- unsigned nr_data = m->nr_blocks - m->nr_redundant;
-
- BUG_ON(!p.has_ec);
-
- if (p.ec.block >= nr_data)
- return false;
-
- return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
- m->sectors);
-}
-
-static inline void gc_stripe_unlock(struct gc_stripe *s)
-{
- BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-
- clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
- wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
-}
-
-static inline void gc_stripe_lock(struct gc_stripe *s)
-{
- wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR,
- TASK_UNINTERRUPTIBLE);
-}
-
-struct bch_read_bio;
-
-struct ec_stripe_buf {
- /* might not be buffering the entire stripe: */
- unsigned offset;
- unsigned size;
- unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-
- void *data[BCH_BKEY_PTRS_MAX];
-
- __BKEY_PADDED(key, 255);
-};
-
-struct ec_stripe_head;
-
-enum ec_stripe_ref {
- STRIPE_REF_io,
- STRIPE_REF_stripe,
- STRIPE_REF_NR
-};
-
-struct ec_stripe_new {
- struct bch_fs *c;
- struct ec_stripe_head *h;
- struct mutex lock;
- struct list_head list;
-
- struct hlist_node hash;
- u64 idx;
-
- struct closure iodone;
-
- atomic_t ref[STRIPE_REF_NR];
-
- int err;
-
- u8 nr_data;
- u8 nr_parity;
- bool allocated;
- bool pending;
- bool have_existing_stripe;
-
- unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
- unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
- open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
- struct disk_reservation res;
-
- struct ec_stripe_buf new_stripe;
- struct ec_stripe_buf existing_stripe;
-};
-
-struct ec_stripe_head {
- struct list_head list;
- struct mutex lock;
-
- unsigned disk_label;
- unsigned algo;
- unsigned redundancy;
- enum bch_watermark watermark;
- bool insufficient_devs;
-
- unsigned long rw_devs_change_count;
-
- u64 nr_created;
-
- struct bch_devs_mask devs;
- unsigned nr_active_devs;
-
- unsigned blocksize;
-
- struct dev_stripe_state block_stripe;
- struct dev_stripe_state parity_stripe;
-
- struct ec_stripe_new *s;
-};
-
-int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
-
-void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
-
-int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
-
-void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
- unsigned, unsigned, unsigned,
- enum bch_watermark, struct closure *);
-
-void bch2_do_stripe_deletes(struct bch_fs *);
-void bch2_ec_do_stripe_creates(struct bch_fs *);
-void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
-
-static inline void ec_stripe_new_get(struct ec_stripe_new *s,
- enum ec_stripe_ref ref)
-{
- atomic_inc(&s->ref[ref]);
-}
-
-static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
- enum ec_stripe_ref ref)
-{
- BUG_ON(atomic_read(&s->ref[ref]) <= 0);
-
- if (atomic_dec_and_test(&s->ref[ref]))
- switch (ref) {
- case STRIPE_REF_stripe:
- bch2_ec_stripe_new_free(c, s);
- break;
- case STRIPE_REF_io:
- bch2_ec_do_stripe_creates(c);
- break;
- default:
- BUG();
- }
-}
-
-int bch2_dev_remove_stripes(struct bch_fs *, unsigned);
-
-void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-void bch2_fs_ec_stop(struct bch_fs *);
-void bch2_fs_ec_flush(struct bch_fs *);
-
-int bch2_stripes_read(struct bch_fs *);
-
-void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_ec_exit(struct bch_fs *);
-void bch2_fs_ec_init_early(struct bch_fs *);
-int bch2_fs_ec_init(struct bch_fs *);
-
-int bch2_check_stripe_to_lru_refs(struct bch_fs *);
-
-#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
deleted file mode 100644
index b9770f24f213..000000000000
--- a/fs/bcachefs/ec_format.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_FORMAT_H
-#define _BCACHEFS_EC_FORMAT_H
-
-struct bch_stripe {
- struct bch_val v;
- __le16 sectors;
- __u8 algorithm;
- __u8 nr_blocks;
- __u8 nr_redundant;
-
- __u8 csum_granularity_bits;
- __u8 csum_type;
-
- /*
- * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
- *
- * we can manage with this because this only needs to point to a
- * disk label, not a target:
- */
- __u8 disk_label;
-
- /*
- * Variable length sections:
- * - Pointers
- * - Checksums
- * 2D array of [stripe block/device][csum block], with checksum block
- * size given by csum_granularity_bits
- * - Block sector counts: per-block array of u16s
- *
- * XXX:
- * Either checksums should have come last, or we should have included a
- * checksum_size field (the size in bytes of the checksum itself, not
- * the blocksize the checksum covers).
- *
- * Currently we aren't able to access the block sector counts if the
- * checksum type is unknown.
- */
-
- struct bch_extent_ptr ptrs[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
deleted file mode 100644
index 06144bfd9c19..000000000000
--- a/fs/bcachefs/ec_types.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_TYPES_H
-#define _BCACHEFS_EC_TYPES_H
-
-#include "bcachefs_format.h"
-
-struct bch_replicas_padded {
- struct bch_replicas_entry_v1 e;
- u8 pad[BCH_BKEY_PTRS_MAX];
-};
-
-struct stripe {
- size_t heap_idx;
- u16 sectors;
- u8 algorithm;
- u8 nr_blocks;
- u8 nr_redundant;
- u8 blocks_nonempty;
- u8 disk_label;
-};
-
-struct gc_stripe {
- u8 lock;
- unsigned alive:1; /* does a corresponding key exist in stripes btree? */
- u16 sectors;
- u8 nr_blocks;
- u8 nr_redundant;
- u16 block_sectors[BCH_BKEY_PTRS_MAX];
- struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
-
- struct bch_replicas_padded r;
-};
-
-#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
deleted file mode 100644
index 43557bebd0f8..000000000000
--- a/fs/bcachefs/errcode.c
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "errcode.h"
-#include "trace.h"
-
-#include <linux/errname.h>
-
-static const char * const bch2_errcode_strs[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
- BCH_ERRCODES()
-#undef x
- NULL
-};
-
-static unsigned bch2_errcode_parents[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
- BCH_ERRCODES()
-#undef x
-};
-
-const char *bch2_err_str(int err)
-{
- const char *errstr;
-
- err = abs(err);
-
- BUG_ON(err >= BCH_ERR_MAX);
-
- if (err >= BCH_ERR_START)
- errstr = bch2_errcode_strs[err - BCH_ERR_START];
- else if (err)
- errstr = errname(err);
- else
- errstr = "(No error)";
- return errstr ?: "(Invalid error)";
-}
-
-bool __bch2_err_matches(int err, int class)
-{
- err = abs(err);
- class = abs(class);
-
- BUG_ON(err >= BCH_ERR_MAX);
- BUG_ON(class >= BCH_ERR_MAX);
-
- while (err >= BCH_ERR_START && err != class)
- err = bch2_errcode_parents[err - BCH_ERR_START];
-
- return err == class;
-}
-
-int __bch2_err_class(int bch_err)
-{
- int std_err = -bch_err;
- BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
-
- while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
- std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
-
- trace_error_downcast(bch_err, std_err, _RET_IP_);
-
- return -std_err;
-}
-
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
- if (status == BLK_STS_REMOVED)
- return "device removed";
- return blk_status_to_str(status);
-}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
deleted file mode 100644
index 101806d7ebe1..000000000000
--- a/fs/bcachefs/errcode.h
+++ /dev/null
@@ -1,378 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERRCODE_H
-#define _BCACHEFS_ERRCODE_H
-
-#define BCH_ERRCODES() \
- x(ERANGE, ERANGE_option_too_small) \
- x(ERANGE, ERANGE_option_too_big) \
- x(EINVAL, mount_option) \
- x(BCH_ERR_mount_option, option_name) \
- x(BCH_ERR_mount_option, option_value) \
- x(BCH_ERR_mount_option, option_not_bool) \
- x(ENOMEM, ENOMEM_stripe_buf) \
- x(ENOMEM, ENOMEM_replicas_table) \
- x(ENOMEM, ENOMEM_cpu_replicas) \
- x(ENOMEM, ENOMEM_replicas_gc) \
- x(ENOMEM, ENOMEM_disk_groups_validate) \
- x(ENOMEM, ENOMEM_disk_groups_to_cpu) \
- x(ENOMEM, ENOMEM_mark_snapshot) \
- x(ENOMEM, ENOMEM_mark_stripe) \
- x(ENOMEM, ENOMEM_mark_stripe_ptr) \
- x(ENOMEM, ENOMEM_btree_key_cache_create) \
- x(ENOMEM, ENOMEM_btree_key_cache_fill) \
- x(ENOMEM, ENOMEM_btree_key_cache_insert) \
- x(ENOMEM, ENOMEM_trans_kmalloc) \
- x(ENOMEM, ENOMEM_trans_log_msg) \
- x(ENOMEM, ENOMEM_do_encrypt) \
- x(ENOMEM, ENOMEM_ec_read_extent) \
- x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \
- x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \
- x(ENOMEM, ENOMEM_fs_btree_cache_init) \
- x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \
- x(ENOMEM, ENOMEM_fs_counters_init) \
- x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \
- x(ENOMEM, ENOMEM_io_clock_init) \
- x(ENOMEM, ENOMEM_blacklist_table_init) \
- x(ENOMEM, ENOMEM_sb_realloc_injected) \
- x(ENOMEM, ENOMEM_sb_bio_realloc) \
- x(ENOMEM, ENOMEM_sb_buf_realloc) \
- x(ENOMEM, ENOMEM_sb_journal_validate) \
- x(ENOMEM, ENOMEM_sb_journal_v2_validate) \
- x(ENOMEM, ENOMEM_journal_entry_add) \
- x(ENOMEM, ENOMEM_journal_read_buf_realloc) \
- x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\
- x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \
- x(ENOMEM, ENOMEM_bio_read_init) \
- x(ENOMEM, ENOMEM_bio_read_split_init) \
- x(ENOMEM, ENOMEM_bio_write_init) \
- x(ENOMEM, ENOMEM_bio_bounce_pages_init) \
- x(ENOMEM, ENOMEM_writepage_bioset_init) \
- x(ENOMEM, ENOMEM_dio_read_bioset_init) \
- x(ENOMEM, ENOMEM_dio_write_bioset_init) \
- x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \
- x(ENOMEM, ENOMEM_promote_table_init) \
- x(ENOMEM, ENOMEM_compression_bounce_read_init) \
- x(ENOMEM, ENOMEM_compression_bounce_write_init) \
- x(ENOMEM, ENOMEM_compression_workspace_init) \
- x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \
- x(EIO, compression_workspace_not_initialized) \
- x(ENOMEM, ENOMEM_bucket_gens) \
- x(ENOMEM, ENOMEM_buckets_nouse) \
- x(ENOMEM, ENOMEM_usage_init) \
- x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \
- x(ENOMEM, ENOMEM_btree_node_reclaim) \
- x(ENOMEM, ENOMEM_btree_node_mem_alloc) \
- x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \
- x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\
- x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \
- x(ENOMEM, ENOMEM_set_nr_journal_buckets) \
- x(ENOMEM, ENOMEM_dev_journal_init) \
- x(ENOMEM, ENOMEM_journal_pin_fifo) \
- x(ENOMEM, ENOMEM_journal_buf) \
- x(ENOMEM, ENOMEM_gc_start) \
- x(ENOMEM, ENOMEM_gc_alloc_start) \
- x(ENOMEM, ENOMEM_gc_reflink_start) \
- x(ENOMEM, ENOMEM_gc_gens) \
- x(ENOMEM, ENOMEM_gc_repair_key) \
- x(ENOMEM, ENOMEM_fsck_extent_ends_at) \
- x(ENOMEM, ENOMEM_fsck_add_nlink) \
- x(ENOMEM, ENOMEM_journal_key_insert) \
- x(ENOMEM, ENOMEM_journal_keys_sort) \
- x(ENOMEM, ENOMEM_read_superblock_clean) \
- x(ENOMEM, ENOMEM_fs_alloc) \
- x(ENOMEM, ENOMEM_fs_name_alloc) \
- x(ENOMEM, ENOMEM_fs_other_alloc) \
- x(ENOMEM, ENOMEM_dev_alloc) \
- x(ENOMEM, ENOMEM_disk_accounting) \
- x(ENOMEM, ENOMEM_stripe_head_alloc) \
- x(ENOMEM, ENOMEM_journal_read_bucket) \
- x(ENOSPC, ENOSPC_disk_reservation) \
- x(ENOSPC, ENOSPC_bucket_alloc) \
- x(ENOSPC, ENOSPC_disk_label_add) \
- x(ENOSPC, ENOSPC_stripe_create) \
- x(ENOSPC, ENOSPC_inode_create) \
- x(ENOSPC, ENOSPC_str_hash_create) \
- x(ENOSPC, ENOSPC_snapshot_create) \
- x(ENOSPC, ENOSPC_subvolume_create) \
- x(ENOSPC, ENOSPC_sb) \
- x(ENOSPC, ENOSPC_sb_journal) \
- x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \
- x(ENOSPC, ENOSPC_sb_quota) \
- x(ENOSPC, ENOSPC_sb_replicas) \
- x(ENOSPC, ENOSPC_sb_members) \
- x(ENOSPC, ENOSPC_sb_members_v2) \
- x(ENOSPC, ENOSPC_sb_crypt) \
- x(ENOSPC, ENOSPC_sb_downgrade) \
- x(ENOSPC, ENOSPC_btree_slot) \
- x(ENOSPC, ENOSPC_snapshot_tree) \
- x(ENOENT, ENOENT_bkey_type_mismatch) \
- x(ENOENT, ENOENT_str_hash_lookup) \
- x(ENOENT, ENOENT_str_hash_set_must_replace) \
- x(ENOENT, ENOENT_inode) \
- x(ENOENT, ENOENT_not_subvol) \
- x(ENOENT, ENOENT_not_directory) \
- x(ENOENT, ENOENT_directory_dead) \
- x(ENOENT, ENOENT_subvolume) \
- x(ENOENT, ENOENT_snapshot_tree) \
- x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
- x(ENOENT, ENOENT_dev_not_found) \
- x(ENOENT, ENOENT_dev_bucket_not_found) \
- x(ENOENT, ENOENT_dev_idx_not_found) \
- x(ENOENT, ENOENT_inode_no_backpointer) \
- x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
- x(ENOENT, btree_node_dying) \
- x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
- x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
- x(EEXIST, EEXIST_str_hash_set) \
- x(EEXIST, EEXIST_discard_in_flight_add) \
- x(EEXIST, EEXIST_subvolume_create) \
- x(ENOSPC, open_buckets_empty) \
- x(ENOSPC, freelist_empty) \
- x(BCH_ERR_freelist_empty, no_buckets_found) \
- x(0, transaction_restart) \
- x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
- x(BCH_ERR_transaction_restart, transaction_restart_relock) \
- x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \
- x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \
- x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \
- x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \
- x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \
- x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \
- x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\
- x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \
- x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \
- x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \
- x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
- x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
- x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\
- x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
- x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
- x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
- x(BCH_ERR_transaction_restart, transaction_restart_nested) \
- x(BCH_ERR_transaction_restart, transaction_restart_commit) \
- x(0, no_btree_node) \
- x(BCH_ERR_no_btree_node, no_btree_node_relock) \
- x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \
- x(BCH_ERR_no_btree_node, no_btree_node_drop) \
- x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \
- x(BCH_ERR_no_btree_node, no_btree_node_up) \
- x(BCH_ERR_no_btree_node, no_btree_node_down) \
- x(BCH_ERR_no_btree_node, no_btree_node_init) \
- x(BCH_ERR_no_btree_node, no_btree_node_cached) \
- x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \
- x(0, btree_insert_fail) \
- x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
- x(0, backpointer_to_overwritten_btree_node) \
- x(0, journal_reclaim_would_deadlock) \
- x(EINVAL, fsck) \
- x(BCH_ERR_fsck, fsck_fix) \
- x(BCH_ERR_fsck, fsck_delete_bkey) \
- x(BCH_ERR_fsck, fsck_ignore) \
- x(BCH_ERR_fsck, fsck_errors_not_fixed) \
- x(BCH_ERR_fsck, fsck_repair_unimplemented) \
- x(BCH_ERR_fsck, fsck_repair_impossible) \
- x(EINVAL, restart_recovery) \
- x(EINVAL, not_in_recovery) \
- x(EINVAL, cannot_rewind_recovery) \
- x(0, data_update_done) \
- x(BCH_ERR_data_update_done, data_update_done_would_block) \
- x(BCH_ERR_data_update_done, data_update_done_unwritten) \
- x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
- x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
- x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
- x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \
- x(EINVAL, device_state_not_allowed) \
- x(EINVAL, member_info_missing) \
- x(EINVAL, mismatched_block_size) \
- x(EINVAL, block_size_too_small) \
- x(EINVAL, bucket_size_too_small) \
- x(EINVAL, device_size_too_small) \
- x(EINVAL, device_size_too_big) \
- x(EINVAL, device_not_a_member_of_filesystem) \
- x(EINVAL, device_has_been_removed) \
- x(EINVAL, device_splitbrain) \
- x(EINVAL, device_already_online) \
- x(EINVAL, insufficient_devices_to_start) \
- x(EINVAL, invalid) \
- x(EINVAL, internal_fsck_err) \
- x(EINVAL, opt_parse_error) \
- x(EINVAL, remove_with_metadata_missing_unimplemented)\
- x(EINVAL, remove_would_lose_data) \
- x(EINVAL, no_resize_with_buckets_nouse) \
- x(EINVAL, inode_unpack_error) \
- x(EINVAL, varint_decode_error) \
- x(EINVAL, erasure_coding_found_btree_node) \
- x(EOPNOTSUPP, may_not_use_incompat_feature) \
- x(EROFS, erofs_trans_commit) \
- x(EROFS, erofs_no_writes) \
- x(EROFS, erofs_journal_err) \
- x(EROFS, erofs_sb_err) \
- x(EROFS, erofs_unfixed_errors) \
- x(EROFS, erofs_norecovery) \
- x(EROFS, erofs_nochanges) \
- x(EROFS, insufficient_devices) \
- x(0, operation_blocked) \
- x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
- x(BCH_ERR_operation_blocked, journal_res_blocked) \
- x(BCH_ERR_journal_res_blocked, journal_blocked) \
- x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \
- x(BCH_ERR_journal_res_blocked, journal_max_open) \
- x(BCH_ERR_journal_res_blocked, journal_full) \
- x(BCH_ERR_journal_res_blocked, journal_pin_full) \
- x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \
- x(BCH_ERR_journal_res_blocked, journal_stuck) \
- x(BCH_ERR_journal_res_blocked, journal_retry_open) \
- x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \
- x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \
- x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \
- x(BCH_ERR_invalid, invalid_sb) \
- x(BCH_ERR_invalid_sb, invalid_sb_magic) \
- x(BCH_ERR_invalid_sb, invalid_sb_version) \
- x(BCH_ERR_invalid_sb, invalid_sb_features) \
- x(BCH_ERR_invalid_sb, invalid_sb_too_big) \
- x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \
- x(BCH_ERR_invalid_sb, invalid_sb_csum) \
- x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
- x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
- x(BCH_ERR_invalid_sb, invalid_sb_offset) \
- x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
- x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
- x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
- x(BCH_ERR_invalid_sb, invalid_sb_field_size) \
- x(BCH_ERR_invalid_sb, invalid_sb_layout) \
- x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \
- x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \
- x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \
- x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \
- x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \
- x(BCH_ERR_invalid_sb, invalid_sb_members) \
- x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \
- x(BCH_ERR_invalid_sb, invalid_sb_replicas) \
- x(BCH_ERR_invalid_sb, invalid_replicas_entry) \
- x(BCH_ERR_invalid_sb, invalid_sb_journal) \
- x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \
- x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
- x(BCH_ERR_invalid_sb, invalid_sb_clean) \
- x(BCH_ERR_invalid_sb, invalid_sb_quota) \
- x(BCH_ERR_invalid_sb, invalid_sb_errors) \
- x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \
- x(BCH_ERR_invalid_sb, invalid_sb_ext) \
- x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \
- x(BCH_ERR_invalid, invalid_bkey) \
- x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
- x(EIO, journal_shutdown) \
- x(EIO, journal_flush_err) \
- x(EIO, journal_write_err) \
- x(EIO, btree_node_read_err) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
- x(EIO, sb_not_downgraded) \
- x(EIO, btree_node_write_all_failed) \
- x(EIO, btree_node_read_error) \
- x(EIO, btree_node_read_validate_error) \
- x(EIO, btree_need_topology_repair) \
- x(EIO, bucket_ref_update) \
- x(EIO, trigger_alloc) \
- x(EIO, trigger_pointer) \
- x(EIO, trigger_stripe_pointer) \
- x(EIO, metadata_bucket_inconsistency) \
- x(EIO, mark_stripe) \
- x(EIO, stripe_reconstruct) \
- x(EIO, key_type_error) \
- x(EIO, extent_poisened) \
- x(EIO, missing_indirect_extent) \
- x(EIO, invalidate_stripe_to_dev) \
- x(EIO, no_encryption_key) \
- x(EIO, insufficient_journal_devices) \
- x(EIO, device_offline) \
- x(EIO, EIO_fault_injected) \
- x(EIO, ec_block_read) \
- x(EIO, ec_block_write) \
- x(EIO, recompute_checksum) \
- x(EIO, decompress) \
- x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \
- x(BCH_ERR_decompress, decompress_lz4) \
- x(BCH_ERR_decompress, decompress_gzip) \
- x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \
- x(BCH_ERR_decompress, decompress_zstd) \
- x(EIO, data_write) \
- x(BCH_ERR_data_write, data_write_io) \
- x(BCH_ERR_data_write, data_write_csum) \
- x(BCH_ERR_data_write, data_write_invalid_ptr) \
- x(BCH_ERR_data_write, data_write_misaligned) \
- x(BCH_ERR_decompress, data_read) \
- x(BCH_ERR_data_read, no_device_to_read_from) \
- x(BCH_ERR_data_read, data_read_io_err) \
- x(BCH_ERR_data_read, data_read_csum_err) \
- x(BCH_ERR_data_read, data_read_retry) \
- x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
- x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
- x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
- x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
- x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
- x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
- x(BCH_ERR_data_read, data_read_decompress_err) \
- x(BCH_ERR_data_read, data_read_decrypt_err) \
- x(BCH_ERR_data_read, data_read_ptr_stale_race) \
- x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \
- x(BCH_ERR_data_read, data_read_no_encryption_key) \
- x(BCH_ERR_data_read, data_read_buffer_too_small) \
- x(BCH_ERR_data_read, data_read_key_overwritten) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \
- x(0, nopromote) \
- x(BCH_ERR_nopromote, nopromote_may_not) \
- x(BCH_ERR_nopromote, nopromote_already_promoted) \
- x(BCH_ERR_nopromote, nopromote_unwritten) \
- x(BCH_ERR_nopromote, nopromote_congested) \
- x(BCH_ERR_nopromote, nopromote_in_flight) \
- x(BCH_ERR_nopromote, nopromote_no_writes) \
- x(BCH_ERR_nopromote, nopromote_enomem) \
- x(0, invalid_snapshot_node) \
- x(0, option_needs_open_fs) \
- x(0, remove_disk_accounting_entry)
-
-enum bch_errcode {
- BCH_ERR_START = 2048,
-#define x(class, err) BCH_ERR_##err,
- BCH_ERRCODES()
-#undef x
- BCH_ERR_MAX
-};
-
-const char *bch2_err_str(int);
-bool __bch2_err_matches(int, int);
-
-static inline bool _bch2_err_matches(int err, int class)
-{
- return err < 0 && __bch2_err_matches(err, class);
-}
-
-#define bch2_err_matches(_err, _class) \
-({ \
- BUILD_BUG_ON(!__builtin_constant_p(_class)); \
- unlikely(_bch2_err_matches(_err, _class)); \
-})
-
-int __bch2_err_class(int);
-
-static inline long bch2_err_class(long err)
-{
- return err < 0 ? __bch2_err_class(err) : err;
-}
-
-#define BLK_STS_REMOVED ((__force blk_status_t)128)
-
-#include <linux/blk_types.h>
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
deleted file mode 100644
index 207f35d3cce2..000000000000
--- a/fs/bcachefs/error.c
+++ /dev/null
@@ -1,604 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "error.h"
-#include "journal.h"
-#include "namei.h"
-#include "recovery_passes.h"
-#include "super.h"
-#include "thread_with_file.h"
-
-#define FSCK_ERR_RATELIMIT_NR 10
-
-bool bch2_inconsistent_error(struct bch_fs *c)
-{
- set_bit(BCH_FS_error, &c->flags);
-
- switch (c->opts.errors) {
- case BCH_ON_ERROR_continue:
- return false;
- case BCH_ON_ERROR_fix_safe:
- case BCH_ON_ERROR_ro:
- if (bch2_fs_emergency_read_only(c))
- bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
- journal_cur_seq(&c->journal));
- return true;
- case BCH_ON_ERROR_panic:
- panic(bch2_fmt(c, "panic after error"));
- return true;
- default:
- BUG();
- }
-}
-
-int bch2_topology_error(struct bch_fs *c)
-{
- set_bit(BCH_FS_topology_error, &c->flags);
- if (!test_bit(BCH_FS_recovery_running, &c->flags)) {
- bch2_inconsistent_error(c);
- return -BCH_ERR_btree_need_topology_repair;
- } else {
- return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
- -BCH_ERR_btree_node_read_validate_error;
- }
-}
-
-void bch2_fatal_error(struct bch_fs *c)
-{
- if (bch2_fs_emergency_read_only(c))
- bch_err(c, "fatal error - emergency read only");
-}
-
-void bch2_io_error_work(struct work_struct *work)
-{
- struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
- struct bch_fs *c = ca->fs;
-
- /* XXX: if it's reads or checksums that are failing, set it to failed */
-
- down_write(&c->state_lock);
- unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
-
- if (write_errors_start &&
- time_after(jiffies,
- write_errors_start + c->opts.write_error_timeout * HZ)) {
- if (ca->mi.state >= BCH_MEMBER_STATE_ro)
- goto out;
-
- bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED);
-
- bch_err(ca,
- "writes erroring for %u seconds, setting %s ro",
- c->opts.write_error_timeout,
- dev ? "device" : "filesystem");
- if (!dev)
- bch2_fs_emergency_read_only(c);
-
- }
-out:
- up_write(&c->state_lock);
-}
-
-void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
-{
- atomic64_inc(&ca->errors[type]);
-
- if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
- ca->write_errors_start = jiffies;
-
- queue_work(system_long_wq, &ca->io_error_work);
-}
-
-enum ask_yn {
- YN_NO,
- YN_YES,
- YN_ALLNO,
- YN_ALLYES,
-};
-
-static enum ask_yn parse_yn_response(char *buf)
-{
- buf = strim(buf);
-
- if (strlen(buf) == 1)
- switch (buf[0]) {
- case 'n':
- return YN_NO;
- case 'y':
- return YN_YES;
- case 'N':
- return YN_ALLNO;
- case 'Y':
- return YN_ALLYES;
- }
- return -1;
-}
-
-#ifdef __KERNEL__
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
-{
- struct stdio_redirect *stdio = c->stdio;
-
- if (c->stdio_filter && c->stdio_filter != current)
- stdio = NULL;
-
- if (!stdio)
- return YN_NO;
-
- if (trans)
- bch2_trans_unlock(trans);
-
- unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0;
- darray_char line = {};
- int ret;
-
- do {
- unsigned long t;
- bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
-rewait:
- t = unlock_long_at
- ? max_t(long, unlock_long_at - jiffies, 0)
- : MAX_SCHEDULE_TIMEOUT;
-
- int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t);
- if (r == -ETIME) {
- bch2_trans_unlock_long(trans);
- unlock_long_at = 0;
- goto rewait;
- }
-
- if (r < 0) {
- ret = YN_NO;
- break;
- }
-
- darray_last(line) = '\0';
- } while ((ret = parse_yn_response(line.data)) < 0);
-
- darray_exit(&line);
- return ret;
-}
-#else
-
-#include "tools-util.h"
-
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
-{
- char *buf = NULL;
- size_t buflen = 0;
- int ret;
-
- do {
- fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
- fflush(stdout);
-
- if (getline(&buf, &buflen, stdin) < 0)
- die("error reading from standard input");
- } while ((ret = parse_yn_response(buf)) < 0);
-
- free(buf);
- return ret;
-}
-
-#endif
-
-static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
-{
- struct fsck_err_state *s;
-
- if (!test_bit(BCH_FS_fsck_running, &c->flags))
- return NULL;
-
- list_for_each_entry(s, &c->fsck_error_msgs, list)
- if (s->fmt == fmt) {
- /*
- * move it to the head of the list: repeated fsck errors
- * are common
- */
- list_move(&s->list, &c->fsck_error_msgs);
- return s;
- }
-
- s = kzalloc(sizeof(*s), GFP_NOFS);
- if (!s) {
- if (!c->fsck_alloc_msgs_err)
- bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
- c->fsck_alloc_msgs_err = true;
- return NULL;
- }
-
- INIT_LIST_HEAD(&s->list);
- s->fmt = fmt;
- list_add(&s->list, &c->fsck_error_msgs);
- return s;
-}
-
-/* s/fix?/fixing/ s/recreate?/recreating/ */
-static void prt_actioning(struct printbuf *out, const char *action)
-{
- unsigned len = strlen(action);
-
- BUG_ON(action[len - 1] != '?');
- --len;
-
- if (action[len - 1] == 'e')
- --len;
-
- prt_bytes(out, action, len);
- prt_str(out, "ing");
-}
-
-static const u8 fsck_flags_extra[] = {
-#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags,
- BCH_SB_ERRS()
-#undef x
-};
-
-static int do_fsck_ask_yn(struct bch_fs *c,
- struct btree_trans *trans,
- struct printbuf *question,
- const char *action)
-{
- prt_str(question, ", ");
- prt_str(question, action);
-
- if (bch2_fs_stdio_redirect(c))
- bch2_print(c, "%s", question->buf);
- else
- bch2_print_string_as_lines(KERN_ERR, question->buf);
-
- int ask = bch2_fsck_ask_yn(c, trans);
-
- if (trans) {
- int ret = bch2_trans_relock(trans);
- if (ret)
- return ret;
- }
-
- return ask;
-}
-
-int __bch2_fsck_err(struct bch_fs *c,
- struct btree_trans *trans,
- enum bch_fsck_flags flags,
- enum bch_sb_error_id err,
- const char *fmt, ...)
-{
- struct fsck_err_state *s = NULL;
- va_list args;
- bool print = true, suppressing = false, inconsistent = false, exiting = false;
- struct printbuf buf = PRINTBUF, *out = &buf;
- int ret = -BCH_ERR_fsck_ignore;
- const char *action_orig = "fix?", *action = action_orig;
-
- might_sleep();
-
- if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
- flags |= fsck_flags_extra[err];
-
- if (!c)
- c = trans->c;
-
- /*
- * Ugly: if there's a transaction in the current task it has to be
- * passed in to unlock if we prompt for user input.
- *
- * But, plumbing a transaction and transaction restarts into
- * bkey_validate() is problematic.
- *
- * So:
- * - make all bkey errors AUTOFIX, they're simple anyways (we just
- * delete the key)
- * - and we don't need to warn if we're not prompting
- */
- WARN_ON((flags & FSCK_CAN_FIX) &&
- !(flags & FSCK_AUTOFIX) &&
- !trans &&
- bch2_current_has_btree_trans(c));
-
- if (test_bit(err, c->sb.errors_silent))
- return flags & FSCK_CAN_FIX
- ? -BCH_ERR_fsck_fix
- : -BCH_ERR_fsck_ignore;
-
- bch2_sb_error_count(c, err);
-
- va_start(args, fmt);
- prt_vprintf(out, fmt, args);
- va_end(args);
-
- /* Custom fix/continue/recreate/etc.? */
- if (out->buf[out->pos - 1] == '?') {
- const char *p = strrchr(out->buf, ',');
- if (p) {
- out->pos = p - out->buf;
- action = kstrdup(p + 2, GFP_KERNEL);
- if (!action) {
- ret = -ENOMEM;
- goto err;
- }
- }
- }
-
- mutex_lock(&c->fsck_error_msgs_lock);
- s = fsck_err_get(c, fmt);
- if (s) {
- /*
- * We may be called multiple times for the same error on
- * transaction restart - this memoizes instead of asking the user
- * multiple times for the same error:
- */
- if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
- ret = s->ret;
- goto err_unlock;
- }
-
- kfree(s->last_msg);
- s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
- if (!s->last_msg) {
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- if (c->opts.ratelimit_errors &&
- !(flags & FSCK_NO_RATELIMIT) &&
- s->nr >= FSCK_ERR_RATELIMIT_NR) {
- if (s->nr == FSCK_ERR_RATELIMIT_NR)
- suppressing = true;
- else
- print = false;
- }
-
- s->nr++;
- }
-
-#ifdef BCACHEFS_LOG_PREFIX
- if (!strncmp(fmt, "bcachefs:", 9))
- prt_printf(out, bch2_log_msg(c, ""));
-#endif
-
- if ((flags & FSCK_AUTOFIX) &&
- (c->opts.errors == BCH_ON_ERROR_continue ||
- c->opts.errors == BCH_ON_ERROR_fix_safe)) {
- prt_str(out, ", ");
- if (flags & FSCK_CAN_FIX) {
- prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
- } else {
- prt_str(out, ", continuing");
- ret = -BCH_ERR_fsck_ignore;
- }
-
- goto print;
- } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
- if (c->opts.errors != BCH_ON_ERROR_continue ||
- !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
- prt_str(out, ", shutting down");
- inconsistent = true;
- ret = -BCH_ERR_fsck_errors_not_fixed;
- } else if (flags & FSCK_CAN_FIX) {
- prt_str(out, ", ");
- prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
- } else {
- prt_str(out, ", continuing");
- ret = -BCH_ERR_fsck_ignore;
- }
- } else if (c->opts.fix_errors == FSCK_FIX_exit) {
- prt_str(out, ", exiting");
- ret = -BCH_ERR_fsck_errors_not_fixed;
- } else if (flags & FSCK_CAN_FIX) {
- int fix = s && s->fix
- ? s->fix
- : c->opts.fix_errors;
-
- if (fix == FSCK_FIX_ask) {
- print = false;
-
- ret = do_fsck_ask_yn(c, trans, out, action);
- if (ret < 0)
- goto err_unlock;
-
- if (ret >= YN_ALLNO && s)
- s->fix = ret == YN_ALLNO
- ? FSCK_FIX_no
- : FSCK_FIX_yes;
-
- ret = ret & 1
- ? -BCH_ERR_fsck_fix
- : -BCH_ERR_fsck_ignore;
- } else if (fix == FSCK_FIX_yes ||
- (c->opts.nochanges &&
- !(flags & FSCK_CAN_IGNORE))) {
- prt_str(out, ", ");
- prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
- } else {
- prt_str(out, ", not ");
- prt_actioning(out, action);
- }
- } else if (!(flags & FSCK_CAN_IGNORE)) {
- prt_str(out, " (repair unimplemented)");
- }
-
- if (ret == -BCH_ERR_fsck_ignore &&
- (c->opts.fix_errors == FSCK_FIX_exit ||
- !(flags & FSCK_CAN_IGNORE)))
- ret = -BCH_ERR_fsck_errors_not_fixed;
-
- if (test_bit(BCH_FS_fsck_running, &c->flags) &&
- (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore)) {
- exiting = true;
- print = true;
- }
-print:
- if (print) {
- if (bch2_fs_stdio_redirect(c))
- bch2_print(c, "%s\n", out->buf);
- else
- bch2_print_string_as_lines(KERN_ERR, out->buf);
- }
-
- if (exiting)
- bch_err(c, "Unable to continue, halting");
- else if (suppressing)
- bch_err(c, "Ratelimiting new instances of previous error");
-
- if (s)
- s->ret = ret;
-
- if (inconsistent)
- bch2_inconsistent_error(c);
-
- /*
- * We don't yet track whether the filesystem currently has errors, for
- * log_fsck_err()s: that would require us to track for every error type
- * which recovery pass corrects it, to get the fsck exit status correct:
- */
- if (flags & FSCK_CAN_FIX) {
- if (ret == -BCH_ERR_fsck_fix) {
- set_bit(BCH_FS_errors_fixed, &c->flags);
- } else {
- set_bit(BCH_FS_errors_not_fixed, &c->flags);
- set_bit(BCH_FS_error, &c->flags);
- }
- }
-err_unlock:
- mutex_unlock(&c->fsck_error_msgs_lock);
-err:
- if (action != action_orig)
- kfree(action);
- printbuf_exit(&buf);
- return ret;
-}
-
-static const char * const bch2_bkey_validate_contexts[] = {
-#define x(n) #n,
- BKEY_VALIDATE_CONTEXTS()
-#undef x
- NULL
-};
-
-int __bch2_bkey_fsck_err(struct bch_fs *c,
- struct bkey_s_c k,
- struct bkey_validate_context from,
- enum bch_sb_error_id err,
- const char *fmt, ...)
-{
- if (from.flags & BCH_VALIDATE_silent)
- return -BCH_ERR_fsck_delete_bkey;
-
- unsigned fsck_flags = 0;
- if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
- if (test_bit(err, c->sb.errors_silent))
- return -BCH_ERR_fsck_delete_bkey;
-
- fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
- }
- if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
- fsck_flags |= fsck_flags_extra[err];
-
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "invalid bkey in %s",
- bch2_bkey_validate_contexts[from.from]);
-
- if (from.from == BKEY_VALIDATE_journal)
- prt_printf(&buf, " journal seq=%llu offset=%u",
- from.journal_seq, from.journal_offset);
-
- prt_str(&buf, " btree=");
- bch2_btree_id_to_text(&buf, from.btree);
- prt_printf(&buf, " level=%u: ", from.level);
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\n ");
-
- va_list args;
- va_start(args, fmt);
- prt_vprintf(&buf, fmt, args);
- va_end(args);
-
- prt_str(&buf, ": delete?");
-
- int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
- printbuf_exit(&buf);
- return ret;
-}
-
-void bch2_flush_fsck_errs(struct bch_fs *c)
-{
- struct fsck_err_state *s, *n;
-
- mutex_lock(&c->fsck_error_msgs_lock);
-
- list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
- if (s->ratelimited && s->last_msg)
- bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
-
- list_del(&s->list);
- kfree(s->last_msg);
- kfree(s);
- }
-
- mutex_unlock(&c->fsck_error_msgs_lock);
-}
-
-int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
- subvol_inum inum, u64 offset)
-{
- u32 restart_count = trans->restart_count;
- int ret = 0;
-
- if (inum.subvol) {
- ret = bch2_inum_to_path(trans, inum, out);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
- }
- if (!inum.subvol || ret)
- prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
- prt_printf(out, " offset %llu: ", offset);
-
- return trans_was_restarted(trans, restart_count);
-}
-
-void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
- subvol_inum inum, u64 offset)
-{
- bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
-}
-
-int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
- struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- if (!bch2_snapshot_is_leaf(c, pos.snapshot))
- prt_str(out, "(multiple snapshots) ");
-
- subvol_inum inum = {
- .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot),
- .inum = pos.inode,
- };
-
- if (inum.subvol) {
- ret = bch2_inum_to_path(trans, inum, out);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
- }
-
- if (!inum.subvol || ret)
- prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot);
-
- prt_printf(out, " offset %llu: ", pos.offset << 8);
- return 0;
-}
-
-void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
- struct bpos pos)
-{
- bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
-}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
deleted file mode 100644
index 7d3f0e2a5fd6..000000000000
--- a/fs/bcachefs/error.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERROR_H
-#define _BCACHEFS_ERROR_H
-
-#include <linux/list.h>
-#include <linux/printk.h>
-#include "bkey_types.h"
-#include "sb-errors.h"
-
-struct bch_dev;
-struct bch_fs;
-struct work_struct;
-
-/*
- * XXX: separate out errors that indicate on disk data is inconsistent, and flag
- * superblock as such
- */
-
-/* Error messages: */
-
-/*
- * Inconsistency errors: The on disk data is inconsistent. If these occur during
- * initial recovery, they don't indicate a bug in the running code - we walk all
- * the metadata before modifying anything. If they occur at runtime, they
- * indicate either a bug in the running code or (less likely) data is being
- * silently corrupted under us.
- *
- * XXX: audit all inconsistent errors and make sure they're all recoverable, in
- * BCH_ON_ERROR_CONTINUE mode
- */
-
-bool bch2_inconsistent_error(struct bch_fs *);
-
-int bch2_topology_error(struct bch_fs *);
-
-#define bch2_fs_topology_error(c, ...) \
-({ \
- bch_err(c, "btree topology error: " __VA_ARGS__); \
- bch2_topology_error(c); \
-})
-
-#define bch2_fs_inconsistent(c, ...) \
-({ \
- bch_err(c, __VA_ARGS__); \
- bch2_inconsistent_error(c); \
-})
-
-#define bch2_fs_inconsistent_on(cond, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- if (_ret) \
- bch2_fs_inconsistent(__VA_ARGS__); \
- _ret; \
-})
-
-/*
- * When a transaction update discovers or is causing a fs inconsistency, it's
- * helpful to also dump the pending updates:
- */
-#define bch2_trans_inconsistent(trans, ...) \
-({ \
- bch_err(trans->c, __VA_ARGS__); \
- bch2_dump_trans_updates(trans); \
- bch2_inconsistent_error(trans->c); \
-})
-
-#define bch2_trans_inconsistent_on(cond, trans, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- \
- if (_ret) \
- bch2_trans_inconsistent(trans, __VA_ARGS__); \
- _ret; \
-})
-
-/*
- * Fsck errors: inconsistency errors we detect at mount time, and should ideally
- * be able to repair:
- */
-
-struct fsck_err_state {
- struct list_head list;
- const char *fmt;
- u64 nr;
- bool ratelimited;
- int ret;
- int fix;
- char *last_msg;
-};
-
-#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-
-__printf(5, 6) __cold
-int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
- enum bch_fsck_flags,
- enum bch_sb_error_id,
- const char *, ...);
-#define bch2_fsck_err(c, _flags, _err_type, ...) \
- __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\
- type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\
- _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
-
-void bch2_flush_fsck_errs(struct bch_fs *);
-
-#define fsck_err_wrap(_do) \
-({ \
- int _ret = _do; \
- if (_ret != -BCH_ERR_fsck_fix && \
- _ret != -BCH_ERR_fsck_ignore) { \
- ret = _ret; \
- goto fsck_err; \
- } \
- \
- _ret == -BCH_ERR_fsck_fix; \
-})
-
-#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
-
-/* These macros return true if error should be fixed: */
-
-/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-
-#define __fsck_err_on(cond, c, _flags, _err_type, ...) \
-({ \
- might_sleep(); \
- \
- if (type_is(c, struct bch_fs *)) \
- WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\
- \
- (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
-})
-
-#define mustfix_fsck_err(c, _err_type, ...) \
- __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define mustfix_fsck_err_on(cond, c, _err_type, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define fsck_err(c, _err_type, ...) \
- __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define fsck_err_on(cond, c, _err_type, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define log_fsck_err(c, _err_type, ...) \
- __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define log_fsck_err_on(cond, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- if (_ret) \
- log_fsck_err(__VA_ARGS__); \
- _ret; \
-})
-
-enum bch_validate_flags;
-__printf(5, 6)
-int __bch2_bkey_fsck_err(struct bch_fs *,
- struct bkey_s_c,
- struct bkey_validate_context from,
- enum bch_sb_error_id,
- const char *, ...);
-
-/*
- * for now, bkey fsck errors are always handled by deleting the entire key -
- * this will change at some point
- */
-#define bkey_fsck_err(c, _err_type, _err_msg, ...) \
-do { \
- int _ret = __bch2_bkey_fsck_err(c, k, from, \
- BCH_FSCK_ERR_##_err_type, \
- _err_msg, ##__VA_ARGS__); \
- if (_ret != -BCH_ERR_fsck_fix && \
- _ret != -BCH_ERR_fsck_ignore) \
- ret = _ret; \
- ret = -BCH_ERR_fsck_delete_bkey; \
- goto fsck_err; \
-} while (0)
-
-#define bkey_fsck_err_on(cond, ...) \
-do { \
- if (unlikely(cond)) \
- bkey_fsck_err(__VA_ARGS__); \
-} while (0)
-
-/*
- * Fatal errors: these don't indicate a bug, but we can't continue running in RW
- * mode - pretty much just due to metadata IO errors:
- */
-
-void bch2_fatal_error(struct bch_fs *);
-
-#define bch2_fs_fatal_error(c, _msg, ...) \
-do { \
- bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \
- bch2_fatal_error(c); \
-} while (0)
-
-#define bch2_fs_fatal_err_on(cond, c, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- \
- if (_ret) \
- bch2_fs_fatal_error(c, __VA_ARGS__); \
- _ret; \
-})
-
-/*
- * IO errors: either recoverable metadata IO (because we have replicas), or data
- * IO - we need to log it and print out a message, but we don't (necessarily)
- * want to shut down the fs:
- */
-
-void bch2_io_error_work(struct work_struct *);
-
-/* Does the error handling without logging a message */
-void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
-static inline void bch2_account_io_success_fail(struct bch_dev *ca,
- enum bch_member_error_type type,
- bool success)
-{
- if (likely(success)) {
- if (type == BCH_MEMBER_ERROR_write &&
- ca->write_errors_start)
- ca->write_errors_start = 0;
- } else {
- bch2_io_error(ca, type);
- }
-}
-
-static inline void bch2_account_io_completion(struct bch_dev *ca,
- enum bch_member_error_type type,
- u64 submit_time, bool success)
-{
- if (unlikely(!ca))
- return;
-
- if (type != BCH_MEMBER_ERROR_checksum)
- bch2_latency_acct(ca, submit_time, type);
-
- bch2_account_io_success_fail(ca, type, success);
-}
-
-int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
-
-void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
-
-int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
-void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
-
-#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
deleted file mode 100644
index 6aac579a692a..000000000000
--- a/fs/bcachefs/extent_update.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "debug.h"
-#include "extents.h"
-#include "extent_update.h"
-
-/*
- * This counts the number of iterators to the alloc & ec btrees we'll need
- * inserting/removing this extent:
- */
-static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- unsigned ret = 0, lru = 0;
-
- bkey_extent_entry_for_each(ptrs, entry) {
- switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- /* Might also be updating LRU btree */
- if (entry->ptr.cached)
- lru++;
-
- fallthrough;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- ret++;
- }
- }
-
- /*
- * Updating keys in the alloc btree may also update keys in the
- * freespace or discard btrees:
- */
- return lru + ret * 2;
-}
-
-static int count_iters_for_insert(struct btree_trans *trans,
- struct bkey_s_c k,
- unsigned offset,
- struct bpos *end,
- unsigned *nr_iters,
- unsigned max_iters)
-{
- int ret = 0, ret2 = 0;
-
- if (*nr_iters >= max_iters) {
- *end = bpos_min(*end, k.k->p);
- ret = 1;
- }
-
- switch (k.k->type) {
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
-
- if (*nr_iters >= max_iters) {
- *end = bpos_min(*end, k.k->p);
- ret = 1;
- }
-
- break;
- case KEY_TYPE_reflink_p: {
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx = REFLINK_P_IDX(p.v);
- unsigned sectors = bpos_min(*end, p.k->p).offset -
- bkey_start_offset(p.k);
- struct btree_iter iter;
- struct bkey_s_c r_k;
-
- for_each_btree_key_norestart(trans, iter,
- BTREE_ID_reflink, POS(0, idx + offset),
- BTREE_ITER_slots, r_k, ret2) {
- if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
- break;
-
- /* extent_update_to_keys(), for the reflink_v update */
- *nr_iters += 1;
-
- *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
-
- if (*nr_iters >= max_iters) {
- struct bpos pos = bkey_start_pos(k.k);
- pos.offset += min_t(u64, k.k->size,
- r_k.k->p.offset - idx);
-
- *end = bpos_min(*end, pos);
- ret = 1;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- break;
- }
- }
-
- return ret2 ?: ret;
-}
-
-#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3)
-
-int bch2_extent_atomic_end(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert,
- struct bpos *end)
-{
- struct btree_iter copy;
- struct bkey_s_c k;
- unsigned nr_iters = 0;
- int ret;
-
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- *end = insert->k.p;
-
- /* extent_update_to_keys(): */
- nr_iters += 1;
-
- ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
- &nr_iters, EXTENT_ITERS_MAX / 2);
- if (ret < 0)
- return ret;
-
- bch2_trans_copy_iter(&copy, iter);
-
- for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
- unsigned offset = 0;
-
- if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
- offset = bkey_start_offset(&insert->k) -
- bkey_start_offset(k.k);
-
- /* extent_handle_overwrites(): */
- switch (bch2_extent_overlap(&insert->k, k.k)) {
- case BCH_EXTENT_OVERLAP_ALL:
- case BCH_EXTENT_OVERLAP_FRONT:
- nr_iters += 1;
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- case BCH_EXTENT_OVERLAP_MIDDLE:
- nr_iters += 2;
- break;
- }
-
- ret = count_iters_for_insert(trans, k, offset, end,
- &nr_iters, EXTENT_ITERS_MAX);
- if (ret)
- break;
- }
-
- bch2_trans_iter_exit(trans, &copy);
- return ret < 0 ? ret : 0;
-}
-
-int bch2_extent_trim_atomic(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *k)
-{
- struct bpos end;
- int ret;
-
- ret = bch2_extent_atomic_end(trans, iter, k, &end);
- if (ret)
- return ret;
-
- bch2_cut_back(end, k);
- return 0;
-}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
deleted file mode 100644
index 6f5cf449361a..000000000000
--- a/fs/bcachefs/extent_update.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENT_UPDATE_H
-#define _BCACHEFS_EXTENT_UPDATE_H
-
-#include "bcachefs.h"
-
-int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, struct bpos *);
-int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
- struct bkey_i *);
-
-#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
deleted file mode 100644
index ae1a1d917805..000000000000
--- a/fs/bcachefs/extents.c
+++ /dev/null
@@ -1,1662 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- *
- * Code for managing the extent btree and dynamically updating the writeback
- * dirty sector count.
- */
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "journal.h"
-#include "rebalance.h"
-#include "replicas.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-#include "util.h"
-
-static const char * const bch2_extent_flags_strs[] = {
-#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
- BCH_EXTENT_FLAGS()
-#undef x
- NULL,
-};
-
-static unsigned bch2_crc_field_size_max[] = {
- [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
- [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
- [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
-};
-
-static void bch2_extent_crc_pack(union bch_extent_crc *,
- struct bch_extent_crc_unpacked,
- enum bch_extent_entry_type);
-
-struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
- unsigned dev)
-{
- struct bch_dev_io_failures *i;
-
- for (i = f->devs; i < f->devs + f->nr; i++)
- if (i->dev == dev)
- return i;
-
- return NULL;
-}
-
-void bch2_mark_io_failure(struct bch_io_failures *failed,
- struct extent_ptr_decoded *p,
- bool csum_error)
-{
- struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
-
- if (!f) {
- BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
- f = &failed->devs[failed->nr++];
- memset(f, 0, sizeof(*f));
- f->dev = p->ptr.dev;
- }
-
- if (p->do_ec_reconstruct)
- f->failed_ec = true;
- else if (!csum_error)
- f->failed_io = true;
- else
- f->failed_csum_nr++;
-}
-
-static inline u64 dev_latency(struct bch_dev *ca)
-{
- return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
-}
-
-static inline int dev_failed(struct bch_dev *ca)
-{
- return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
-}
-
-/*
- * returns true if p1 is better than p2:
- */
-static inline bool ptr_better(struct bch_fs *c,
- const struct extent_ptr_decoded p1,
- u64 p1_latency,
- struct bch_dev *ca1,
- const struct extent_ptr_decoded p2,
- u64 p2_latency)
-{
- struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
-
- int failed_delta = dev_failed(ca1) - dev_failed(ca2);
- if (unlikely(failed_delta))
- return failed_delta < 0;
-
- if (unlikely(bch2_force_reconstruct_read))
- return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
-
- if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
- return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
-
- int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
- if (unlikely(crc_retry_delta))
- return crc_retry_delta < 0;
-
- /* Pick at random, biased in favor of the faster device: */
-
- return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
-}
-
-/*
- * This picks a non-stale pointer, preferably from a device other than @avoid.
- * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
- * other devices, it will still pick a pointer from avoid.
- */
-int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_failures *failed,
- struct extent_ptr_decoded *pick,
- int dev)
-{
- bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
- bool have_dirty_ptrs = false, have_pick = false;
-
- if (k.k->type == KEY_TYPE_error)
- return -BCH_ERR_key_type_error;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return -BCH_ERR_extent_poisened;
-
- rcu_read_lock();
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- u64 pick_latency;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- have_dirty_ptrs |= !p.ptr.cached;
-
- /*
- * Unwritten extent: no need to actually read, treat it as a
- * hole and return 0s:
- */
- if (p.ptr.unwritten) {
- rcu_read_unlock();
- return 0;
- }
-
- /* Are we being asked to read from a specific device? */
- if (dev >= 0 && p.ptr.dev != dev)
- continue;
-
- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-
- if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
- continue;
-
- struct bch_dev_io_failures *f =
- unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
- if (unlikely(f)) {
- p.crc_retry_nr = f->failed_csum_nr;
- p.has_ec &= ~f->failed_ec;
-
- if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
- have_io_errors |= f->failed_io;
- have_io_errors |= f->failed_ec;
- }
- have_csum_errors |= !!f->failed_csum_nr;
-
- if (p.has_ec && (f->failed_io || f->failed_csum_nr))
- p.do_ec_reconstruct = true;
- else if (f->failed_io ||
- f->failed_csum_nr > c->opts.checksum_err_retry_nr)
- continue;
- }
-
- have_missing_devs |= ca && !bch2_dev_is_online(ca);
-
- if (!ca || !bch2_dev_is_online(ca)) {
- if (!p.has_ec)
- continue;
- p.do_ec_reconstruct = true;
- }
-
- if (bch2_force_reconstruct_read && p.has_ec)
- p.do_ec_reconstruct = true;
-
- u64 p_latency = dev_latency(ca);
- /*
- * Square the latencies, to bias more in favor of the faster
- * device - we never want to stop issuing reads to the slower
- * device altogether, so that we can update our latency numbers:
- */
- p_latency *= p_latency;
-
- if (!have_pick ||
- ptr_better(c,
- p, p_latency, ca,
- *pick, pick_latency)) {
- *pick = p;
- pick_latency = p_latency;
- have_pick = true;
- }
- }
- rcu_read_unlock();
-
- if (have_pick)
- return 1;
- if (!have_dirty_ptrs)
- return 0;
- if (have_missing_devs)
- return -BCH_ERR_no_device_to_read_from;
- if (have_csum_errors)
- return -BCH_ERR_data_read_csum_err;
- if (have_io_errors)
- return -BCH_ERR_data_read_io_err;
-
- WARN_ONCE(1, "unhandled error case in %s\n", __func__);
- return -EINVAL;
-}
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
- c, btree_ptr_val_too_big,
- "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
-
- ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
- c, btree_ptr_v2_val_too_big,
- "value too big (%zu > %zu)",
- bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
-
- bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
- c, btree_ptr_v2_min_key_bad,
- "min_key > key");
-
- if ((from.flags & BCH_VALIDATE_write) &&
- c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
- bkey_fsck_err_on(!bp.v->sectors_written,
- c, btree_ptr_v2_written_0,
- "sectors_written == 0");
-
- ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
- prt_printf(out, "seq %llx written %u min_key %s",
- le64_to_cpu(bp.v->seq),
- le16_to_cpu(bp.v->sectors_written),
- BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
-
- bch2_bpos_to_text(out, bp.v->min_key);
- prt_printf(out, " ");
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
- unsigned big_endian, int write,
- struct bkey_s k)
-{
- struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
-
- compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
-
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id_is_extents(btree_id) &&
- !bkey_eq(bp.v->min_key, POS_MIN))
- bp.v->min_key = write
- ? bpos_nosnap_predecessor(bp.v->min_key)
- : bpos_nosnap_successor(bp.v->min_key);
-}
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
- struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
- struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
- union bch_extent_entry *en_l;
- const union bch_extent_entry *en_r;
- struct extent_ptr_decoded lp, rp;
- bool use_right_ptr;
-
- en_l = l_ptrs.start;
- en_r = r_ptrs.start;
- while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
- if (extent_entry_type(en_l) != extent_entry_type(en_r))
- return false;
-
- en_l = extent_entry_next(en_l);
- en_r = extent_entry_next(en_r);
- }
-
- if (en_l < l_ptrs.end || en_r < r_ptrs.end)
- return false;
-
- en_l = l_ptrs.start;
- en_r = r_ptrs.start;
- lp.crc = bch2_extent_crc_unpack(l.k, NULL);
- rp.crc = bch2_extent_crc_unpack(r.k, NULL);
-
- while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
- __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
- if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
- rp.ptr.offset + rp.crc.offset ||
- lp.ptr.dev != rp.ptr.dev ||
- lp.ptr.gen != rp.ptr.gen ||
- lp.ptr.unwritten != rp.ptr.unwritten ||
- lp.has_ec != rp.has_ec)
- return false;
-
- /* Extents may not straddle buckets: */
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
- bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
- rcu_read_unlock();
-
- if (!same_bucket)
- return false;
-
- if (lp.has_ec != rp.has_ec ||
- (lp.has_ec &&
- (lp.ec.block != rp.ec.block ||
- lp.ec.redundancy != rp.ec.redundancy ||
- lp.ec.idx != rp.ec.idx)))
- return false;
-
- if (lp.crc.compression_type != rp.crc.compression_type ||
- lp.crc.nonce != rp.crc.nonce)
- return false;
-
- if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
- lp.crc.uncompressed_size) {
- /* can use left extent's crc entry */
- } else if (lp.crc.live_size <= rp.crc.offset) {
- /* can use right extent's crc entry */
- } else {
- /* check if checksums can be merged: */
- if (lp.crc.csum_type != rp.crc.csum_type ||
- lp.crc.nonce != rp.crc.nonce ||
- crc_is_compressed(lp.crc) ||
- !bch2_checksum_mergeable(lp.crc.csum_type))
- return false;
-
- if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
- rp.crc.offset)
- return false;
-
- if (lp.crc.csum_type &&
- lp.crc.uncompressed_size +
- rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
- return false;
- }
-
- en_l = extent_entry_next(en_l);
- en_r = extent_entry_next(en_r);
- }
-
- en_l = l_ptrs.start;
- en_r = r_ptrs.start;
- while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
- if (extent_entry_is_crc(en_l)) {
- struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
- struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
- if (crc_l.uncompressed_size + crc_r.uncompressed_size >
- bch2_crc_field_size_max[extent_entry_type(en_l)])
- return false;
- }
-
- en_l = extent_entry_next(en_l);
- en_r = extent_entry_next(en_r);
- }
-
- use_right_ptr = false;
- en_l = l_ptrs.start;
- en_r = r_ptrs.start;
- while (en_l < l_ptrs.end) {
- if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
- use_right_ptr)
- en_l->ptr = en_r->ptr;
-
- if (extent_entry_is_crc(en_l)) {
- struct bch_extent_crc_unpacked crc_l =
- bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
- struct bch_extent_crc_unpacked crc_r =
- bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
- use_right_ptr = false;
-
- if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
- crc_l.uncompressed_size) {
- /* can use left extent's crc entry */
- } else if (crc_l.live_size <= crc_r.offset) {
- /* can use right extent's crc entry */
- crc_r.offset -= crc_l.live_size;
- bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
- extent_entry_type(en_l));
- use_right_ptr = true;
- } else {
- crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
- crc_l.csum,
- crc_r.csum,
- crc_r.uncompressed_size << 9);
-
- crc_l.uncompressed_size += crc_r.uncompressed_size;
- crc_l.compressed_size += crc_r.compressed_size;
- bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
- extent_entry_type(en_l));
- }
- }
-
- en_l = extent_entry_next(en_l);
- en_r = extent_entry_next(en_r);
- }
-
- bch2_key_resize(l.k, l.k->size + r.k->size);
- return true;
-}
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
- int ret = 0;
-
- bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
- c, reservation_key_nr_replicas_invalid,
- "invalid nr_replicas (%u)", r.v->nr_replicas);
-fsck_err:
- return ret;
-}
-
-void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
- prt_printf(out, "generation %u replicas %u",
- le32_to_cpu(r.v->generation),
- r.v->nr_replicas);
-}
-
-bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
- struct bkey_s_reservation l = bkey_s_to_reservation(_l);
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
-
- if (l.v->generation != r.v->generation ||
- l.v->nr_replicas != r.v->nr_replicas)
- return false;
-
- bch2_key_resize(l.k, l.k->size + r.k->size);
- return true;
-}
-
-/* Extent checksum entries: */
-
-/* returns true if not equal */
-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
- struct bch_extent_crc_unpacked r)
-{
- return (l.csum_type != r.csum_type ||
- l.compression_type != r.compression_type ||
- l.compressed_size != r.compressed_size ||
- l.uncompressed_size != r.uncompressed_size ||
- l.offset != r.offset ||
- l.live_size != r.live_size ||
- l.nonce != r.nonce ||
- bch2_crc_cmp(l.csum, r.csum));
-}
-
-static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
- struct bch_extent_crc_unpacked n)
-{
- return !crc_is_compressed(u) &&
- u.csum_type &&
- u.uncompressed_size > u.live_size &&
- bch2_csum_type_is_encryption(u.csum_type) ==
- bch2_csum_type_is_encryption(n.csum_type);
-}
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
- struct bch_extent_crc_unpacked n)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_crc_unpacked crc;
- const union bch_extent_entry *i;
-
- if (!n.csum_type)
- return false;
-
- bkey_for_each_crc(k.k, ptrs, crc, i)
- if (can_narrow_crc(crc, n))
- return true;
-
- return false;
-}
-
-/*
- * We're writing another replica for this extent, so while we've got the data in
- * memory we'll be computing a new checksum for the currently live data.
- *
- * If there are other replicas we aren't moving, and they are checksummed but
- * not compressed, we can modify them to point to only the data that is
- * currently live (so that readers won't have to bounce) while we've got the
- * checksum we need:
- */
-bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
- struct bch_extent_crc_unpacked u;
- struct extent_ptr_decoded p;
- union bch_extent_entry *i;
- bool ret = false;
-
- /* Find a checksum entry that covers only live data: */
- if (!n.csum_type) {
- bkey_for_each_crc(&k->k, ptrs, u, i)
- if (!crc_is_compressed(u) &&
- u.csum_type &&
- u.live_size == u.uncompressed_size) {
- n = u;
- goto found;
- }
- return false;
- }
-found:
- BUG_ON(crc_is_compressed(n));
- BUG_ON(n.offset);
- BUG_ON(n.live_size != k->k.size);
-
-restart_narrow_pointers:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
- bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
- if (can_narrow_crc(p.crc, n)) {
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
- p.ptr.offset += p.crc.offset;
- p.crc = n;
- bch2_extent_ptr_decoded_append(k, &p);
- ret = true;
- goto restart_narrow_pointers;
- }
-
- return ret;
-}
-
-static void bch2_extent_crc_pack(union bch_extent_crc *dst,
- struct bch_extent_crc_unpacked src,
- enum bch_extent_entry_type type)
-{
-#define common_fields(_src) \
- .type = BIT(type), \
- .csum_type = _src.csum_type, \
- .compression_type = _src.compression_type, \
- ._compressed_size = _src.compressed_size - 1, \
- ._uncompressed_size = _src.uncompressed_size - 1, \
- .offset = _src.offset
-
- switch (type) {
- case BCH_EXTENT_ENTRY_crc32:
- dst->crc32 = (struct bch_extent_crc32) {
- common_fields(src),
- .csum = (u32 __force) *((__le32 *) &src.csum.lo),
- };
- break;
- case BCH_EXTENT_ENTRY_crc64:
- dst->crc64 = (struct bch_extent_crc64) {
- common_fields(src),
- .nonce = src.nonce,
- .csum_lo = (u64 __force) src.csum.lo,
- .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi),
- };
- break;
- case BCH_EXTENT_ENTRY_crc128:
- dst->crc128 = (struct bch_extent_crc128) {
- common_fields(src),
- .nonce = src.nonce,
- .csum = src.csum,
- };
- break;
- default:
- BUG();
- }
-#undef set_common_fields
-}
-
-void bch2_extent_crc_append(struct bkey_i *k,
- struct bch_extent_crc_unpacked new)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
- union bch_extent_crc *crc = (void *) ptrs.end;
- enum bch_extent_entry_type type;
-
- if (bch_crc_bytes[new.csum_type] <= 4 &&
- new.uncompressed_size <= CRC32_SIZE_MAX &&
- new.nonce <= CRC32_NONCE_MAX)
- type = BCH_EXTENT_ENTRY_crc32;
- else if (bch_crc_bytes[new.csum_type] <= 10 &&
- new.uncompressed_size <= CRC64_SIZE_MAX &&
- new.nonce <= CRC64_NONCE_MAX)
- type = BCH_EXTENT_ENTRY_crc64;
- else if (bch_crc_bytes[new.csum_type] <= 16 &&
- new.uncompressed_size <= CRC128_SIZE_MAX &&
- new.nonce <= CRC128_NONCE_MAX)
- type = BCH_EXTENT_ENTRY_crc128;
- else
- BUG();
-
- bch2_extent_crc_pack(crc, new, type);
-
- k->k.u64s += extent_entry_u64s(ptrs.end);
-
- EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
-}
-
-/* Generic code for keys with pointers: */
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
-{
- return bch2_bkey_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
-{
- return k.k->type == KEY_TYPE_reservation
- ? bkey_s_c_to_reservation(k).v->nr_replicas
- : bch2_bkey_dirty_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
-{
- unsigned ret = 0;
-
- if (k.k->type == KEY_TYPE_reservation) {
- ret = bkey_s_c_to_reservation(k).v->nr_replicas;
- } else {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- ret += !p.ptr.cached && !crc_is_compressed(p.crc);
- }
-
- return ret;
-}
-
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned ret = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached && crc_is_compressed(p.crc))
- ret += p.crc.compressed_size;
-
- return ret;
-}
-
-bool bch2_bkey_is_incompressible(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
-
- bkey_for_each_crc(k.k, ptrs, crc, entry)
- if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
- return true;
- return false;
-}
-
-unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p = { 0 };
- unsigned replicas = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.ptr.cached)
- continue;
-
- if (p.has_ec)
- replicas += p.ec.redundancy;
-
- replicas++;
-
- }
-
- return replicas;
-}
-
-static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
-{
- if (p->ptr.cached)
- return 0;
-
- return p->has_ec
- ? p->ec.redundancy + 1
- : ca->mi.durability;
-}
-
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
- struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
- return ca ? __extent_ptr_durability(ca, p) : 0;
-}
-
-unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
- struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
- if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
- return 0;
-
- return __extent_ptr_durability(ca, p);
-}
-
-unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned durability = 0;
-
- rcu_read_lock();
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- durability += bch2_extent_ptr_durability(c, &p);
- rcu_read_unlock();
-
- return durability;
-}
-
-static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned durability = 0;
-
- rcu_read_lock();
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
- durability += bch2_extent_ptr_durability(c, &p);
- rcu_read_unlock();
-
- return durability;
-}
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
-{
- union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
- union bch_extent_entry *next = extent_entry_next(entry);
-
- memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
- k->k.u64s -= extent_entry_u64s(entry);
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *k,
- struct extent_ptr_decoded *p)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
- struct bch_extent_crc_unpacked crc =
- bch2_extent_crc_unpack(&k->k, NULL);
- union bch_extent_entry *pos;
-
- if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
- pos = ptrs.start;
- goto found;
- }
-
- bkey_for_each_crc(&k->k, ptrs, crc, pos)
- if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
- pos = extent_entry_next(pos);
- goto found;
- }
-
- bch2_extent_crc_append(k, p->crc);
- pos = bkey_val_end(bkey_i_to_s(k));
-found:
- p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- __extent_entry_insert(k, pos, to_entry(&p->ptr));
-
- if (p->has_ec) {
- p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
- __extent_entry_insert(k, pos, to_entry(&p->ec));
- }
-}
-
-static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
- union bch_extent_entry *entry)
-{
- union bch_extent_entry *i = ptrs.start;
-
- if (i == entry)
- return NULL;
-
- while (extent_entry_next(i) != entry)
- i = extent_entry_next(i);
- return i;
-}
-
-/*
- * Returns pointer to the next entry after the one being dropped:
- */
-void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry = to_entry(ptr), *next;
- bool drop_crc = true;
-
- if (k.k->type == KEY_TYPE_stripe) {
- ptr->dev = BCH_SB_MEMBER_INVALID;
- return;
- }
-
- EBUG_ON(ptr < &ptrs.start->ptr ||
- ptr >= &ptrs.end->ptr);
- EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
-
- for (next = extent_entry_next(entry);
- next != ptrs.end;
- next = extent_entry_next(next)) {
- if (extent_entry_is_crc(next)) {
- break;
- } else if (extent_entry_is_ptr(next)) {
- drop_crc = false;
- break;
- }
- }
-
- extent_entry_drop(k, entry);
-
- while ((entry = extent_entry_prev(ptrs, entry))) {
- if (extent_entry_is_ptr(entry))
- break;
-
- if ((extent_entry_is_crc(entry) && drop_crc) ||
- extent_entry_is_stripe_ptr(entry))
- extent_entry_drop(k, entry);
- }
-}
-
-void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
-{
- if (k.k->type != KEY_TYPE_stripe) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == ptr->dev && p.has_ec) {
- ptr->dev = BCH_SB_MEMBER_INVALID;
- return;
- }
- }
-
- bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
-
- bch2_bkey_drop_ptr_noerror(k, ptr);
-
- /*
- * If we deleted all the dirty pointers and there's still cached
- * pointers, we could set the cached pointers to dirty if they're not
- * stale - but to do that correctly we'd need to grab an open_bucket
- * reference so that we don't race with bucket reuse:
- */
- if (have_dirty &&
- !bch2_bkey_dirty_devs(k.s_c).nr) {
- k.k->type = KEY_TYPE_error;
- set_bkey_val_u64s(k.k, 0);
- } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
- k.k->type = KEY_TYPE_deleted;
- set_bkey_val_u64s(k.k, 0);
- }
-}
-
-void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
-{
- bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
-}
-
-void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
-{
- bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
-}
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == dev)
- return ptr;
-
- return NULL;
-}
-
-bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_dev *ca;
- bool ret = false;
-
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr)
- if (bch2_dev_in_target(c, ptr->dev, target) &&
- (ca = bch2_dev_rcu(c, ptr->dev)) &&
- (!ptr->cached ||
- !dev_ptr_stale_rcu(ca, ptr))) {
- ret = true;
- break;
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
- struct bch_extent_ptr m, u64 offset)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == m.dev &&
- p.ptr.gen == m.gen &&
- (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
- (s64) m.offset - offset)
- return true;
-
- return false;
-}
-
-/*
- * Returns true if two extents refer to the same data:
- */
-bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
-{
- if (k1.k->type != k2.k->type)
- return false;
-
- if (bkey_extent_is_direct_data(k1.k)) {
- struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
- struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
- const union bch_extent_entry *entry1, *entry2;
- struct extent_ptr_decoded p1, p2;
-
- if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
- return false;
-
- bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
- bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
- if (p1.ptr.dev == p2.ptr.dev &&
- p1.ptr.gen == p2.ptr.gen &&
-
- /*
- * This checks that the two pointers point
- * to the same region on disk - adjusting
- * for the difference in where the extents
- * start, since one may have been trimmed:
- */
- (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
-
- /*
- * This additionally checks that the
- * extents overlap on disk, since the
- * previous check may trigger spuriously
- * when one extent is immediately partially
- * overwritten with another extent (so that
- * on disk they are adjacent) and
- * compression is in use:
- */
- ((p1.ptr.offset >= p2.ptr.offset &&
- p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
- (p2.ptr.offset >= p1.ptr.offset &&
- p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
- return true;
-
- return false;
- } else {
- /* KEY_TYPE_deleted, etc. */
- return true;
- }
-}
-
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
-{
- struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
- union bch_extent_entry *entry2;
- struct extent_ptr_decoded p2;
-
- bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
- if (p1.ptr.dev == p2.ptr.dev &&
- p1.ptr.gen == p2.ptr.gen &&
- (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
- return &entry2->ptr;
-
- return NULL;
-}
-
-static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
- struct bch_extent_ptr *ptr)
-{
- if (!opts->promote_target ||
- !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
- return false;
-
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
-
- return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
-}
-
-void bch2_extent_ptr_set_cached(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_s k,
- struct bch_extent_ptr *ptr)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- rcu_read_lock();
- if (!want_cached_ptr(c, opts, ptr)) {
- bch2_bkey_drop_ptr_noerror(k, ptr);
- goto out;
- }
-
- /*
- * Stripes can't contain cached data, for - reasons.
- *
- * Possibly something we can fix in the future?
- */
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (&entry->ptr == ptr) {
- if (p.has_ec)
- bch2_bkey_drop_ptr_noerror(k, ptr);
- else
- ptr->cached = true;
- goto out;
- }
-
- BUG();
-out:
- rcu_read_unlock();
-}
-
-/*
- * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
- *
- * Returns true if @k should be dropped entirely
- *
- * For existing keys, only called when btree nodes are being rewritten, not when
- * they're merely being compacted/resorted in memory.
- */
-bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
-{
- struct bch_dev *ca;
-
- rcu_read_lock();
- bch2_bkey_drop_ptrs(k, ptr,
- ptr->cached &&
- (!(ca = bch2_dev_rcu(c, ptr->dev)) ||
- dev_ptr_stale_rcu(ca, ptr) > 0));
- rcu_read_unlock();
-
- return bkey_deleted(k.k);
-}
-
-/*
- * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
- *
- * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
- * the promote target.
- */
-bool bch2_extent_normalize_by_opts(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_s k)
-{
- struct bkey_ptrs ptrs;
- bool have_cached_ptr;
-
- rcu_read_lock();
-restart_drop_ptrs:
- ptrs = bch2_bkey_ptrs(k);
- have_cached_ptr = false;
-
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->cached) {
- if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
- bch2_bkey_drop_ptr(k, ptr);
- goto restart_drop_ptrs;
- }
- have_cached_ptr = true;
- }
- rcu_read_unlock();
-
- return bkey_deleted(k.k);
-}
-
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
-{
- out->atomic++;
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
- if (!ca) {
- prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "");
- } else {
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, "ptr: %u:%llu:%u gen %u",
- ptr->dev, b, offset, ptr->gen);
- if (ca->mi.durability != 1)
- prt_printf(out, " d=%u", ca->mi.durability);
- if (ptr->cached)
- prt_str(out, " cached");
- if (ptr->unwritten)
- prt_str(out, " unwritten");
- int stale = dev_ptr_stale_rcu(ca, ptr);
- if (stale > 0)
- prt_printf(out, " stale");
- else if (stale)
- prt_printf(out, " invalid");
- }
- rcu_read_unlock();
- --out->atomic;
-}
-
-void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
-{
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
- crc->compressed_size,
- crc->uncompressed_size,
- crc->offset, crc->nonce);
- bch2_prt_csum_type(out, crc->csum_type);
- prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
- prt_str(out, " compress ");
- bch2_prt_compression_type(out, crc->compression_type);
-}
-
-static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
- const struct bch_extent_rebalance *r)
-{
- prt_str(out, "rebalance:");
-
- prt_printf(out, " replicas=%u", r->data_replicas);
- if (r->data_replicas_from_inode)
- prt_str(out, " (inode)");
-
- prt_str(out, " checksum=");
- bch2_prt_csum_opt(out, r->data_checksum);
- if (r->data_checksum_from_inode)
- prt_str(out, " (inode)");
-
- if (r->background_compression || r->background_compression_from_inode) {
- prt_str(out, " background_compression=");
- bch2_compression_opt_to_text(out, r->background_compression);
-
- if (r->background_compression_from_inode)
- prt_str(out, " (inode)");
- }
-
- if (r->background_target || r->background_target_from_inode) {
- prt_str(out, " background_target=");
- if (c)
- bch2_target_to_text(out, c, r->background_target);
- else
- prt_printf(out, "%u", r->background_target);
-
- if (r->background_target_from_inode)
- prt_str(out, " (inode)");
- }
-
- if (r->promote_target || r->promote_target_from_inode) {
- prt_str(out, " promote_target=");
- if (c)
- bch2_target_to_text(out, c, r->promote_target);
- else
- prt_printf(out, "%u", r->promote_target);
-
- if (r->promote_target_from_inode)
- prt_str(out, " (inode)");
- }
-
- if (r->erasure_code || r->erasure_code_from_inode) {
- prt_printf(out, " ec=%u", r->erasure_code);
- if (r->erasure_code_from_inode)
- prt_str(out, " (inode)");
- }
-}
-
-void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- bool first = true;
-
- if (c)
- prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
-
- bkey_extent_entry_for_each(ptrs, entry) {
- if (!first)
- prt_printf(out, " ");
-
- switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
- break;
-
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128: {
- struct bch_extent_crc_unpacked crc =
- bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
- bch2_extent_crc_unpacked_to_text(out, &crc);
- break;
- }
- case BCH_EXTENT_ENTRY_stripe_ptr: {
- const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
-
- prt_printf(out, "ec: idx %llu block %u",
- (u64) ec->idx, ec->block);
- break;
- }
- case BCH_EXTENT_ENTRY_rebalance:
- bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
- break;
-
- case BCH_EXTENT_ENTRY_flags:
- prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
- break;
-
- default:
- prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
- return;
- }
-
- first = false;
- }
-}
-
-static int extent_ptr_validate(struct bch_fs *c,
- struct bkey_s_c k,
- struct bkey_validate_context from,
- const struct bch_extent_ptr *ptr,
- unsigned size_ondisk,
- bool metadata)
-{
- int ret = 0;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr2)
- bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
- c, ptr_to_duplicate_device,
- "multiple pointers to same device (%u)", ptr->dev);
-
- /* bad pointers are repaired by check_fix_ptrs(): */
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
- if (!ca) {
- rcu_read_unlock();
- return 0;
- }
- u32 bucket_offset;
- u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
- unsigned first_bucket = ca->mi.first_bucket;
- u64 nbuckets = ca->mi.nbuckets;
- unsigned bucket_size = ca->mi.bucket_size;
- rcu_read_unlock();
-
- bkey_fsck_err_on(bucket >= nbuckets,
- c, ptr_after_last_bucket,
- "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
- bkey_fsck_err_on(bucket < first_bucket,
- c, ptr_before_first_bucket,
- "pointer before first bucket (%llu < %u)", bucket, first_bucket);
- bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
- c, ptr_spans_multiple_buckets,
- "pointer spans multiple buckets (%u + %u > %u)",
- bucket_offset, size_ondisk, bucket_size);
-fsck_err:
- return ret;
-}
-
-int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
- unsigned size_ondisk = k.k->size;
- unsigned nonce = UINT_MAX;
- unsigned nr_ptrs = 0;
- bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
- int ret = 0;
-
- if (bkey_is_btree_ptr(k.k))
- size_ondisk = btree_sectors(c);
-
- bkey_extent_entry_for_each(ptrs, entry) {
- bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
- c, extent_ptrs_invalid_entry,
- "invalid extent entry type (got %u, max %u)",
- __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
-
- bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
- !extent_entry_is_ptr(entry),
- c, btree_ptr_has_non_ptr,
- "has non ptr field");
-
- switch (extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
- if (ret)
- return ret;
-
- bkey_fsck_err_on(entry->ptr.cached && have_ec,
- c, ptr_cached_and_erasure_coded,
- "cached, erasure coded ptr");
-
- if (!entry->ptr.unwritten)
- have_written = true;
- else
- have_unwritten = true;
-
- have_ec = false;
- crc_since_last_ptr = false;
- nr_ptrs++;
- break;
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
- bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
- c, ptr_crc_csum_type_unknown,
- "invalid checksum type");
- bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
- c, ptr_crc_compression_type_unknown,
- "invalid compression type");
-
- bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
- c, ptr_crc_uncompressed_size_too_small,
- "checksum offset + key size > uncompressed size");
- bkey_fsck_err_on(crc_is_encoded(crc) &&
- (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
- (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
- c, ptr_crc_uncompressed_size_too_big,
- "too large encoded extent");
- bkey_fsck_err_on(!crc_is_compressed(crc) &&
- crc.compressed_size != crc.uncompressed_size,
- c, ptr_crc_uncompressed_size_mismatch,
- "not compressed but compressed != uncompressed size");
-
- if (bch2_csum_type_is_encryption(crc.csum_type)) {
- if (nonce == UINT_MAX)
- nonce = crc.offset + crc.nonce;
- else if (nonce != crc.offset + crc.nonce)
- bkey_fsck_err(c, ptr_crc_nonce_mismatch,
- "incorrect nonce");
- }
-
- bkey_fsck_err_on(crc_since_last_ptr,
- c, ptr_crc_redundant,
- "redundant crc entry");
- crc_since_last_ptr = true;
-
- size_ondisk = crc.compressed_size;
- break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- bkey_fsck_err_on(have_ec,
- c, ptr_stripe_redundant,
- "redundant stripe entry");
- have_ec = true;
- break;
- case BCH_EXTENT_ENTRY_rebalance: {
- /*
- * this shouldn't be a fsck error, for forward
- * compatibility; the rebalance code should just refetch
- * the compression opt if it's unknown
- */
-#if 0
- const struct bch_extent_rebalance *r = &entry->rebalance;
-
- if (!bch2_compression_opt_valid(r->compression)) {
- struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
- prt_printf(err, "invalid compression opt %u:%u",
- opt.type, opt.level);
- return -BCH_ERR_invalid_bkey;
- }
-#endif
- break;
- }
- case BCH_EXTENT_ENTRY_flags:
- bkey_fsck_err_on(entry != ptrs.start,
- c, extent_flags_not_at_start,
- "extent flags entry not at start");
- break;
- }
- }
-
- bkey_fsck_err_on(!nr_ptrs,
- c, extent_ptrs_no_ptrs,
- "no ptrs");
- bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
- c, extent_ptrs_too_many_ptrs,
- "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
- bkey_fsck_err_on(have_written && have_unwritten,
- c, extent_ptrs_written_and_unwritten,
- "extent with unwritten and written ptrs");
- bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
- c, extent_ptrs_unwritten,
- "has unwritten ptrs");
- bkey_fsck_err_on(crc_since_last_ptr,
- c, extent_ptrs_redundant_crc,
- "redundant crc entry");
- bkey_fsck_err_on(have_ec,
- c, extent_ptrs_redundant_stripe,
- "redundant stripe entry");
-fsck_err:
- return ret;
-}
-
-void bch2_ptr_swab(struct bkey_s k)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
- u64 *d;
-
- for (d = (u64 *) ptrs.start;
- d != (u64 *) ptrs.end;
- d++)
- *d = swab64(*d);
-
- for (entry = ptrs.start;
- entry < ptrs.end;
- entry = extent_entry_next(entry)) {
- switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- break;
- case BCH_EXTENT_ENTRY_crc32:
- entry->crc32.csum = swab32(entry->crc32.csum);
- break;
- case BCH_EXTENT_ENTRY_crc64:
- entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
- entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
- break;
- case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.csum.hi = (__force __le64)
- swab64((__force u64) entry->crc128.csum.hi);
- entry->crc128.csum.lo = (__force __le64)
- swab64((__force u64) entry->crc128.csum.lo);
- break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- break;
- case BCH_EXTENT_ENTRY_rebalance:
- break;
- default:
- /* Bad entry type: will be caught by validate() */
- return;
- }
- }
-}
-
-int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
-{
- int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
- if (ret)
- return ret;
-
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
- if (ptrs.start != ptrs.end &&
- extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
- ptrs.start->flags.flags = flags;
- } else {
- struct bch_extent_flags f = {
- .type = BIT(BCH_EXTENT_ENTRY_flags),
- .flags = flags,
- };
- __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
- }
-
- return 0;
-}
-
-/* Generic extent code: */
-
-int bch2_cut_front_s(struct bpos where, struct bkey_s k)
-{
- unsigned new_val_u64s = bkey_val_u64s(k.k);
- int val_u64s_delta;
- u64 sub;
-
- if (bkey_le(where, bkey_start_pos(k.k)))
- return 0;
-
- EBUG_ON(bkey_gt(where, k.k->p));
-
- sub = where.offset - bkey_start_offset(k.k);
-
- k.k->size -= sub;
-
- if (!k.k->size) {
- k.k->type = KEY_TYPE_deleted;
- new_val_u64s = 0;
- }
-
- switch (k.k->type) {
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v: {
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
- bool seen_crc = false;
-
- bkey_extent_entry_for_each(ptrs, entry) {
- switch (extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- if (!seen_crc)
- entry->ptr.offset += sub;
- break;
- case BCH_EXTENT_ENTRY_crc32:
- entry->crc32.offset += sub;
- break;
- case BCH_EXTENT_ENTRY_crc64:
- entry->crc64.offset += sub;
- break;
- case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.offset += sub;
- break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- case BCH_EXTENT_ENTRY_rebalance:
- case BCH_EXTENT_ENTRY_flags:
- break;
- }
-
- if (extent_entry_is_crc(entry))
- seen_crc = true;
- }
-
- break;
- }
- case KEY_TYPE_reflink_p: {
- struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
-
- SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
- break;
- }
- case KEY_TYPE_inline_data:
- case KEY_TYPE_indirect_inline_data: {
- void *p = bkey_inline_data_p(k);
- unsigned bytes = bkey_inline_data_bytes(k.k);
-
- sub = min_t(u64, sub << 9, bytes);
-
- memmove(p, p + sub, bytes - sub);
-
- new_val_u64s -= sub >> 3;
- break;
- }
- }
-
- val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
- BUG_ON(val_u64s_delta < 0);
-
- set_bkey_val_u64s(k.k, new_val_u64s);
- memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
- return -val_u64s_delta;
-}
-
-int bch2_cut_back_s(struct bpos where, struct bkey_s k)
-{
- unsigned new_val_u64s = bkey_val_u64s(k.k);
- int val_u64s_delta;
- u64 len = 0;
-
- if (bkey_ge(where, k.k->p))
- return 0;
-
- EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
-
- len = where.offset - bkey_start_offset(k.k);
-
- k.k->p.offset = where.offset;
- k.k->size = len;
-
- if (!len) {
- k.k->type = KEY_TYPE_deleted;
- new_val_u64s = 0;
- }
-
- switch (k.k->type) {
- case KEY_TYPE_inline_data:
- case KEY_TYPE_indirect_inline_data:
- new_val_u64s = (bkey_inline_data_offset(k.k) +
- min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
- break;
- }
-
- val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
- BUG_ON(val_u64s_delta < 0);
-
- set_bkey_val_u64s(k.k, new_val_u64s);
- memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
- return -val_u64s_delta;
-}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
deleted file mode 100644
index e78a39e7e18f..000000000000
--- a/fs/bcachefs/extents.h
+++ /dev/null
@@ -1,772 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_H
-#define _BCACHEFS_EXTENTS_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "extents_types.h"
-
-struct bch_fs;
-struct btree_trans;
-
-/* extent entries: */
-
-#define extent_entry_last(_e) \
- ((typeof(&(_e).v->start[0])) bkey_val_end(_e))
-
-#define entry_to_ptr(_entry) \
-({ \
- EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \
- \
- __builtin_choose_expr( \
- type_is_exact(_entry, const union bch_extent_entry *), \
- (const struct bch_extent_ptr *) (_entry), \
- (struct bch_extent_ptr *) (_entry)); \
-})
-
-/* downcast, preserves const */
-#define to_entry(_entry) \
-({ \
- BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \
- !type_is(_entry, struct bch_extent_ptr *) && \
- !type_is(_entry, struct bch_extent_stripe_ptr *)); \
- \
- __builtin_choose_expr( \
- (type_is_exact(_entry, const union bch_extent_crc *) || \
- type_is_exact(_entry, const struct bch_extent_ptr *) ||\
- type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
- (const union bch_extent_entry *) (_entry), \
- (union bch_extent_entry *) (_entry)); \
-})
-
-#define extent_entry_next(_entry) \
- ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
-
-#define extent_entry_next_safe(_entry, _end) \
- (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \
- ? extent_entry_next(_entry) \
- : _end)
-
-static inline unsigned
-__extent_entry_type(const union bch_extent_entry *e)
-{
- return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
-}
-
-static inline enum bch_extent_entry_type
-extent_entry_type(const union bch_extent_entry *e)
-{
- int ret = __ffs(e->type);
-
- EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
-
- return ret;
-}
-
-static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
-{
- switch (extent_entry_type(entry)) {
-#define x(f, n) \
- case BCH_EXTENT_ENTRY_##f: \
- return sizeof(struct bch_extent_##f);
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
- default:
- BUG();
- }
-}
-
-static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
-{
- return extent_entry_bytes(entry) / sizeof(u64);
-}
-
-static inline void __extent_entry_insert(struct bkey_i *k,
- union bch_extent_entry *dst,
- union bch_extent_entry *new)
-{
- union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
- memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
- dst, (u64 *) end - (u64 *) dst);
- k->k.u64s += extent_entry_u64s(new);
- memcpy_u64s_small(dst, new, extent_entry_u64s(new));
-}
-
-static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
- union bch_extent_entry *next = extent_entry_next(entry);
-
- /* stripes have ptrs, but their layout doesn't work with this code */
- BUG_ON(k.k->type == KEY_TYPE_stripe);
-
- memmove_u64s_down(entry, next,
- (u64 *) bkey_val_end(k) - (u64 *) next);
- k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
-static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
-{
- return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
-}
-
-static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
-{
- return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
-}
-
-static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
-{
- switch (__extent_entry_type(e)) {
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- return true;
- default:
- return false;
- }
-}
-
-union bch_extent_crc {
- u8 type;
- struct bch_extent_crc32 crc32;
- struct bch_extent_crc64 crc64;
- struct bch_extent_crc128 crc128;
-};
-
-#define __entry_to_crc(_entry) \
- __builtin_choose_expr( \
- type_is_exact(_entry, const union bch_extent_entry *), \
- (const union bch_extent_crc *) (_entry), \
- (union bch_extent_crc *) (_entry))
-
-#define entry_to_crc(_entry) \
-({ \
- EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \
- \
- __entry_to_crc(_entry); \
-})
-
-static inline struct bch_extent_crc_unpacked
-bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
-{
-#define common_fields(_crc) \
- .csum_type = _crc.csum_type, \
- .compression_type = _crc.compression_type, \
- .compressed_size = _crc._compressed_size + 1, \
- .uncompressed_size = _crc._uncompressed_size + 1, \
- .offset = _crc.offset, \
- .live_size = k->size
-
- if (!crc)
- return (struct bch_extent_crc_unpacked) {
- .compressed_size = k->size,
- .uncompressed_size = k->size,
- .live_size = k->size,
- };
-
- switch (extent_entry_type(to_entry(crc))) {
- case BCH_EXTENT_ENTRY_crc32: {
- struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
- common_fields(crc->crc32),
- };
-
- *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
- return ret;
- }
- case BCH_EXTENT_ENTRY_crc64: {
- struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
- common_fields(crc->crc64),
- .nonce = crc->crc64.nonce,
- .csum.lo = (__force __le64) crc->crc64.csum_lo,
- };
-
- *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
-
- return ret;
- }
- case BCH_EXTENT_ENTRY_crc128: {
- struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
- common_fields(crc->crc128),
- .nonce = crc->crc128.nonce,
- .csum = crc->crc128.csum,
- };
-
- return ret;
- }
- default:
- BUG();
- }
-#undef common_fields
-}
-
-static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
-{
- return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
- crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
-}
-
-static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
-{
- return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
-}
-
-void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *);
-
-/* bkey_ptrs: generically over any key type that has ptrs */
-
-struct bkey_ptrs_c {
- const union bch_extent_entry *start;
- const union bch_extent_entry *end;
-};
-
-struct bkey_ptrs {
- union bch_extent_entry *start;
- union bch_extent_entry *end;
-};
-
-static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr: {
- struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
-
- return (struct bkey_ptrs_c) {
- to_entry(&e.v->start[0]),
- to_entry(extent_entry_last(e))
- };
- }
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
- return (struct bkey_ptrs_c) {
- e.v->start,
- extent_entry_last(e)
- };
- }
- case KEY_TYPE_stripe: {
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- return (struct bkey_ptrs_c) {
- to_entry(&s.v->ptrs[0]),
- to_entry(&s.v->ptrs[s.v->nr_blocks]),
- };
- }
- case KEY_TYPE_reflink_v: {
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
- return (struct bkey_ptrs_c) {
- r.v->start,
- bkey_val_end(r),
- };
- }
- case KEY_TYPE_btree_ptr_v2: {
- struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
-
- return (struct bkey_ptrs_c) {
- to_entry(&e.v->start[0]),
- to_entry(extent_entry_last(e))
- };
- }
- default:
- return (struct bkey_ptrs_c) { NULL, NULL };
- }
-}
-
-static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
-{
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
-
- return (struct bkey_ptrs) {
- (void *) p.start,
- (void *) p.end
- };
-}
-
-#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
- for ((_entry) = (_start); \
- (_entry) < (_end); \
- (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define __bkey_ptr_next(_ptr, _end) \
-({ \
- typeof(_end) _entry; \
- \
- __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \
- if (extent_entry_is_ptr(_entry)) \
- break; \
- \
- _entry < (_end) ? entry_to_ptr(_entry) : NULL; \
-})
-
-#define bkey_extent_entry_for_each_from(_p, _entry, _start) \
- __bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
-
-#define bkey_extent_entry_for_each(_p, _entry) \
- bkey_extent_entry_for_each_from(_p, _entry, _p.start)
-
-#define __bkey_for_each_ptr(_start, _end, _ptr) \
- for (typeof(_start) (_ptr) = (_start); \
- ((_ptr) = __bkey_ptr_next(_ptr, _end)); \
- (_ptr)++)
-
-#define bkey_ptr_next(_p, _ptr) \
- __bkey_ptr_next(_ptr, (_p).end)
-
-#define bkey_for_each_ptr(_p, _ptr) \
- __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
-
-#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \
-({ \
- __label__ out; \
- \
- (_ptr).has_ec = false; \
- (_ptr).do_ec_reconstruct = false; \
- (_ptr).crc_retry_nr = 0; \
- \
- __bkey_extent_entry_for_each_from(_entry, _end, _entry) \
- switch (__extent_entry_type(_entry)) { \
- case BCH_EXTENT_ENTRY_ptr: \
- (_ptr).ptr = _entry->ptr; \
- goto out; \
- case BCH_EXTENT_ENTRY_crc32: \
- case BCH_EXTENT_ENTRY_crc64: \
- case BCH_EXTENT_ENTRY_crc128: \
- (_ptr).crc = bch2_extent_crc_unpack(_k, \
- entry_to_crc(_entry)); \
- break; \
- case BCH_EXTENT_ENTRY_stripe_ptr: \
- (_ptr).ec = _entry->stripe_ptr; \
- (_ptr).has_ec = true; \
- break; \
- default: \
- /* nothing */ \
- break; \
- } \
-out: \
- _entry < (_end); \
-})
-
-#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \
- for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
- (_entry) = _start; \
- __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
- (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
- __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
- _ptr, _entry)
-
-#define bkey_crc_next(_k, _end, _crc, _iter) \
-({ \
- __bkey_extent_entry_for_each_from(_iter, _end, _iter) \
- if (extent_entry_is_crc(_iter)) { \
- (_crc) = bch2_extent_crc_unpack(_k, \
- entry_to_crc(_iter)); \
- break; \
- } \
- \
- (_iter) < (_end); \
-})
-
-#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
- for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
- (_iter) = (_start); \
- bkey_crc_next(_k, _end, _crc, _iter); \
- (_iter) = extent_entry_next(_iter))
-
-#define bkey_for_each_crc(_k, _p, _crc, _iter) \
- __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
-
-/* Iterate over pointers in KEY_TYPE_extent: */
-
-#define extent_for_each_entry_from(_e, _entry, _start) \
- __bkey_extent_entry_for_each_from(_start, \
- extent_entry_last(_e), _entry)
-
-#define extent_for_each_entry(_e, _entry) \
- extent_for_each_entry_from(_e, _entry, (_e).v->start)
-
-#define extent_ptr_next(_e, _ptr) \
- __bkey_ptr_next(_ptr, extent_entry_last(_e))
-
-#define extent_for_each_ptr(_e, _ptr) \
- __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
-
-#define extent_for_each_ptr_decode(_e, _ptr, _entry) \
- __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \
- extent_entry_last(_e), _ptr, _entry)
-
-/* utility code common to all keys with pointers: */
-
-struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
- unsigned);
-void bch2_mark_io_failure(struct bch_io_failures *,
- struct extent_ptr_decoded *, bool);
-int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
- struct bch_io_failures *,
- struct extent_ptr_decoded *, int);
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-
-int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
- int, struct bkey_s);
-
-#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \
- .key_validate = bch2_btree_ptr_validate, \
- .val_to_text = bch2_btree_ptr_to_text, \
- .swab = bch2_ptr_swab, \
- .trigger = bch2_trigger_extent, \
-})
-
-#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \
- .key_validate = bch2_btree_ptr_v2_validate, \
- .val_to_text = bch2_btree_ptr_v2_to_text, \
- .swab = bch2_ptr_swab, \
- .compat = bch2_btree_ptr_v2_compat, \
- .trigger = bch2_trigger_extent, \
- .min_val_size = 40, \
-})
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_extent ((struct bkey_ops) { \
- .key_validate = bch2_bkey_ptrs_validate, \
- .val_to_text = bch2_bkey_ptrs_to_text, \
- .swab = bch2_ptr_swab, \
- .key_normalize = bch2_extent_normalize, \
- .key_merge = bch2_extent_merge, \
- .trigger = bch2_trigger_extent, \
-})
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_reservation ((struct bkey_ops) { \
- .key_validate = bch2_reservation_validate, \
- .val_to_text = bch2_reservation_to_text, \
- .key_merge = bch2_reservation_merge, \
- .trigger = bch2_trigger_reservation, \
- .min_val_size = 8, \
-})
-
-/* Extent checksum entries: */
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
- struct bch_extent_crc_unpacked);
-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
-void bch2_extent_crc_append(struct bkey_i *,
- struct bch_extent_crc_unpacked);
-
-/* Generic code for keys with pointers: */
-
-static inline bool bkey_is_btree_ptr(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool bkey_extent_is_direct_data(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool bkey_extent_is_inline_data(const struct bkey *k)
-{
- return k->type == KEY_TYPE_inline_data ||
- k->type == KEY_TYPE_indirect_inline_data;
-}
-
-static inline unsigned bkey_inline_data_offset(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_inline_data:
- return sizeof(struct bch_inline_data);
- case KEY_TYPE_indirect_inline_data:
- return sizeof(struct bch_indirect_inline_data);
- default:
- BUG();
- }
-}
-
-static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
-{
- return bkey_val_bytes(k) - bkey_inline_data_offset(k);
-}
-
-#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k))
-
-static inline bool bkey_extent_is_data(const struct bkey *k)
-{
- return bkey_extent_is_direct_data(k) ||
- bkey_extent_is_inline_data(k) ||
- k->type == KEY_TYPE_reflink_p;
-}
-
-/*
- * Should extent be counted under inode->i_sectors?
- */
-static inline bool bkey_extent_is_allocation(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_extent:
- case KEY_TYPE_reservation:
- case KEY_TYPE_reflink_p:
- case KEY_TYPE_reflink_v:
- case KEY_TYPE_inline_data:
- case KEY_TYPE_indirect_inline_data:
- case KEY_TYPE_error:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->unwritten)
- return true;
- return false;
-}
-
-static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
-{
- return k.k->type == KEY_TYPE_reservation ||
- bkey_extent_is_unwritten(k);
-}
-
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
-{
- struct bch_devs_list ret = (struct bch_devs_list) { 0 };
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(p, ptr)
- ret.data[ret.nr++] = ptr->dev;
-
- return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
-{
- struct bch_devs_list ret = (struct bch_devs_list) { 0 };
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(p, ptr)
- if (!ptr->cached)
- ret.data[ret.nr++] = ptr->dev;
-
- return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
-{
- struct bch_devs_list ret = (struct bch_devs_list) { 0 };
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(p, ptr)
- if (ptr->cached)
- ret.data[ret.nr++] = ptr->dev;
-
- return ret;
-}
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
-bool bch2_bkey_is_incompressible(struct bkey_s_c);
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-
-unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
-
-static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
-{
- return (void *) bch2_bkey_has_device_c(k.s_c, dev);
-}
-
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
-
-static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
-{
- struct bch_extent_ptr *dest;
-
- EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
-
- switch (k->k.type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
- ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
- *dest = ptr;
- k->k.u64s++;
- break;
- default:
- BUG();
- }
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *,
- struct extent_ptr_decoded *);
-void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
-void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
-
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-
-#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
-do { \
- __label__ _again; \
- struct bkey_ptrs _ptrs; \
-_again: \
- _ptrs = bch2_bkey_ptrs(_k); \
- \
- bkey_for_each_ptr(_ptrs, _ptr) \
- if (_cond) { \
- bch2_bkey_drop_ptr_noerror(_k, _ptr); \
- goto _again; \
- } \
-} while (0)
-
-#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
-do { \
- __label__ _again; \
- struct bkey_ptrs _ptrs; \
-_again: \
- _ptrs = bch2_bkey_ptrs(_k); \
- \
- bkey_for_each_ptr(_ptrs, _ptr) \
- if (_cond) { \
- bch2_bkey_drop_ptr(_k, _ptr); \
- goto _again; \
- } \
-} while (0)
-
-bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
- struct bch_extent_ptr, u64);
-bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
-
-void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
- struct bkey_s, struct bch_extent_ptr *);
-
-bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
-void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-
-static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
- struct bch_extent_ptr ptr2)
-{
- return (ptr1.cached == ptr2.cached &&
- ptr1.unwritten == ptr2.unwritten &&
- ptr1.offset == ptr2.offset &&
- ptr1.dev == ptr2.dev &&
- ptr1.gen == ptr2.gen);
-}
-
-void bch2_ptr_swab(struct bkey_s);
-
-/* Generic extent code: */
-
-enum bch_extent_overlap {
- BCH_EXTENT_OVERLAP_ALL = 0,
- BCH_EXTENT_OVERLAP_BACK = 1,
- BCH_EXTENT_OVERLAP_FRONT = 2,
- BCH_EXTENT_OVERLAP_MIDDLE = 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
- const struct bkey *m)
-{
- int cmp1 = bkey_lt(k->p, m->p);
- int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
-
- return (cmp1 << 1) + cmp2;
-}
-
-int bch2_cut_front_s(struct bpos, struct bkey_s);
-int bch2_cut_back_s(struct bpos, struct bkey_s);
-
-static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
-{
- bch2_cut_front_s(where, bkey_i_to_s(k));
-}
-
-static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
-{
- bch2_cut_back_s(where, bkey_i_to_s(k));
-}
-
-/**
- * bch_key_resize - adjust size of @k
- *
- * bkey_start_offset(k) will be preserved, modifies where the extent ends
- */
-static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
-{
- k->p.offset -= k->size;
- k->p.offset += new_size;
- k->size = new_size;
-}
-
-static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
-{
- if (ptrs.start != ptrs.end &&
- extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
- return ptrs.start->flags.flags;
- return 0;
-}
-
-static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
-{
- return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
-}
-
-int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
-
-#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
deleted file mode 100644
index 74c0252cbd98..000000000000
--- a/fs/bcachefs/extents_format.h
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_FORMAT_H
-#define _BCACHEFS_EXTENTS_FORMAT_H
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32 - 0b1
- * bch_extent_ptr - 0b10
- * bch_extent_crc64 - 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
-#define BCH_EXTENT_ENTRY_TYPES() \
- x(ptr, 0) \
- x(crc32, 1) \
- x(crc64, 2) \
- x(crc128, 3) \
- x(stripe_ptr, 4) \
- x(rebalance, 5) \
- x(flags, 6)
-#define BCH_EXTENT_ENTRY_MAX 7
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u32 type:2,
- _compressed_size:7,
- _uncompressed_size:7,
- offset:7,
- _unused:1,
- csum_type:4,
- compression_type:4;
- __u32 csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u32 csum;
- __u32 compression_type:4,
- csum_type:4,
- _unused:1,
- offset:7,
- _uncompressed_size:7,
- _compressed_size:7,
- type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX (1U << 7)
-#define CRC32_NONCE_MAX 0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:3,
- _compressed_size:9,
- _uncompressed_size:9,
- offset:9,
- nonce:10,
- csum_type:4,
- compression_type:4,
- csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 csum_hi:16,
- compression_type:4,
- csum_type:4,
- nonce:10,
- offset:9,
- _uncompressed_size:9,
- _compressed_size:9,
- type:3;
-#endif
- __u64 csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX (1U << 9)
-#define CRC64_NONCE_MAX ((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:4,
- _compressed_size:13,
- _uncompressed_size:13,
- offset:13,
- nonce:13,
- csum_type:4,
- compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 compression_type:4,
- csum_type:4,
- nonce:13,
- offset:13,
- _uncompressed_size:13,
- _compressed_size:13,
- type:4;
-#endif
- struct bch_csum csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX (1U << 13)
-#define CRC128_NONCE_MAX ((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:1,
- cached:1,
- unused:1,
- unwritten:1,
- offset:44, /* 8 petabytes */
- dev:8,
- gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 gen:8,
- dev:8,
- offset:44,
- unwritten:1,
- unused:1,
- cached:1,
- type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:5,
- block:8,
- redundancy:4,
- idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:47,
- redundancy:4,
- block:8,
- type:5;
-#endif
-};
-
-#define BCH_EXTENT_FLAGS() \
- x(poisoned, 0)
-
-enum bch_extent_flags_e {
-#define x(n, v) BCH_EXTENT_FLAG_##n = v,
- BCH_EXTENT_FLAGS()
-#undef x
-};
-
-struct bch_extent_flags {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:7,
- flags:57;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 flags:57,
- type:7;
-#endif
-};
-
-/* bch_extent_rebalance: */
-#include "rebalance_format.h"
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
- unsigned long type;
-#elif __BITS_PER_LONG == 32
- struct {
- unsigned long pad;
- unsigned long type;
- };
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f f;
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
- struct bch_val v;
-
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
- struct bch_val v;
-
- __u64 mem_ptr;
- __le64 seq;
- __le16 sectors_written;
- __le16 flags;
- struct bpos min_key;
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
- struct bch_val v;
-
- __u64 _data[0];
- union bch_extent_entry start[];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
- ((sizeof(struct bch_extent_crc128) + \
- sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX \
- (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX \
- ((sizeof(struct bch_btree_ptr_v2) + \
- sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX \
- (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-struct bch_reservation {
- struct bch_val v;
-
- __le32 generation;
- __u8 nr_replicas;
- __u8 pad[3];
-} __packed __aligned(8);
-
-struct bch_inline_data {
- struct bch_val v;
- u8 data[];
-};
-
-#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
deleted file mode 100644
index e51529dca4c2..000000000000
--- a/fs/bcachefs/extents_types.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_TYPES_H
-#define _BCACHEFS_EXTENTS_TYPES_H
-
-#include "bcachefs_format.h"
-
-struct bch_extent_crc_unpacked {
- u32 compressed_size;
- u32 uncompressed_size;
- u32 live_size;
-
- u8 csum_type;
- u8 compression_type;
-
- u16 offset;
-
- u16 nonce;
-
- struct bch_csum csum;
-};
-
-struct extent_ptr_decoded {
- bool has_ec;
- bool do_ec_reconstruct;
- u8 crc_retry_nr;
- struct bch_extent_crc_unpacked crc;
- struct bch_extent_ptr ptr;
- struct bch_extent_stripe_ptr ec;
-};
-
-struct bch_io_failures {
- u8 nr;
- struct bch_dev_io_failures {
- u8 dev;
- unsigned failed_csum_nr:6,
- failed_io:1,
- failed_ec:1;
- } devs[BCH_REPLICAS_MAX + 1];
-};
-
-#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
deleted file mode 100644
index 0e742555cb0a..000000000000
--- a/fs/bcachefs/eytzinger.c
+++ /dev/null
@@ -1,315 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "eytzinger.h"
-
-/**
- * is_aligned - is this pointer & size okay for word-wide copying?
- * @base: pointer to data
- * @size: size of each element
- * @align: required alignment (typically 4 or 8)
- *
- * Returns true if elements can be copied using word loads and stores.
- * The size must be a multiple of the alignment, and the base address must
- * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
- *
- * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
- * to "if ((a | b) & mask)", so we do that by hand.
- */
-__attribute_const__ __always_inline
-static bool is_aligned(const void *base, size_t size, unsigned char align)
-{
- unsigned char lsbits = (unsigned char)size;
-
- (void)base;
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- lsbits |= (unsigned char)(uintptr_t)base;
-#endif
- return (lsbits & (align - 1)) == 0;
-}
-
-/**
- * swap_words_32 - swap two elements in 32-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 4)
- *
- * Exchange the two objects in memory. This exploits base+index addressing,
- * which basically all CPUs have, to minimize loop overhead computations.
- *
- * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
- * bottom of the loop, even though the zero flag is still valid from the
- * subtract (since the intervening mov instructions don't alter the flags).
- * Gcc 8.1.0 doesn't have that problem.
- */
-static void swap_words_32(void *a, void *b, size_t n)
-{
- do {
- u32 t = *(u32 *)(a + (n -= 4));
- *(u32 *)(a + n) = *(u32 *)(b + n);
- *(u32 *)(b + n) = t;
- } while (n);
-}
-
-/**
- * swap_words_64 - swap two elements in 64-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 8)
- *
- * Exchange the two objects in memory. This exploits base+index
- * addressing, which basically all CPUs have, to minimize loop overhead
- * computations.
- *
- * We'd like to use 64-bit loads if possible. If they're not, emulating
- * one requires base+index+4 addressing which x86 has but most other
- * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
- * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
- * x32 ABI). Are there any cases the kernel needs to worry about?
- */
-static void swap_words_64(void *a, void *b, size_t n)
-{
- do {
-#ifdef CONFIG_64BIT
- u64 t = *(u64 *)(a + (n -= 8));
- *(u64 *)(a + n) = *(u64 *)(b + n);
- *(u64 *)(b + n) = t;
-#else
- /* Use two 32-bit transfers to avoid base+index+4 addressing */
- u32 t = *(u32 *)(a + (n -= 4));
- *(u32 *)(a + n) = *(u32 *)(b + n);
- *(u32 *)(b + n) = t;
-
- t = *(u32 *)(a + (n -= 4));
- *(u32 *)(a + n) = *(u32 *)(b + n);
- *(u32 *)(b + n) = t;
-#endif
- } while (n);
-}
-
-/**
- * swap_bytes - swap two elements a byte at a time
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size
- *
- * This is the fallback if alignment doesn't allow using larger chunks.
- */
-static void swap_bytes(void *a, void *b, size_t n)
-{
- do {
- char t = ((char *)a)[--n];
- ((char *)a)[n] = ((char *)b)[n];
- ((char *)b)[n] = t;
- } while (n);
-}
-
-/*
- * The values are arbitrary as long as they can't be confused with
- * a pointer, but small integers make for the smallest compare
- * instructions.
- */
-#define SWAP_WORDS_64 (swap_r_func_t)0
-#define SWAP_WORDS_32 (swap_r_func_t)1
-#define SWAP_BYTES (swap_r_func_t)2
-#define SWAP_WRAPPER (swap_r_func_t)3
-
-struct wrapper {
- cmp_func_t cmp;
- swap_func_t swap_func;
-};
-
-/*
- * The function pointer is last to make tail calls most efficient if the
- * compiler decides not to inline this function.
- */
-static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
-{
- if (swap_func == SWAP_WRAPPER) {
- ((const struct wrapper *)priv)->swap_func(a, b, (int)size);
- return;
- }
-
- if (swap_func == SWAP_WORDS_64)
- swap_words_64(a, b, size);
- else if (swap_func == SWAP_WORDS_32)
- swap_words_32(a, b, size);
- else if (swap_func == SWAP_BYTES)
- swap_bytes(a, b, size);
- else
- swap_func(a, b, (int)size, priv);
-}
-
-#define _CMP_WRAPPER ((cmp_r_func_t)0L)
-
-static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
-{
- if (cmp == _CMP_WRAPPER)
- return ((const struct wrapper *)priv)->cmp(a, b);
- return cmp(a, b, priv);
-}
-
-static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size,
- cmp_r_func_t cmp_func, const void *priv,
- size_t l, size_t r)
-{
- return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size,
- base1 + inorder_to_eytzinger1(r, n) * size,
- cmp_func, priv);
-}
-
-static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size,
- swap_r_func_t swap_func, const void *priv,
- size_t l, size_t r)
-{
- do_swap(base1 + inorder_to_eytzinger1(l, n) * size,
- base1 + inorder_to_eytzinger1(r, n) * size,
- size, swap_func, priv);
-}
-
-static void eytzinger1_sort_r(void *base1, size_t n, size_t size,
- cmp_r_func_t cmp_func,
- swap_r_func_t swap_func,
- const void *priv)
-{
- unsigned i, j, k;
-
- /* called from 'sort' without swap function, let's pick the default */
- if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
- swap_func = NULL;
-
- if (!swap_func) {
- if (is_aligned(base1, size, 8))
- swap_func = SWAP_WORDS_64;
- else if (is_aligned(base1, size, 4))
- swap_func = SWAP_WORDS_32;
- else
- swap_func = SWAP_BYTES;
- }
-
- /* heapify */
- for (i = n / 2; i >= 1; --i) {
- /* Find the sift-down path all the way to the leaves. */
- for (j = i; k = j * 2, k < n;)
- j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
- /* Special case for the last leaf with no sibling. */
- if (j * 2 == n)
- j *= 2;
-
- /* Backtrack to the correct location. */
- while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0)
- j /= 2;
-
- /* Shift the element into its correct place. */
- for (k = j; j != i;) {
- j /= 2;
- eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
- }
- }
-
- /* sort */
- for (i = n; i > 1; --i) {
- eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i);
-
- /* Find the sift-down path all the way to the leaves. */
- for (j = 1; k = j * 2, k + 1 < i;)
- j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
- /* Special case for the last leaf with no sibling. */
- if (j * 2 + 1 == i)
- j *= 2;
-
- /* Backtrack to the correct location. */
- while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0)
- j /= 2;
-
- /* Shift the element into its correct place. */
- for (k = j; j > 1;) {
- j /= 2;
- eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
- }
- }
-}
-
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
- cmp_r_func_t cmp_func,
- swap_r_func_t swap_func,
- const void *priv)
-{
- void *base1 = base - size;
-
- return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
- cmp_func_t cmp_func,
- swap_func_t swap_func)
-{
- struct wrapper w = {
- .cmp = cmp_func,
- .swap_func = swap_func,
- };
-
- return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
-}
-
-#if 0
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/ktime.h>
-
-static u64 cmp_count;
-
-static int mycmp(const void *a, const void *b)
-{
- u32 _a = *(u32 *)a;
- u32 _b = *(u32 *)b;
-
- cmp_count++;
- if (_a < _b)
- return -1;
- else if (_a > _b)
- return 1;
- else
- return 0;
-}
-
-static int test(void)
-{
- size_t N, i;
- ktime_t start, end;
- s64 delta;
- u32 *arr;
-
- for (N = 10000; N <= 100000; N += 10000) {
- arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
- cmp_count = 0;
-
- for (i = 0; i < N; i++)
- arr[i] = get_random_u32();
-
- start = ktime_get();
- eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL);
- end = ktime_get();
-
- delta = ktime_us_delta(end, start);
- printk(KERN_INFO "time: %lld\n", delta);
- printk(KERN_INFO "comparisons: %lld\n", cmp_count);
-
- u32 prev = 0;
-
- eytzinger0_for_each(i, N) {
- if (prev > arr[i])
- goto err;
- prev = arr[i];
- }
-
- kfree(arr);
- }
- return 0;
-
-err:
- kfree(arr);
- return -1;
-}
-#endif
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
deleted file mode 100644
index 643c1f716061..000000000000
--- a/fs/bcachefs/eytzinger.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
-
-#include <linux/bitops.h>
-#include <linux/log2.h>
-
-#ifdef EYTZINGER_DEBUG
-#include <linux/bug.h>
-#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
-#else
-#define EYTZINGER_BUG_ON(cond)
-#endif
-
-/*
- * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array.
- *
- * Consider using an eytzinger tree any time you would otherwise be doing binary
- * search over an array. Binary search is a worst case scenario for branch
- * prediction and prefetching, but in an eytzinger tree every node's children
- * are adjacent in memory, thus we can prefetch children before knowing the
- * result of the comparison, assuming multiple nodes fit on a cacheline.
- *
- * Two variants are provided, for one based indexing and zero based indexing.
- *
- * Zero based indexing is more convenient, but one based indexing has better
- * alignment and thus better performance because each new level of the tree
- * starts at a power of two, and thus if element 0 was cacheline aligned, each
- * new level will be as well.
- */
-
-static inline unsigned eytzinger1_child(unsigned i, unsigned child)
-{
- EYTZINGER_BUG_ON(child > 1);
-
- return (i << 1) + child;
-}
-
-static inline unsigned eytzinger1_left_child(unsigned i)
-{
- return eytzinger1_child(i, 0);
-}
-
-static inline unsigned eytzinger1_right_child(unsigned i)
-{
- return eytzinger1_child(i, 1);
-}
-
-static inline unsigned eytzinger1_first(unsigned size)
-{
- return size ? rounddown_pow_of_two(size) : 0;
-}
-
-static inline unsigned eytzinger1_last(unsigned size)
-{
- return rounddown_pow_of_two(size + 1) - 1;
-}
-
-static inline unsigned eytzinger1_next(unsigned i, unsigned size)
-{
- EYTZINGER_BUG_ON(i == 0 || i > size);
-
- if (eytzinger1_right_child(i) <= size) {
- i = eytzinger1_right_child(i);
-
- i <<= __fls(size) - __fls(i);
- i >>= i > size;
- } else {
- i >>= ffz(i) + 1;
- }
-
- return i;
-}
-
-static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
-{
- EYTZINGER_BUG_ON(i == 0 || i > size);
-
- if (eytzinger1_left_child(i) <= size) {
- i = eytzinger1_left_child(i) + 1;
-
- i <<= __fls(size) - __fls(i);
- i -= 1;
- i >>= i > size;
- } else {
- i >>= __ffs(i) + 1;
- }
-
- return i;
-}
-
-static inline unsigned eytzinger1_extra(unsigned size)
-{
- return size
- ? (size + 1 - rounddown_pow_of_two(size)) << 1
- : 0;
-}
-
-static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
- unsigned extra)
-{
- unsigned b = __fls(i);
- unsigned shift = __fls(size) - b;
- int s;
-
- EYTZINGER_BUG_ON(!i || i > size);
-
- i ^= 1U << b;
- i <<= 1;
- i |= 1;
- i <<= shift;
-
- /*
- * sign bit trick:
- *
- * if (i > extra)
- * i -= (i - extra) >> 1;
- */
- s = extra - i;
- i += (s >> 1) & (s >> 31);
-
- return i;
-}
-
-static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
- unsigned extra)
-{
- unsigned shift;
- int s;
-
- EYTZINGER_BUG_ON(!i || i > size);
-
- /*
- * sign bit trick:
- *
- * if (i > extra)
- * i += i - extra;
- */
- s = extra - i;
- i -= s & (s >> 31);
-
- shift = __ffs(i);
-
- i >>= shift + 1;
- i |= 1U << (__fls(size) - shift);
-
- return i;
-}
-
-static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
-{
- return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
-{
- return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
-}
-
-#define eytzinger1_for_each(_i, _size) \
- for (unsigned (_i) = eytzinger1_first((_size)); \
- (_i) != 0; \
- (_i) = eytzinger1_next((_i), (_size)))
-
-/* Zero based indexing version: */
-
-static inline unsigned eytzinger0_child(unsigned i, unsigned child)
-{
- EYTZINGER_BUG_ON(child > 1);
-
- return (i << 1) + 1 + child;
-}
-
-static inline unsigned eytzinger0_left_child(unsigned i)
-{
- return eytzinger0_child(i, 0);
-}
-
-static inline unsigned eytzinger0_right_child(unsigned i)
-{
- return eytzinger0_child(i, 1);
-}
-
-static inline unsigned eytzinger0_first(unsigned size)
-{
- return eytzinger1_first(size) - 1;
-}
-
-static inline unsigned eytzinger0_last(unsigned size)
-{
- return eytzinger1_last(size) - 1;
-}
-
-static inline unsigned eytzinger0_next(unsigned i, unsigned size)
-{
- return eytzinger1_next(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
-{
- return eytzinger1_prev(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_extra(unsigned size)
-{
- return eytzinger1_extra(size);
-}
-
-static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
- unsigned extra)
-{
- return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
-}
-
-static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
- unsigned extra)
-{
- return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
-}
-
-static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
-{
- return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
-{
- return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
-}
-
-#define eytzinger0_for_each(_i, _size) \
- for (unsigned (_i) = eytzinger0_first((_size)); \
- (_i) != -1; \
- (_i) = eytzinger0_next((_i), (_size)))
-
-#define eytzinger0_for_each_prev(_i, _size) \
- for (unsigned (_i) = eytzinger0_last((_size)); \
- (_i) != -1; \
- (_i) = eytzinger0_prev((_i), (_size)))
-
-/* return greatest node <= @search, or -1 if not found */
-static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
-{
- void *base1 = base - size;
- unsigned n = 1;
-
- while (n <= nr)
- n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
- n >>= __ffs(n) + 1;
- return n - 1;
-}
-
-/* return smallest node > @search, or -1 if not found */
-static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
-{
- void *base1 = base - size;
- unsigned n = 1;
-
- while (n <= nr)
- n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
- n >>= __ffs(n + 1) + 1;
- return n - 1;
-}
-
-/* return smallest node >= @search, or -1 if not found */
-static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
-{
- void *base1 = base - size;
- unsigned n = 1;
-
- while (n <= nr)
- n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0);
- n >>= __ffs(n + 1) + 1;
- return n - 1;
-}
-
-#define eytzinger0_find(base, nr, size, _cmp, search) \
-({ \
- size_t _size = (size); \
- void *_base1 = (void *)(base) - _size; \
- const void *_search = (search); \
- size_t _nr = (nr); \
- size_t _i = 1; \
- int _res; \
- \
- while (_i <= _nr && \
- (_res = _cmp(_search, _base1 + _i * _size))) \
- _i = eytzinger1_child(_i, _res > 0); \
- _i - 1; \
-})
-
-void eytzinger0_sort_r(void *, size_t, size_t,
- cmp_r_func_t, swap_r_func_t, const void *);
-void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
-
-#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
deleted file mode 100644
index d8153fe27037..000000000000
--- a/fs/bcachefs/fifo.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FIFO_H
-#define _BCACHEFS_FIFO_H
-
-#include "util.h"
-
-#define FIFO(type) \
-struct { \
- size_t front, back, size, mask; \
- type *data; \
-}
-
-#define DECLARE_FIFO(type, name) FIFO(type) name
-
-#define fifo_buf_size(fifo) \
- ((fifo)->size \
- ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \
- : 0)
-
-#define init_fifo(fifo, _size, _gfp) \
-({ \
- (fifo)->front = (fifo)->back = 0; \
- (fifo)->size = (_size); \
- (fifo)->mask = (fifo)->size \
- ? roundup_pow_of_two((fifo)->size) - 1 \
- : 0; \
- (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \
-})
-
-#define free_fifo(fifo) \
-do { \
- kvfree((fifo)->data); \
- (fifo)->data = NULL; \
-} while (0)
-
-#define fifo_swap(l, r) \
-do { \
- swap((l)->front, (r)->front); \
- swap((l)->back, (r)->back); \
- swap((l)->size, (r)->size); \
- swap((l)->mask, (r)->mask); \
- swap((l)->data, (r)->data); \
-} while (0)
-
-#define fifo_move(dest, src) \
-do { \
- typeof(*((dest)->data)) _t; \
- while (!fifo_full(dest) && \
- fifo_pop(src, _t)) \
- fifo_push(dest, _t); \
-} while (0)
-
-#define fifo_used(fifo) (((fifo)->back - (fifo)->front))
-#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
-
-#define fifo_empty(fifo) ((fifo)->front == (fifo)->back)
-#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size)
-
-#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask])
-#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
-
-#define fifo_entry_idx_abs(fifo, p) \
- ((((p) >= &fifo_peek_front(fifo) \
- ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \
- (((p) - (fifo)->data)))
-
-#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
-
-#define fifo_push_back_ref(f) \
- (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
-
-#define fifo_push_front_ref(f) \
- (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
-
-#define fifo_push_back(fifo, new) \
-({ \
- typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \
- if (_r) \
- *_r = (new); \
- _r != NULL; \
-})
-
-#define fifo_push_front(fifo, new) \
-({ \
- typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \
- if (_r) \
- *_r = (new); \
- _r != NULL; \
-})
-
-#define fifo_pop_front(fifo, i) \
-({ \
- bool _r = !fifo_empty((fifo)); \
- if (_r) \
- (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \
- _r; \
-})
-
-#define fifo_pop_back(fifo, i) \
-({ \
- bool _r = !fifo_empty((fifo)); \
- if (_r) \
- (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \
- _r; \
-})
-
-#define fifo_push_ref(fifo) fifo_push_back_ref(fifo)
-#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
-#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
-#define fifo_peek(fifo) fifo_peek_front(fifo)
-
-#define fifo_for_each_entry(_entry, _fifo, _iter) \
- for (typecheck(typeof((_fifo)->front), _iter), \
- (_iter) = (_fifo)->front; \
- ((_iter != (_fifo)->back) && \
- (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
- (_iter)++)
-
-#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \
- for (typecheck(typeof((_fifo)->front), _iter), \
- (_iter) = (_fifo)->front; \
- ((_iter != (_fifo)->back) && \
- (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \
- (_iter)++)
-
-#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
deleted file mode 100644
index 5ab1c73c8d4c..000000000000
--- a/fs/bcachefs/fs-io-buffered.c
+++ /dev/null
@@ -1,1102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/backing-dev.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
- if (bio->bi_vcnt >= bio->bi_max_vecs)
- return true;
- if (bio->bi_iter.bi_size > UINT_MAX - len)
- return true;
- return false;
-}
-
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
-{
- struct folio_iter fi;
-
- bio_for_each_folio_all(fi, bio)
- folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
-
- bio_put(bio);
-}
-
-struct readpages_iter {
- struct address_space *mapping;
- unsigned idx;
- folios folios;
-};
-
-static int readpages_iter_init(struct readpages_iter *iter,
- struct readahead_control *ractl)
-{
- struct folio *folio;
-
- *iter = (struct readpages_iter) { ractl->mapping };
-
- while ((folio = __readahead_folio(ractl))) {
- if (!bch2_folio_create(folio, GFP_KERNEL) ||
- darray_push(&iter->folios, folio)) {
- bch2_folio_release(folio);
- ractl->_nr_pages += folio_nr_pages(folio);
- ractl->_index -= folio_nr_pages(folio);
- return iter->folios.nr ? 0 : -ENOMEM;
- }
-
- folio_put(folio);
- }
-
- return 0;
-}
-
-static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
-{
- if (iter->idx >= iter->folios.nr)
- return NULL;
- return iter->folios.data[iter->idx];
-}
-
-static inline void readpage_iter_advance(struct readpages_iter *iter)
-{
- iter->idx++;
-}
-
-static bool extent_partial_reads_expensive(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_crc_unpacked crc;
- const union bch_extent_entry *i;
-
- bkey_for_each_crc(k.k, ptrs, crc, i)
- if (crc.csum_type || crc.compression_type)
- return true;
- return false;
-}
-
-static int readpage_bio_extend(struct btree_trans *trans,
- struct readpages_iter *iter,
- struct bio *bio,
- unsigned sectors_this_extent,
- bool get_more)
-{
- /* Don't hold btree locks while allocating memory: */
- bch2_trans_unlock(trans);
-
- while (bio_sectors(bio) < sectors_this_extent &&
- bio->bi_vcnt < bio->bi_max_vecs) {
- struct folio *folio = readpage_iter_peek(iter);
- int ret;
-
- if (folio) {
- readpage_iter_advance(iter);
- } else {
- pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-
- if (!get_more)
- break;
-
- unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
-
- if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
- break;
-
- unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
-
- /* ensure proper alignment */
- order = min(order, __ffs(folio_offset|BIT(31)));
-
- folio = xa_load(&iter->mapping->i_pages, folio_offset);
- if (folio && !xa_is_value(folio))
- break;
-
- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
- if (!folio)
- break;
-
- if (!__bch2_folio_create(folio, GFP_KERNEL)) {
- folio_put(folio);
- break;
- }
-
- ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
- if (ret) {
- __bch2_folio_release(folio);
- folio_put(folio);
- break;
- }
-
- folio_put(folio);
- }
-
- BUG_ON(folio_sector(folio) != bio_end_sector(bio));
-
- BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
- }
-
- return bch2_trans_relock(trans);
-}
-
-static void bchfs_read(struct btree_trans *trans,
- struct bch_read_bio *rbio,
- subvol_inum inum,
- struct readpages_iter *readpages_iter)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_buf sk;
- int flags = BCH_READ_retry_if_stale|
- BCH_READ_may_promote;
- int ret = 0;
-
- rbio->subvol = inum.subvol;
-
- bch2_bkey_buf_init(&sk);
- bch2_trans_begin(trans);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inum.inum, rbio->bio.bi_iter.bi_sector),
- BTREE_ITER_slots);
- while (1) {
- struct bkey_s_c k;
- unsigned bytes, sectors;
- s64 offset_into_extent;
- enum btree_id data_btree = BTREE_ID_extents;
-
- bch2_trans_begin(trans);
-
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(&iter, snapshot);
-
- bch2_btree_iter_set_pos(&iter,
- POS(inum.inum, rbio->bio.bi_iter.bi_sector));
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- offset_into_extent = iter.pos.offset -
- bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
-
- bch2_bkey_buf_reassemble(&sk, c, k);
-
- ret = bch2_read_indirect_extent(trans, &data_btree,
- &offset_into_extent, &sk);
- if (ret)
- goto err;
-
- k = bkey_i_to_s_c(sk.k);
-
- sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
-
- if (readpages_iter) {
- ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
- extent_partial_reads_expensive(k));
- if (ret)
- goto err;
- }
-
- bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
- swap(rbio->bio.bi_iter.bi_size, bytes);
-
- if (rbio->bio.bi_iter.bi_size == bytes)
- flags |= BCH_READ_last_fragment;
-
- bch2_bio_page_state_set(&rbio->bio, k);
-
- bch2_read_extent(trans, rbio, iter.pos,
- data_btree, k, offset_into_extent, flags);
-
- if (flags & BCH_READ_last_fragment)
- break;
-
- swap(rbio->bio.bi_iter.bi_size, bytes);
- bio_advance(&rbio->bio, bytes);
-err:
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret) {
- struct printbuf buf = PRINTBUF;
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
- prt_printf(&buf, "read error %i from btree lookup", ret);
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
- rbio->bio.bi_status = BLK_STS_IOERR;
- bio_endio(&rbio->bio);
- }
-
- bch2_bkey_buf_exit(&sk, c);
-}
-
-void bch2_readahead(struct readahead_control *ractl)
-{
- struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts;
- struct folio *folio;
- struct readpages_iter readpages_iter;
- struct blk_plug plug;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- int ret = readpages_iter_init(&readpages_iter, ractl);
- if (ret)
- return;
-
- /*
- * Besides being a general performance optimization, plugging helps with
- * avoiding btree transaction srcu warnings - submitting a bio can
- * block, and we don't want todo that with the transaction locked.
- *
- * However, plugged bios are submitted when we schedule; we ideally
- * would have our own scheduler hook to call unlock_long() before
- * scheduling.
- */
- blk_start_plug(&plug);
- bch2_pagecache_add_get(inode);
-
- struct btree_trans *trans = bch2_trans_get(c);
- while ((folio = readpage_iter_peek(&readpages_iter))) {
- unsigned n = min_t(unsigned,
- readpages_iter.folios.nr -
- readpages_iter.idx,
- BIO_MAX_VECS);
- struct bch_read_bio *rbio =
- rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
- GFP_KERNEL, &c->bio_read),
- c,
- opts,
- bch2_readpages_end_io);
-
- readpage_iter_advance(&readpages_iter);
-
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- bchfs_read(trans, rbio, inode_inum(inode),
- &readpages_iter);
- bch2_trans_unlock(trans);
- }
- bch2_trans_put(trans);
-
- bch2_pagecache_add_put(inode);
- blk_finish_plug(&plug);
- darray_exit(&readpages_iter.folios);
-}
-
-static void bch2_read_single_folio_end_io(struct bio *bio)
-{
- complete(bio->bi_private);
-}
-
-int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
-{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_read_bio *rbio;
- struct bch_io_opts opts;
- struct blk_plug plug;
- int ret;
- DECLARE_COMPLETION_ONSTACK(done);
-
- BUG_ON(folio_test_uptodate(folio));
- BUG_ON(folio_test_dirty(folio));
-
- if (!bch2_folio_create(folio, GFP_KERNEL))
- return -ENOMEM;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
- c,
- opts,
- bch2_read_single_folio_end_io);
- rbio->bio.bi_private = &done;
- rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- blk_start_plug(&plug);
- bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
- blk_finish_plug(&plug);
- wait_for_completion(&done);
-
- ret = blk_status_to_errno(rbio->bio.bi_status);
- bio_put(&rbio->bio);
-
- if (ret < 0)
- return ret;
-
- folio_mark_uptodate(folio);
- return 0;
-}
-
-int bch2_read_folio(struct file *file, struct folio *folio)
-{
- int ret;
-
- ret = bch2_read_single_folio(folio, folio->mapping);
- folio_unlock(folio);
- return bch2_err_class(ret);
-}
-
-/* writepages: */
-
-struct bch_writepage_io {
- struct bch_inode_info *inode;
-
- /* must be last: */
- struct bch_write_op op;
-};
-
-struct bch_writepage_state {
- struct bch_writepage_io *io;
- struct bch_io_opts opts;
- struct bch_folio_sector *tmp;
- unsigned tmp_sectors;
-};
-
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- struct bch_writepage_state ret = { 0 };
-
- bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
- return ret;
-}
-
-/*
- * Determine when a writepage io is full. We have to limit writepage bios to a
- * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
- * what the bounce path in bch2_write_extent() can handle. In theory we could
- * loosen this restriction for non-bounce I/O, but we don't have that context
- * here. Ideally, we can up this limit and make it configurable in the future
- * when the bounce path can be enhanced to accommodate larger source bios.
- */
-static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
-{
- struct bio *bio = &io->op.wbio.bio;
- return bio_full(bio, len) ||
- (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
-}
-
-static void bch2_writepage_io_done(struct bch_write_op *op)
-{
- struct bch_writepage_io *io =
- container_of(op, struct bch_writepage_io, op);
- struct bch_fs *c = io->op.c;
- struct bio *bio = &io->op.wbio.bio;
- struct folio_iter fi;
- unsigned i;
-
- if (io->op.error) {
- set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
-
- bio_for_each_folio_all(fi, bio) {
- struct bch_folio *s;
-
- mapping_set_error(fi.folio->mapping, -EIO);
-
- s = __bch2_folio(fi.folio);
- spin_lock(&s->lock);
- for (i = 0; i < folio_sectors(fi.folio); i++)
- s->s[i].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
- }
-
- if (io->op.flags & BCH_WRITE_wrote_data_inline) {
- bio_for_each_folio_all(fi, bio) {
- struct bch_folio *s;
-
- s = __bch2_folio(fi.folio);
- spin_lock(&s->lock);
- for (i = 0; i < folio_sectors(fi.folio); i++)
- s->s[i].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
- }
-
- /*
- * racing with fallocate can cause us to add fewer sectors than
- * expected - but we shouldn't add more sectors than expected:
- */
- WARN_ON_ONCE(io->op.i_sectors_delta > 0);
-
- /*
- * (error (due to going RO) halfway through a page can screw that up
- * slightly)
- * XXX wtf?
- BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
- */
-
- /*
- * The writeback flag is effectively our ref on the inode -
- * fixup i_blocks before calling folio_end_writeback:
- */
- bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
-
- bio_for_each_folio_all(fi, bio) {
- struct bch_folio *s = __bch2_folio(fi.folio);
-
- if (atomic_dec_and_test(&s->write_count))
- folio_end_writeback(fi.folio);
- }
-
- bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
- struct bch_writepage_io *io = w->io;
-
- w->io = NULL;
- closure_call(&io->op.cl, bch2_write, NULL, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
- struct writeback_control *wbc,
- struct bch_writepage_state *w,
- struct bch_inode_info *inode,
- u64 sector,
- unsigned nr_replicas)
-{
- struct bch_write_op *op;
-
- w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
- REQ_OP_WRITE,
- GFP_KERNEL,
- &c->writepage_bioset),
- struct bch_writepage_io, op.wbio.bio);
-
- w->io->inode = inode;
- op = &w->io->op;
- bch2_write_op_init(op, c, w->opts);
- op->target = w->opts.foreground_target;
- op->nr_replicas = nr_replicas;
- op->res.nr_replicas = nr_replicas;
- op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->subvol = inode->ei_inum.subvol;
- op->pos = POS(inode->v.i_ino, sector);
- op->end_io = bch2_writepage_io_done;
- op->devs_need_flush = &inode->ei_devs_need_flush;
- op->wbio.bio.bi_iter.bi_sector = sector;
- op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
-}
-
-static int __bch2_writepage(struct folio *folio,
- struct writeback_control *wbc,
- void *data)
-{
- struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_writepage_state *w = data;
- struct bch_folio *s;
- unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
- loff_t i_size = i_size_read(&inode->v);
- int ret;
-
- EBUG_ON(!folio_test_uptodate(folio));
-
- /* Is the folio fully inside i_size? */
- if (folio_end_pos(folio) <= i_size)
- goto do_io;
-
- /* Is the folio fully outside i_size? (truncate in progress) */
- if (folio_pos(folio) >= i_size) {
- folio_unlock(folio);
- return 0;
- }
-
- /*
- * The folio straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
- * in multiples of the folio size. For a file that is not a multiple of
- * the folio size, the remaining memory is zeroed when mapped, and
- * writes to that region are not written out to the file."
- */
- folio_zero_segment(folio,
- i_size - folio_pos(folio),
- folio_size(folio));
-do_io:
- f_sectors = folio_sectors(folio);
- s = bch2_folio(folio);
-
- if (f_sectors > w->tmp_sectors) {
- kfree(w->tmp);
- w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
- w->tmp_sectors = f_sectors;
- }
-
- /*
- * Things get really hairy with errors during writeback:
- */
- ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
- BUG_ON(ret);
-
- /* Before unlocking the page, get copy of reservations: */
- spin_lock(&s->lock);
- memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-
- for (i = 0; i < f_sectors; i++) {
- if (s->s[i].state < SECTOR_dirty)
- continue;
-
- nr_replicas_this_write =
- min_t(unsigned, nr_replicas_this_write,
- s->s[i].nr_replicas +
- s->s[i].replicas_reserved);
- }
-
- for (i = 0; i < f_sectors; i++) {
- if (s->s[i].state < SECTOR_dirty)
- continue;
-
- s->s[i].nr_replicas = w->opts.compression
- ? 0 : nr_replicas_this_write;
-
- s->s[i].replicas_reserved = 0;
- bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
- }
- spin_unlock(&s->lock);
-
- BUG_ON(atomic_read(&s->write_count));
- atomic_set(&s->write_count, 1);
-
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio);
-
- folio_unlock(folio);
-
- offset = 0;
- while (1) {
- unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
- u64 sector;
-
- while (offset < f_sectors &&
- w->tmp[offset].state < SECTOR_dirty)
- offset++;
-
- if (offset == f_sectors)
- break;
-
- while (offset + sectors < f_sectors &&
- w->tmp[offset + sectors].state >= SECTOR_dirty) {
- reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
- dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
- sectors++;
- }
- BUG_ON(!sectors);
-
- sector = folio_sector(folio) + offset;
-
- if (w->io &&
- (w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bch_io_full(w->io, sectors << 9) ||
- bio_end_sector(&w->io->op.wbio.bio) != sector))
- bch2_writepage_do_io(w);
-
- if (!w->io)
- bch2_writepage_io_alloc(c, wbc, w, inode, sector,
- nr_replicas_this_write);
-
- atomic_inc(&s->write_count);
-
- BUG_ON(inode != w->io->inode);
- BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
- sectors << 9, offset << 9));
-
- w->io->op.res.sectors += reserved_sectors;
- w->io->op.i_sectors_delta -= dirty_sectors;
- w->io->op.new_i_size = i_size;
-
- offset += sectors;
- }
-
- if (atomic_dec_and_test(&s->write_count))
- folio_end_writeback(folio);
-
- return 0;
-}
-
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
- struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w =
- bch_writepage_state_init(c, to_bch_ei(mapping->host));
- struct blk_plug plug;
- int ret;
-
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
- if (w.io)
- bch2_writepage_do_io(&w);
- blk_finish_plug(&plug);
- kfree(w.tmp);
- return bch2_err_class(ret);
-}
-
-/* buffered writes: */
-
-int bch2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
-{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation *res;
- struct folio *folio;
- unsigned offset;
- int ret = -ENOMEM;
-
- res = kmalloc(sizeof(*res), GFP_KERNEL);
- if (!res)
- return -ENOMEM;
-
- bch2_folio_reservation_init(c, inode, res);
- *fsdata = res;
-
- bch2_pagecache_add_get(inode);
-
- folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
- FGP_WRITEBEGIN | fgf_set_order(len),
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- goto err_unlock;
-
- offset = pos - folio_pos(folio);
- len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
- if (folio_test_uptodate(folio))
- goto out;
-
- /* If we're writing entire folio, don't need to read it in first: */
- if (!offset && len == folio_size(folio))
- goto out;
-
- if (!offset && pos + len >= inode->v.i_size) {
- folio_zero_segment(folio, len, folio_size(folio));
- flush_dcache_folio(folio);
- goto out;
- }
-
- if (folio_pos(folio) >= inode->v.i_size) {
- folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
- flush_dcache_folio(folio);
- goto out;
- }
-readpage:
- ret = bch2_read_single_folio(folio, mapping);
- if (ret)
- goto err;
-out:
- ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
- if (ret)
- goto err;
-
- ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
- if (ret) {
- if (!folio_test_uptodate(folio)) {
- /*
- * If the folio hasn't been read in, we won't know if we
- * actually need a reservation - we don't actually need
- * to read here, we just need to check if the folio is
- * fully backed by uncompressed data:
- */
- goto readpage;
- }
-
- goto err;
- }
-
- *foliop = folio;
- return 0;
-err:
- folio_unlock(folio);
- folio_put(folio);
-err_unlock:
- bch2_pagecache_add_put(inode);
- kfree(res);
- *fsdata = NULL;
- return bch2_err_class(ret);
-}
-
-int bch2_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
-{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation *res = fsdata;
- unsigned offset = pos - folio_pos(folio);
-
- lockdep_assert_held(&inode->v.i_rwsem);
- BUG_ON(offset + copied > folio_size(folio));
-
- if (unlikely(copied < len && !folio_test_uptodate(folio))) {
- /*
- * The folio needs to be read in, but that would destroy
- * our partial write - simplest thing is to just force
- * userspace to redo the write:
- */
- folio_zero_range(folio, 0, folio_size(folio));
- flush_dcache_folio(folio);
- copied = 0;
- }
-
- spin_lock(&inode->v.i_lock);
- if (pos + copied > inode->v.i_size)
- i_size_write(&inode->v, pos + copied);
- spin_unlock(&inode->v.i_lock);
-
- if (copied) {
- if (!folio_test_uptodate(folio))
- folio_mark_uptodate(folio);
-
- bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
- inode->ei_last_dirtied = (unsigned long) current;
- }
-
- folio_unlock(folio);
- folio_put(folio);
- bch2_pagecache_add_put(inode);
-
- bch2_folio_reservation_put(c, inode, res);
- kfree(res);
-
- return copied;
-}
-
-static noinline void folios_trunc(folios *fs, struct folio **fi)
-{
- while (fs->data + fs->nr > fi) {
- struct folio *f = darray_pop(fs);
-
- folio_unlock(f);
- folio_put(f);
- }
-}
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
- struct address_space *mapping,
- struct iov_iter *iter,
- loff_t pos, unsigned len)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation res;
- folios fs;
- struct folio *f;
- unsigned copied = 0, f_offset, f_copied;
- u64 end = pos + len, f_pos, f_len;
- loff_t last_folio_pos = inode->v.i_size;
- int ret = 0;
-
- BUG_ON(!len);
-
- bch2_folio_reservation_init(c, inode, &res);
- darray_init(&fs);
-
- ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
- FGP_WRITEBEGIN | fgf_set_order(len),
- mapping_gfp_mask(mapping), &fs);
- if (ret)
- goto out;
-
- BUG_ON(!fs.nr);
-
- f = darray_first(fs);
- if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
- ret = bch2_read_single_folio(f, mapping);
- if (ret)
- goto out;
- }
-
- f = darray_last(fs);
- end = min(end, folio_end_pos(f));
- last_folio_pos = folio_pos(f);
- if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
- if (end >= inode->v.i_size) {
- folio_zero_range(f, 0, folio_size(f));
- } else {
- ret = bch2_read_single_folio(f, mapping);
- if (ret)
- goto out;
- }
- }
-
- ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
- if (ret)
- goto out;
-
- f_pos = pos;
- f_offset = pos - folio_pos(darray_first(fs));
- darray_for_each(fs, fi) {
- ssize_t f_reserved;
-
- f = *fi;
- f_len = min(end, folio_end_pos(f)) - f_pos;
- f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
-
- if (unlikely(f_reserved != f_len)) {
- if (f_reserved < 0) {
- if (f == darray_first(fs)) {
- ret = f_reserved;
- goto out;
- }
-
- folios_trunc(&fs, fi);
- end = min(end, folio_end_pos(darray_last(fs)));
- } else {
- if (!folio_test_uptodate(f)) {
- ret = bch2_read_single_folio(f, mapping);
- if (ret)
- goto out;
- }
-
- folios_trunc(&fs, fi + 1);
- end = f_pos + f_reserved;
- }
-
- break;
- }
-
- f_pos = folio_end_pos(f);
- f_offset = 0;
- }
-
- if (mapping_writably_mapped(mapping))
- darray_for_each(fs, fi)
- flush_dcache_folio(*fi);
-
- f_pos = pos;
- f_offset = pos - folio_pos(darray_first(fs));
- darray_for_each(fs, fi) {
- f = *fi;
- f_len = min(end, folio_end_pos(f)) - f_pos;
- f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
- if (!f_copied) {
- folios_trunc(&fs, fi);
- break;
- }
-
- if (!folio_test_uptodate(f) &&
- f_copied != folio_size(f) &&
- pos + copied + f_copied < inode->v.i_size) {
- iov_iter_revert(iter, f_copied);
- folio_zero_range(f, 0, folio_size(f));
- folios_trunc(&fs, fi);
- break;
- }
-
- flush_dcache_folio(f);
- copied += f_copied;
-
- if (f_copied != f_len) {
- folios_trunc(&fs, fi + 1);
- break;
- }
-
- f_pos = folio_end_pos(f);
- f_offset = 0;
- }
-
- if (!copied)
- goto out;
-
- end = pos + copied;
-
- spin_lock(&inode->v.i_lock);
- if (end > inode->v.i_size)
- i_size_write(&inode->v, end);
- spin_unlock(&inode->v.i_lock);
-
- f_pos = pos;
- f_offset = pos - folio_pos(darray_first(fs));
- darray_for_each(fs, fi) {
- f = *fi;
- f_len = min(end, folio_end_pos(f)) - f_pos;
-
- if (!folio_test_uptodate(f))
- folio_mark_uptodate(f);
-
- bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
-
- f_pos = folio_end_pos(f);
- f_offset = 0;
- }
-
- inode->ei_last_dirtied = (unsigned long) current;
-out:
- darray_for_each(fs, fi) {
- folio_unlock(*fi);
- folio_put(*fi);
- }
-
- /*
- * If the last folio added to the mapping starts beyond current EOF, we
- * performed a short write but left around at least one post-EOF folio.
- * Clean up the mapping before we return.
- */
- if (last_folio_pos >= inode->v.i_size)
- truncate_pagecache(&inode->v, inode->v.i_size);
-
- darray_exit(&fs);
- bch2_folio_reservation_put(c, inode, &res);
-
- return copied ?: ret;
-}
-
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct bch_inode_info *inode = file_bch_inode(file);
- loff_t pos = iocb->ki_pos;
- ssize_t written = 0;
- int ret = 0;
-
- bch2_pagecache_add_get(inode);
-
- do {
- unsigned offset = pos & (PAGE_SIZE - 1);
- unsigned bytes = iov_iter_count(iter);
-again:
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
- */
- if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
- bytes = min_t(unsigned long, iov_iter_count(iter),
- PAGE_SIZE - offset);
-
- if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
- ret = -EFAULT;
- break;
- }
- }
-
- if (unlikely(fatal_signal_pending(current))) {
- ret = -EINTR;
- break;
- }
-
- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
- if (unlikely(ret < 0))
- break;
-
- cond_resched();
-
- if (unlikely(ret == 0)) {
- /*
- * If we were unable to copy any data at all, we must
- * fall back to a single segment length write.
- *
- * If we didn't fallback here, we could livelock
- * because not all segments in the iov can be copied at
- * once without a pagefault.
- */
- bytes = min_t(unsigned long, PAGE_SIZE - offset,
- iov_iter_single_seg_count(iter));
- goto again;
- }
- pos += ret;
- written += ret;
- ret = 0;
-
- balance_dirty_pages_ratelimited(mapping);
- } while (iov_iter_count(iter));
-
- bch2_pagecache_add_put(inode);
-
- return written ? written : ret;
-}
-
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- ssize_t ret;
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = bch2_direct_write(iocb, from);
- goto out;
- }
-
- inode_lock(&inode->v);
-
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- goto unlock;
-
- ret = file_remove_privs(file);
- if (ret)
- goto unlock;
-
- ret = file_update_time(file);
- if (ret)
- goto unlock;
-
- ret = bch2_buffered_write(iocb, from);
- if (likely(ret > 0))
- iocb->ki_pos += ret;
-unlock:
- inode_unlock(&inode->v);
-
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
-out:
- return bch2_err_class(ret);
-}
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
-{
- bioset_exit(&c->writepage_bioset);
-}
-
-int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
-{
- if (bioset_init(&c->writepage_bioset,
- 4, offsetof(struct bch_writepage_io, op.wbio.bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_writepage_bioset_init;
-
- return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
deleted file mode 100644
index 3207ebbb4ab4..000000000000
--- a/fs/bcachefs/fs-io-buffered.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_BUFFERED_H
-#define _BCACHEFS_FS_IO_BUFFERED_H
-
-#ifndef NO_BCACHEFS_FS
-
-int bch2_read_single_folio(struct folio *, struct address_space *);
-int bch2_read_folio(struct file *, struct folio *);
-
-int bch2_writepages(struct address_space *, struct writeback_control *);
-void bch2_readahead(struct readahead_control *);
-
-int bch2_write_begin(struct file *, struct address_space *, loff_t pos,
- unsigned len, struct folio **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
- unsigned len, unsigned copied, struct folio *, void *);
-
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
-int bch2_fs_fs_io_buffered_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
deleted file mode 100644
index 535bc5fcbcc0..000000000000
--- a/fs/bcachefs/fs-io-direct.c
+++ /dev/null
@@ -1,703 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/prefetch.h>
-#include <linux/task_io_accounting_ops.h>
-
-/* O_DIRECT reads */
-
-struct dio_read {
- struct closure cl;
- struct kiocb *req;
- long ret;
- bool should_dirty;
- struct bch_read_bio rbio;
-};
-
-static void bio_check_or_release(struct bio *bio, bool check_dirty)
-{
- if (check_dirty) {
- bio_check_pages_dirty(bio);
- } else {
- bio_release_pages(bio, false);
- bio_put(bio);
- }
-}
-
-static CLOSURE_CALLBACK(bch2_dio_read_complete)
-{
- closure_type(dio, struct dio_read, cl);
-
- dio->req->ki_complete(dio->req, dio->ret);
- bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
- struct dio_read *dio = bio->bi_private;
-
- if (bio->bi_status)
- dio->ret = blk_status_to_errno(bio->bi_status);
-
- closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
- struct dio_read *dio = bio->bi_private;
- bool should_dirty = dio->should_dirty;
-
- bch2_direct_IO_read_endio(bio);
- bio_check_or_release(bio, should_dirty);
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
- struct file *file = req->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts;
- struct dio_read *dio;
- struct bio *bio;
- struct blk_plug plug;
- loff_t offset = req->ki_pos;
- bool sync = is_sync_kiocb(req);
- bool split = false;
- size_t shorten;
- ssize_t ret;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- /* bios must be 512 byte aligned: */
- if ((offset|iter->count) & (SECTOR_SIZE - 1))
- return -EINVAL;
-
- ret = min_t(loff_t, iter->count,
- max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
- if (!ret)
- return ret;
-
- shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
- if (shorten >= iter->count)
- shorten = 0;
- iter->count -= shorten;
-
- bio = bio_alloc_bioset(NULL,
- bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_READ,
- GFP_KERNEL,
- &c->dio_read_bioset);
-
- dio = container_of(bio, struct dio_read, rbio.bio);
- closure_init(&dio->cl, NULL);
-
- /*
- * this is a _really_ horrible hack just to avoid an atomic sub at the
- * end:
- */
- if (!sync) {
- set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
- atomic_set(&dio->cl.remaining,
- CLOSURE_REMAINING_INITIALIZER -
- CLOSURE_RUNNING +
- CLOSURE_DESTRUCTOR);
- } else {
- atomic_set(&dio->cl.remaining,
- CLOSURE_REMAINING_INITIALIZER + 1);
- dio->cl.closure_get_happened = true;
- }
-
- dio->req = req;
- dio->ret = ret;
- /*
- * This is one of the sketchier things I've encountered: we have to skip
- * the dirtying of requests that are internal from the kernel (i.e. from
- * loopback), because we'll deadlock on page_lock.
- */
- dio->should_dirty = iter_is_iovec(iter);
-
- blk_start_plug(&plug);
-
- goto start;
- while (iter->count) {
- split = true;
-
- bio = bio_alloc_bioset(NULL,
- bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_READ,
- GFP_KERNEL,
- &c->bio_read);
-start:
- bio->bi_opf = REQ_OP_READ|REQ_SYNC;
- bio->bi_iter.bi_sector = offset >> 9;
- bio->bi_private = dio;
-
- ret = bio_iov_iter_get_pages(bio, iter);
- if (ret < 0) {
- /* XXX: fault inject this path */
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- break;
- }
-
- offset += bio->bi_iter.bi_size;
-
- if (dio->should_dirty)
- bio_set_pages_dirty(bio);
-
- if (iter->count)
- closure_get(&dio->cl);
-
- struct bch_read_bio *rbio =
- rbio_init(bio,
- c,
- opts,
- split
- ? bch2_direct_IO_read_split_endio
- : bch2_direct_IO_read_endio);
-
- bch2_read(c, rbio, inode_inum(inode));
- }
-
- blk_finish_plug(&plug);
-
- iter->count += shorten;
-
- if (sync) {
- closure_sync(&dio->cl);
- closure_debug_destroy(&dio->cl);
- ret = dio->ret;
- bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
- return ret;
- } else {
- return -EIOCBQUEUED;
- }
-}
-
-ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct address_space *mapping = file->f_mapping;
- size_t count = iov_iter_count(iter);
- ssize_t ret = 0;
-
- if (!count)
- return 0; /* skip atime */
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- struct blk_plug plug;
-
- if (unlikely(mapping->nrpages)) {
- ret = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (ret < 0)
- goto out;
- }
-
- file_accessed(file);
-
- blk_start_plug(&plug);
- ret = bch2_direct_IO_read(iocb, iter);
- blk_finish_plug(&plug);
-
- if (ret >= 0)
- iocb->ki_pos += ret;
- } else {
- bch2_pagecache_add_get(inode);
- ret = filemap_read(iocb, iter, ret);
- bch2_pagecache_add_put(inode);
- }
-out:
- return bch2_err_class(ret);
-}
-
-/* O_DIRECT writes */
-
-struct dio_write {
- struct kiocb *req;
- struct address_space *mapping;
- struct bch_inode_info *inode;
- struct mm_struct *mm;
- const struct iovec *iov;
- unsigned loop:1,
- extending:1,
- sync:1,
- flush:1;
- struct quota_res quota_res;
- u64 written;
-
- struct iov_iter iter;
- struct iovec inline_vecs[2];
-
- /* must be last: */
- struct bch_write_op op;
-};
-
-static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
- u64 offset, u64 size,
- unsigned nr_replicas, bool compressed)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 end = offset + size;
- u32 snapshot;
- bool ret = true;
- int err;
-retry:
- bch2_trans_begin(trans);
-
- err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (err)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_slots, k, err) {
- if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
- break;
-
- if (k.k->p.snapshot != snapshot ||
- nr_replicas > bch2_bkey_replicas(c, k) ||
- (!compressed && bch2_bkey_sectors_compressed(k))) {
- ret = false;
- break;
- }
- }
-
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(err, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_put(trans);
-
- return err ? false : ret;
-}
-
-static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct bch_inode_info *inode = dio->inode;
- struct bio *bio = &dio->op.wbio.bio;
-
- return bch2_check_range_allocated(c, inode_inum(inode),
- dio->op.pos.offset, bio_sectors(bio),
- dio->op.opts.data_replicas,
- dio->op.opts.compression != 0);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *);
-static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-
-/*
- * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
- * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
- * caller's stack, we're not guaranteed that it will live for the duration of
- * the IO:
- */
-static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
-{
- struct iovec *iov = dio->inline_vecs;
-
- /*
- * iov_iter has a single embedded iovec - nothing to do:
- */
- if (iter_is_ubuf(&dio->iter))
- return 0;
-
- /*
- * We don't currently handle non-iovec iov_iters here - return an error,
- * and we'll fall back to doing the IO synchronously:
- */
- if (!iter_is_iovec(&dio->iter))
- return -1;
-
- if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
- dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
- GFP_KERNEL);
- if (unlikely(!iov))
- return -ENOMEM;
- }
-
- memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
- dio->iter.__iov = iov;
- return 0;
-}
-
-static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
-{
- closure_type(dio, struct dio_write, op.cl);
- struct bch_fs *c = dio->op.c;
-
- closure_debug_destroy(cl);
-
- dio->op.error = bch2_journal_error(&c->journal);
-
- bch2_dio_write_done(dio);
-}
-
-static noinline void bch2_dio_write_flush(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct bch_inode_unpacked inode;
- int ret;
-
- dio->flush = 0;
-
- closure_init(&dio->op.cl, NULL);
-
- if (!dio->op.error) {
- ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
- if (ret) {
- dio->op.error = ret;
- } else {
- bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
- &dio->op.cl);
- bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
- }
- }
-
- if (dio->sync) {
- closure_sync(&dio->op.cl);
- closure_debug_destroy(&dio->op.cl);
- } else {
- continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
- }
-}
-
-static __always_inline long bch2_dio_write_done(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct kiocb *req = dio->req;
- struct bch_inode_info *inode = dio->inode;
- bool sync = dio->sync;
- long ret;
-
- if (unlikely(dio->flush)) {
- bch2_dio_write_flush(dio);
- if (!sync)
- return -EIOCBQUEUED;
- }
-
- bch2_pagecache_block_put(inode);
-
- kfree(dio->iov);
-
- ret = dio->op.error ?: ((long) dio->written << 9);
- bio_put(&dio->op.wbio.bio);
-
- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
-
- /* inode->i_dio_count is our ref on inode and thus bch_fs */
- inode_dio_end(&inode->v);
-
- if (ret < 0)
- ret = bch2_err_class(ret);
-
- if (!sync) {
- req->ki_complete(req, ret);
- ret = -EIOCBQUEUED;
- }
- return ret;
-}
-
-static __always_inline void bch2_dio_write_end(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct kiocb *req = dio->req;
- struct bch_inode_info *inode = dio->inode;
- struct bio *bio = &dio->op.wbio.bio;
-
- req->ki_pos += (u64) dio->op.written << 9;
- dio->written += dio->op.written;
-
- if (dio->extending) {
- spin_lock(&inode->v.i_lock);
- if (req->ki_pos > inode->v.i_size)
- i_size_write(&inode->v, req->ki_pos);
- spin_unlock(&inode->v.i_lock);
- }
-
- if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
- mutex_lock(&inode->ei_quota_lock);
- __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
- __bch2_quota_reservation_put(c, inode, &dio->quota_res);
- mutex_unlock(&inode->ei_quota_lock);
- }
-
- bio_release_pages(bio, false);
-
- if (unlikely(dio->op.error))
- set_bit(EI_INODE_ERROR, &inode->ei_flags);
-}
-
-static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct kiocb *req = dio->req;
- struct address_space *mapping = dio->mapping;
- struct bch_inode_info *inode = dio->inode;
- struct bch_io_opts opts;
- struct bio *bio = &dio->op.wbio.bio;
- unsigned unaligned, iter_count;
- bool sync = dio->sync, dropped_locks;
- long ret;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- while (1) {
- iter_count = dio->iter.count;
-
- EBUG_ON(current->faults_disabled_mapping);
- current->faults_disabled_mapping = mapping;
-
- ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
- dropped_locks = fdm_dropped_locks();
-
- current->faults_disabled_mapping = NULL;
-
- /*
- * If the fault handler returned an error but also signalled
- * that it dropped & retook ei_pagecache_lock, we just need to
- * re-shoot down the page cache and retry:
- */
- if (dropped_locks && ret)
- ret = 0;
-
- if (unlikely(ret < 0))
- goto err;
-
- if (unlikely(dropped_locks)) {
- ret = bch2_write_invalidate_inode_pages_range(mapping,
- req->ki_pos,
- req->ki_pos + iter_count - 1);
- if (unlikely(ret))
- goto err;
-
- if (!bio->bi_iter.bi_size)
- continue;
- }
-
- unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
- bio->bi_iter.bi_size -= unaligned;
- iov_iter_revert(&dio->iter, unaligned);
-
- if (!bio->bi_iter.bi_size) {
- /*
- * bio_iov_iter_get_pages was only able to get <
- * blocksize worth of pages:
- */
- ret = -EFAULT;
- goto err;
- }
-
- bch2_write_op_init(&dio->op, c, opts);
- dio->op.end_io = sync
- ? NULL
- : bch2_dio_write_loop_async;
- dio->op.target = dio->op.opts.foreground_target;
- dio->op.write_point = writepoint_hashed((unsigned long) current);
- dio->op.nr_replicas = dio->op.opts.data_replicas;
- dio->op.subvol = inode->ei_inum.subvol;
- dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
- dio->op.devs_need_flush = &inode->ei_devs_need_flush;
-
- if (sync)
- dio->op.flags |= BCH_WRITE_sync;
- dio->op.flags |= BCH_WRITE_check_enospc;
-
- ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
- bio_sectors(bio), true);
- if (unlikely(ret))
- goto err;
-
- ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
- dio->op.opts.data_replicas, 0);
- if (unlikely(ret) &&
- !bch2_dio_write_check_allocated(dio))
- goto err;
-
- task_io_account_write(bio->bi_iter.bi_size);
-
- if (unlikely(dio->iter.count) &&
- !dio->sync &&
- !dio->loop &&
- bch2_dio_write_copy_iov(dio))
- dio->sync = sync = true;
-
- dio->loop = true;
- closure_call(&dio->op.cl, bch2_write, NULL, NULL);
-
- if (!sync)
- return -EIOCBQUEUED;
-
- bch2_dio_write_end(dio);
-
- if (likely(!dio->iter.count) || dio->op.error)
- break;
-
- bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
- }
-out:
- return bch2_dio_write_done(dio);
-err:
- dio->op.error = ret;
-
- bio_release_pages(bio, false);
-
- bch2_quota_reservation_put(c, inode, &dio->quota_res);
- goto out;
-}
-
-static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
-{
- struct mm_struct *mm = dio->mm;
-
- bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
-
- if (mm)
- kthread_use_mm(mm);
- bch2_dio_write_loop(dio);
- if (mm)
- kthread_unuse_mm(mm);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
-{
- struct dio_write *dio = container_of(op, struct dio_write, op);
-
- bch2_dio_write_end(dio);
-
- if (likely(!dio->iter.count) || dio->op.error)
- bch2_dio_write_done(dio);
- else
- bch2_dio_write_continue(dio);
-}
-
-ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
-{
- struct file *file = req->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct dio_write *dio;
- struct bio *bio;
- bool locked = true, extending;
- ssize_t ret;
-
- prefetch(&c->opts);
- prefetch((void *) &c->opts + 64);
- prefetch(&inode->ei_inode);
- prefetch((void *) &inode->ei_inode + 64);
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
- return -EROFS;
-
- inode_lock(&inode->v);
-
- ret = generic_write_checks(req, iter);
- if (unlikely(ret <= 0))
- goto err_put_write_ref;
-
- ret = file_remove_privs(file);
- if (unlikely(ret))
- goto err_put_write_ref;
-
- ret = file_update_time(file);
- if (unlikely(ret))
- goto err_put_write_ref;
-
- if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
- ret = -EINVAL;
- goto err_put_write_ref;
- }
-
- inode_dio_begin(&inode->v);
- bch2_pagecache_block_get(inode);
-
- extending = req->ki_pos + iter->count > inode->v.i_size;
- if (!extending) {
- inode_unlock(&inode->v);
- locked = false;
- }
-
- bio = bio_alloc_bioset(NULL,
- bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
- GFP_KERNEL,
- &c->dio_write_bioset);
- dio = container_of(bio, struct dio_write, op.wbio.bio);
- dio->req = req;
- dio->mapping = mapping;
- dio->inode = inode;
- dio->mm = current->mm;
- dio->iov = NULL;
- dio->loop = false;
- dio->extending = extending;
- dio->sync = is_sync_kiocb(req) || extending;
- dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
- dio->quota_res.sectors = 0;
- dio->written = 0;
- dio->iter = *iter;
- dio->op.c = c;
-
- if (unlikely(mapping->nrpages)) {
- ret = bch2_write_invalidate_inode_pages_range(mapping,
- req->ki_pos,
- req->ki_pos + iter->count - 1);
- if (unlikely(ret))
- goto err_put_bio;
- }
-
- ret = bch2_dio_write_loop(dio);
-out:
- if (locked)
- inode_unlock(&inode->v);
- return ret;
-err_put_bio:
- bch2_pagecache_block_put(inode);
- bio_put(bio);
- inode_dio_end(&inode->v);
-err_put_write_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
- goto out;
-}
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
-{
- bioset_exit(&c->dio_write_bioset);
- bioset_exit(&c->dio_read_bioset);
-}
-
-int bch2_fs_fs_io_direct_init(struct bch_fs *c)
-{
- if (bioset_init(&c->dio_read_bioset,
- 4, offsetof(struct dio_read, rbio.bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-
- if (bioset_init(&c->dio_write_bioset,
- 4, offsetof(struct dio_write, op.wbio.bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-
- return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
deleted file mode 100644
index 814621ec7f81..000000000000
--- a/fs/bcachefs/fs-io-direct.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_DIRECT_H
-#define _BCACHEFS_FS_IO_DIRECT_H
-
-#ifndef NO_BCACHEFS_FS
-ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
-ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *);
-int bch2_fs_fs_io_direct_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
deleted file mode 100644
index e072900e6a5b..000000000000
--- a/fs/bcachefs/fs-io-pagecache.c
+++ /dev/null
@@ -1,823 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "btree_iter.h"
-#include "extents.h"
-#include "fs-io.h"
-#include "fs-io-pagecache.h"
-#include "subvolume.h"
-
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-
-int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
- loff_t start, u64 end,
- fgf_t fgp_flags, gfp_t gfp,
- folios *fs)
-{
- struct folio *f;
- u64 pos = start;
- int ret = 0;
-
- while (pos < end) {
- if ((u64) pos >= (u64) start + (1ULL << 20))
- fgp_flags &= ~FGP_CREAT;
-
- ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
- if (ret)
- break;
-
- f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
- if (IS_ERR(f))
- break;
-
- BUG_ON(fs->nr && folio_pos(f) != pos);
-
- pos = folio_end_pos(f);
- darray_push(fs, f);
- }
-
- if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
- ret = -ENOMEM;
-
- return fs->nr ? 0 : ret;
-}
-
-/* pagecache_block must be held */
-int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
- loff_t start, loff_t end)
-{
- int ret;
-
- /*
- * XXX: the way this is currently implemented, we can spin if a process
- * is continually redirtying a specific page
- */
- do {
- if (!mapping->nrpages)
- return 0;
-
- ret = filemap_write_and_wait_range(mapping, start, end);
- if (ret)
- break;
-
- if (!mapping->nrpages)
- return 0;
-
- ret = invalidate_inode_pages2_range(mapping,
- start >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
- } while (ret == -EBUSY);
-
- return ret;
-}
-
-#if 0
-/* Useful for debug tracing: */
-static const char * const bch2_folio_sector_states[] = {
-#define x(n) #n,
- BCH_FOLIO_SECTOR_STATE()
-#undef x
- NULL
-};
-#endif
-
-static inline enum bch_folio_sector_state
-folio_sector_dirty(enum bch_folio_sector_state state)
-{
- switch (state) {
- case SECTOR_unallocated:
- return SECTOR_dirty;
- case SECTOR_reserved:
- return SECTOR_dirty_reserved;
- default:
- return state;
- }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_undirty(enum bch_folio_sector_state state)
-{
- switch (state) {
- case SECTOR_dirty:
- return SECTOR_unallocated;
- case SECTOR_dirty_reserved:
- return SECTOR_reserved;
- default:
- return state;
- }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_reserve(enum bch_folio_sector_state state)
-{
- switch (state) {
- case SECTOR_unallocated:
- return SECTOR_reserved;
- case SECTOR_dirty:
- return SECTOR_dirty_reserved;
- default:
- return state;
- }
-}
-
-/* for newly allocated folios: */
-struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
- struct bch_folio *s;
-
- s = kzalloc(sizeof(*s) +
- sizeof(struct bch_folio_sector) *
- folio_sectors(folio), gfp);
- if (!s)
- return NULL;
-
- spin_lock_init(&s->lock);
- folio_attach_private(folio, s);
- return s;
-}
-
-struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
- return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-}
-
-static unsigned bkey_to_sector_state(struct bkey_s_c k)
-{
- if (bkey_extent_is_reservation(k))
- return SECTOR_reserved;
- if (bkey_extent_is_allocation(k.k))
- return SECTOR_allocated;
- return SECTOR_unallocated;
-}
-
-static void __bch2_folio_set(struct folio *folio,
- unsigned pg_offset, unsigned pg_len,
- unsigned nr_ptrs, unsigned state)
-{
- struct bch_folio *s = bch2_folio(folio);
- unsigned i, sectors = folio_sectors(folio);
-
- BUG_ON(pg_offset >= sectors);
- BUG_ON(pg_offset + pg_len > sectors);
-
- spin_lock(&s->lock);
-
- for (i = pg_offset; i < pg_offset + pg_len; i++) {
- s->s[i].nr_replicas = nr_ptrs;
- bch2_folio_sector_set(folio, s, i, state);
- }
-
- if (i == sectors)
- s->uptodate = true;
-
- spin_unlock(&s->lock);
-}
-
-/*
- * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
- * extents btree:
- */
-int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
- struct folio **fs, unsigned nr_folios)
-{
- u64 offset = folio_sector(fs[0]);
- bool need_set = false;
-
- for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
- struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- need_set |= !s->uptodate;
- }
-
- if (!need_set)
- return 0;
-
- unsigned folio_idx = 0;
-
- return bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
- POS(inum.inum, offset),
- POS(inum.inum, U64_MAX),
- inum.subvol, BTREE_ITER_slots, k, ({
- unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- while (folio_idx < nr_folios) {
- struct folio *folio = fs[folio_idx];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
- folio_start;
- unsigned folio_len = min(k.k->p.offset, folio_end) -
- folio_offset - folio_start;
-
- BUG_ON(k.k->p.offset < folio_start);
- BUG_ON(bkey_start_offset(k.k) > folio_end);
-
- if (!bch2_folio(folio)->uptodate)
- __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
- if (k.k->p.offset < folio_end)
- break;
- folio_idx++;
- }
-
- if (folio_idx == nr_folios)
- break;
- 0;
- })));
-}
-
-void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-{
- struct bvec_iter iter;
- struct folio_vec fv;
- unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
- ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- bio_for_each_folio(fv, bio, iter)
- __bch2_folio_set(fv.fv_folio,
- fv.fv_offset >> 9,
- fv.fv_len >> 9,
- nr_ptrs, state);
-}
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
- u64 start, u64 end)
-{
- pgoff_t index = start >> PAGE_SECTORS_SHIFT;
- pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct folio_batch fbatch;
- unsigned i, j;
-
- if (end <= start)
- return;
-
- folio_batch_init(&fbatch);
-
- while (filemap_get_folios(inode->v.i_mapping,
- &index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
- struct bch_folio *s;
-
- BUG_ON(end <= folio_start);
-
- folio_lock(folio);
- s = bch2_folio(folio);
-
- if (s) {
- spin_lock(&s->lock);
- for (j = folio_offset; j < folio_offset + folio_len; j++)
- s->s[j].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
-
- folio_unlock(folio);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
-}
-
-int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
- u64 *start, u64 end,
- bool nonblocking)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
- pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct folio_batch fbatch;
- s64 i_sectors_delta = 0;
- int ret = 0;
-
- if (end <= *start)
- return 0;
-
- folio_batch_init(&fbatch);
-
- while (filemap_get_folios(inode->v.i_mapping,
- &index, end_index, &fbatch)) {
- for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
-
- if (!nonblocking)
- folio_lock(folio);
- else if (!folio_trylock(folio)) {
- folio_batch_release(&fbatch);
- ret = -EAGAIN;
- break;
- }
-
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
-
- BUG_ON(end <= folio_start);
-
- *start = min(end, folio_end);
-
- struct bch_folio *s = bch2_folio(folio);
- if (s) {
- unsigned folio_offset = max(*start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-
- spin_lock(&s->lock);
- for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
- i_sectors_delta -= s->s[j].state == SECTOR_dirty;
- bch2_folio_sector_set(folio, s, j,
- folio_sector_reserve(s->s[j].state));
- }
- spin_unlock(&s->lock);
- }
-
- folio_unlock(folio);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
-
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
- return ret;
-}
-
-static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
- unsigned nr_replicas)
-{
- return max(0, (int) nr_replicas -
- s->nr_replicas -
- s->replicas_reserved);
-}
-
-int bch2_get_folio_disk_reservation(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio, bool check_enospc)
-{
- struct bch_folio *s = bch2_folio_create(folio, 0);
- unsigned nr_replicas = inode_nr_replicas(c, inode);
- struct disk_reservation disk_res = { 0 };
- unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
- int ret;
-
- if (!s)
- return -ENOMEM;
-
- for (i = 0; i < sectors; i++)
- disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-
- if (!disk_res_sectors)
- return 0;
-
- ret = bch2_disk_reservation_get(c, &disk_res,
- disk_res_sectors, 1,
- !check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL
- : 0);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < sectors; i++)
- s->s[i].replicas_reserved +=
- sectors_to_reserve(&s->s[i], nr_replicas);
-
- return 0;
-}
-
-void bch2_folio_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch2_folio_reservation *res)
-{
- bch2_disk_reservation_put(c, &res->disk);
- bch2_quota_reservation_put(c, inode, &res->quota);
-}
-
-static int __bch2_folio_reservation_get(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- size_t offset, size_t len,
- bool partial)
-{
- struct bch_folio *s = bch2_folio_create(folio, 0);
- unsigned i, disk_sectors = 0, quota_sectors = 0;
- struct disk_reservation disk_res = {};
- size_t reserved = len;
- int ret;
-
- if (!s)
- return -ENOMEM;
-
- BUG_ON(!s->uptodate);
-
- for (i = round_down(offset, block_bytes(c)) >> 9;
- i < round_up(offset + len, block_bytes(c)) >> 9;
- i++) {
- disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
- quota_sectors += s->s[i].state == SECTOR_unallocated;
- }
-
- if (disk_sectors) {
- ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
- partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
- if (unlikely(ret))
- return ret;
-
- if (unlikely(disk_res.sectors != disk_sectors)) {
- disk_sectors = quota_sectors = 0;
-
- for (i = round_down(offset, block_bytes(c)) >> 9;
- i < round_up(offset + len, block_bytes(c)) >> 9;
- i++) {
- disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
- if (disk_sectors > disk_res.sectors) {
- /*
- * Make sure to get a reservation that's
- * aligned to the filesystem blocksize:
- */
- unsigned reserved_offset = round_down(i << 9, block_bytes(c));
- reserved = clamp(reserved_offset, offset, offset + len) - offset;
-
- if (!reserved) {
- bch2_disk_reservation_put(c, &disk_res);
- return -BCH_ERR_ENOSPC_disk_reservation;
- }
- break;
- }
- quota_sectors += s->s[i].state == SECTOR_unallocated;
- }
- }
- }
-
- if (quota_sectors) {
- ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
- if (unlikely(ret)) {
- bch2_disk_reservation_put(c, &disk_res);
- return ret;
- }
- }
-
- res->disk.sectors += disk_res.sectors;
- return partial ? reserved : 0;
-}
-
-int bch2_folio_reservation_get(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- size_t offset, size_t len)
-{
- return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
-}
-
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- size_t offset, size_t len)
-{
- return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
-}
-
-static void bch2_clear_folio_bits(struct folio *folio)
-{
- struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_folio *s = bch2_folio(folio);
- struct disk_reservation disk_res = { 0 };
- int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-
- if (!s)
- return;
-
- EBUG_ON(!folio_test_locked(folio));
- EBUG_ON(folio_test_writeback(folio));
-
- for (i = 0; i < sectors; i++) {
- disk_res.sectors += s->s[i].replicas_reserved;
- s->s[i].replicas_reserved = 0;
-
- dirty_sectors -= s->s[i].state == SECTOR_dirty;
- bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
- }
-
- bch2_disk_reservation_put(c, &disk_res);
-
- bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
-
- bch2_folio_release(folio);
-}
-
-void bch2_set_folio_dirty(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- unsigned offset, unsigned len)
-{
- struct bch_folio *s = bch2_folio(folio);
- unsigned i, dirty_sectors = 0;
-
- WARN_ON((u64) folio_pos(folio) + offset + len >
- round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-
- BUG_ON(!s->uptodate);
-
- spin_lock(&s->lock);
-
- for (i = round_down(offset, block_bytes(c)) >> 9;
- i < round_up(offset + len, block_bytes(c)) >> 9;
- i++) {
- unsigned sectors = sectors_to_reserve(&s->s[i],
- res->disk.nr_replicas);
-
- /*
- * This can happen if we race with the error path in
- * bch2_writepage_io_done():
- */
- sectors = min_t(unsigned, sectors, res->disk.sectors);
-
- s->s[i].replicas_reserved += sectors;
- res->disk.sectors -= sectors;
-
- dirty_sectors += s->s[i].state == SECTOR_unallocated;
-
- bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
- }
-
- spin_unlock(&s->lock);
-
- bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-
- if (!folio_test_dirty(folio))
- filemap_dirty_folio(inode->v.i_mapping, folio);
-}
-
-vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-{
- struct file *file = vmf->vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct address_space *fdm = faults_disabled_mapping();
- struct bch_inode_info *inode = file_bch_inode(file);
- vm_fault_t ret;
-
- if (fdm == mapping)
- return VM_FAULT_SIGBUS;
-
- /* Lock ordering: */
- if (fdm > mapping) {
- struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-
- if (bch2_pagecache_add_tryget(inode))
- goto got_lock;
-
- bch2_pagecache_block_put(fdm_host);
-
- bch2_pagecache_add_get(inode);
- bch2_pagecache_add_put(inode);
-
- bch2_pagecache_block_get(fdm_host);
-
- /* Signal that lock has been dropped: */
- set_fdm_dropped_locks();
- return VM_FAULT_SIGBUS;
- }
-
- bch2_pagecache_add_get(inode);
-got_lock:
- ret = filemap_fault(vmf);
- bch2_pagecache_add_put(inode);
-
- return ret;
-}
-
-vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-{
- struct folio *folio = page_folio(vmf->page);
- struct file *file = vmf->vma->vm_file;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct address_space *mapping = file->f_mapping;
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation res;
- unsigned len;
- loff_t isize;
- vm_fault_t ret;
-
- bch2_folio_reservation_init(c, inode, &res);
-
- sb_start_pagefault(inode->v.i_sb);
- file_update_time(file);
-
- /*
- * Not strictly necessary, but helps avoid dio writes livelocking in
- * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
- * a bch2_write_invalidate_inode_pages_range() that works without dropping
- * page lock before invalidating page
- */
- bch2_pagecache_add_get(inode);
-
- folio_lock(folio);
- isize = i_size_read(&inode->v);
-
- if (folio->mapping != mapping || folio_pos(folio) >= isize) {
- folio_unlock(folio);
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
-
- if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
- bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
- folio_unlock(folio);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
- bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
- bch2_folio_reservation_put(c, inode, &res);
-
- folio_wait_stable(folio);
- ret = VM_FAULT_LOCKED;
-out:
- bch2_pagecache_add_put(inode);
- sb_end_pagefault(inode->v.i_sb);
-
- return ret;
-}
-
-void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-{
- if (offset || length < folio_size(folio))
- return;
-
- bch2_clear_folio_bits(folio);
-}
-
-bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-{
- if (folio_test_dirty(folio) || folio_test_writeback(folio))
- return false;
-
- bch2_clear_folio_bits(folio);
- return true;
-}
-
-/* fseek: */
-
-static int folio_data_offset(struct folio *folio, loff_t pos,
- unsigned min_replicas)
-{
- struct bch_folio *s = bch2_folio(folio);
- unsigned i, sectors = folio_sectors(folio);
-
- if (s)
- for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
- if (s->s[i].state >= SECTOR_dirty &&
- s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
- return i << SECTOR_SHIFT;
-
- return -1;
-}
-
-loff_t bch2_seek_pagecache_data(struct inode *vinode,
- loff_t start_offset,
- loff_t end_offset,
- unsigned min_replicas,
- bool nonblock)
-{
- struct folio_batch fbatch;
- pgoff_t start_index = start_offset >> PAGE_SHIFT;
- pgoff_t end_index = end_offset >> PAGE_SHIFT;
- pgoff_t index = start_index;
- unsigned i;
- loff_t ret;
- int offset;
-
- folio_batch_init(&fbatch);
-
- while (filemap_get_folios(vinode->i_mapping,
- &index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
-
- if (!nonblock) {
- folio_lock(folio);
- } else if (!folio_trylock(folio)) {
- folio_batch_release(&fbatch);
- return -EAGAIN;
- }
-
- offset = folio_data_offset(folio,
- max(folio_pos(folio), start_offset),
- min_replicas);
- if (offset >= 0) {
- ret = clamp(folio_pos(folio) + offset,
- start_offset, end_offset);
- folio_unlock(folio);
- folio_batch_release(&fbatch);
- return ret;
- }
- folio_unlock(folio);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
-
- return end_offset;
-}
-
-/*
- * Search for a hole in a folio.
- *
- * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
- * code to indicate a pagecache hole exists at the returned offset. Otherwise
- * return 0 if the folio is filled with data, or an error code. This function
- * can return -EAGAIN if nonblock is specified.
- */
-static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
- unsigned min_replicas, bool nonblock)
-{
- struct folio *folio;
- struct bch_folio *s;
- unsigned i, sectors;
- int ret = -ENOENT;
-
- folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
- FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
- if (IS_ERR(folio))
- return PTR_ERR(folio);
-
- s = bch2_folio(folio);
- if (!s)
- goto unlock;
-
- sectors = folio_sectors(folio);
- for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
- if (s->s[i].state < SECTOR_dirty ||
- s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
- *offset = max(*offset,
- folio_pos(folio) + (i << SECTOR_SHIFT));
- goto unlock;
- }
-
- *offset = folio_end_pos(folio);
- ret = 0;
-unlock:
- folio_unlock(folio);
- folio_put(folio);
- return ret;
-}
-
-loff_t bch2_seek_pagecache_hole(struct inode *vinode,
- loff_t start_offset,
- loff_t end_offset,
- unsigned min_replicas,
- bool nonblock)
-{
- struct address_space *mapping = vinode->i_mapping;
- loff_t offset = start_offset;
- loff_t ret = 0;
-
- while (!ret && offset < end_offset)
- ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
-
- if (ret && ret != -ENOENT)
- return ret;
- return min(offset, end_offset);
-}
-
-int bch2_clamp_data_hole(struct inode *inode,
- u64 *hole_start,
- u64 *hole_end,
- unsigned min_replicas,
- bool nonblock)
-{
- loff_t ret;
-
- ret = bch2_seek_pagecache_hole(inode,
- *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
- if (ret < 0)
- return ret;
-
- *hole_start = ret;
-
- if (*hole_start == *hole_end)
- return 0;
-
- ret = bch2_seek_pagecache_data(inode,
- *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
- if (ret < 0)
- return ret;
-
- *hole_end = ret;
- return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
deleted file mode 100644
index fad911cf5068..000000000000
--- a/fs/bcachefs/fs-io-pagecache.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
-#define _BCACHEFS_FS_IO_PAGECACHE_H
-
-#include <linux/pagemap.h>
-
-typedef DARRAY(struct folio *) folios;
-
-int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
- u64, fgf_t, gfp_t, folios *);
-int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
-
-/*
- * Use u64 for the end pos and sector helpers because if the folio covers the
- * max supported range of the mapping, the start offset of the next folio
- * overflows loff_t. This breaks much of the range based processing in the
- * buffered write path.
- */
-static inline u64 folio_end_pos(struct folio *folio)
-{
- return folio_pos(folio) + folio_size(folio);
-}
-
-static inline size_t folio_sectors(struct folio *folio)
-{
- return PAGE_SECTORS << folio_order(folio);
-}
-
-static inline loff_t folio_sector(struct folio *folio)
-{
- return folio_pos(folio) >> 9;
-}
-
-static inline u64 folio_end_sector(struct folio *folio)
-{
- return folio_end_pos(folio) >> 9;
-}
-
-#define BCH_FOLIO_SECTOR_STATE() \
- x(unallocated) \
- x(reserved) \
- x(dirty) \
- x(dirty_reserved) \
- x(allocated)
-
-enum bch_folio_sector_state {
-#define x(n) SECTOR_##n,
- BCH_FOLIO_SECTOR_STATE()
-#undef x
-};
-
-struct bch_folio_sector {
- /* Uncompressed, fully allocated replicas (or on disk reservation): */
- u8 nr_replicas:4,
- /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
- replicas_reserved:4;
- u8 state;
-};
-
-struct bch_folio {
- spinlock_t lock;
- atomic_t write_count;
- /*
- * Is the sector state up to date with the btree?
- * (Not the data itself)
- */
- bool uptodate;
- struct bch_folio_sector s[];
-};
-
-/* Helper for when we need to add debug instrumentation: */
-static inline void bch2_folio_sector_set(struct folio *folio,
- struct bch_folio *s,
- unsigned i, unsigned n)
-{
- s->s[i].state = n;
-}
-
-/* file offset (to folio offset) to bch_folio_sector index */
-static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-{
- u64 f_offset = pos - folio_pos(folio);
-
- BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
- return f_offset >> SECTOR_SHIFT;
-}
-
-/* for newly allocated folios: */
-static inline void __bch2_folio_release(struct folio *folio)
-{
- kfree(folio_detach_private(folio));
-}
-
-static inline void bch2_folio_release(struct folio *folio)
-{
- EBUG_ON(!folio_test_locked(folio));
- __bch2_folio_release(folio);
-}
-
-static inline struct bch_folio *__bch2_folio(struct folio *folio)
-{
- return folio_get_private(folio);
-}
-
-static inline struct bch_folio *bch2_folio(struct folio *folio)
-{
- EBUG_ON(!folio_test_locked(folio));
-
- return __bch2_folio(folio);
-}
-
-struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
-struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
-
-struct bch2_folio_reservation {
- struct disk_reservation disk;
- struct quota_res quota;
-};
-
-static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-{
- /* XXX: this should not be open coded */
- return inode->ei_inode.bi_data_replicas
- ? inode->ei_inode.bi_data_replicas - 1
- : c->opts.data_replicas;
-}
-
-static inline void bch2_folio_reservation_init(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch2_folio_reservation *res)
-{
- memset(res, 0, sizeof(*res));
-
- res->disk.nr_replicas = inode_nr_replicas(c, inode);
-}
-
-int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
-void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
-
-int bch2_get_folio_disk_reservation(struct bch_fs *,
- struct bch_inode_info *,
- struct folio *, bool);
-
-void bch2_folio_reservation_put(struct bch_fs *,
- struct bch_inode_info *,
- struct bch2_folio_reservation *);
-int bch2_folio_reservation_get(struct bch_fs *,
- struct bch_inode_info *,
- struct folio *,
- struct bch2_folio_reservation *,
- size_t, size_t);
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *,
- struct bch_inode_info *,
- struct folio *,
- struct bch2_folio_reservation *,
- size_t, size_t);
-
-void bch2_set_folio_dirty(struct bch_fs *,
- struct bch_inode_info *,
- struct folio *,
- struct bch2_folio_reservation *,
- unsigned, unsigned);
-
-vm_fault_t bch2_page_fault(struct vm_fault *);
-vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidate_folio(struct folio *, size_t, size_t);
-bool bch2_release_folio(struct folio *, gfp_t);
-
-loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
-loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
-int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
-
-#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
deleted file mode 100644
index 717e7b94c66f..000000000000
--- a/fs/bcachefs/fs-io.c
+++ /dev/null
@@ -1,1065 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal.h"
-#include "io_misc.h"
-#include "keylist.h"
-#include "quota.h"
-#include "reflink.h"
-#include "trace.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/falloc.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/pagevec.h>
-#include <linux/rmap.h>
-#include <linux/sched/signal.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/uio.h>
-
-#include <trace/events/writeback.h>
-
-struct nocow_flush {
- struct closure *cl;
- struct bch_dev *ca;
- struct bio bio;
-};
-
-static void nocow_flush_endio(struct bio *_bio)
-{
-
- struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
-
- closure_put(bio->cl);
- percpu_ref_put(&bio->ca->io_ref);
- bio_put(&bio->bio);
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct closure *cl)
-{
- struct nocow_flush *bio;
- struct bch_dev *ca;
- struct bch_devs_mask devs;
- unsigned dev;
-
- dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
- if (dev == BCH_SB_MEMBERS_MAX)
- return;
-
- devs = inode->ei_devs_need_flush;
- memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
- for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca && !percpu_ref_tryget(&ca->io_ref))
- ca = NULL;
- rcu_read_unlock();
-
- if (!ca)
- continue;
-
- bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
- REQ_OP_WRITE|REQ_PREFLUSH,
- GFP_KERNEL,
- &c->nocow_flush_bioset),
- struct nocow_flush, bio);
- bio->cl = cl;
- bio->ca = ca;
- bio->bio.bi_end_io = nocow_flush_endio;
- closure_bio_submit(&bio->bio, cl);
- }
-}
-
-static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- struct closure cl;
-
- closure_init_stack(&cl);
- bch2_inode_flush_nocow_writes_async(c, inode, &cl);
- closure_sync(&cl);
-
- return 0;
-}
-
-/* i_size updates: */
-
-struct inode_new_size {
- loff_t new_size;
- u64 now;
- unsigned fields;
-};
-
-static int inode_set_size(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct inode_new_size *s = p;
-
- bi->bi_size = s->new_size;
- if (s->fields & ATTR_ATIME)
- bi->bi_atime = s->now;
- if (s->fields & ATTR_MTIME)
- bi->bi_mtime = s->now;
- if (s->fields & ATTR_CTIME)
- bi->bi_ctime = s->now;
-
- return 0;
-}
-
-int __must_check bch2_write_inode_size(struct bch_fs *c,
- struct bch_inode_info *inode,
- loff_t new_size, unsigned fields)
-{
- struct inode_new_size s = {
- .new_size = new_size,
- .now = bch2_current_time(c),
- .fields = fields,
- };
-
- return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-}
-
-void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
- struct quota_res *quota_res, s64 sectors)
-{
- bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
- "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
- inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
- inode->ei_inode.bi_sectors);
- inode->v.i_blocks += sectors;
-
-#ifdef CONFIG_BCACHEFS_QUOTA
- if (quota_res &&
- !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
- sectors > 0) {
- BUG_ON(sectors > quota_res->sectors);
- BUG_ON(sectors > inode->ei_quota_reserved);
-
- quota_res->sectors -= sectors;
- inode->ei_quota_reserved -= sectors;
- } else {
- bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
- }
-#endif
-}
-
-/* fsync: */
-
-static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum,
- u64 *seq)
-{
- struct printbuf buf = PRINTBUF;
- struct bch_inode_unpacked u;
- struct btree_iter iter;
- int ret = bch2_inode_peek(trans, &iter, &u, inum, 0);
- if (ret)
- return ret;
-
- u64 cur_seq = journal_cur_seq(&trans->c->journal);
- *seq = min(cur_seq, u.bi_journal_seq);
-
- if (fsck_err_on(u.bi_journal_seq > cur_seq,
- trans, inode_journal_seq_in_future,
- "inode journal seq in future (currently at %llu)\n%s",
- cur_seq,
- (bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf))) {
- u.bi_journal_seq = cur_seq;
- ret = bch2_inode_write(trans, &iter, &u);
- }
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
- * insert trigger: look up the btree inode instead
- */
-static int bch2_flush_inode(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- if (c->opts.journal_flush_disabled)
- return 0;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
- return -EROFS;
-
- u64 seq;
- int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
- bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
- bch2_inode_flush_nocow_writes(c, inode);
- bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
- return ret;
-}
-
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret, err;
-
- trace_bch2_fsync(file, datasync);
-
- ret = file_write_and_wait_range(file, start, end);
- if (ret)
- goto out;
- ret = sync_inode_metadata(&inode->v, 1);
- if (ret)
- goto out;
- ret = bch2_flush_inode(c, inode);
-out:
- ret = bch2_err_class(ret);
- if (ret == -EROFS)
- ret = -EIO;
-
- err = file_check_and_advance_wb_err(file);
- if (!ret)
- ret = err;
-
- return ret;
-}
-
-/* truncate: */
-
-static inline int range_has_data(struct bch_fs *c, u32 subvol,
- struct bpos start,
- struct bpos end)
-{
- return bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end,
- subvol, 0, k, ({
- bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
- })));
-}
-
-static int __bch2_truncate_folio(struct bch_inode_info *inode,
- pgoff_t index, loff_t start, loff_t end)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct address_space *mapping = inode->v.i_mapping;
- struct bch_folio *s;
- unsigned start_offset;
- unsigned end_offset;
- unsigned i;
- struct folio *folio;
- s64 i_sectors_delta = 0;
- int ret = 0;
- u64 end_pos;
-
- folio = filemap_lock_folio(mapping, index);
- if (IS_ERR_OR_NULL(folio)) {
- /*
- * XXX: we're doing two index lookups when we end up reading the
- * folio
- */
- ret = range_has_data(c, inode->ei_inum.subvol,
- POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
- POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
- if (ret <= 0)
- return ret;
-
- folio = __filemap_get_folio(mapping, index,
- FGP_LOCK|FGP_CREAT, GFP_KERNEL);
- if (IS_ERR(folio)) {
- ret = -ENOMEM;
- goto out;
- }
- }
-
- BUG_ON(start >= folio_end_pos(folio));
- BUG_ON(end <= folio_pos(folio));
-
- start_offset = max(start, folio_pos(folio)) - folio_pos(folio);
- end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
-
- /* Folio boundary? Nothing to do */
- if (start_offset == 0 &&
- end_offset == folio_size(folio)) {
- ret = 0;
- goto unlock;
- }
-
- s = bch2_folio_create(folio, 0);
- if (!s) {
- ret = -ENOMEM;
- goto unlock;
- }
-
- if (!folio_test_uptodate(folio)) {
- ret = bch2_read_single_folio(folio, mapping);
- if (ret)
- goto unlock;
- }
-
- ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
- if (ret)
- goto unlock;
-
- for (i = round_up(start_offset, block_bytes(c)) >> 9;
- i < round_down(end_offset, block_bytes(c)) >> 9;
- i++) {
- s->s[i].nr_replicas = 0;
-
- i_sectors_delta -= s->s[i].state == SECTOR_dirty;
- bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
- }
-
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
- /*
- * Caller needs to know whether this folio will be written out by
- * writeback - doing an i_size update if necessary - or whether it will
- * be responsible for the i_size update.
- *
- * Note that we shouldn't ever see a folio beyond EOF, but check and
- * warn if so. This has been observed by failure to clean up folios
- * after a short write and there's still a chance reclaim will fix
- * things up.
- */
- WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
- end_pos = folio_end_pos(folio);
- if (inode->v.i_size > folio_pos(folio))
- end_pos = min_t(u64, inode->v.i_size, end_pos);
- ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
-
- folio_zero_segment(folio, start_offset, end_offset);
-
- /*
- * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
- *
- * XXX: because we aren't currently tracking whether the folio has actual
- * data in it (vs. just 0s, or only partially written) this wrong. ick.
- */
- BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
-
- /*
- * This removes any writeable userspace mappings; we need to force
- * .page_mkwrite to be called again before any mmapped writes, to
- * redirty the full page:
- */
- folio_mkclean(folio);
- filemap_dirty_folio(mapping, folio);
-unlock:
- folio_unlock(folio);
- folio_put(folio);
-out:
- return ret;
-}
-
-static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
-{
- return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
- from, ANYSINT_MAX(loff_t));
-}
-
-static int bch2_truncate_folios(struct bch_inode_info *inode,
- loff_t start, loff_t end)
-{
- int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
- start, end);
-
- if (ret >= 0 &&
- start >> PAGE_SHIFT != end >> PAGE_SHIFT)
- ret = __bch2_truncate_folio(inode,
- (end - 1) >> PAGE_SHIFT,
- start, end);
- return ret;
-}
-
-static int bch2_extend(struct mnt_idmap *idmap,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *inode_u,
- struct iattr *iattr)
-{
- struct address_space *mapping = inode->v.i_mapping;
- int ret;
-
- /*
- * sync appends:
- *
- * this has to be done _before_ extending i_size:
- */
- ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
- if (ret)
- return ret;
-
- truncate_setsize(&inode->v, iattr->ia_size);
-
- return bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-int bchfs_truncate(struct mnt_idmap *idmap,
- struct bch_inode_info *inode, struct iattr *iattr)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct address_space *mapping = inode->v.i_mapping;
- struct bch_inode_unpacked inode_u;
- s64 i_sectors_delta = 0;
- int ret = 0;
-
- /*
- * If the truncate call with change the size of the file, the
- * cmtimes should be updated. If the size will not change, we
- * do not need to update the cmtimes.
- */
- if (iattr->ia_size != inode->v.i_size) {
- if (!(iattr->ia_valid & ATTR_MTIME))
- ktime_get_coarse_real_ts64(&iattr->ia_mtime);
- if (!(iattr->ia_valid & ATTR_CTIME))
- ktime_get_coarse_real_ts64(&iattr->ia_ctime);
- iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
- }
-
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(inode);
-
- ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
- if (ret)
- goto err;
-
- /*
- * check this before next assertion; on filesystem error our normal
- * invariants are a bit broken (truncate has to truncate the page cache
- * before the inode).
- */
- ret = bch2_journal_error(&c->journal);
- if (ret)
- goto err;
-
- WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
- inode->v.i_size < inode_u.bi_size,
- "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
- (u64) inode->v.i_size, inode_u.bi_size);
-
- if (iattr->ia_size > inode->v.i_size) {
- ret = bch2_extend(idmap, inode, &inode_u, iattr);
- goto err;
- }
-
- iattr->ia_valid &= ~ATTR_SIZE;
-
- ret = bch2_truncate_folio(inode, iattr->ia_size);
- if (unlikely(ret < 0))
- goto err;
- ret = 0;
-
- truncate_setsize(&inode->v, iattr->ia_size);
-
- /*
- * When extending, we're going to write the new i_size to disk
- * immediately so we need to flush anything above the current on disk
- * i_size first:
- *
- * Also, when extending we need to flush the page that i_size currently
- * straddles - if it's mapped to userspace, we need to ensure that
- * userspace has to redirty it and call .mkwrite -> set_page_dirty
- * again to allocate the part of the page that was extended.
- */
- if (iattr->ia_size > inode_u.bi_size)
- ret = filemap_write_and_wait_range(mapping,
- inode_u.bi_size,
- iattr->ia_size - 1);
- else if (iattr->ia_size & (PAGE_SIZE - 1))
- ret = filemap_write_and_wait_range(mapping,
- round_down(iattr->ia_size, PAGE_SIZE),
- iattr->ia_size - 1);
- if (ret)
- goto err;
-
- ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
- if (unlikely(ret)) {
- /*
- * If we error here, VFS caches are now inconsistent with btree
- */
- set_bit(EI_INODE_ERROR, &inode->ei_flags);
- goto err;
- }
-
- bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
- !bch2_journal_error(&c->journal), c,
- "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
- inode->v.i_ino, (u64) inode->v.i_blocks,
- inode->ei_inode.bi_sectors);
-
- ret = bch2_setattr_nonsize(idmap, inode, iattr);
-err:
- bch2_pagecache_block_put(inode);
- return bch2_err_class(ret);
-}
-
-/* fallocate: */
-
-static int inode_update_times_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi, void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
- return 0;
-}
-
-static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- u64 end = offset + len;
- u64 block_start = round_up(offset, block_bytes(c));
- u64 block_end = round_down(end, block_bytes(c));
- bool truncated_last_page;
- int ret = 0;
-
- ret = bch2_truncate_folios(inode, offset, end);
- if (unlikely(ret < 0))
- goto err;
-
- truncated_last_page = ret;
-
- truncate_pagecache_range(&inode->v, offset, end - 1);
-
- if (block_start < block_end) {
- s64 i_sectors_delta = 0;
-
- ret = bch2_fpunch(c, inode_inum(inode),
- block_start >> 9, block_end >> 9,
- &i_sectors_delta);
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
- }
-
- mutex_lock(&inode->ei_update_lock);
- if (end >= inode->v.i_size && !truncated_last_page) {
- ret = bch2_write_inode_size(c, inode, inode->v.i_size,
- ATTR_MTIME|ATTR_CTIME);
- } else {
- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
- ATTR_MTIME|ATTR_CTIME);
- }
- mutex_unlock(&inode->ei_update_lock);
-err:
- return ret;
-}
-
-static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
- loff_t offset, loff_t len,
- bool insert)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct address_space *mapping = inode->v.i_mapping;
- s64 i_sectors_delta = 0;
- int ret = 0;
-
- if ((offset | len) & (block_bytes(c) - 1))
- return -EINVAL;
-
- if (insert) {
- if (offset >= inode->v.i_size)
- return -EINVAL;
- } else {
- if (offset + len >= inode->v.i_size)
- return -EINVAL;
- }
-
- ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
- if (ret)
- return ret;
-
- if (insert)
- i_size_write(&inode->v, inode->v.i_size + len);
-
- ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
- insert, &i_sectors_delta);
- if (!ret && !insert)
- i_size_write(&inode->v, inode->v.i_size - len);
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
- return ret;
-}
-
-static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
- u64 start_sector, u64 end_sector)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bpos end_pos = POS(inode->v.i_ino, end_sector);
- struct bch_io_opts opts;
- int ret = 0;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inode->v.i_ino, start_sector),
- BTREE_ITER_slots|BTREE_ITER_intent);
-
- while (!ret) {
- s64 i_sectors_delta = 0;
- struct quota_res quota_res = { 0 };
- struct bkey_s_c k;
- unsigned sectors;
- bool is_allocation;
- u64 hole_start, hole_end;
- u32 snapshot;
-
- bch2_trans_begin(trans);
-
- if (bkey_ge(iter.pos, end_pos))
- break;
-
- ret = bch2_subvolume_get_snapshot(trans,
- inode->ei_inum.subvol, &snapshot);
- if (ret)
- goto bkey_err;
-
- bch2_btree_iter_set_snapshot(&iter, snapshot);
-
- k = bch2_btree_iter_peek_slot(&iter);
- if ((ret = bkey_err(k)))
- goto bkey_err;
-
- hole_start = iter.pos.offset;
- hole_end = bpos_min(k.k->p, end_pos).offset;
- is_allocation = bkey_extent_is_allocation(k.k);
-
- /* already reserved */
- if (bkey_extent_is_reservation(k) &&
- bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
- bch2_btree_iter_advance(&iter);
- continue;
- }
-
- if (bkey_extent_is_data(k.k) &&
- !(mode & FALLOC_FL_ZERO_RANGE)) {
- bch2_btree_iter_advance(&iter);
- continue;
- }
-
- if (!(mode & FALLOC_FL_ZERO_RANGE)) {
- /*
- * Lock ordering - can't be holding btree locks while
- * blocking on a folio lock:
- */
- if (bch2_clamp_data_hole(&inode->v,
- &hole_start,
- &hole_end,
- opts.data_replicas, true)) {
- ret = drop_locks_do(trans,
- (bch2_clamp_data_hole(&inode->v,
- &hole_start,
- &hole_end,
- opts.data_replicas, false), 0));
- if (ret)
- goto bkey_err;
- }
- bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
-
- if (ret)
- goto bkey_err;
-
- if (hole_start == hole_end)
- continue;
- }
-
- sectors = hole_end - hole_start;
-
- if (!is_allocation) {
- ret = bch2_quota_reservation_add(c, inode,
- &quota_res, sectors, true);
- if (unlikely(ret))
- goto bkey_err;
- }
-
- ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
- sectors, opts, &i_sectors_delta,
- writepoint_hashed((unsigned long) current));
- if (ret)
- goto bkey_err;
-
- bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
-
- if (bch2_mark_pagecache_reserved(inode, &hole_start,
- iter.pos.offset, true)) {
- ret = drop_locks_do(trans,
- bch2_mark_pagecache_reserved(inode, &hole_start,
- iter.pos.offset, false));
- if (ret)
- goto bkey_err;
- }
-bkey_err:
- bch2_quota_reservation_put(c, inode, &quota_res);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
- }
-
- if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
- struct quota_res quota_res = { 0 };
- s64 i_sectors_delta = 0;
-
- bch2_fpunch_at(trans, &iter, inode_inum(inode),
- end_sector, &i_sectors_delta);
- bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
- bch2_quota_reservation_put(c, inode, &quota_res);
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode,
- loff_t offset, loff_t len)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- u64 end = offset + len;
- u64 block_start = round_down(offset, block_bytes(c));
- u64 block_end = round_up(end, block_bytes(c));
- bool truncated_last_page = false;
- int ret, ret2 = 0;
-
- if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
- ret = inode_newsize_ok(&inode->v, end);
- if (ret)
- return ret;
- }
-
- if (mode & FALLOC_FL_ZERO_RANGE) {
- ret = bch2_truncate_folios(inode, offset, end);
- if (unlikely(ret < 0))
- return ret;
-
- truncated_last_page = ret;
-
- truncate_pagecache_range(&inode->v, offset, end - 1);
-
- block_start = round_up(offset, block_bytes(c));
- block_end = round_down(end, block_bytes(c));
- }
-
- ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
-
- /*
- * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
- * so that the VFS cache i_size is consistent with the btree i_size:
- */
- if (ret &&
- !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
- return ret;
-
- if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
- end = inode->v.i_size;
-
- if (end >= inode->v.i_size &&
- (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
- !(mode & FALLOC_FL_KEEP_SIZE))) {
- spin_lock(&inode->v.i_lock);
- i_size_write(&inode->v, end);
- spin_unlock(&inode->v.i_lock);
-
- mutex_lock(&inode->ei_update_lock);
- ret2 = bch2_write_inode_size(c, inode, end, 0);
- mutex_unlock(&inode->ei_update_lock);
- }
-
- return ret ?: ret2;
-}
-
-long bch2_fallocate_dispatch(struct file *file, int mode,
- loff_t offset, loff_t len)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- long ret;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
- return -EROFS;
-
- inode_lock(&inode->v);
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(inode);
-
- ret = file_modified(file);
- if (ret)
- goto err;
-
- if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
- ret = bchfs_fallocate(inode, mode, offset, len);
- else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
- ret = bchfs_fpunch(inode, offset, len);
- else if (mode == FALLOC_FL_INSERT_RANGE)
- ret = bchfs_fcollapse_finsert(inode, offset, len, true);
- else if (mode == FALLOC_FL_COLLAPSE_RANGE)
- ret = bchfs_fcollapse_finsert(inode, offset, len, false);
- else
- ret = -EOPNOTSUPP;
-err:
- bch2_pagecache_block_put(inode);
- inode_unlock(&inode->v);
- bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
-
- return bch2_err_class(ret);
-}
-
-/*
- * Take a quota reservation for unallocated blocks in a given file range
- * Does not check pagecache
- */
-static int quota_reserve_range(struct bch_inode_info *inode,
- struct quota_res *res,
- u64 start, u64 end)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- u64 sectors = end - start;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter,
- BTREE_ID_extents,
- POS(inode->v.i_ino, start),
- POS(inode->v.i_ino, end - 1),
- inode->ei_inum.subvol, 0, k, ({
- if (bkey_extent_is_allocation(k.k)) {
- u64 s = min(end, k.k->p.offset) -
- max(start, bkey_start_offset(k.k));
- BUG_ON(s > sectors);
- sectors -= s;
- }
-
- 0;
- })));
-
- return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
-}
-
-loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- loff_t len, unsigned remap_flags)
-{
- struct bch_inode_info *src = file_bch_inode(file_src);
- struct bch_inode_info *dst = file_bch_inode(file_dst);
- struct bch_fs *c = src->v.i_sb->s_fs_info;
- struct quota_res quota_res = { 0 };
- s64 i_sectors_delta = 0;
- u64 aligned_len;
- loff_t ret = 0;
-
- if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
- return -EINVAL;
-
- if ((pos_src & (block_bytes(c) - 1)) ||
- (pos_dst & (block_bytes(c) - 1)))
- return -EINVAL;
-
- if (src == dst &&
- abs(pos_src - pos_dst) < len)
- return -EINVAL;
-
- lock_two_nondirectories(&src->v, &dst->v);
- bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
-
- inode_dio_wait(&src->v);
- inode_dio_wait(&dst->v);
-
- ret = generic_remap_file_range_prep(file_src, pos_src,
- file_dst, pos_dst,
- &len, remap_flags);
- if (ret < 0 || len == 0)
- goto err;
-
- aligned_len = round_up((u64) len, block_bytes(c));
-
- ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
- pos_dst, pos_dst + len - 1);
- if (ret)
- goto err;
-
- ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
- (pos_dst + aligned_len) >> 9);
- if (ret)
- goto err;
-
- if (!(remap_flags & REMAP_FILE_DEDUP))
- file_update_time(file_dst);
-
- bch2_mark_pagecache_unallocated(src, pos_src >> 9,
- (pos_src + aligned_len) >> 9);
-
- /*
- * XXX: we'd like to be telling bch2_remap_range() if we have
- * permission to write to the source file, and thus if io path option
- * changes should be propagated through the copy, but we need mnt_idmap
- * from the pathwalk, awkward
- */
- ret = bch2_remap_range(c,
- inode_inum(dst), pos_dst >> 9,
- inode_inum(src), pos_src >> 9,
- aligned_len >> 9,
- pos_dst + len, &i_sectors_delta,
- false);
- if (ret < 0)
- goto err;
-
- /*
- * due to alignment, we might have remapped slightly more than requsted
- */
- ret = min((u64) ret << 9, (u64) len);
-
- bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
-
- spin_lock(&dst->v.i_lock);
- if (pos_dst + ret > dst->v.i_size)
- i_size_write(&dst->v, pos_dst + ret);
- spin_unlock(&dst->v.i_lock);
-
- if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
- IS_SYNC(file_inode(file_dst)))
- ret = bch2_flush_inode(c, dst);
-err:
- bch2_quota_reservation_put(c, dst, &quota_res);
- bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
- unlock_two_nondirectories(&src->v, &dst->v);
-
- return bch2_err_class(ret);
-}
-
-/* fseek: */
-
-static loff_t bch2_seek_data(struct file *file, u64 offset)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- subvol_inum inum = inode_inum(inode);
- u64 isize, next_data = MAX_LFS_FILESIZE;
-
- isize = i_size_read(&inode->v);
- if (offset >= isize)
- return -ENXIO;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9),
- POS(inode->v.i_ino, U64_MAX),
- inum.subvol, 0, k, ({
- if (bkey_extent_is_data(k.k)) {
- next_data = max(offset, bkey_start_offset(k.k) << 9);
- break;
- } else if (k.k->p.offset >> 9 > isize)
- break;
- 0;
- })));
- if (ret)
- return ret;
-
- if (next_data > offset)
- next_data = bch2_seek_pagecache_data(&inode->v,
- offset, next_data, 0, false);
-
- if (next_data >= isize)
- return -ENXIO;
-
- return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
-}
-
-static loff_t bch2_seek_hole(struct file *file, u64 offset)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- subvol_inum inum = inode_inum(inode);
- u64 isize, next_hole = MAX_LFS_FILESIZE;
-
- isize = i_size_read(&inode->v);
- if (offset >= isize)
- return -ENXIO;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9),
- POS(inode->v.i_ino, U64_MAX),
- inum.subvol, BTREE_ITER_slots, k, ({
- if (k.k->p.inode != inode->v.i_ino) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- offset, MAX_LFS_FILESIZE, 0, false);
- break;
- } else if (!bkey_extent_is_data(k.k)) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- max(offset, bkey_start_offset(k.k) << 9),
- k.k->p.offset << 9, 0, false);
-
- if (next_hole < k.k->p.offset << 9)
- break;
- } else {
- offset = max(offset, bkey_start_offset(k.k) << 9);
- }
- 0;
- })));
- if (ret)
- return ret;
-
- if (next_hole > isize)
- next_hole = isize;
-
- return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
-}
-
-loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
-{
- loff_t ret;
-
- switch (whence) {
- case SEEK_SET:
- case SEEK_CUR:
- case SEEK_END:
- ret = generic_file_llseek(file, offset, whence);
- break;
- case SEEK_DATA:
- ret = bch2_seek_data(file, offset);
- break;
- case SEEK_HOLE:
- ret = bch2_seek_hole(file, offset);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return bch2_err_class(ret);
-}
-
-void bch2_fs_fsio_exit(struct bch_fs *c)
-{
- bioset_exit(&c->nocow_flush_bioset);
-}
-
-int bch2_fs_fsio_init(struct bch_fs *c)
-{
- if (bioset_init(&c->nocow_flush_bioset,
- 1, offsetof(struct nocow_flush, bio), 0))
- return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
-
- return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
deleted file mode 100644
index ca70346e68dc..000000000000
--- a/fs/bcachefs/fs-io.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_H
-#define _BCACHEFS_FS_IO_H
-
-#ifndef NO_BCACHEFS_FS
-
-#include "buckets.h"
-#include "fs.h"
-#include "io_write_types.h"
-#include "quota.h"
-
-#include <linux/uio.h>
-
-struct folio_vec {
- struct folio *fv_folio;
- size_t fv_offset;
- size_t fv_len;
-};
-
-static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-{
-
- struct folio *folio = page_folio(bv.bv_page);
- size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
- bv.bv_offset;
- size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-
- return (struct folio_vec) {
- .fv_folio = folio,
- .fv_offset = offset,
- .fv_len = len,
- };
-}
-
-static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
- struct bvec_iter iter)
-{
- return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-}
-
-#define __bio_for_each_folio(bvl, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \
- bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-
-/**
- * bio_for_each_folio - iterate over folios within a bio
- *
- * Like other non-_all versions, this iterates over what bio->bi_iter currently
- * points to. This version is for drivers, where the bio may have previously
- * been split or cloned.
- */
-#define bio_for_each_folio(bvl, bio, iter) \
- __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-
-struct quota_res {
- u64 sectors;
-};
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res)
-{
- BUG_ON(res->sectors > inode->ei_quota_reserved);
-
- bch2_quota_acct(c, inode->ei_qid, Q_SPC,
- -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
- inode->ei_quota_reserved -= res->sectors;
- res->sectors = 0;
-}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res)
-{
- if (res->sectors) {
- mutex_lock(&inode->ei_quota_lock);
- __bch2_quota_reservation_put(c, inode, res);
- mutex_unlock(&inode->ei_quota_lock);
- }
-}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res,
- u64 sectors,
- bool check_enospc)
-{
- int ret;
-
- if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
- return 0;
-
- mutex_lock(&inode->ei_quota_lock);
- ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
- check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
- if (likely(!ret)) {
- inode->ei_quota_reserved += sectors;
- res->sectors += sectors;
- }
- mutex_unlock(&inode->ei_quota_lock);
-
- return ret;
-}
-
-#else
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res) {}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res) {}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res,
- unsigned sectors,
- bool check_enospc)
-{
- return 0;
-}
-
-#endif
-
-void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
- struct quota_res *, s64);
-
-static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
- struct quota_res *quota_res, s64 sectors)
-{
- if (sectors) {
- mutex_lock(&inode->ei_quota_lock);
- __bch2_i_sectors_acct(c, inode, quota_res, sectors);
- mutex_unlock(&inode->ei_quota_lock);
- }
-}
-
-static inline struct address_space *faults_disabled_mapping(void)
-{
- return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-}
-
-static inline void set_fdm_dropped_locks(void)
-{
- current->faults_disabled_mapping =
- (void *) (((unsigned long) current->faults_disabled_mapping)|1);
-}
-
-static inline bool fdm_dropped_locks(void)
-{
- return ((unsigned long) current->faults_disabled_mapping) & 1;
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
- struct bch_inode_info *, struct closure *);
-
-int __must_check bch2_write_inode_size(struct bch_fs *,
- struct bch_inode_info *,
- loff_t, unsigned);
-
-int bch2_fsync(struct file *, loff_t, loff_t, int);
-
-int bchfs_truncate(struct mnt_idmap *,
- struct bch_inode_info *, struct iattr *);
-long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-
-loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
- loff_t, loff_t, unsigned);
-
-loff_t bch2_llseek(struct file *, loff_t, int);
-
-void bch2_fs_fsio_exit(struct bch_fs *);
-int bch2_fs_fsio_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
deleted file mode 100644
index f45054cee746..000000000000
--- a/fs/bcachefs/fs-ioctl.c
+++ /dev/null
@@ -1,651 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "fs.h"
-#include "fs-ioctl.h"
-#include "namei.h"
-#include "quota.h"
-
-#include <linux/compat.h>
-#include <linux/fsnotify.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/security.h>
-#include <linux/writeback.h>
-
-#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
-#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
-#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
-#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
-
-struct flags_set {
- unsigned mask;
- unsigned flags;
-
- unsigned projid;
-
- bool set_projinherit;
- bool projinherit;
-};
-
-static int bch2_inode_flags_set(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- /*
- * We're relying on btree locking here for exclusion with other ioctl
- * calls - use the flags in the btree (@bi), not inode->i_flags:
- */
- struct flags_set *s = p;
- unsigned newflags = s->flags;
- unsigned oldflags = bi->bi_flags & s->mask;
-
- if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
- if (!S_ISREG(bi->bi_mode) &&
- !S_ISDIR(bi->bi_mode) &&
- (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
- return -EINVAL;
-
- if ((newflags ^ oldflags) & BCH_INODE_casefolded) {
-#ifdef CONFIG_UNICODE
- int ret = 0;
- /* Not supported on individual files. */
- if (!S_ISDIR(bi->bi_mode))
- return -EOPNOTSUPP;
-
- /*
- * Make sure the dir is empty, as otherwise we'd need to
- * rehash everything and update the dirent keys.
- */
- ret = bch2_empty_dir_trans(trans, inode_inum(inode));
- if (ret < 0)
- return ret;
-
- ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding);
- if (ret)
- return ret;
-
- bch2_check_set_feature(c, BCH_FEATURE_casefolding);
-#else
- printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
- return -EOPNOTSUPP;
-#endif
- }
-
- if (s->set_projinherit) {
- bi->bi_fields_set &= ~(1 << Inode_opt_project);
- bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
- }
-
- bi->bi_flags &= ~s->mask;
- bi->bi_flags |= newflags;
-
- bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
- return 0;
-}
-
-static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
-{
- unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
-
- return put_user(flags, arg);
-}
-
-static int bch2_ioc_setflags(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *inode,
- void __user *arg)
-{
- struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
- unsigned uflags;
- int ret;
-
- if (get_user(uflags, (int __user *) arg))
- return -EFAULT;
-
- s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
- if (uflags)
- return -EOPNOTSUPP;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(&inode->v);
- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
- ret = -EACCES;
- goto setflags_out;
- }
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
- ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-
-setflags_out:
- inode_unlock(&inode->v);
- mnt_drop_write_file(file);
- return ret;
-}
-
-static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
- struct fsxattr __user *arg)
-{
- struct fsxattr fa = { 0 };
-
- fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
-
- if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
- fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
-
- fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
-
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -EFAULT;
-
- return 0;
-}
-
-static int fssetxattr_inode_update_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct flags_set *s = p;
-
- if (s->projid != bi->bi_project) {
- bi->bi_fields_set |= 1U << Inode_opt_project;
- bi->bi_project = s->projid;
- }
-
- return bch2_inode_flags_set(trans, inode, bi, p);
-}
-
-static int bch2_ioc_fssetxattr(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *inode,
- struct fsxattr __user *arg)
-{
- struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
- struct fsxattr fa;
- int ret;
-
- if (copy_from_user(&fa, arg, sizeof(fa)))
- return -EFAULT;
-
- s.set_projinherit = true;
- s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
- fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
-
- s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
- if (fa.fsx_xflags)
- return -EOPNOTSUPP;
-
- if (fa.fsx_projid >= U32_MAX)
- return -EINVAL;
-
- /*
- * inode fields accessible via the xattr interface are stored with a +1
- * bias, so that 0 means unset:
- */
- s.projid = fa.fsx_projid + 1;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(&inode->v);
- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
- ret = -EACCES;
- goto err;
- }
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- bch2_set_projid(c, inode, fa.fsx_projid) ?:
- bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
- ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-err:
- inode_unlock(&inode->v);
- mnt_drop_write_file(file);
- return ret;
-}
-
-static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_inode_info *dir = p;
-
- return !bch2_reinherit_attrs(bi, &dir->ei_inode);
-}
-
-static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *src,
- const char __user *name)
-{
- struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
- struct bch_inode_info *dst;
- struct inode *vinode = NULL;
- char *kname = NULL;
- struct qstr qstr;
- int ret = 0;
- subvol_inum inum;
-
- kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
- if (!kname)
- return -ENOMEM;
-
- ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
- if (unlikely(ret < 0))
- goto err1;
-
- qstr.len = ret;
- qstr.name = kname;
-
- ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
- if (ret)
- goto err1;
-
- vinode = bch2_vfs_inode_get(c, inum);
- ret = PTR_ERR_OR_ZERO(vinode);
- if (ret)
- goto err1;
-
- dst = to_bch_ei(vinode);
-
- ret = mnt_want_write_file(file);
- if (ret)
- goto err2;
-
- bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
-
- if (inode_attr_changing(src, dst, Inode_opt_project)) {
- ret = bch2_fs_quota_transfer(c, dst,
- src->ei_qid,
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_PREALLOC);
- if (ret)
- goto err3;
- }
-
- ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
-err3:
- bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
-
- /* return true if we did work */
- if (ret >= 0)
- ret = !ret;
-
- mnt_drop_write_file(file);
-err2:
- iput(vinode);
-err1:
- kfree(kname);
-
- return ret;
-}
-
-static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
-{
- return put_user(inode->v.i_generation, arg);
-}
-
-static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
-{
- int ret;
- size_t len;
- char label[BCH_SB_LABEL_SIZE];
-
- BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX);
-
- mutex_lock(&c->sb_lock);
- memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
- mutex_unlock(&c->sb_lock);
-
- len = strnlen(label, BCH_SB_LABEL_SIZE);
- if (len == BCH_SB_LABEL_SIZE) {
- bch_warn(c,
- "label is too long, return the first %zu bytes",
- --len);
- }
-
- ret = copy_to_user(user_label, label, len);
-
- return ret ? -EFAULT : 0;
-}
-
-static int bch2_ioc_setlabel(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *inode,
- const char __user *user_label)
-{
- int ret;
- char label[BCH_SB_LABEL_SIZE];
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(label, user_label, sizeof(label)))
- return -EFAULT;
-
- if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) {
- bch_err(c,
- "unable to set label with more than %d bytes",
- BCH_SB_LABEL_SIZE - 1);
- return -EINVAL;
- }
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- mutex_lock(&c->sb_lock);
- strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
- ret = bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- mnt_drop_write_file(file);
- return ret;
-}
-
-static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
-{
- u32 flags;
- int ret = 0;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (get_user(flags, arg))
- return -EFAULT;
-
- bch_notice(c, "shutdown by ioctl type %u", flags);
-
- switch (flags) {
- case FSOP_GOING_FLAGS_DEFAULT:
- ret = bdev_freeze(c->vfs_sb->s_bdev);
- if (ret)
- break;
- bch2_journal_flush(&c->journal);
- bch2_fs_emergency_read_only(c);
- bdev_thaw(c->vfs_sb->s_bdev);
- break;
- case FSOP_GOING_FLAGS_LOGFLUSH:
- bch2_journal_flush(&c->journal);
- fallthrough;
- case FSOP_GOING_FLAGS_NOLOGFLUSH:
- bch2_fs_emergency_read_only(c);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
-{
- struct inode *dir;
- struct bch_inode_info *inode;
- struct user_namespace *s_user_ns;
- struct dentry *dst_dentry;
- struct path src_path, dst_path;
- int how = LOOKUP_FOLLOW;
- int error;
- subvol_inum snapshot_src = { 0 };
- unsigned lookup_flags = 0;
- unsigned create_flags = BCH_CREATE_SUBVOL;
-
- if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
- BCH_SUBVOL_SNAPSHOT_RO))
- return -EINVAL;
-
- if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
- (arg.src_ptr ||
- (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
- return -EINVAL;
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
- create_flags |= BCH_CREATE_SNAPSHOT;
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
- create_flags |= BCH_CREATE_SNAPSHOT_RO;
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
- /* sync_inodes_sb enforce s_umount is locked */
- down_read(&c->vfs_sb->s_umount);
- sync_inodes_sb(c->vfs_sb);
- up_read(&c->vfs_sb->s_umount);
- }
-
- if (arg.src_ptr) {
- error = user_path_at(arg.dirfd,
- (const char __user *)(unsigned long)arg.src_ptr,
- how, &src_path);
- if (error)
- goto err1;
-
- if (src_path.dentry->d_sb->s_fs_info != c) {
- path_put(&src_path);
- error = -EXDEV;
- goto err1;
- }
-
- snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
- }
-
- dst_dentry = user_path_create(arg.dirfd,
- (const char __user *)(unsigned long)arg.dst_ptr,
- &dst_path, lookup_flags);
- error = PTR_ERR_OR_ZERO(dst_dentry);
- if (error)
- goto err2;
-
- if (dst_dentry->d_sb->s_fs_info != c) {
- error = -EXDEV;
- goto err3;
- }
-
- if (dst_dentry->d_inode) {
- error = -BCH_ERR_EEXIST_subvolume_create;
- goto err3;
- }
-
- dir = dst_path.dentry->d_inode;
- if (IS_DEADDIR(dir)) {
- error = -BCH_ERR_ENOENT_directory_dead;
- goto err3;
- }
-
- s_user_ns = dir->i_sb->s_user_ns;
- if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
- !kgid_has_mapping(s_user_ns, current_fsgid())) {
- error = -EOVERFLOW;
- goto err3;
- }
-
- error = inode_permission(file_mnt_idmap(filp),
- dir, MAY_WRITE | MAY_EXEC);
- if (error)
- goto err3;
-
- if (!IS_POSIXACL(dir))
- arg.mode &= ~current_umask();
-
- error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
- if (error)
- goto err3;
-
- if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
- !arg.src_ptr)
- snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
-
- down_write(&c->snapshot_create_lock);
- inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
- dst_dentry, arg.mode|S_IFDIR,
- 0, snapshot_src, create_flags);
- up_write(&c->snapshot_create_lock);
-
- error = PTR_ERR_OR_ZERO(inode);
- if (error)
- goto err3;
-
- d_instantiate(dst_dentry, &inode->v);
- fsnotify_mkdir(dir, dst_dentry);
-err3:
- done_path_create(&dst_path, dst_dentry);
-err2:
- if (arg.src_ptr)
- path_put(&src_path);
-err1:
- return error;
-}
-
-static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
-{
- const char __user *name = (void __user *)(unsigned long)arg.dst_ptr;
- struct path path;
- struct inode *dir;
- struct dentry *victim;
- int ret = 0;
-
- if (arg.flags)
- return -EINVAL;
-
- victim = user_path_locked_at(arg.dirfd, name, &path);
- if (IS_ERR(victim))
- return PTR_ERR(victim);
-
- dir = d_inode(path.dentry);
- if (victim->d_sb->s_fs_info != c) {
- ret = -EXDEV;
- goto err;
- }
- ret = __bch2_unlink(dir, victim, true);
- if (!ret) {
- fsnotify_rmdir(dir, victim);
- d_delete(victim);
- }
-err:
- inode_unlock(dir);
- dput(victim);
- path_put(&path);
- return ret;
-}
-
-long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- long ret;
-
- switch (cmd) {
- case FS_IOC_GETFLAGS:
- ret = bch2_ioc_getflags(inode, (int __user *) arg);
- break;
-
- case FS_IOC_SETFLAGS:
- ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
- break;
-
- case FS_IOC_FSGETXATTR:
- ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
- break;
-
- case FS_IOC_FSSETXATTR:
- ret = bch2_ioc_fssetxattr(c, file, inode,
- (void __user *) arg);
- break;
-
- case BCHFS_IOC_REINHERIT_ATTRS:
- ret = bch2_ioc_reinherit_attrs(c, file, inode,
- (void __user *) arg);
- break;
-
- case FS_IOC_GETVERSION:
- ret = bch2_ioc_getversion(inode, (u32 __user *) arg);
- break;
-
- case FS_IOC_SETVERSION:
- ret = -ENOTTY;
- break;
-
- case FS_IOC_GETFSLABEL:
- ret = bch2_ioc_getlabel(c, (void __user *) arg);
- break;
-
- case FS_IOC_SETFSLABEL:
- ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg);
- break;
-
- case FS_IOC_GOINGDOWN:
- ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
- break;
-
- case BCH_IOCTL_SUBVOLUME_CREATE: {
- struct bch_ioctl_subvolume i;
-
- ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
- ? -EFAULT
- : bch2_ioctl_subvolume_create(c, file, i);
- break;
- }
-
- case BCH_IOCTL_SUBVOLUME_DESTROY: {
- struct bch_ioctl_subvolume i;
-
- ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
- ? -EFAULT
- : bch2_ioctl_subvolume_destroy(c, file, i);
- break;
- }
-
- default:
- ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
- break;
- }
-
- return bch2_err_class(ret);
-}
-
-#ifdef CONFIG_COMPAT
-long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
- /* These are just misnamed, they actually get/put from/to user an int */
- switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
- case FS_IOC32_GETVERSION:
- cmd = FS_IOC_GETVERSION;
- break;
- case FS_IOC_GETFSLABEL:
- case FS_IOC_SETFSLABEL:
- break;
- default:
- return -ENOIOCTLCMD;
- }
- return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
deleted file mode 100644
index ecd3bfdcde21..000000000000
--- a/fs/bcachefs/fs-ioctl.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IOCTL_H
-#define _BCACHEFS_FS_IOCTL_H
-
-/* Inode flags: */
-
-/* bcachefs inode flags -> vfs inode flags: */
-static const __maybe_unused unsigned bch_flags_to_vfs[] = {
- [__BCH_INODE_sync] = S_SYNC,
- [__BCH_INODE_immutable] = S_IMMUTABLE,
- [__BCH_INODE_append] = S_APPEND,
- [__BCH_INODE_noatime] = S_NOATIME,
- [__BCH_INODE_casefolded] = S_CASEFOLD,
-};
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const __maybe_unused unsigned bch_flags_to_uflags[] = {
- [__BCH_INODE_sync] = FS_SYNC_FL,
- [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
- [__BCH_INODE_append] = FS_APPEND_FL,
- [__BCH_INODE_nodump] = FS_NODUMP_FL,
- [__BCH_INODE_noatime] = FS_NOATIME_FL,
- [__BCH_INODE_casefolded] = FS_CASEFOLD_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const __maybe_unused unsigned bch_flags_to_xflags[] = {
- [__BCH_INODE_sync] = FS_XFLAG_SYNC,
- [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
- [__BCH_INODE_append] = FS_XFLAG_APPEND,
- [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
- [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
- //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
-};
-
-#define set_flags(_map, _in, _out) \
-do { \
- unsigned _i; \
- \
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
- if ((_in) & (1 << _i)) \
- (_out) |= _map[_i]; \
- else \
- (_out) &= ~_map[_i]; \
-} while (0)
-
-#define map_flags(_map, _in) \
-({ \
- unsigned _out = 0; \
- \
- set_flags(_map, _in, _out); \
- _out; \
-})
-
-#define map_flags_rev(_map, _in) \
-({ \
- unsigned _i, _out = 0; \
- \
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
- if ((_in) & _map[_i]) { \
- (_out) |= 1 << _i; \
- (_in) &= ~_map[_i]; \
- } \
- (_out); \
-})
-
-#define map_defined(_map) \
-({ \
- unsigned _in = ~0; \
- \
- map_flags_rev(_map, _in); \
-})
-
-/* Set VFS inode flags from bcachefs inode: */
-static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
-{
- set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-}
-
-long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
-long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
-
-#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
deleted file mode 100644
index c88c149d5de5..000000000000
--- a/fs/bcachefs/fs.c
+++ /dev/null
@@ -1,2450 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "errcode.h"
-#include "extents.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-ioctl.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_read.h"
-#include "journal.h"
-#include "keylist.h"
-#include "namei.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "snapshot.h"
-#include "super.h"
-#include "xattr.h"
-#include "trace.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/exportfs.h>
-#include <linux/fiemap.h>
-#include <linux/fs_context.h>
-#include <linux/module.h>
-#include <linux/pagemap.h>
-#include <linux/posix_acl.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/siphash.h>
-#include <linux/statfs.h>
-#include <linux/string.h>
-#include <linux/xattr.h>
-
-static struct kmem_cache *bch2_inode_cache;
-
-static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
- struct bch_inode_info *,
- struct bch_inode_unpacked *,
- struct bch_subvolume *);
-
-void bch2_inode_update_after_write(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- unsigned fields)
-{
- struct bch_fs *c = trans->c;
-
- BUG_ON(bi->bi_inum != inode->v.i_ino);
-
- bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
-
- set_nlink(&inode->v, bch2_inode_nlink_get(bi));
- i_uid_write(&inode->v, bi->bi_uid);
- i_gid_write(&inode->v, bi->bi_gid);
- inode->v.i_mode = bi->bi_mode;
-
- if (fields & ATTR_SIZE)
- i_size_write(&inode->v, bi->bi_size);
-
- if (fields & ATTR_ATIME)
- inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
- if (fields & ATTR_MTIME)
- inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
- if (fields & ATTR_CTIME)
- inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
-
- inode->ei_inode = *bi;
-
- bch2_inode_flags_to_vfs(inode);
-}
-
-int __must_check bch2_write_inode(struct bch_fs *c,
- struct bch_inode_info *inode,
- inode_set_fn set,
- void *p, unsigned fields)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bch_inode_unpacked inode_u;
- int ret;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent);
- if (ret)
- goto err;
-
- struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
-
- ret = (set ? set(trans, inode, &inode_u, p) : 0);
- if (ret)
- goto err;
-
- struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
-
- if (memcmp(&old_r, &new_r, sizeof(new_r))) {
- ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
- if (ret)
- goto err;
- }
-
- ret = bch2_inode_write(trans, &iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-
- /*
- * the btree node lock protects inode->ei_inode, not ei_update_lock;
- * this is important for inode updates via bchfs_write_index_update
- */
- if (!ret)
- bch2_inode_update_after_write(trans, inode, &inode_u, fields);
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
- "%s: inode %llu:%llu not found when updating",
- bch2_err_str(ret),
- inode_inum(inode).subvol,
- inode_inum(inode).inum);
-
- bch2_trans_put(trans);
- return ret < 0 ? ret : 0;
-}
-
-int bch2_fs_quota_transfer(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch_qid new_qid,
- unsigned qtypes,
- enum quota_acct_mode mode)
-{
- unsigned i;
- int ret;
-
- qtypes &= enabled_qtypes(c);
-
- for (i = 0; i < QTYP_NR; i++)
- if (new_qid.q[i] == inode->ei_qid.q[i])
- qtypes &= ~(1U << i);
-
- if (!qtypes)
- return 0;
-
- mutex_lock(&inode->ei_quota_lock);
-
- ret = bch2_quota_transfer(c, qtypes, new_qid,
- inode->ei_qid,
- inode->v.i_blocks +
- inode->ei_quota_reserved,
- mode);
- if (!ret)
- for (i = 0; i < QTYP_NR; i++)
- if (qtypes & (1 << i))
- inode->ei_qid.q[i] = new_qid.q[i];
-
- mutex_unlock(&inode->ei_quota_lock);
-
- return ret;
-}
-
-static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
-{
- return a.subvol == b.subvol && a.inum == b.inum;
-}
-
-static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
-{
- const subvol_inum *inum = data;
- siphash_key_t k = { .key[0] = seed };
-
- return siphash_2u64(inum->subvol, inum->inum, &k);
-}
-
-static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
-{
- const struct bch_inode_info *inode = data;
-
- return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
-}
-
-static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const struct bch_inode_info *inode = obj;
- const subvol_inum *v = arg->key;
-
- return !subvol_inum_eq(inode->ei_inum, *v);
-}
-
-static const struct rhashtable_params bch2_vfs_inodes_params = {
- .head_offset = offsetof(struct bch_inode_info, hash),
- .key_offset = offsetof(struct bch_inode_info, ei_inum),
- .key_len = sizeof(subvol_inum),
- .hashfn = bch2_vfs_inode_hash_fn,
- .obj_hashfn = bch2_vfs_inode_obj_hash_fn,
- .obj_cmpfn = bch2_vfs_inode_cmp_fn,
- .automatic_shrinking = true,
-};
-
-static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = {
- .head_offset = offsetof(struct bch_inode_info, by_inum_hash),
- .key_offset = offsetof(struct bch_inode_info, ei_inum.inum),
- .key_len = sizeof(u64),
- .automatic_shrinking = true,
-};
-
-int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
-{
- struct bch_fs *c = trans->c;
- struct rhltable *ht = &c->vfs_inodes_by_inum_table;
- u64 inum = p.offset;
- DARRAY(u32) subvols;
- int ret = 0;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return false;
-
- darray_init(&subvols);
-restart_from_top:
-
- /*
- * Tweaked version of __rhashtable_lookup(); we need to get a list of
- * subvolumes in which the given inode number is open.
- *
- * For this to work, we don't include the subvolume ID in the key that
- * we hash - all inodes with the same inode number regardless of
- * subvolume will hash to the same slot.
- *
- * This will be less than ideal if the same file is ever open
- * simultaneously in many different snapshots:
- */
- rcu_read_lock();
- struct rhash_lock_head __rcu *const *bkt;
- struct rhash_head *he;
- unsigned int hash;
- struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht);
-restart:
- hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params);
- bkt = rht_bucket(tbl, hash);
- do {
- struct bch_inode_info *inode;
-
- rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
- if (inode->ei_inum.inum == inum) {
- ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
- GFP_NOWAIT|__GFP_NOWARN);
- if (ret) {
- rcu_read_unlock();
- ret = darray_make_room(&subvols, 1);
- if (ret)
- goto err;
- subvols.nr = 0;
- goto restart_from_top;
- }
- }
- }
- /* An object might have been moved to a different hash chain,
- * while we walk along it - better check and retry.
- */
- } while (he != RHT_NULLS_MARKER(bkt));
-
- /* Ensure we see any new tables. */
- smp_rmb();
-
- tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht);
- if (unlikely(tbl))
- goto restart;
- rcu_read_unlock();
-
- darray_for_each(subvols, i) {
- u32 snap;
- ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
- if (ret)
- goto err;
-
- ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
- if (ret)
- break;
- }
-err:
- darray_exit(&subvols);
- return ret;
-}
-
-static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
-{
- return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
-}
-
-static void __wait_on_freeing_inode(struct bch_fs *c,
- struct bch_inode_info *inode,
- subvol_inum inum)
-{
- wait_queue_head_t *wq;
- struct wait_bit_queue_entry wait;
-
- wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&inode->v.i_lock);
-
- if (__bch2_inode_hash_find(c, inum) == inode)
- schedule_timeout(HZ * 10);
- finish_wait(wq, &wait.wq_entry);
-}
-
-static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
- subvol_inum inum)
-{
- struct bch_inode_info *inode;
-repeat:
- inode = __bch2_inode_hash_find(c, inum);
- if (inode) {
- spin_lock(&inode->v.i_lock);
- if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
- spin_unlock(&inode->v.i_lock);
- return NULL;
- }
- if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
- if (!trans) {
- __wait_on_freeing_inode(c, inode, inum);
- } else {
- bch2_trans_unlock(trans);
- __wait_on_freeing_inode(c, inode, inum);
- int ret = bch2_trans_relock(trans);
- if (ret)
- return ERR_PTR(ret);
- }
- goto repeat;
- }
- __iget(&inode->v);
- spin_unlock(&inode->v.i_lock);
- }
-
- return inode;
-}
-
-static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
-{
- spin_lock(&inode->v.i_lock);
- bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
- spin_unlock(&inode->v.i_lock);
-
- if (remove) {
- int ret = rhltable_remove(&c->vfs_inodes_by_inum_table,
- &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params);
- BUG_ON(ret);
-
- ret = rhashtable_remove_fast(&c->vfs_inodes_table,
- &inode->hash, bch2_vfs_inodes_params);
- BUG_ON(ret);
- inode->v.i_hash.pprev = NULL;
- /*
- * This pairs with the bch2_inode_hash_find() ->
- * __wait_on_freeing_inode() path
- */
- inode_wake_up_bit(&inode->v, __I_NEW);
- }
-}
-
-static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
- struct btree_trans *trans,
- struct bch_inode_info *inode)
-{
- struct bch_inode_info *old = inode;
-
- set_bit(EI_INODE_HASHED, &inode->ei_flags);
-retry:
- if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
- &inode->ei_inum,
- &inode->hash,
- bch2_vfs_inodes_params))) {
- old = bch2_inode_hash_find(c, trans, inode->ei_inum);
- if (!old)
- goto retry;
-
- clear_bit(EI_INODE_HASHED, &inode->ei_flags);
-
- /*
- * bcachefs doesn't use I_NEW; we have no use for it since we
- * only insert fully created inodes in the inode hash table. But
- * discard_new_inode() expects it to be set...
- */
- inode->v.i_state |= I_NEW;
- /*
- * We don't want bch2_evict_inode() to delete the inode on disk,
- * we just raced and had another inode in cache. Normally new
- * inodes don't have nlink == 0 - except tmpfiles do...
- */
- set_nlink(&inode->v, 1);
- discard_new_inode(&inode->v);
- return old;
- } else {
- int ret = rhltable_insert(&c->vfs_inodes_by_inum_table,
- &inode->by_inum_hash,
- bch2_vfs_inodes_by_inum_params);
- BUG_ON(ret);
-
- inode_fake_hash(&inode->v);
-
- inode_sb_list_add(&inode->v);
-
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
- return inode;
- }
-}
-
-#define memalloc_flags_do(_flags, _do) \
-({ \
- unsigned _saved_flags = memalloc_flags_save(_flags); \
- typeof(_do) _ret = _do; \
- memalloc_noreclaim_restore(_saved_flags); \
- _ret; \
-})
-
-static struct inode *bch2_alloc_inode(struct super_block *sb)
-{
- BUG();
-}
-
-static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
-{
- struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
- bch2_inode_cache, gfp);
- if (!inode)
- return NULL;
-
- inode_init_once(&inode->v);
- mutex_init(&inode->ei_update_lock);
- two_state_lock_init(&inode->ei_pagecache_lock);
- INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
- inode->ei_flags = 0;
- mutex_init(&inode->ei_quota_lock);
- memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
- if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
- kmem_cache_free(bch2_inode_cache, inode);
- return NULL;
- }
-
- return inode;
-}
-
-/*
- * Allocate a new inode, dropping/retaking btree locks if necessary:
- */
-static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
-{
- struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
-
- if (unlikely(!inode)) {
- int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
- if (ret && inode) {
- __destroy_inode(&inode->v);
- kmem_cache_free(bch2_inode_cache, inode);
- }
- if (ret)
- return ERR_PTR(ret);
- }
-
- return inode;
-}
-
-static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *bi,
- struct bch_subvolume *subvol)
-{
- struct bch_inode_info *inode = bch2_new_inode(trans);
- if (IS_ERR(inode))
- return inode;
-
- bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
-
- return bch2_inode_hash_insert(trans->c, trans, inode);
-
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
-{
- struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
- if (inode)
- return &inode->v;
-
- struct btree_trans *trans = bch2_trans_get(c);
-
- struct bch_inode_unpacked inode_u;
- struct bch_subvolume subvol;
- int ret = lockrestart_do(trans,
- bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
- bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
- PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
- bch2_trans_put(trans);
-
- return ret ? ERR_PTR(ret) : &inode->v;
-}
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *idmap,
- struct bch_inode_info *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev, subvol_inum snapshot_src,
- unsigned flags)
-{
- struct bch_fs *c = dir->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct bch_inode_unpacked dir_u;
- struct bch_inode_info *inode;
- struct bch_inode_unpacked inode_u;
- struct posix_acl *default_acl = NULL, *acl = NULL;
- subvol_inum inum;
- struct bch_subvolume subvol;
- u64 journal_seq = 0;
- kuid_t kuid;
- kgid_t kgid;
- int ret;
-
- /*
- * preallocate acls + vfs inode before btree transaction, so that
- * nothing can fail after the transaction succeeds:
- */
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
- if (ret)
- return ERR_PTR(ret);
-#endif
- inode = __bch2_new_inode(c, GFP_NOFS);
- if (unlikely(!inode)) {
- inode = ERR_PTR(-ENOMEM);
- goto err;
- }
-
- bch2_inode_init_early(c, &inode_u);
-
- if (!(flags & BCH_CREATE_TMPFILE))
- mutex_lock(&dir->ei_update_lock);
-
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
- kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
- ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
- bch2_create_trans(trans,
- inode_inum(dir), &dir_u, &inode_u,
- !(flags & BCH_CREATE_TMPFILE)
- ? &dentry->d_name : NULL,
- from_kuid(i_user_ns(&dir->v), kuid),
- from_kgid(i_user_ns(&dir->v), kgid),
- mode, rdev,
- default_acl, acl, snapshot_src, flags) ?:
- bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
- KEY_TYPE_QUOTA_PREALLOC);
- if (unlikely(ret))
- goto err_before_quota;
-
- inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
- inum.inum = inode_u.bi_inum;
-
- ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
- bch2_trans_commit(trans, NULL, &journal_seq, 0);
- if (unlikely(ret)) {
- bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
- KEY_TYPE_QUOTA_WARN);
-err_before_quota:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- goto err_trans;
- }
-
- if (!(flags & BCH_CREATE_TMPFILE)) {
- bch2_inode_update_after_write(trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
- mutex_unlock(&dir->ei_update_lock);
- }
-
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-
- set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
- set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
-
- /*
- * we must insert the new inode into the inode cache before calling
- * bch2_trans_exit() and dropping locks, else we could race with another
- * thread pulling the inode in and modifying it:
- *
- * also, calling bch2_inode_hash_insert() without passing in the
- * transaction object is sketchy - if we could ever end up in
- * __wait_on_freeing_inode(), we'd risk deadlock.
- *
- * But that shouldn't be possible, since we still have the inode locked
- * that we just created, and we _really_ can't take a transaction
- * restart here.
- */
- inode = bch2_inode_hash_insert(c, NULL, inode);
- bch2_trans_put(trans);
-err:
- posix_acl_release(default_acl);
- posix_acl_release(acl);
- return inode;
-err_trans:
- if (!(flags & BCH_CREATE_TMPFILE))
- mutex_unlock(&dir->ei_update_lock);
-
- bch2_trans_put(trans);
- make_bad_inode(&inode->v);
- iput(&inode->v);
- inode = ERR_PTR(ret);
- goto err;
-}
-
-/* methods */
-
-static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
- subvol_inum dir, struct bch_hash_info *dir_hash_info,
- const struct qstr *name)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter dirent_iter = {};
- subvol_inum inum = {};
- struct printbuf buf = PRINTBUF;
-
- struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
- dir_hash_info, dir, name, 0);
- int ret = bkey_err(k);
- if (ret)
- return ERR_PTR(ret);
-
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
- ret = bch2_dirent_read_target(trans, dir, d, &inum);
- if (ret > 0)
- ret = -ENOENT;
- if (ret)
- goto err;
-
- struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
- if (inode)
- goto out;
-
- /*
- * Note: if check/repair needs it, we commit before
- * bch2_inode_hash_init_insert(), as after that point we can't take a
- * restart - not in the top level loop with a commit_do(), like we
- * usually do:
- */
-
- struct bch_subvolume subvol;
- struct bch_inode_unpacked inode_u;
- ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
- bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
- bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
-
- /*
- * don't remove it: check_inodes might find another inode that points
- * back to this dirent
- */
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
- c, "dirent to missing inode:\n %s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf));
- if (ret)
- goto err;
-out:
- bch2_trans_iter_exit(trans, &dirent_iter);
- printbuf_exit(&buf);
- return inode;
-err:
- inode = ERR_PTR(ret);
- goto out;
-}
-
-static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
- unsigned int flags)
-{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir);
- struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
-
- struct bch_inode_info *inode;
- bch2_trans_do(c,
- PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
- &hash, &dentry->d_name)));
- if (IS_ERR(inode))
- inode = NULL;
-
-#ifdef CONFIG_UNICODE
- if (!inode && IS_CASEFOLDED(vdir)) {
- /*
- * Do not cache a negative dentry in casefolded directories
- * as it would need to be invalidated in the following situation:
- * - Lookup file "blAH" in a casefolded directory
- * - Creation of file "BLAH" in a casefolded directory
- * - Lookup file "blAH" in a casefolded directory
- * which would fail if we had a negative dentry.
- *
- * We should come back to this when VFS has a method to handle
- * this edgecase.
- */
- return NULL;
- }
-#endif
-
- return d_splice_alias(&inode->v, dentry);
-}
-
-static int bch2_mknod(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
-{
- struct bch_inode_info *inode =
- __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
- (subvol_inum) { 0 }, 0);
-
- if (IS_ERR(inode))
- return bch2_err_class(PTR_ERR(inode));
-
- d_instantiate(dentry, &inode->v);
- return 0;
-}
-
-static int bch2_create(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry,
- umode_t mode, bool excl)
-{
- return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
-}
-
-static int __bch2_link(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch_inode_info *dir,
- struct dentry *dentry)
-{
- struct bch_inode_unpacked dir_u, inode_u;
- int ret;
-
- mutex_lock(&inode->ei_update_lock);
- struct btree_trans *trans = bch2_trans_get(c);
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_link_trans(trans,
- inode_inum(dir), &dir_u,
- inode_inum(inode), &inode_u,
- &dentry->d_name));
-
- if (likely(!ret)) {
- bch2_inode_update_after_write(trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
- bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
- }
-
- bch2_trans_put(trans);
- mutex_unlock(&inode->ei_update_lock);
- return ret;
-}
-
-static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
- struct dentry *dentry)
-{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir);
- struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
- int ret;
-
- lockdep_assert_held(&inode->v.i_rwsem);
-
- ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
- bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- __bch2_link(c, inode, dir, dentry);
- if (unlikely(ret))
- return bch2_err_class(ret);
-
- ihold(&inode->v);
- d_instantiate(dentry, &inode->v);
- return 0;
-}
-
-int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
- bool deleting_snapshot)
-{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir);
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct bch_inode_unpacked dir_u, inode_u;
- int ret;
-
- bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
-
- struct btree_trans *trans = bch2_trans_get(c);
-
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_unlink_trans(trans,
- inode_inum(dir), &dir_u,
- &inode_u, &dentry->d_name,
- deleting_snapshot));
- if (unlikely(ret))
- goto err;
-
- bch2_inode_update_after_write(trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
- bch2_inode_update_after_write(trans, inode, &inode_u,
- ATTR_MTIME);
-
- if (inode_u.bi_subvol) {
- /*
- * Subvolume deletion is asynchronous, but we still want to tell
- * the VFS that it's been deleted here:
- */
- set_nlink(&inode->v, 0);
- }
-err:
- bch2_trans_put(trans);
- bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
-
- return ret;
-}
-
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
-{
- struct bch_inode_info *dir= to_bch_ei(vdir);
- struct bch_fs *c = dir->v.i_sb->s_fs_info;
-
- int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
- __bch2_unlink(vdir, dentry, false);
- return bch2_err_class(ret);
-}
-
-static int bch2_symlink(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry,
- const char *symname)
-{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
- int ret;
-
- inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
- (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
- if (IS_ERR(inode))
- return bch2_err_class(PTR_ERR(inode));
-
- inode_lock(&inode->v);
- ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
- inode_unlock(&inode->v);
-
- if (unlikely(ret))
- goto err;
-
- ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
- if (unlikely(ret))
- goto err;
-
- ret = __bch2_link(c, inode, dir, dentry);
- if (unlikely(ret))
- goto err;
-
- d_instantiate(dentry, &inode->v);
- return 0;
-err:
- iput(&inode->v);
- return bch2_err_class(ret);
-}
-
-static struct dentry *bch2_mkdir(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry, umode_t mode)
-{
- return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0));
-}
-
-static int bch2_rename2(struct mnt_idmap *idmap,
- struct inode *src_vdir, struct dentry *src_dentry,
- struct inode *dst_vdir, struct dentry *dst_dentry,
- unsigned flags)
-{
- struct bch_fs *c = src_vdir->i_sb->s_fs_info;
- struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
- struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
- struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
- struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
- struct bch_inode_unpacked dst_dir_u, src_dir_u;
- struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
- struct btree_trans *trans;
- enum bch_rename_mode mode = flags & RENAME_EXCHANGE
- ? BCH_RENAME_EXCHANGE
- : dst_dentry->d_inode
- ? BCH_RENAME_OVERWRITE : BCH_RENAME;
- bool whiteout = !!(flags & RENAME_WHITEOUT);
- int ret;
-
- if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
- return -EINVAL;
-
- if (mode == BCH_RENAME_OVERWRITE) {
- ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
- 0, LLONG_MAX);
- if (ret)
- return ret;
- }
-
- bch2_lock_inodes(INODE_UPDATE_LOCK,
- src_dir,
- dst_dir,
- src_inode,
- dst_inode);
-
- trans = bch2_trans_get(c);
-
- ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
- bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
- if (ret)
- goto err_tx_restart;
-
- if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
- ret = bch2_fs_quota_transfer(c, src_inode,
- dst_dir->ei_qid,
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_PREALLOC);
- if (ret)
- goto err;
- }
-
- if (mode == BCH_RENAME_EXCHANGE &&
- inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
- ret = bch2_fs_quota_transfer(c, dst_inode,
- src_dir->ei_qid,
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_PREALLOC);
- if (ret)
- goto err;
- }
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_rename_trans(trans,
- inode_inum(src_dir), &src_dir_u,
- inode_inum(dst_dir), &dst_dir_u,
- &src_inode_u,
- &dst_inode_u,
- &src_dentry->d_name,
- &dst_dentry->d_name,
- mode);
- if (unlikely(ret))
- goto err_tx_restart;
-
- if (whiteout) {
- whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
- ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
- if (unlikely(ret))
- goto err_tx_restart;
- bch2_inode_init_early(c, whiteout_inode_u);
-
- ret = bch2_create_trans(trans,
- inode_inum(src_dir), &src_dir_u,
- whiteout_inode_u,
- &src_dentry->d_name,
- from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
- from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
- S_IFCHR|WHITEOUT_MODE, 0,
- NULL, NULL, (subvol_inum) { 0 }, 0) ?:
- bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
- KEY_TYPE_QUOTA_PREALLOC);
- if (unlikely(ret))
- goto err_tx_restart;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, 0);
- if (unlikely(ret)) {
-err_tx_restart:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- goto err;
- }
-
- BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
- BUG_ON(dst_inode &&
- dst_inode->v.i_ino != dst_inode_u.bi_inum);
-
- bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-
- if (src_dir != dst_dir)
- bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-
- bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
- ATTR_CTIME);
-
- if (dst_inode)
- bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
- ATTR_CTIME);
-err:
- bch2_trans_put(trans);
-
- bch2_fs_quota_transfer(c, src_inode,
- bch_qid(&src_inode->ei_inode),
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_NOCHECK);
- if (dst_inode)
- bch2_fs_quota_transfer(c, dst_inode,
- bch_qid(&dst_inode->ei_inode),
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_NOCHECK);
-
- bch2_unlock_inodes(INODE_UPDATE_LOCK,
- src_dir,
- dst_dir,
- src_inode,
- dst_inode);
-
- return bch2_err_class(ret);
-}
-
-static void bch2_setattr_copy(struct mnt_idmap *idmap,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- struct iattr *attr)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- unsigned int ia_valid = attr->ia_valid;
- kuid_t kuid;
- kgid_t kgid;
-
- if (ia_valid & ATTR_UID) {
- kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
- bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
- }
- if (ia_valid & ATTR_GID) {
- kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
- bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
- }
-
- if (ia_valid & ATTR_SIZE)
- bi->bi_size = attr->ia_size;
-
- if (ia_valid & ATTR_ATIME)
- bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
- if (ia_valid & ATTR_MTIME)
- bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
- if (ia_valid & ATTR_CTIME)
- bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
-
- if (ia_valid & ATTR_MODE) {
- umode_t mode = attr->ia_mode;
- kgid_t gid = ia_valid & ATTR_GID
- ? kgid
- : inode->v.i_gid;
-
- if (!in_group_or_capable(idmap, &inode->v,
- make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
- mode &= ~S_ISGID;
- bi->bi_mode = mode;
- }
-}
-
-int bch2_setattr_nonsize(struct mnt_idmap *idmap,
- struct bch_inode_info *inode,
- struct iattr *attr)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_qid qid;
- struct btree_trans *trans;
- struct btree_iter inode_iter = { NULL };
- struct bch_inode_unpacked inode_u;
- struct posix_acl *acl = NULL;
- kuid_t kuid;
- kgid_t kgid;
- int ret;
-
- mutex_lock(&inode->ei_update_lock);
-
- qid = inode->ei_qid;
-
- if (attr->ia_valid & ATTR_UID) {
- kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
- qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
- }
-
- if (attr->ia_valid & ATTR_GID) {
- kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
- qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
- }
-
- ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
- KEY_TYPE_QUOTA_PREALLOC);
- if (ret)
- goto err;
-
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
- kfree(acl);
- acl = NULL;
-
- ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_intent);
- if (ret)
- goto btree_err;
-
- bch2_setattr_copy(idmap, inode, &inode_u, attr);
-
- if (attr->ia_valid & ATTR_MODE) {
- ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
- inode_u.bi_mode, &acl);
- if (ret)
- goto btree_err;
- }
-
- ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
-btree_err:
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- if (unlikely(ret))
- goto err_trans;
-
- bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
-
- if (acl)
- set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
-err_trans:
- bch2_trans_put(trans);
-err:
- mutex_unlock(&inode->ei_update_lock);
-
- return bch2_err_class(ret);
-}
-
-static int bch2_getattr(struct mnt_idmap *idmap,
- const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned query_flags)
-{
- struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
- vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
-
- stat->dev = inode->v.i_sb->s_dev;
- stat->ino = inode->v.i_ino;
- stat->mode = inode->v.i_mode;
- stat->nlink = inode->v.i_nlink;
- stat->uid = vfsuid_into_kuid(vfsuid);
- stat->gid = vfsgid_into_kgid(vfsgid);
- stat->rdev = inode->v.i_rdev;
- stat->size = i_size_read(&inode->v);
- stat->atime = inode_get_atime(&inode->v);
- stat->mtime = inode_get_mtime(&inode->v);
- stat->ctime = inode_get_ctime(&inode->v);
- stat->blksize = block_bytes(c);
- stat->blocks = inode->v.i_blocks;
-
- stat->subvol = inode->ei_inum.subvol;
- stat->result_mask |= STATX_SUBVOL;
-
- if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
- stat->result_mask |= STATX_DIOALIGN;
- /*
- * this is incorrect; we should be tracking this in superblock,
- * and checking the alignment of open devices
- */
- stat->dio_mem_align = SECTOR_SIZE;
- stat->dio_offset_align = block_bytes(c);
- }
-
- if (request_mask & STATX_BTIME) {
- stat->result_mask |= STATX_BTIME;
- stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
- }
-
- if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
- stat->attributes |= STATX_ATTR_IMMUTABLE;
- stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
-
- if (inode->ei_inode.bi_flags & BCH_INODE_append)
- stat->attributes |= STATX_ATTR_APPEND;
- stat->attributes_mask |= STATX_ATTR_APPEND;
-
- if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
- stat->attributes |= STATX_ATTR_NODUMP;
- stat->attributes_mask |= STATX_ATTR_NODUMP;
-
- return 0;
-}
-
-static int bch2_setattr(struct mnt_idmap *idmap,
- struct dentry *dentry, struct iattr *iattr)
-{
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret;
-
- lockdep_assert_held(&inode->v.i_rwsem);
-
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- setattr_prepare(idmap, dentry, iattr);
- if (ret)
- return ret;
-
- return iattr->ia_valid & ATTR_SIZE
- ? bchfs_truncate(idmap, inode, iattr)
- : bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-static int bch2_tmpfile(struct mnt_idmap *idmap,
- struct inode *vdir, struct file *file, umode_t mode)
-{
- struct bch_inode_info *inode =
- __bch2_create(idmap, to_bch_ei(vdir),
- file->f_path.dentry, mode, 0,
- (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-
- if (IS_ERR(inode))
- return bch2_err_class(PTR_ERR(inode));
-
- d_mark_tmpfile(file, &inode->v);
- d_instantiate(file->f_path.dentry, &inode->v);
- return finish_open_simple(file, 0);
-}
-
-static int bch2_fill_extent(struct bch_fs *c,
- struct fiemap_extent_info *info,
- struct bkey_s_c k, unsigned flags)
-{
- if (bkey_extent_is_direct_data(k.k)) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- int ret;
-
- if (k.k->type == KEY_TYPE_reflink_v)
- flags |= FIEMAP_EXTENT_SHARED;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int flags2 = 0;
- u64 offset = p.ptr.offset;
-
- if (p.ptr.unwritten)
- flags2 |= FIEMAP_EXTENT_UNWRITTEN;
-
- if (p.crc.compression_type)
- flags2 |= FIEMAP_EXTENT_ENCODED;
- else
- offset += p.crc.offset;
-
- if ((offset & (block_sectors(c) - 1)) ||
- (k.k->size & (block_sectors(c) - 1)))
- flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
-
- ret = fiemap_fill_next_extent(info,
- bkey_start_offset(k.k) << 9,
- offset << 9,
- k.k->size << 9, flags|flags2);
- if (ret)
- return ret;
- }
-
- return 0;
- } else if (bkey_extent_is_inline_data(k.k)) {
- return fiemap_fill_next_extent(info,
- bkey_start_offset(k.k) << 9,
- 0, k.k->size << 9,
- flags|
- FIEMAP_EXTENT_DATA_INLINE);
- } else if (k.k->type == KEY_TYPE_reservation) {
- return fiemap_fill_next_extent(info,
- bkey_start_offset(k.k) << 9,
- 0, k.k->size << 9,
- flags|
- FIEMAP_EXTENT_DELALLOC|
- FIEMAP_EXTENT_UNWRITTEN);
- } else {
- BUG();
- }
-}
-
-static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
- u64 start, u64 len)
-{
- struct bch_fs *c = vinode->i_sb->s_fs_info;
- struct bch_inode_info *ei = to_bch_ei(vinode);
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_buf cur, prev;
- bool have_extent = false;
- int ret = 0;
-
- ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
- if (ret)
- return ret;
-
- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
- if (start + len < start)
- return -EINVAL;
-
- start >>= 9;
-
- bch2_bkey_buf_init(&cur);
- bch2_bkey_buf_init(&prev);
- trans = bch2_trans_get(c);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(ei->v.i_ino, start), 0);
-
- while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- enum btree_id data_btree = BTREE_ID_extents;
-
- bch2_trans_begin(trans);
-
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
- if (ret)
- continue;
-
- bch2_btree_iter_set_snapshot(&iter, snapshot);
-
- k = bch2_btree_iter_peek_max(&iter, end);
- ret = bkey_err(k);
- if (ret)
- continue;
-
- if (!k.k)
- break;
-
- if (!bkey_extent_is_data(k.k) &&
- k.k->type != KEY_TYPE_reservation) {
- bch2_btree_iter_advance(&iter);
- continue;
- }
-
- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
- unsigned sectors = k.k->size - offset_into_extent;
-
- bch2_bkey_buf_reassemble(&cur, c, k);
-
- ret = bch2_read_indirect_extent(trans, &data_btree,
- &offset_into_extent, &cur);
- if (ret)
- continue;
-
- k = bkey_i_to_s_c(cur.k);
- bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
-
- sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
-
- bch2_cut_front(POS(k.k->p.inode,
- bkey_start_offset(k.k) +
- offset_into_extent),
- cur.k);
- bch2_key_resize(&cur.k->k, sectors);
- cur.k->k.p = iter.pos;
- cur.k->k.p.offset += cur.k->k.size;
-
- if (have_extent) {
- bch2_trans_unlock(trans);
- ret = bch2_fill_extent(c, info,
- bkey_i_to_s_c(prev.k), 0);
- if (ret)
- break;
- }
-
- bkey_copy(prev.k, cur.k);
- have_extent = true;
-
- bch2_btree_iter_set_pos(&iter,
- POS(iter.pos.inode, iter.pos.offset + sectors));
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (!ret && have_extent) {
- bch2_trans_unlock(trans);
- ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
- FIEMAP_EXTENT_LAST);
- }
-
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- return ret < 0 ? ret : 0;
-}
-
-static const struct vm_operations_struct bch_vm_ops = {
- .fault = bch2_page_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = bch2_page_mkwrite,
-};
-
-static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
-{
- file_accessed(file);
-
- vma->vm_ops = &bch_vm_ops;
- return 0;
-}
-
-/* Directories: */
-
-static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
-{
- return generic_file_llseek_size(file, offset, whence,
- S64_MAX, S64_MAX);
-}
-
-static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- if (!dir_emit_dots(file, ctx))
- return 0;
-
- int ret = bch2_readdir(c, inode_inum(inode), ctx);
-
- bch_err_fn(c, ret);
- return bch2_err_class(ret);
-}
-
-static int bch2_open(struct inode *vinode, struct file *file)
-{
- if (file->f_flags & (O_WRONLY|O_RDWR)) {
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
- if (ret)
- return ret;
- }
-
- file->f_mode |= FMODE_CAN_ODIRECT;
-
- return generic_file_open(vinode, file);
-}
-
-static const struct file_operations bch_file_operations = {
- .open = bch2_open,
- .llseek = bch2_llseek,
- .read_iter = bch2_read_iter,
- .write_iter = bch2_write_iter,
- .mmap = bch2_mmap,
- .get_unmapped_area = thp_get_unmapped_area,
- .fsync = bch2_fsync,
- .splice_read = filemap_splice_read,
- .splice_write = iter_file_splice_write,
- .fallocate = bch2_fallocate_dispatch,
- .unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = bch2_compat_fs_ioctl,
-#endif
- .remap_file_range = bch2_remap_file_range,
-};
-
-static const struct inode_operations bch_file_inode_operations = {
- .getattr = bch2_getattr,
- .setattr = bch2_setattr,
- .fiemap = bch2_fiemap,
- .listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_inode_acl = bch2_get_acl,
- .set_acl = bch2_set_acl,
-#endif
-};
-
-static const struct inode_operations bch_dir_inode_operations = {
- .lookup = bch2_lookup,
- .create = bch2_create,
- .link = bch2_link,
- .unlink = bch2_unlink,
- .symlink = bch2_symlink,
- .mkdir = bch2_mkdir,
- .rmdir = bch2_unlink,
- .mknod = bch2_mknod,
- .rename = bch2_rename2,
- .getattr = bch2_getattr,
- .setattr = bch2_setattr,
- .tmpfile = bch2_tmpfile,
- .listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_inode_acl = bch2_get_acl,
- .set_acl = bch2_set_acl,
-#endif
-};
-
-static const struct file_operations bch_dir_file_operations = {
- .llseek = bch2_dir_llseek,
- .read = generic_read_dir,
- .iterate_shared = bch2_vfs_readdir,
- .fsync = bch2_fsync,
- .unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = bch2_compat_fs_ioctl,
-#endif
-};
-
-static const struct inode_operations bch_symlink_inode_operations = {
- .get_link = page_get_link,
- .getattr = bch2_getattr,
- .setattr = bch2_setattr,
- .listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_inode_acl = bch2_get_acl,
- .set_acl = bch2_set_acl,
-#endif
-};
-
-static const struct inode_operations bch_special_inode_operations = {
- .getattr = bch2_getattr,
- .setattr = bch2_setattr,
- .listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_inode_acl = bch2_get_acl,
- .set_acl = bch2_set_acl,
-#endif
-};
-
-static const struct address_space_operations bch_address_space_operations = {
- .read_folio = bch2_read_folio,
- .writepages = bch2_writepages,
- .readahead = bch2_readahead,
- .dirty_folio = filemap_dirty_folio,
- .write_begin = bch2_write_begin,
- .write_end = bch2_write_end,
- .invalidate_folio = bch2_invalidate_folio,
- .release_folio = bch2_release_folio,
-#ifdef CONFIG_MIGRATION
- .migrate_folio = filemap_migrate_folio,
-#endif
- .error_remove_folio = generic_error_remove_folio,
-};
-
-struct bcachefs_fid {
- u64 inum;
- u32 subvol;
- u32 gen;
-} __packed;
-
-struct bcachefs_fid_with_parent {
- struct bcachefs_fid fid;
- struct bcachefs_fid dir;
-} __packed;
-
-static int bcachefs_fid_valid(int fh_len, int fh_type)
-{
- switch (fh_type) {
- case FILEID_BCACHEFS_WITHOUT_PARENT:
- return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
- case FILEID_BCACHEFS_WITH_PARENT:
- return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
- default:
- return false;
- }
-}
-
-static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
-{
- return (struct bcachefs_fid) {
- .inum = inode->ei_inum.inum,
- .subvol = inode->ei_inum.subvol,
- .gen = inode->ei_inode.bi_generation,
- };
-}
-
-static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
- struct inode *vdir)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_inode_info *dir = to_bch_ei(vdir);
- int min_len;
-
- if (!S_ISDIR(inode->v.i_mode) && dir) {
- struct bcachefs_fid_with_parent *fid = (void *) fh;
-
- min_len = sizeof(*fid) / sizeof(u32);
- if (*len < min_len) {
- *len = min_len;
- return FILEID_INVALID;
- }
-
- fid->fid = bch2_inode_to_fid(inode);
- fid->dir = bch2_inode_to_fid(dir);
-
- *len = min_len;
- return FILEID_BCACHEFS_WITH_PARENT;
- } else {
- struct bcachefs_fid *fid = (void *) fh;
-
- min_len = sizeof(*fid) / sizeof(u32);
- if (*len < min_len) {
- *len = min_len;
- return FILEID_INVALID;
- }
- *fid = bch2_inode_to_fid(inode);
-
- *len = min_len;
- return FILEID_BCACHEFS_WITHOUT_PARENT;
- }
-}
-
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
- struct bcachefs_fid fid)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
- .subvol = fid.subvol,
- .inum = fid.inum,
- });
- if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
- iput(vinode);
- vinode = ERR_PTR(-ESTALE);
- }
- return vinode;
-}
-
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
- int fh_len, int fh_type)
-{
- struct bcachefs_fid *fid = (void *) _fid;
-
- if (!bcachefs_fid_valid(fh_len, fh_type))
- return NULL;
-
- return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
-}
-
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
- int fh_len, int fh_type)
-{
- struct bcachefs_fid_with_parent *fid = (void *) _fid;
-
- if (!bcachefs_fid_valid(fh_len, fh_type) ||
- fh_type != FILEID_BCACHEFS_WITH_PARENT)
- return NULL;
-
- return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
-}
-
-static struct dentry *bch2_get_parent(struct dentry *child)
-{
- struct bch_inode_info *inode = to_bch_ei(child->d_inode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- subvol_inum parent_inum = {
- .subvol = inode->ei_inode.bi_parent_subvol ?:
- inode->ei_inum.subvol,
- .inum = inode->ei_inode.bi_dir,
- };
-
- return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
-}
-
-static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
-{
- struct bch_inode_info *inode = to_bch_ei(child->d_inode);
- struct bch_inode_info *dir = to_bch_ei(parent->d_inode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter1;
- struct btree_iter iter2;
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- struct bch_inode_unpacked inode_u;
- subvol_inum target;
- u32 snapshot;
- struct qstr dirent_name;
- unsigned name_len = 0;
- int ret;
-
- if (!S_ISDIR(dir->v.i_mode))
- return -EINVAL;
-
- trans = bch2_trans_get(c);
-
- bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
- POS(dir->ei_inode.bi_inum, 0), 0);
- bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
- POS(dir->ei_inode.bi_inum, 0), 0);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(&iter1, snapshot);
- bch2_btree_iter_set_snapshot(&iter2, snapshot);
-
- ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
- if (ret)
- goto err;
-
- if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
- bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
-
- k = bch2_btree_iter_peek_slot(&iter1);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_dirent) {
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
- goto err;
- }
-
- d = bkey_s_c_to_dirent(k);
- ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
- if (ret > 0)
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
- if (ret)
- goto err;
-
- if (subvol_inum_eq(target, inode->ei_inum))
- goto found;
- } else {
- /*
- * File with multiple hardlinks and our backref is to the wrong
- * directory - linear search:
- */
- for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
- if (k.k->p.inode > dir->ei_inode.bi_inum)
- break;
-
- if (k.k->type != KEY_TYPE_dirent)
- continue;
-
- d = bkey_s_c_to_dirent(k);
- ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
- if (ret < 0)
- break;
- if (ret)
- continue;
-
- if (subvol_inum_eq(target, inode->ei_inum))
- goto found;
- }
- }
-
- ret = -ENOENT;
- goto err;
-found:
- dirent_name = bch2_dirent_get_name(d);
-
- name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
- memcpy(name, dirent_name.name, name_len);
- name[name_len] = '\0';
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_iter_exit(trans, &iter1);
- bch2_trans_iter_exit(trans, &iter2);
- bch2_trans_put(trans);
-
- return ret;
-}
-
-static const struct export_operations bch_export_ops = {
- .encode_fh = bch2_encode_fh,
- .fh_to_dentry = bch2_fh_to_dentry,
- .fh_to_parent = bch2_fh_to_parent,
- .get_parent = bch2_get_parent,
- .get_name = bch2_get_name,
-};
-
-static void bch2_vfs_inode_init(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- struct bch_subvolume *subvol)
-{
- inode->v.i_ino = inum.inum;
- inode->ei_inum = inum;
- inode->ei_inode.bi_inum = inum.inum;
- bch2_inode_update_after_write(trans, inode, bi, ~0);
-
- inode->v.i_blocks = bi->bi_sectors;
- inode->v.i_rdev = bi->bi_dev;
- inode->v.i_generation = bi->bi_generation;
- inode->v.i_size = bi->bi_size;
-
- inode->ei_flags = 0;
- inode->ei_quota_reserved = 0;
- inode->ei_qid = bch_qid(bi);
-
- if (BCH_SUBVOLUME_SNAP(subvol))
- set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-
- inode->v.i_mapping->a_ops = &bch_address_space_operations;
-
- switch (inode->v.i_mode & S_IFMT) {
- case S_IFREG:
- inode->v.i_op = &bch_file_inode_operations;
- inode->v.i_fop = &bch_file_operations;
- break;
- case S_IFDIR:
- inode->v.i_op = &bch_dir_inode_operations;
- inode->v.i_fop = &bch_dir_file_operations;
- break;
- case S_IFLNK:
- inode_nohighmem(&inode->v);
- inode->v.i_op = &bch_symlink_inode_operations;
- break;
- default:
- init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
- inode->v.i_op = &bch_special_inode_operations;
- break;
- }
-
- mapping_set_folio_min_order(inode->v.i_mapping,
- get_order(trans->c->opts.block_size));
-}
-
-static void bch2_free_inode(struct inode *vinode)
-{
- kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
-}
-
-static int inode_update_times_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
- bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
- bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
-
- return 0;
-}
-
-static int bch2_vfs_write_inode(struct inode *vinode,
- struct writeback_control *wbc)
-{
- struct bch_fs *c = vinode->i_sb->s_fs_info;
- struct bch_inode_info *inode = to_bch_ei(vinode);
- int ret;
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
- ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-
- return bch2_err_class(ret);
-}
-
-static void bch2_evict_inode(struct inode *vinode)
-{
- struct bch_fs *c = vinode->i_sb->s_fs_info;
- struct bch_inode_info *inode = to_bch_ei(vinode);
- bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
-
- /*
- * evict() has waited for outstanding writeback, we'll do no more IO
- * through this inode: it's safe to remove from VFS inode hashtable here
- *
- * Do that now so that other threads aren't blocked from pulling it back
- * in, there's no reason for them to be:
- */
- if (!delete)
- bch2_inode_hash_remove(c, inode);
-
- truncate_inode_pages_final(&inode->v.i_data);
-
- clear_inode(&inode->v);
-
- BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
-
- if (delete) {
- bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
- KEY_TYPE_QUOTA_WARN);
- bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
- KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode_inum(inode));
-
- /*
- * If we are deleting, we need it present in the vfs hash table
- * so that fsck can check if unlinked inodes are still open:
- */
- bch2_inode_hash_remove(c, inode);
- }
-
- mutex_lock(&c->vfs_inodes_lock);
- list_del_init(&inode->ei_vfs_inode_list);
- mutex_unlock(&c->vfs_inodes_lock);
-}
-
-void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
-{
- struct bch_inode_info *inode;
- DARRAY(struct bch_inode_info *) grabbed;
- bool clean_pass = false, this_pass_clean;
-
- /*
- * Initially, we scan for inodes without I_DONTCACHE, then mark them to
- * be pruned with d_mark_dontcache().
- *
- * Once we've had a clean pass where we didn't find any inodes without
- * I_DONTCACHE, we wait for them to be freed:
- */
-
- darray_init(&grabbed);
- darray_make_room(&grabbed, 1024);
-again:
- cond_resched();
- this_pass_clean = true;
-
- mutex_lock(&c->vfs_inodes_lock);
- list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
- if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
- continue;
-
- if (!(inode->v.i_state & I_DONTCACHE) &&
- !(inode->v.i_state & I_FREEING) &&
- igrab(&inode->v)) {
- this_pass_clean = false;
-
- if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
- iput(&inode->v);
- break;
- }
- } else if (clean_pass && this_pass_clean) {
- struct wait_bit_queue_entry wqe;
- struct wait_queue_head *wq_head;
-
- wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
- prepare_to_wait_event(wq_head, &wqe.wq_entry,
- TASK_UNINTERRUPTIBLE);
- mutex_unlock(&c->vfs_inodes_lock);
-
- schedule();
- finish_wait(wq_head, &wqe.wq_entry);
- goto again;
- }
- }
- mutex_unlock(&c->vfs_inodes_lock);
-
- darray_for_each(grabbed, i) {
- inode = *i;
- d_mark_dontcache(&inode->v);
- d_prune_aliases(&inode->v);
- iput(&inode->v);
- }
- grabbed.nr = 0;
-
- if (!clean_pass || !this_pass_clean) {
- clean_pass = this_pass_clean;
- goto again;
- }
-
- darray_exit(&grabbed);
-}
-
-static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct super_block *sb = dentry->d_sb;
- struct bch_fs *c = sb->s_fs_info;
- struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
- unsigned shift = sb->s_blocksize_bits - 9;
- /*
- * this assumes inodes take up 64 bytes, which is a decent average
- * number:
- */
- u64 avail_inodes = ((usage.capacity - usage.used) << 3);
-
- buf->f_type = BCACHEFS_STATFS_MAGIC;
- buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = usage.capacity >> shift;
- buf->f_bfree = usage.free >> shift;
- buf->f_bavail = avail_factor(usage.free) >> shift;
-
- buf->f_files = usage.nr_inodes + avail_inodes;
- buf->f_ffree = avail_inodes;
-
- buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b);
- buf->f_namelen = BCH_NAME_MAX;
-
- return 0;
-}
-
-static int bch2_sync_fs(struct super_block *sb, int wait)
-{
- struct bch_fs *c = sb->s_fs_info;
- int ret;
-
- trace_bch2_sync_fs(sb, wait);
-
- if (c->opts.journal_flush_disabled)
- return 0;
-
- if (!wait) {
- bch2_journal_flush_async(&c->journal, NULL);
- return 0;
- }
-
- ret = bch2_journal_flush(&c->journal);
- return bch2_err_class(ret);
-}
-
-static struct bch_fs *bch2_path_to_fs(const char *path)
-{
- struct bch_fs *c;
- dev_t dev;
- int ret;
-
- ret = lookup_bdev(path, &dev);
- if (ret)
- return ERR_PTR(ret);
-
- c = bch2_dev_to_fs(dev);
- if (c)
- closure_put(&c->cl);
- return c ?: ERR_PTR(-ENOENT);
-}
-
-static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
-{
- struct bch_fs *c = root->d_sb->s_fs_info;
- bool first = true;
-
- for_each_online_member(c, ca) {
- if (!first)
- seq_putc(seq, ':');
- first = false;
- seq_puts(seq, ca->disk_sb.sb_name);
- }
-
- return 0;
-}
-
-static int bch2_show_options(struct seq_file *seq, struct dentry *root)
-{
- struct bch_fs *c = root->d_sb->s_fs_info;
- struct printbuf buf = PRINTBUF;
-
- bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
- OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
- printbuf_nul_terminate(&buf);
- seq_printf(seq, ",%s", buf.buf);
-
- int ret = buf.allocation_failure ? -ENOMEM : 0;
- printbuf_exit(&buf);
- return ret;
-}
-
-static void bch2_put_super(struct super_block *sb)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- __bch2_fs_stop(c);
-}
-
-/*
- * bcachefs doesn't currently integrate intwrite freeze protection but the
- * internal write references serve the same purpose. Therefore reuse the
- * read-only transition code to perform the quiesce. The caveat is that we don't
- * currently have the ability to block tasks that want a write reference while
- * the superblock is frozen. This is fine for now, but we should either add
- * blocking support or find a way to integrate sb_start_intwrite() and friends.
- */
-static int bch2_freeze(struct super_block *sb)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- down_write(&c->state_lock);
- bch2_fs_read_only(c);
- up_write(&c->state_lock);
- return 0;
-}
-
-static int bch2_unfreeze(struct super_block *sb)
-{
- struct bch_fs *c = sb->s_fs_info;
- int ret;
-
- if (test_bit(BCH_FS_emergency_ro, &c->flags))
- return 0;
-
- down_write(&c->state_lock);
- ret = bch2_fs_read_write(c);
- up_write(&c->state_lock);
- return ret;
-}
-
-static const struct super_operations bch_super_operations = {
- .alloc_inode = bch2_alloc_inode,
- .free_inode = bch2_free_inode,
- .write_inode = bch2_vfs_write_inode,
- .evict_inode = bch2_evict_inode,
- .sync_fs = bch2_sync_fs,
- .statfs = bch2_statfs,
- .show_devname = bch2_show_devname,
- .show_options = bch2_show_options,
- .put_super = bch2_put_super,
- .freeze_fs = bch2_freeze,
- .unfreeze_fs = bch2_unfreeze,
-};
-
-static int bch2_set_super(struct super_block *s, void *data)
-{
- s->s_fs_info = data;
- return 0;
-}
-
-static int bch2_noset_super(struct super_block *s, void *data)
-{
- return -EBUSY;
-}
-
-typedef DARRAY(struct bch_fs *) darray_fs;
-
-static int bch2_test_super(struct super_block *s, void *data)
-{
- struct bch_fs *c = s->s_fs_info;
- darray_fs *d = data;
-
- if (!c)
- return false;
-
- darray_for_each(*d, i)
- if (c != *i)
- return false;
- return true;
-}
-
-static int bch2_fs_get_tree(struct fs_context *fc)
-{
- struct bch_fs *c;
- struct super_block *sb;
- struct inode *vinode;
- struct bch2_opts_parse *opts_parse = fc->fs_private;
- struct bch_opts opts = opts_parse->opts;
- darray_str devs;
- darray_fs devs_to_fs = {};
- int ret;
-
- opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
- opt_set(opts, nostart, true);
-
- if (!fc->source || strlen(fc->source) == 0)
- return -EINVAL;
-
- ret = bch2_split_devs(fc->source, &devs);
- if (ret)
- return ret;
-
- darray_for_each(devs, i) {
- ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
- if (ret)
- goto err;
- }
-
- sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
- if (!IS_ERR(sb))
- goto got_sb;
-
- c = bch2_fs_open(devs.data, devs.nr, opts);
- ret = PTR_ERR_OR_ZERO(c);
- if (ret)
- goto err;
-
- if (opt_defined(opts, discard))
- set_bit(BCH_FS_discard_mount_opt_set, &c->flags);
-
- /* Some options can't be parsed until after the fs is started: */
- opts = bch2_opts_empty();
- ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
- if (ret)
- goto err_stop_fs;
-
- bch2_opts_apply(&c->opts, opts);
-
- /*
- * need to initialise sb and set c->vfs_sb _before_ starting fs,
- * for blk_holder_ops
- */
-
- sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
- ret = PTR_ERR_OR_ZERO(sb);
- if (ret)
- goto err_stop_fs;
-got_sb:
- c = sb->s_fs_info;
-
- if (sb->s_root) {
- if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
- ret = -EBUSY;
- goto err_put_super;
- }
- goto out;
- }
-
- sb->s_blocksize = block_bytes(c);
- sb->s_blocksize_bits = ilog2(block_bytes(c));
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_op = &bch_super_operations;
- sb->s_export_op = &bch_export_ops;
-#ifdef CONFIG_BCACHEFS_QUOTA
- sb->s_qcop = &bch2_quotactl_operations;
- sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
-#endif
- sb->s_xattr = bch2_xattr_handlers;
- sb->s_magic = BCACHEFS_STATFS_MAGIC;
- sb->s_time_gran = c->sb.nsec_per_time_unit;
- sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
- sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
- super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
- super_set_sysfs_name_uuid(sb);
- sb->s_shrink->seeks = 0;
- c->vfs_sb = sb;
- strscpy(sb->s_id, c->name, sizeof(sb->s_id));
-
- ret = super_setup_bdi(sb);
- if (ret)
- goto err_put_super;
-
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
-
- for_each_online_member(c, ca) {
- struct block_device *bdev = ca->disk_sb.bdev;
-
- /* XXX: create an anonymous device for multi device filesystems */
- sb->s_bdev = bdev;
- sb->s_dev = bdev->bd_dev;
- percpu_ref_put(&ca->io_ref);
- break;
- }
-
- c->dev = sb->s_dev;
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- if (c->opts.acl)
- sb->s_flags |= SB_POSIXACL;
-#endif
-
- sb->s_shrink->seeks = 0;
-
- ret = bch2_fs_start(c);
- if (ret)
- goto err_put_super;
-
- vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
- ret = PTR_ERR_OR_ZERO(vinode);
- bch_err_msg(c, ret, "mounting: error getting root inode");
- if (ret)
- goto err_put_super;
-
- sb->s_root = d_make_root(vinode);
- if (!sb->s_root) {
- bch_err(c, "error mounting: error allocating root dentry");
- ret = -ENOMEM;
- goto err_put_super;
- }
-
- sb->s_flags |= SB_ACTIVE;
-out:
- fc->root = dget(sb->s_root);
-err:
- darray_exit(&devs_to_fs);
- bch2_darray_str_exit(&devs);
- if (ret)
- pr_err("error: %s", bch2_err_str(ret));
- /*
- * On an inconsistency error in recovery we might see an -EROFS derived
- * errorcode (from the journal), but we don't want to return that to
- * userspace as that causes util-linux to retry the mount RO - which is
- * confusing:
- */
- if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
- ret = -EIO;
- return bch2_err_class(ret);
-
-err_stop_fs:
- bch2_fs_stop(c);
- goto err;
-
-err_put_super:
- __bch2_fs_stop(c);
- deactivate_locked_super(sb);
- goto err;
-}
-
-static void bch2_kill_sb(struct super_block *sb)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- generic_shutdown_super(sb);
- bch2_fs_free(c);
-}
-
-static void bch2_fs_context_free(struct fs_context *fc)
-{
- struct bch2_opts_parse *opts = fc->fs_private;
-
- if (opts) {
- printbuf_exit(&opts->parse_later);
- kfree(opts);
- }
-}
-
-static int bch2_fs_parse_param(struct fs_context *fc,
- struct fs_parameter *param)
-{
- /*
- * the "source" param, i.e., the name of the device(s) to mount,
- * is handled by the VFS layer.
- */
- if (!strcmp(param->key, "source"))
- return -ENOPARAM;
-
- struct bch2_opts_parse *opts = fc->fs_private;
- struct bch_fs *c = NULL;
-
- /* for reconfigure, we already have a struct bch_fs */
- if (fc->root)
- c = fc->root->d_sb->s_fs_info;
-
- int ret = bch2_parse_one_mount_opt(c, &opts->opts,
- &opts->parse_later, param->key,
- param->string);
-
- return bch2_err_class(ret);
-}
-
-static int bch2_fs_reconfigure(struct fs_context *fc)
-{
- struct super_block *sb = fc->root->d_sb;
- struct bch2_opts_parse *opts = fc->fs_private;
- struct bch_fs *c = sb->s_fs_info;
- int ret = 0;
-
- opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
-
- if (opts->opts.read_only != c->opts.read_only) {
- down_write(&c->state_lock);
-
- if (opts->opts.read_only) {
- bch2_fs_read_only(c);
-
- sb->s_flags |= SB_RDONLY;
- } else {
- ret = bch2_fs_read_write(c);
- if (ret) {
- bch_err(c, "error going rw: %i", ret);
- up_write(&c->state_lock);
- ret = -EINVAL;
- goto err;
- }
-
- sb->s_flags &= ~SB_RDONLY;
- }
-
- c->opts.read_only = opts->opts.read_only;
-
- up_write(&c->state_lock);
- }
-
- if (opt_defined(opts->opts, errors))
- c->opts.errors = opts->opts.errors;
-err:
- return bch2_err_class(ret);
-}
-
-static const struct fs_context_operations bch2_context_ops = {
- .free = bch2_fs_context_free,
- .parse_param = bch2_fs_parse_param,
- .get_tree = bch2_fs_get_tree,
- .reconfigure = bch2_fs_reconfigure,
-};
-
-static int bch2_init_fs_context(struct fs_context *fc)
-{
- struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
-
- if (!opts)
- return -ENOMEM;
-
- opts->parse_later = PRINTBUF;
-
- fc->ops = &bch2_context_ops;
- fc->fs_private = opts;
-
- return 0;
-}
-
-void bch2_fs_vfs_exit(struct bch_fs *c)
-{
- if (c->vfs_inodes_by_inum_table.ht.tbl)
- rhltable_destroy(&c->vfs_inodes_by_inum_table);
- if (c->vfs_inodes_table.tbl)
- rhashtable_destroy(&c->vfs_inodes_table);
-}
-
-int bch2_fs_vfs_init(struct bch_fs *c)
-{
- return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?:
- rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params);
-}
-
-static struct file_system_type bcache_fs_type = {
- .owner = THIS_MODULE,
- .name = "bcachefs",
- .init_fs_context = bch2_init_fs_context,
- .kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS,
-};
-
-MODULE_ALIAS_FS("bcachefs");
-
-void bch2_vfs_exit(void)
-{
- unregister_filesystem(&bcache_fs_type);
- kmem_cache_destroy(bch2_inode_cache);
-}
-
-int __init bch2_vfs_init(void)
-{
- int ret = -ENOMEM;
-
- bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
- SLAB_ACCOUNT);
- if (!bch2_inode_cache)
- goto err;
-
- ret = register_filesystem(&bcache_fs_type);
- if (ret)
- goto err;
-
- return 0;
-err:
- bch2_vfs_exit();
- return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
deleted file mode 100644
index dd2198541455..000000000000
--- a/fs/bcachefs/fs.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_H
-#define _BCACHEFS_FS_H
-
-#include "inode.h"
-#include "opts.h"
-#include "str_hash.h"
-#include "quota_types.h"
-#include "two_state_shared_lock.h"
-
-#include <linux/seqlock.h>
-#include <linux/stat.h>
-
-struct bch_inode_info {
- struct inode v;
- struct rhash_head hash;
- struct rhlist_head by_inum_hash;
- subvol_inum ei_inum;
-
- struct list_head ei_vfs_inode_list;
- unsigned long ei_flags;
-
- struct mutex ei_update_lock;
- u64 ei_quota_reserved;
- unsigned long ei_last_dirtied;
- two_state_lock_t ei_pagecache_lock;
-
- struct mutex ei_quota_lock;
- struct bch_qid ei_qid;
-
- /*
- * When we've been doing nocow writes we'll need to issue flushes to the
- * underlying block devices
- *
- * XXX: a device may have had a flush issued by some other codepath. It
- * would be better to keep for each device a sequence number that's
- * incremented when we isusue a cache flush, and track here the sequence
- * number that needs flushing.
- */
- struct bch_devs_mask ei_devs_need_flush;
-
- /* copy of inode in btree: */
- struct bch_inode_unpacked ei_inode;
-};
-
-#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0)
-
-#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
-#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1)
-
-static inline subvol_inum inode_inum(struct bch_inode_info *inode)
-{
- return inode->ei_inum;
-}
-
-/*
- * Set if we've gotten a btree error for this inode, and thus the vfs inode and
- * btree inode may be inconsistent:
- */
-#define EI_INODE_ERROR 0
-
-/*
- * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
- * those:
- */
-#define EI_INODE_SNAPSHOT 1
-#define EI_INODE_HASHED 2
-
-#define to_bch_ei(_inode) \
- container_of_or_null(_inode, struct bch_inode_info, v)
-
-static inline int ptrcmp(void *l, void *r)
-{
- return cmp_int(l, r);
-}
-
-enum bch_inode_lock_op {
- INODE_PAGECACHE_BLOCK = (1U << 0),
- INODE_UPDATE_LOCK = (1U << 1),
-};
-
-#define bch2_lock_inodes(_locks, ...) \
-do { \
- struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
- unsigned i; \
- \
- bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
- \
- for (i = 1; i < ARRAY_SIZE(a); i++) \
- if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_PAGECACHE_BLOCK) \
- bch2_pagecache_block_get(a[i]);\
- if ((_locks) & INODE_UPDATE_LOCK) \
- mutex_lock_nested(&a[i]->ei_update_lock, i);\
- } \
-} while (0)
-
-#define bch2_unlock_inodes(_locks, ...) \
-do { \
- struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
- unsigned i; \
- \
- bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
- \
- for (i = 1; i < ARRAY_SIZE(a); i++) \
- if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_PAGECACHE_BLOCK) \
- bch2_pagecache_block_put(a[i]);\
- if ((_locks) & INODE_UPDATE_LOCK) \
- mutex_unlock(&a[i]->ei_update_lock); \
- } \
-} while (0)
-
-static inline struct bch_inode_info *file_bch_inode(struct file *file)
-{
- return to_bch_ei(file_inode(file));
-}
-
-static inline bool inode_attr_changing(struct bch_inode_info *dir,
- struct bch_inode_info *inode,
- enum inode_opt_id id)
-{
- return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
- bch2_inode_opt_get(&dir->ei_inode, id) !=
- bch2_inode_opt_get(&inode->ei_inode, id);
-}
-
-static inline bool inode_attrs_changing(struct bch_inode_info *dir,
- struct bch_inode_info *inode)
-{
- unsigned id;
-
- for (id = 0; id < Inode_opt_nr; id++)
- if (inode_attr_changing(dir, inode, id))
- return true;
-
- return false;
-}
-
-struct bch_inode_unpacked;
-
-#ifndef NO_BCACHEFS_FS
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
- struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
-
-int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
-
-int bch2_fs_quota_transfer(struct bch_fs *,
- struct bch_inode_info *,
- struct bch_qid,
- unsigned,
- enum quota_acct_mode);
-
-static inline int bch2_set_projid(struct bch_fs *c,
- struct bch_inode_info *inode,
- u32 projid)
-{
- struct bch_qid qid = inode->ei_qid;
-
- qid.q[QTYP_PRJ] = projid;
-
- return bch2_fs_quota_transfer(c, inode, qid,
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_PREALLOC);
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
-
-/* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct btree_trans *,
- struct bch_inode_info *,
- struct bch_inode_unpacked *, void *);
-
-void bch2_inode_update_after_write(struct btree_trans *,
- struct bch_inode_info *,
- struct bch_inode_unpacked *,
- unsigned);
-int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
- inode_set_fn, void *, unsigned);
-
-int bch2_setattr_nonsize(struct mnt_idmap *,
- struct bch_inode_info *,
- struct iattr *);
-int __bch2_unlink(struct inode *, struct dentry *, bool);
-
-void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
-
-void bch2_fs_vfs_exit(struct bch_fs *);
-int bch2_fs_vfs_init(struct bch_fs *);
-
-void bch2_vfs_exit(void);
-int bch2_vfs_init(void);
-
-#else
-
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
-
-static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
-
-static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
- snapshot_id_list *s) {}
-
-static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
-static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
-
-static inline void bch2_vfs_exit(void) {}
-static inline int bch2_vfs_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
deleted file mode 100644
index 091057023fc5..000000000000
--- a/fs/bcachefs/fsck.c
+++ /dev/null
@@ -1,3152 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "darray.h"
-#include "dirent.h"
-#include "error.h"
-#include "fs.h"
-#include "fsck.h"
-#include "inode.h"
-#include "keylist.h"
-#include "namei.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "super.h"
-#include "thread_with_file.h"
-#include "xattr.h"
-
-#include <linux/bsearch.h>
-#include <linux/dcache.h> /* struct qstr */
-
-static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
-{
- if (d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
- : le64_to_cpu(d.v->d_inum) == inode->bi_inum)
- return 0;
- return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
-}
-
-static void dirent_inode_mismatch_msg(struct printbuf *out,
- struct bch_fs *c,
- struct bkey_s_c_dirent dirent,
- struct bch_inode_unpacked *inode)
-{
- prt_str(out, "inode points to dirent that does not point back:");
- prt_newline(out);
- bch2_bkey_val_to_text(out, c, dirent.s_c);
- prt_newline(out);
- bch2_inode_unpacked_to_text(out, inode);
-}
-
-static int dirent_points_to_inode(struct bch_fs *c,
- struct bkey_s_c_dirent dirent,
- struct bch_inode_unpacked *inode)
-{
- int ret = dirent_points_to_inode_nowarn(dirent, inode);
- if (ret) {
- struct printbuf buf = PRINTBUF;
- dirent_inode_mismatch_msg(&buf, c, dirent, inode);
- bch_warn(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
- return ret;
-}
-
-/*
- * XXX: this is handling transaction restarts without returning
- * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
- */
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
- u32 snapshot)
-{
- u64 sectors = 0;
-
- int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- SPOS(inum, 0, snapshot),
- POS(inum, U64_MAX),
- 0, k, ({
- if (bkey_extent_is_allocation(k.k))
- sectors += k.k->size;
- 0;
- }));
-
- return ret ?: sectors;
-}
-
-static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
- u32 snapshot)
-{
- u64 subdirs = 0;
-
- int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot),
- POS(inum, U64_MAX),
- 0, k, ({
- if (k.k->type == KEY_TYPE_dirent &&
- bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
- subdirs++;
- 0;
- }));
-
- return ret ?: subdirs;
-}
-
-static int subvol_lookup(struct btree_trans *trans, u32 subvol,
- u32 *snapshot, u64 *inum)
-{
- struct bch_subvolume s;
- int ret = bch2_subvolume_get(trans, subvol, false, &s);
-
- *snapshot = le32_to_cpu(s.snapshot);
- *inum = le64_to_cpu(s.inode);
- return ret;
-}
-
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inode_nr, snapshot), 0);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- ret = bkey_is_inode(k.k)
- ? bch2_inode_unpack(k, inode)
- : -BCH_ERR_ENOENT_inode;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int lookup_dirent_in_snapshot(struct btree_trans *trans,
- struct bch_hash_info hash_info,
- subvol_inum dir, struct qstr *name,
- u64 *target, unsigned *type, u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
- &hash_info, dir, name, 0, snapshot);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- *target = le64_to_cpu(d.v->d_inum);
- *type = d.v->d_type;
- bch2_trans_iter_exit(trans, &iter);
- return 0;
-}
-
-/*
- * Find any subvolume associated with a tree of snapshots
- * We can't rely on master_subvol - it might have been deleted.
- */
-static int find_snapshot_tree_subvol(struct btree_trans *trans,
- u32 tree_id, u32 *subvol)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
-
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
- if (le32_to_cpu(s.v->tree) != tree_id)
- continue;
-
- if (s.v->subvol) {
- *subvol = le32_to_cpu(s.v->subvol);
- goto found;
- }
- }
- ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
-found:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
- struct bch_inode_unpacked *lostfound,
- u64 reattaching_inum)
-{
- struct bch_fs *c = trans->c;
- struct qstr lostfound_str = QSTR("lost+found");
- struct btree_iter lostfound_iter = { NULL };
- u64 inum = 0;
- unsigned d_type = 0;
- int ret;
-
- struct bch_snapshot_tree st;
- ret = bch2_snapshot_tree_lookup(trans,
- bch2_snapshot_tree(c, snapshot), &st);
- if (ret)
- return ret;
-
- u32 subvolid;
- ret = find_snapshot_tree_subvol(trans,
- bch2_snapshot_tree(c, snapshot), &subvolid);
- bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
- bch2_snapshot_tree(c, snapshot));
- if (ret)
- return ret;
-
- struct bch_subvolume subvol;
- ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
- bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
- if (ret)
- return ret;
-
- if (!subvol.inode) {
- struct btree_iter iter;
- struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, subvolid),
- 0, subvolume);
- ret = PTR_ERR_OR_ZERO(subvol);
- if (ret)
- return ret;
-
- subvol->v.inode = cpu_to_le64(reattaching_inum);
- bch2_trans_iter_exit(trans, &iter);
- }
-
- subvol_inum root_inum = {
- .subvol = subvolid,
- .inum = le64_to_cpu(subvol.inode)
- };
-
- struct bch_inode_unpacked root_inode;
- struct bch_hash_info root_hash_info;
- ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
- bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
- root_inum.inum, subvolid);
- if (ret)
- return ret;
-
- root_hash_info = bch2_hash_info_init(c, &root_inode);
-
- ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
- &lostfound_str, &inum, &d_type, snapshot);
- if (bch2_err_matches(ret, ENOENT))
- goto create_lostfound;
-
- bch_err_fn(c, ret);
- if (ret)
- return ret;
-
- if (d_type != DT_DIR) {
- bch_err(c, "error looking up lost+found: not a directory");
- return -BCH_ERR_ENOENT_not_directory;
- }
-
- /*
- * The bch2_check_dirents pass has already run, dangling dirents
- * shouldn't exist here:
- */
- ret = lookup_inode(trans, inum, snapshot, lostfound);
- bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
- inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
- return ret;
-
-create_lostfound:
- /*
- * we always create lost+found in the root snapshot; we don't want
- * different branches of the snapshot tree to have different lost+found
- */
- snapshot = le32_to_cpu(st.root_snapshot);
- /*
- * XXX: we could have a nicer log message here if we had a nice way to
- * walk backpointers to print a path
- */
- struct printbuf path = PRINTBUF;
- ret = bch2_inum_to_path(trans, root_inum, &path);
- if (ret)
- goto err;
-
- bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
- path.buf, root_inum.subvol, snapshot);
- printbuf_exit(&path);
-
- u64 now = bch2_current_time(c);
- u64 cpu = raw_smp_processor_id();
-
- bch2_inode_init_early(c, lostfound);
- bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
- lostfound->bi_dir = root_inode.bi_inum;
- lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
-
- root_inode.bi_nlink++;
-
- ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
- ret = bch2_btree_iter_traverse(&lostfound_iter);
- if (ret)
- goto err;
-
- ret = bch2_dirent_create_snapshot(trans,
- 0, root_inode.bi_inum, snapshot, &root_hash_info,
- mode_to_type(lostfound->bi_mode),
- &lostfound_str,
- lostfound->bi_inum,
- &lostfound->bi_dir_offset,
- STR_HASH_must_create) ?:
- bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
- BTREE_UPDATE_internal_snapshot_node);
-err:
- bch_err_msg(c, ret, "creating lost+found");
- bch2_trans_iter_exit(trans, &lostfound_iter);
- return ret;
-}
-
-static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
-{
- if (inode->bi_inum == BCACHEFS_ROOT_INO &&
- inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
- return false;
-
- return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
-}
-
-static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
- SPOS(d_pos.inode, d_pos.offset, snapshot),
- BTREE_ITER_intent|
- BTREE_ITER_with_updates);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (bpos_eq(k.k->p, d_pos)) {
- /*
- * delet_at() doesn't work because the update path doesn't
- * internally use BTREE_ITER_with_updates yet
- */
- struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
- ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- goto err;
-
- bkey_init(&k->k);
- k->k.type = KEY_TYPE_whiteout;
- k->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
- struct bch_fs *c = trans->c;
- struct bch_inode_unpacked lostfound;
- char name_buf[20];
- int ret;
-
- u32 dirent_snapshot = inode->bi_snapshot;
- if (inode->bi_subvol) {
- inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
-
- u64 root_inum;
- ret = subvol_lookup(trans, inode->bi_parent_subvol,
- &dirent_snapshot, &root_inum);
- if (ret)
- return ret;
-
- snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
- } else {
- snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
- }
-
- ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
- if (ret)
- return ret;
-
- lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
-
- /* ensure lost+found inode is also present in inode snapshot */
- if (!inode->bi_subvol) {
- BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
- lostfound.bi_snapshot = inode->bi_snapshot;
- }
-
- ret = __bch2_fsck_write_inode(trans, &lostfound);
- if (ret)
- return ret;
-
- struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
- struct qstr name = QSTR(name_buf);
-
- inode->bi_dir = lostfound.bi_inum;
-
- ret = bch2_dirent_create_snapshot(trans,
- inode->bi_parent_subvol, lostfound.bi_inum,
- dirent_snapshot,
- &dir_hash,
- inode_d_type(inode),
- &name,
- inode->bi_subvol ?: inode->bi_inum,
- &inode->bi_dir_offset,
- STR_HASH_must_create);
- if (ret) {
- bch_err_msg(c, ret, "error creating dirent");
- return ret;
- }
-
- ret = __bch2_fsck_write_inode(trans, inode);
- if (ret)
- return ret;
-
- /*
- * Fix up inodes in child snapshots: if they should also be reattached
- * update the backpointer field, if they should not be we need to emit
- * whiteouts for the dirent we just created.
- */
- if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
- snapshot_id_list whiteouts_done;
- struct btree_iter iter;
- struct bkey_s_c k;
-
- darray_init(&whiteouts_done);
-
- for_each_btree_key_reverse_norestart(trans, iter,
- BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
- BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
- if (k.k->p.offset != inode->bi_inum)
- break;
-
- if (!bkey_is_inode(k.k) ||
- !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
- snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
- continue;
-
- struct bch_inode_unpacked child_inode;
- ret = bch2_inode_unpack(k, &child_inode);
- if (ret)
- break;
-
- if (!inode_should_reattach(&child_inode)) {
- ret = maybe_delete_dirent(trans,
- SPOS(lostfound.bi_inum, inode->bi_dir_offset,
- dirent_snapshot),
- k.k->p.snapshot);
- if (ret)
- break;
-
- ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
- if (ret)
- break;
- } else {
- iter.snapshot = k.k->p.snapshot;
- child_inode.bi_dir = inode->bi_dir;
- child_inode.bi_dir_offset = inode->bi_dir_offset;
-
- ret = bch2_inode_write_flags(trans, &iter, &child_inode,
- BTREE_UPDATE_internal_snapshot_node);
- if (ret)
- break;
- }
- }
- darray_exit(&whiteouts_done);
- bch2_trans_iter_exit(trans, &iter);
- }
-
- return ret;
-}
-
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos)
-{
- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static int remove_backpointer(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
-{
- if (!inode->bi_dir)
- return 0;
-
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
- int ret = bkey_err(d) ?:
- dirent_points_to_inode(c, d, inode) ?:
- bch2_fsck_remove_dirent(trans, d.k->p);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
-{
- struct bch_fs *c = trans->c;
-
- struct bch_inode_unpacked inode;
- int ret = bch2_inode_find_by_inum_trans(trans,
- (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
- &inode);
- if (ret)
- return ret;
-
- ret = remove_backpointer(trans, &inode);
- if (!bch2_err_matches(ret, ENOENT))
- bch_err_msg(c, ret, "removing dirent");
- if (ret)
- return ret;
-
- ret = reattach_inode(trans, &inode);
- bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
- return ret;
-}
-
-static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
-{
- struct bch_fs *c = trans->c;
-
- if (!bch2_snapshot_is_leaf(c, snapshotid)) {
- bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
- return -BCH_ERR_fsck_repair_unimplemented;
- }
-
- /*
- * If inum isn't set, that means we're being called from check_dirents,
- * not check_inodes - the root of this subvolume doesn't exist or we
- * would have found it there:
- */
- if (!inum) {
- struct btree_iter inode_iter = {};
- struct bch_inode_unpacked new_inode;
- u64 cpu = raw_smp_processor_id();
-
- bch2_inode_init_early(c, &new_inode);
- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
-
- new_inode.bi_subvol = subvolid;
-
- int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
- bch2_btree_iter_traverse(&inode_iter) ?:
- bch2_inode_write(trans, &inode_iter, &new_inode);
- bch2_trans_iter_exit(trans, &inode_iter);
- if (ret)
- return ret;
-
- inum = new_inode.bi_inum;
- }
-
- bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
-
- struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
- int ret = PTR_ERR_OR_ZERO(new_subvol);
- if (ret)
- return ret;
-
- bkey_subvolume_init(&new_subvol->k_i);
- new_subvol->k.p.offset = subvolid;
- new_subvol->v.snapshot = cpu_to_le32(snapshotid);
- new_subvol->v.inode = cpu_to_le64(inum);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
- if (ret)
- return ret;
-
- struct btree_iter iter;
- struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshots, POS(0, snapshotid),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(s);
- bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
- if (ret)
- return ret;
-
- u32 snapshot_tree = le32_to_cpu(s->v.tree);
-
- s->v.subvol = cpu_to_le32(subvolid);
- SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
- bch2_trans_iter_exit(trans, &iter);
-
- struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
- 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(st);
- bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
- if (ret)
- return ret;
-
- if (!st->v.master_subvol)
- st->v.master_subvol = cpu_to_le32(subvolid);
-
- bch2_trans_iter_exit(trans, &iter);
- return 0;
-}
-
-static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
-{
- struct bch_fs *c = trans->c;
- unsigned i_mode = S_IFREG;
- u64 i_size = 0;
-
- switch (btree) {
- case BTREE_ID_extents: {
- struct btree_iter iter = {};
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
- struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
- bch2_trans_iter_exit(trans, &iter);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- i_size = k.k->p.offset << 9;
- break;
- }
- case BTREE_ID_dirents:
- i_mode = S_IFDIR;
- break;
- case BTREE_ID_xattrs:
- break;
- default:
- BUG();
- }
-
- struct bch_inode_unpacked new_inode;
- bch2_inode_init_early(c, &new_inode);
- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
- new_inode.bi_size = i_size;
- new_inode.bi_inum = inum;
- new_inode.bi_snapshot = snapshot;
-
- return __bch2_fsck_write_inode(trans, &new_inode);
-}
-
-struct snapshots_seen {
- struct bpos pos;
- snapshot_id_list ids;
-};
-
-static inline void snapshots_seen_exit(struct snapshots_seen *s)
-{
- darray_exit(&s->ids);
-}
-
-static inline void snapshots_seen_init(struct snapshots_seen *s)
-{
- memset(s, 0, sizeof(*s));
-}
-
-static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
-{
- u32 *i;
- __darray_for_each(s->ids, i) {
- if (*i == id)
- return 0;
- if (*i > id)
- break;
- }
-
- int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
- if (ret)
- bch_err(c, "error reallocating snapshots_seen table (size %zu)",
- s->ids.size);
- return ret;
-}
-
-static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
- enum btree_id btree_id, struct bpos pos)
-{
- if (!bkey_eq(s->pos, pos))
- s->ids.nr = 0;
- s->pos = pos;
-
- return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
-}
-
-/**
- * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
- * and @ancestor hasn't been overwritten in @seen
- *
- * @c: filesystem handle
- * @seen: list of snapshot ids already seen at current position
- * @id: descendent snapshot id
- * @ancestor: ancestor snapshot id
- *
- * Returns: whether key in @ancestor snapshot is visible in @id snapshot
- */
-static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
- u32 id, u32 ancestor)
-{
- ssize_t i;
-
- EBUG_ON(id > ancestor);
-
- /* @ancestor should be the snapshot most recently added to @seen */
- EBUG_ON(ancestor != seen->pos.snapshot);
- EBUG_ON(ancestor != darray_last(seen->ids));
-
- if (id == ancestor)
- return true;
-
- if (!bch2_snapshot_is_ancestor(c, id, ancestor))
- return false;
-
- /*
- * We know that @id is a descendant of @ancestor, we're checking if
- * we've seen a key that overwrote @ancestor - i.e. also a descendent of
- * @ascestor and with @id as a descendent.
- *
- * But we already know that we're scanning IDs between @id and @ancestor
- * numerically, since snapshot ID lists are kept sorted, so if we find
- * an id that's an ancestor of @id we're done:
- */
-
- for (i = seen->ids.nr - 2;
- i >= 0 && seen->ids.data[i] >= id;
- --i)
- if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
- return false;
-
- return true;
-}
-
-/**
- * ref_visible - given a key with snapshot id @src that points to a key with
- * snapshot id @dst, test whether there is some snapshot in which @dst is
- * visible.
- *
- * @c: filesystem handle
- * @s: list of snapshot IDs already seen at @src
- * @src: snapshot ID of src key
- * @dst: snapshot ID of dst key
- * Returns: true if there is some snapshot in which @dst is visible
- *
- * Assumes we're visiting @src keys in natural key order
- */
-static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
- u32 src, u32 dst)
-{
- return dst <= src
- ? key_visible_in_snapshot(c, s, dst, src)
- : bch2_snapshot_is_ancestor(c, src, dst);
-}
-
-static int ref_visible2(struct bch_fs *c,
- u32 src, struct snapshots_seen *src_seen,
- u32 dst, struct snapshots_seen *dst_seen)
-{
- if (dst > src) {
- swap(dst, src);
- swap(dst_seen, src_seen);
- }
- return key_visible_in_snapshot(c, src_seen, dst, src);
-}
-
-#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
- for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
- (_i)->snapshot <= (_snapshot); _i++) \
- if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
-
-struct inode_walker_entry {
- struct bch_inode_unpacked inode;
- u32 snapshot;
- u64 count;
- u64 i_size;
-};
-
-struct inode_walker {
- bool first_this_inode;
- bool have_inodes;
- bool recalculate_sums;
- struct bpos last_pos;
-
- DARRAY(struct inode_walker_entry) inodes;
- snapshot_id_list deletes;
-};
-
-static void inode_walker_exit(struct inode_walker *w)
-{
- darray_exit(&w->inodes);
- darray_exit(&w->deletes);
-}
-
-static struct inode_walker inode_walker_init(void)
-{
- return (struct inode_walker) { 0, };
-}
-
-static int add_inode(struct bch_fs *c, struct inode_walker *w,
- struct bkey_s_c inode)
-{
- struct bch_inode_unpacked u;
-
- return bch2_inode_unpack(inode, &u) ?:
- darray_push(&w->inodes, ((struct inode_walker_entry) {
- .inode = u,
- .snapshot = inode.k->p.snapshot,
- }));
-}
-
-static int get_inodes_all_snapshots(struct btree_trans *trans,
- struct inode_walker *w, u64 inum)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- /*
- * We no longer have inodes for w->last_pos; clear this to avoid
- * screwing up check_i_sectors/check_subdir_count if we take a
- * transaction restart here:
- */
- w->have_inodes = false;
- w->recalculate_sums = false;
- w->inodes.nr = 0;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
-
- if (bkey_is_inode(k.k))
- add_inode(c, w, k);
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- return ret;
-
- w->first_this_inode = true;
- w->have_inodes = true;
- return 0;
-}
-
-static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
-{
- bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
-
- struct inode_walker_entry *i;
- __darray_for_each(w->inodes, i)
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
- goto found;
-
- return NULL;
-found:
- BUG_ON(k.k->p.snapshot > i->snapshot);
-
- if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
- struct inode_walker_entry new = *i;
-
- new.snapshot = k.k->p.snapshot;
- new.count = 0;
- new.i_size = 0;
-
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
- "unexpected because we should always update the inode when we update a key in that inode\n"
- "%s",
- w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
- printbuf_exit(&buf);
-
- while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
- --i;
-
- size_t pos = i - w->inodes.data;
- int ret = darray_insert_item(&w->inodes, pos, new);
- if (ret)
- return ERR_PTR(ret);
-
- i = w->inodes.data + pos;
- }
-
- return i;
-}
-
-static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
- struct inode_walker *w,
- struct bkey_s_c k)
-{
- if (w->last_pos.inode != k.k->p.inode) {
- int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
- if (ret)
- return ERR_PTR(ret);
- }
-
- w->last_pos = k.k->p;
-
- return lookup_inode_for_snapshot(trans->c, w, k);
-}
-
-static int get_visible_inodes(struct btree_trans *trans,
- struct inode_walker *w,
- struct snapshots_seen *s,
- u64 inum)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- w->inodes.nr = 0;
- w->deletes.nr = 0;
-
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
-
- if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
- continue;
-
- if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
- continue;
-
- ret = bkey_is_inode(k.k)
- ? add_inode(c, w, k)
- : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-/*
- * Prefer to delete the first one, since that will be the one at the wrong
- * offset:
- * return value: 0 -> delete k1, 1 -> delete k2
- */
-int bch2_fsck_update_backpointers(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_i *new)
-{
- if (new->k.type != KEY_TYPE_dirent)
- return 0;
-
- struct bkey_i_dirent *d = bkey_i_to_dirent(new);
- struct inode_walker target = inode_walker_init();
- int ret = 0;
-
- if (d->v.d_type == DT_SUBVOL) {
- BUG();
- } else {
- ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
- if (ret)
- goto err;
-
- darray_for_each(target.inodes, i) {
- i->inode.bi_dir_offset = d->k.p.offset;
- ret = __bch2_fsck_write_inode(trans, &i->inode);
- if (ret)
- goto err;
- }
- }
-err:
- inode_walker_exit(&target);
- return ret;
-}
-
-static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
-{
- if (inode->bi_subvol) {
- u64 inum;
- int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
- if (ret)
- return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
- }
-
- return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
-}
-
-static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
- int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int check_inode_dirent_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- bool *write_inode)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- u32 inode_snapshot = inode->bi_snapshot;
- struct btree_iter dirent_iter = {};
- struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
- int ret = bkey_err(d);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (fsck_err_on(ret,
- trans, inode_points_to_missing_dirent,
- "inode points to missing dirent\n%s",
- (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
- fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
- trans, inode_points_to_wrong_dirent,
- "%s",
- (printbuf_reset(&buf),
- dirent_inode_mismatch_msg(&buf, c, d, inode),
- buf.buf))) {
- /*
- * We just clear the backpointer fields for now. If we find a
- * dirent that points to this inode in check_dirents(), we'll
- * update it then; then when we get to check_path() if the
- * backpointer is still 0 we'll reattach it.
- */
- inode->bi_dir = 0;
- inode->bi_dir_offset = 0;
- *write_inode = true;
- }
-
- ret = 0;
-fsck_err:
- bch2_trans_iter_exit(trans, &dirent_iter);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int get_snapshot_root_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *root,
- u64 inum)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum, U32_MAX),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
- if (bkey_is_inode(k.k))
- goto found_root;
- }
- if (ret)
- goto err;
- BUG();
-found_root:
- ret = bch2_inode_unpack(k, root);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int check_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_inode_unpacked *snapshot_root,
- struct snapshots_seen *s)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct bch_inode_unpacked u;
- bool do_update = false;
- int ret;
-
- ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret < 0)
- goto err;
- if (ret)
- return 0;
-
- ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
- if (ret)
- goto err;
-
- if (!bkey_is_inode(k.k))
- return 0;
-
- ret = bch2_inode_unpack(k, &u);
- if (ret)
- goto err;
-
- if (snapshot_root->bi_inum != u.bi_inum) {
- ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
- if (ret)
- goto err;
- }
-
- if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
- INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
- trans, inode_snapshot_mismatch,
- "inode hash info in different snapshots don't match")) {
- u.bi_hash_seed = snapshot_root->bi_hash_seed;
- SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
- do_update = true;
- }
-
- if (u.bi_dir || u.bi_dir_offset) {
- ret = check_inode_dirent_inode(trans, &u, &do_update);
- if (ret)
- goto err;
- }
-
- if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
- trans, inode_unlinked_but_has_dirent,
- "inode unlinked but has dirent\n%s",
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf))) {
- u.bi_flags &= ~BCH_INODE_unlinked;
- do_update = true;
- }
-
- if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
- /* Check for this early so that check_unreachable_inode() will reattach it */
-
- ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
- if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
- goto err;
-
- fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
- "dir unlinked but not empty\n%s",
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf));
- u.bi_flags &= ~BCH_INODE_unlinked;
- do_update = true;
- ret = 0;
- }
-
- ret = bch2_inode_has_child_snapshots(trans, k.k->p);
- if (ret < 0)
- goto err;
-
- if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
- trans, inode_has_child_snapshots_wrong,
- "inode has_child_snapshots flag wrong (should be %u)\n%s",
- ret,
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf))) {
- if (ret)
- u.bi_flags |= BCH_INODE_has_child_snapshot;
- else
- u.bi_flags &= ~BCH_INODE_has_child_snapshot;
- do_update = true;
- }
- ret = 0;
-
- if ((u.bi_flags & BCH_INODE_unlinked) &&
- !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
- if (!test_bit(BCH_FS_started, &c->flags)) {
- /*
- * If we're not in online fsck, don't delete unlinked
- * inodes, just make sure they're on the deleted list.
- *
- * They might be referred to by a logged operation -
- * i.e. we might have crashed in the middle of a
- * truncate on an unlinked but open file - so we want to
- * let the delete_dead_inodes kill it after resuming
- * logged ops.
- */
- ret = check_inode_deleted_list(trans, k.k->p);
- if (ret < 0)
- goto err_noprint;
-
- fsck_err_on(!ret,
- trans, unlinked_inode_not_on_deleted_list,
- "inode %llu:%u unlinked, but not on deleted list",
- u.bi_inum, k.k->p.snapshot);
-
- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
- if (ret)
- goto err;
- } else {
- ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
- if (ret < 0)
- goto err;
-
- if (fsck_err_on(!ret,
- trans, inode_unlinked_and_not_open,
- "inode %llu:%u unlinked and not open",
- u.bi_inum, u.bi_snapshot)) {
- ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
- bch_err_msg(c, ret, "in fsck deleting inode");
- goto err_noprint;
- }
- ret = 0;
- }
- }
-
- if (fsck_err_on(u.bi_parent_subvol &&
- (u.bi_subvol == 0 ||
- u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
- trans, inode_bi_parent_nonzero,
- "inode %llu:%u has subvol %u but nonzero parent subvol %u",
- u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
- u.bi_parent_subvol = 0;
- do_update = true;
- }
-
- if (u.bi_subvol) {
- struct bch_subvolume s;
-
- ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
- ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
- goto do_update;
- }
-
- if (fsck_err_on(ret,
- trans, inode_bi_subvol_missing,
- "inode %llu:%u bi_subvol points to missing subvolume %u",
- u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
- fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
- !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
- k.k->p.snapshot),
- trans, inode_bi_subvol_wrong,
- "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
- u.bi_inum, k.k->p.snapshot, u.bi_subvol,
- le64_to_cpu(s.inode),
- le32_to_cpu(s.snapshot))) {
- u.bi_subvol = 0;
- u.bi_parent_subvol = 0;
- do_update = true;
- }
- }
-
- if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
- trans, inode_journal_seq_in_future,
- "inode journal seq in future (currently at %llu)\n%s",
- journal_cur_seq(&c->journal),
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf))) {
- u.bi_journal_seq = journal_cur_seq(&c->journal);
- do_update = true;
- }
-do_update:
- if (do_update) {
- ret = __bch2_fsck_write_inode(trans, &u);
- bch_err_msg(c, ret, "in fsck updating inode");
- if (ret)
- goto err_noprint;
- }
-err:
-fsck_err:
- bch_err_fn(c, ret);
-err_noprint:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_inodes(struct bch_fs *c)
-{
- struct bch_inode_unpacked snapshot_root = {};
- struct snapshots_seen s;
-
- snapshots_seen_init(&s);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
- POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_inode(trans, &iter, k, &snapshot_root, &s)));
-
- snapshots_seen_exit(&s);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- /*
- * We look for inodes to reattach in natural key order, leaves first,
- * but we should do the reattach at the oldest version that needs to be
- * reattached:
- */
- for_each_btree_key_norestart(trans, iter,
- BTREE_ID_inodes,
- SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inode->bi_inum)
- break;
-
- if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
- continue;
-
- if (!bkey_is_inode(k.k))
- break;
-
- struct bch_inode_unpacked parent_inode;
- ret = bch2_inode_unpack(k, &parent_inode);
- if (ret)
- break;
-
- if (!inode_should_reattach(&parent_inode))
- break;
-
- *inode = parent_inode;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-static int check_unreachable_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (!bkey_is_inode(k.k))
- return 0;
-
- struct bch_inode_unpacked inode;
- ret = bch2_inode_unpack(k, &inode);
- if (ret)
- return ret;
-
- if (!inode_should_reattach(&inode))
- return 0;
-
- ret = find_oldest_inode_needs_reattach(trans, &inode);
- if (ret)
- return ret;
-
- if (fsck_err(trans, inode_unreachable,
- "unreachable inode:\n%s",
- (bch2_inode_unpacked_to_text(&buf, &inode),
- buf.buf)))
- ret = reattach_inode(trans, &inode);
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * Reattach unreachable (but not unlinked) inodes
- *
- * Run after check_inodes() and check_dirents(), so we node that inode
- * backpointer fields point to valid dirents, and every inode that has a dirent
- * that points to it has its backpointer field set - so we're just looking for
- * non-unlinked inodes without backpointers:
- *
- * XXX: this is racy w.r.t. hardlink removal in online fsck
- */
-int bch2_check_unreachable_inodes(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
- POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_unreachable_inode(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
-{
- switch (btree) {
- case BTREE_ID_extents:
- return S_ISREG(mode) || S_ISLNK(mode);
- case BTREE_ID_dirents:
- return S_ISDIR(mode);
- case BTREE_ID_xattrs:
- return true;
- default:
- BUG();
- }
-}
-
-static int check_key_has_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct inode_walker *inode,
- struct inode_walker_entry *i,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = PTR_ERR_OR_ZERO(i);
- if (ret)
- return ret;
-
- if (k.k->type == KEY_TYPE_whiteout)
- goto out;
-
- if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
- ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
-
- inode->last_pos.inode--;
- ret = -BCH_ERR_transaction_restart_nested;
- goto err;
- }
-
- if (fsck_err_on(!i,
- trans, key_in_missing_inode,
- "key in missing inode:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
-
- if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
- trans, key_in_wrong_inode_type,
- "key for wrong inode mode %o:\n %s",
- i->inode.bi_mode,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
-out:
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-delete:
- ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
- goto out;
-}
-
-static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
- s64 count2;
-
- darray_for_each(w->inodes, i) {
- if (i->inode.bi_sectors == i->count)
- continue;
-
- count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
-
- if (w->recalculate_sums)
- i->count = count2;
-
- if (i->count != count2) {
- bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
- i->count = count2;
- }
-
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
- trans, inode_i_sectors_wrong,
- "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
- w->last_pos.inode, i->snapshot,
- i->inode.bi_sectors, i->count)) {
- i->inode.bi_sectors = i->count;
- ret = bch2_fsck_write_inode(trans, &i->inode);
- if (ret)
- break;
- }
- }
-fsck_err:
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
-{
- u32 restart_count = trans->restart_count;
- return check_i_sectors_notnested(trans, w) ?:
- trans_was_restarted(trans, restart_count);
-}
-
-struct extent_end {
- u32 snapshot;
- u64 offset;
- struct snapshots_seen seen;
-};
-
-struct extent_ends {
- struct bpos last_pos;
- DARRAY(struct extent_end) e;
-};
-
-static void extent_ends_reset(struct extent_ends *extent_ends)
-{
- darray_for_each(extent_ends->e, i)
- snapshots_seen_exit(&i->seen);
- extent_ends->e.nr = 0;
-}
-
-static void extent_ends_exit(struct extent_ends *extent_ends)
-{
- extent_ends_reset(extent_ends);
- darray_exit(&extent_ends->e);
-}
-
-static void extent_ends_init(struct extent_ends *extent_ends)
-{
- memset(extent_ends, 0, sizeof(*extent_ends));
-}
-
-static int extent_ends_at(struct bch_fs *c,
- struct extent_ends *extent_ends,
- struct snapshots_seen *seen,
- struct bkey_s_c k)
-{
- struct extent_end *i, n = (struct extent_end) {
- .offset = k.k->p.offset,
- .snapshot = k.k->p.snapshot,
- .seen = *seen,
- };
-
- n.seen.ids.data = kmemdup(seen->ids.data,
- sizeof(seen->ids.data[0]) * seen->ids.size,
- GFP_KERNEL);
- if (!n.seen.ids.data)
- return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
-
- __darray_for_each(extent_ends->e, i) {
- if (i->snapshot == k.k->p.snapshot) {
- snapshots_seen_exit(&i->seen);
- *i = n;
- return 0;
- }
-
- if (i->snapshot >= k.k->p.snapshot)
- break;
- }
-
- return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
-}
-
-static int overlapping_extents_found(struct btree_trans *trans,
- enum btree_id btree,
- struct bpos pos1, struct snapshots_seen *pos1_seen,
- struct bkey pos2,
- bool *fixed,
- struct extent_end *extent_end)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter iter1, iter2 = { NULL };
- struct bkey_s_c k1, k2;
- int ret;
-
- BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
-
- bch2_trans_iter_init(trans, &iter1, btree, pos1,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_not_extents);
- k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
- ret = bkey_err(k1);
- if (ret)
- goto err;
-
- prt_str(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, k1);
-
- if (!bpos_eq(pos1, k1.k->p)) {
- prt_str(&buf, "\n wanted\n ");
- bch2_bpos_to_text(&buf, pos1);
- prt_str(&buf, "\n ");
- bch2_bkey_to_text(&buf, &pos2);
-
- bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
- __func__, buf.buf);
- ret = -BCH_ERR_internal_fsck_err;
- goto err;
- }
-
- bch2_trans_copy_iter(&iter2, &iter1);
-
- while (1) {
- bch2_btree_iter_advance(&iter2);
-
- k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
- ret = bkey_err(k2);
- if (ret)
- goto err;
-
- if (bpos_ge(k2.k->p, pos2.p))
- break;
- }
-
- prt_str(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, k2);
-
- if (bpos_gt(k2.k->p, pos2.p) ||
- pos2.size != k2.k->size) {
- bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
- __func__, buf.buf);
- ret = -BCH_ERR_internal_fsck_err;
- goto err;
- }
-
- prt_printf(&buf, "\n overwriting %s extent",
- pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
-
- if (fsck_err(trans, extent_overlapping,
- "overlapping extents%s", buf.buf)) {
- struct btree_iter *old_iter = &iter1;
- struct disk_reservation res = { 0 };
-
- if (pos1.snapshot < pos2.p.snapshot) {
- old_iter = &iter2;
- swap(k1, k2);
- }
-
- trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
-
- ret = bch2_trans_update_extent_overwrite(trans, old_iter,
- BTREE_UPDATE_internal_snapshot_node,
- k1, k2) ?:
- bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
- bch2_disk_reservation_put(c, &res);
-
- if (ret)
- goto err;
-
- *fixed = true;
-
- if (pos1.snapshot == pos2.p.snapshot) {
- /*
- * We overwrote the first extent, and did the overwrite
- * in the same snapshot:
- */
- extent_end->offset = bkey_start_offset(&pos2);
- } else if (pos1.snapshot > pos2.p.snapshot) {
- /*
- * We overwrote the first extent in pos2's snapshot:
- */
- ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
- } else {
- /*
- * We overwrote the second extent - restart
- * check_extent() from the top:
- */
- ret = -BCH_ERR_transaction_restart_nested;
- }
- }
-fsck_err:
-err:
- bch2_trans_iter_exit(trans, &iter2);
- bch2_trans_iter_exit(trans, &iter1);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int check_overlapping_extents(struct btree_trans *trans,
- struct snapshots_seen *seen,
- struct extent_ends *extent_ends,
- struct bkey_s_c k,
- struct btree_iter *iter,
- bool *fixed)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- /* transaction restart, running again */
- if (bpos_eq(extent_ends->last_pos, k.k->p))
- return 0;
-
- if (extent_ends->last_pos.inode != k.k->p.inode)
- extent_ends_reset(extent_ends);
-
- darray_for_each(extent_ends->e, i) {
- if (i->offset <= bkey_start_offset(k.k))
- continue;
-
- if (!ref_visible2(c,
- k.k->p.snapshot, seen,
- i->snapshot, &i->seen))
- continue;
-
- ret = overlapping_extents_found(trans, iter->btree_id,
- SPOS(iter->pos.inode,
- i->offset,
- i->snapshot),
- &i->seen,
- *k.k, fixed, i);
- if (ret)
- goto err;
- }
-
- extent_ends->last_pos = k.k->p;
-err:
- return ret;
-}
-
-static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_crc_unpacked crc;
- const union bch_extent_entry *i;
- unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
-
- bkey_for_each_crc(k.k, ptrs, crc, i)
- if (crc_is_encoded(crc) &&
- crc.uncompressed_size > encoded_extent_max_sectors) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
- printbuf_exit(&buf);
- }
-
- return 0;
-}
-
-static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k,
- struct inode_walker *inode,
- struct snapshots_seen *s,
- struct extent_ends *extent_ends,
- struct disk_reservation *res)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret) {
- ret = ret < 0 ? ret : 0;
- goto out;
- }
-
- if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
- ret = check_i_sectors(trans, inode);
- if (ret)
- goto err;
- }
-
- ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
- if (ret)
- goto err;
-
- struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
- ret = PTR_ERR_OR_ZERO(extent_i);
- if (ret)
- goto err;
-
- ret = check_key_has_inode(trans, iter, inode, extent_i, k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_whiteout) {
- ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
- &inode->recalculate_sums);
- if (ret)
- goto err;
-
- /*
- * Check inodes in reverse order, from oldest snapshots to
- * newest, starting from the inode that matches this extent's
- * snapshot. If we didn't have one, iterate over all inodes:
- */
- for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
- inode->inodes.data && i >= inode->inodes.data;
- --i) {
- if (i->snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
- continue;
-
- if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
- !bkey_extent_is_reservation(k),
- trans, extent_past_end_of_inode,
- "extent type past end of inode %llu:%u, i_size %llu\n %s",
- i->inode.bi_inum, i->snapshot, i->inode.bi_size,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- struct btree_iter iter2;
-
- bch2_trans_copy_iter(&iter2, iter);
- bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
- ret = bch2_btree_iter_traverse(&iter2) ?:
- bch2_btree_delete_at(trans, &iter2,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter2);
- if (ret)
- goto err;
-
- iter->k.type = KEY_TYPE_whiteout;
- break;
- }
- }
- }
-
- ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
-
- if (bkey_extent_is_allocation(k.k)) {
- for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
- inode->inodes.data && i >= inode->inodes.data;
- --i) {
- if (i->snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
- continue;
-
- i->count += k.k->size;
- }
- }
-
- if (k.k->type != KEY_TYPE_whiteout) {
- ret = extent_ends_at(c, extent_ends, s, k);
- if (ret)
- goto err;
- }
-out:
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/*
- * Walk extents: verify that extents have a corresponding S_ISREG inode, and
- * that i_size an i_sectors are consistent
- */
-int bch2_check_extents(struct bch_fs *c)
-{
- struct inode_walker w = inode_walker_init();
- struct snapshots_seen s;
- struct extent_ends extent_ends;
- struct disk_reservation res = { 0 };
-
- snapshots_seen_init(&s);
- extent_ends_init(&extent_ends);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_extents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
- check_extent_overbig(trans, &iter, k);
- })) ?:
- check_i_sectors_notnested(trans, &w));
-
- bch2_disk_reservation_put(c, &res);
- extent_ends_exit(&extent_ends);
- inode_walker_exit(&w);
- snapshots_seen_exit(&s);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_check_indirect_extents(struct bch_fs *c)
-{
- struct disk_reservation res = { 0 };
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
- POS_MIN,
- BTREE_ITER_prefetch, k,
- &res, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
- bch2_disk_reservation_put(c, &res);
- check_extent_overbig(trans, &iter, k);
- })));
-
- bch2_disk_reservation_put(c, &res);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
- s64 count2;
-
- darray_for_each(w->inodes, i) {
- if (i->inode.bi_nlink == i->count)
- continue;
-
- count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
- if (count2 < 0)
- return count2;
-
- if (i->count != count2) {
- bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
- i->count = count2;
- if (i->inode.bi_nlink == i->count)
- continue;
- }
-
- if (fsck_err_on(i->inode.bi_nlink != i->count,
- trans, inode_dir_wrong_nlink,
- "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
- w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
- i->inode.bi_nlink = i->count;
- ret = bch2_fsck_write_inode(trans, &i->inode);
- if (ret)
- break;
- }
- }
-fsck_err:
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
-{
- u32 restart_count = trans->restart_count;
- return check_subdir_count_notnested(trans, w) ?:
- trans_was_restarted(trans, restart_count);
-}
-
-/* find a subvolume that's a descendent of @snapshot: */
-static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_subvolume)
- continue;
-
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
- bch2_trans_iter_exit(trans, &iter);
- *subvolid = k.k->p.offset;
- goto found;
- }
- }
- if (!ret)
- ret = -ENOENT;
-found:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-noinline_for_stack
-static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c_dirent d)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter subvol_iter = {};
- struct bch_inode_unpacked subvol_root;
- u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
- u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
- u32 parent_snapshot;
- u32 new_parent_subvol = 0;
- u64 parent_inum;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (ret ||
- (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
- int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
- if (ret2 && !bch2_err_matches(ret, ENOENT))
- return ret2;
- }
-
- if (ret &&
- !new_parent_subvol &&
- (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
- /*
- * Couldn't find a subvol for dirent's snapshot - but we lost
- * subvols, so we need to reconstruct:
- */
- ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
- if (ret)
- return ret;
-
- parent_snapshot = d.k->p.snapshot;
- }
-
- if (fsck_err_on(ret,
- trans, dirent_to_missing_parent_subvol,
- "dirent parent_subvol points to missing subvolume\n%s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
- fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
- trans, dirent_not_visible_in_parent_subvol,
- "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
- parent_snapshot,
- (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- if (!new_parent_subvol) {
- bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
- return -BCH_ERR_fsck_repair_unimplemented;
- }
-
- struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
- ret = PTR_ERR_OR_ZERO(new_dirent);
- if (ret)
- goto err;
-
- new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
- }
-
- struct bkey_s_c_subvolume s =
- bch2_bkey_get_iter_typed(trans, &subvol_iter,
- BTREE_ID_subvolumes, POS(0, target_subvol),
- 0, subvolume);
- ret = bkey_err(s.s_c);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (ret) {
- if (fsck_err(trans, dirent_to_missing_subvol,
- "dirent points to missing subvolume\n%s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
- return bch2_fsck_remove_dirent(trans, d.k->p);
- ret = 0;
- goto out;
- }
-
- if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
- trans, subvol_fs_path_parent_wrong,
- "subvol with wrong fs_path_parent, should be be %u\n%s",
- parent_subvol,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- struct bkey_i_subvolume *n =
- bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- n->v.fs_path_parent = cpu_to_le32(parent_subvol);
- }
-
- u64 target_inum = le64_to_cpu(s.v->inode);
- u32 target_snapshot = le32_to_cpu(s.v->snapshot);
-
- ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (ret) {
- bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto err;
- }
-
- if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
- trans, inode_bi_parent_wrong,
- "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
- target_inum,
- subvol_root.bi_parent_subvol, parent_subvol)) {
- subvol_root.bi_parent_subvol = parent_subvol;
- subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
- ret = __bch2_fsck_write_inode(trans, &subvol_root);
- if (ret)
- goto err;
- }
-
- ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
- if (ret)
- goto err;
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &subvol_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_hash_info *hash_info,
- struct inode_walker *dir,
- struct inode_walker *target,
- struct snapshots_seen *s)
-{
- struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret) {
- ret = ret < 0 ? ret : 0;
- goto out;
- }
-
- ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
- if (ret)
- goto err;
-
- if (k.k->type == KEY_TYPE_whiteout)
- goto out;
-
- if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
- ret = check_subdir_dirents_count(trans, dir);
- if (ret)
- goto err;
- }
-
- i = walk_inode(trans, dir, k);
- ret = PTR_ERR_OR_ZERO(i);
- if (ret < 0)
- goto err;
-
- ret = check_key_has_inode(trans, iter, dir, i, k);
- if (ret)
- goto err;
-
- if (!i)
- goto out;
-
- if (dir->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &i->inode);
- dir->first_this_inode = false;
-
- ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
- if (ret < 0)
- goto err;
- if (ret) {
- /* dirent has been deleted */
- ret = 0;
- goto out;
- }
-
- if (k.k->type != KEY_TYPE_dirent)
- goto out;
-
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
- if (d.v->d_type == DT_SUBVOL) {
- ret = check_dirent_to_subvol(trans, iter, d);
- if (ret)
- goto err;
- } else {
- ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
- if (ret)
- goto err;
-
- if (fsck_err_on(!target->inodes.nr,
- trans, dirent_to_missing_inode,
- "dirent points to missing inode:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf))) {
- ret = bch2_fsck_remove_dirent(trans, d.k->p);
- if (ret)
- goto err;
- }
-
- darray_for_each(target->inodes, i) {
- ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
- if (ret)
- goto err;
- }
-
- darray_for_each(target->deletes, i)
- if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
- trans, dirent_to_overwritten_inode,
- "dirent points to inode overwritten in snapshot %u:\n%s",
- *i,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf))) {
- struct btree_iter delete_iter;
- bch2_trans_iter_init(trans, &delete_iter,
- BTREE_ID_dirents,
- SPOS(k.k->p.inode, k.k->p.offset, *i),
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&delete_iter) ?:
- bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- hash_info,
- &delete_iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &delete_iter);
- if (ret)
- goto err;
-
- }
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
-
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
- if (d.v->d_type == DT_DIR)
- i->count++;
- i->i_size += bkey_bytes(d.k);
- }
-out:
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
- */
-int bch2_check_dirents(struct bch_fs *c)
-{
- struct inode_walker dir = inode_walker_init();
- struct inode_walker target = inode_walker_init();
- struct snapshots_seen s;
- struct bch_hash_info hash_info;
-
- snapshots_seen_init(&s);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_dirents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
- check_subdir_count_notnested(trans, &dir));
-
- snapshots_seen_exit(&s);
- inode_walker_exit(&dir);
- inode_walker_exit(&target);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_hash_info *hash_info,
- struct inode_walker *inode)
-{
- struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
- int ret;
-
- ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
-
- i = walk_inode(trans, inode, k);
- ret = PTR_ERR_OR_ZERO(i);
- if (ret)
- return ret;
-
- ret = check_key_has_inode(trans, iter, inode, i, k);
- if (ret)
- return ret;
-
- if (!i)
- return 0;
-
- if (inode->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &i->inode);
- inode->first_this_inode = false;
-
- ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/*
- * Walk xattrs: verify that they all have a corresponding inode
- */
-int bch2_check_xattrs(struct bch_fs *c)
-{
- struct inode_walker inode = inode_walker_init();
- struct bch_hash_info hash_info;
- int ret = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- check_xattr(trans, &iter, k, &hash_info, &inode)));
-
- inode_walker_exit(&inode);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_root_trans(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct bch_inode_unpacked root_inode;
- u32 snapshot;
- u64 inum;
- int ret;
-
- ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
- "root subvol missing")) {
- struct bkey_i_subvolume *root_subvol =
- bch2_trans_kmalloc(trans, sizeof(*root_subvol));
- ret = PTR_ERR_OR_ZERO(root_subvol);
- if (ret)
- goto err;
-
- snapshot = U32_MAX;
- inum = BCACHEFS_ROOT_INO;
-
- bkey_subvolume_init(&root_subvol->k_i);
- root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_subvol->v.flags = 0;
- root_subvol->v.snapshot = cpu_to_le32(snapshot);
- root_subvol->v.inode = cpu_to_le64(inum);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
- bch_err_msg(c, ret, "writing root subvol");
- if (ret)
- goto err;
- }
-
- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (mustfix_fsck_err_on(ret,
- trans, root_dir_missing,
- "root directory missing") ||
- mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
- trans, root_inode_not_dir,
- "root inode not a directory")) {
- bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
- 0, NULL);
- root_inode.bi_inum = inum;
- root_inode.bi_snapshot = snapshot;
-
- ret = __bch2_fsck_write_inode(trans, &root_inode);
- bch_err_msg(c, ret, "writing root inode");
- }
-err:
-fsck_err:
- return ret;
-}
-
-/* Get root directory, create if it doesn't exist: */
-int bch2_check_root(struct bch_fs *c)
-{
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_root_trans(trans));
- bch_err_fn(c, ret);
- return ret;
-}
-
-typedef DARRAY(u32) darray_u32;
-
-static bool darray_u32_has(darray_u32 *d, u32 v)
-{
- darray_for_each(*d, i)
- if (*i == v)
- return true;
- return false;
-}
-
-static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter parent_iter = {};
- darray_u32 subvol_path = {};
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_subvolume)
- return 0;
-
- while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
- ret = darray_push(&subvol_path, k.k->p.offset);
- if (ret)
- goto err;
-
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-
- struct bch_inode_unpacked subvol_root;
- ret = bch2_inode_find_by_inum_trans(trans,
- (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
- &subvol_root);
- if (ret)
- break;
-
- u32 parent = le32_to_cpu(s.v->fs_path_parent);
-
- if (darray_u32_has(&subvol_path, parent)) {
- if (fsck_err(c, subvol_loop, "subvolume loop"))
- ret = reattach_subvol(trans, s);
- break;
- }
-
- bch2_trans_iter_exit(trans, &parent_iter);
- bch2_trans_iter_init(trans, &parent_iter,
- BTREE_ID_subvolumes, POS(0, parent), 0);
- k = bch2_btree_iter_peek_slot(&parent_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
- trans, subvol_unreachable,
- "unreachable subvolume %s",
- (bch2_bkey_val_to_text(&buf, c, s.s_c),
- buf.buf))) {
- ret = reattach_subvol(trans, s);
- break;
- }
- }
-fsck_err:
-err:
- printbuf_exit(&buf);
- darray_exit(&subvol_path);
- bch2_trans_iter_exit(trans, &parent_iter);
- return ret;
-}
-
-int bch2_check_subvolume_structure(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_subvol_path(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-struct pathbuf_entry {
- u64 inum;
- u32 snapshot;
-};
-
-typedef DARRAY(struct pathbuf_entry) pathbuf;
-
-static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
- u32 new_depth)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, p->inum, p->snapshot), 0);
-
- struct bch_inode_unpacked inode;
- int ret = bkey_err(k) ?:
- !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
- : bch2_inode_unpack(k, &inode);
- if (ret)
- goto err;
-
- if (inode.bi_depth != new_depth) {
- inode.bi_depth = new_depth;
- ret = __bch2_fsck_write_inode(trans, &inode) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
-{
- u32 restart_count = trans->restart_count;
- int ret = 0;
-
- darray_for_each_reverse(*path, i) {
- ret = nested_lockrestart_do(trans,
- bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
- bch_err_fn(trans->c, ret);
- if (ret)
- break;
-
- new_bi_depth++;
- }
-
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
-{
- darray_for_each(*p, i)
- if (i->inum == inum &&
- i->snapshot == snapshot)
- return true;
- return false;
-}
-
-static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter inode_iter = {};
- pathbuf path = {};
- struct printbuf buf = PRINTBUF;
- u32 snapshot = inode_k.k->p.snapshot;
- bool redo_bi_depth = false;
- u32 min_bi_depth = U32_MAX;
- int ret = 0;
-
- struct bch_inode_unpacked inode;
- ret = bch2_inode_unpack(inode_k, &inode);
- if (ret)
- return ret;
-
- while (!inode.bi_subvol) {
- struct btree_iter dirent_iter;
- struct bkey_s_c_dirent d;
- u32 parent_snapshot = snapshot;
-
- d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
- ret = bkey_err(d.s_c);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto out;
-
- if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
- bch2_trans_iter_exit(trans, &dirent_iter);
-
- if (bch2_err_matches(ret, ENOENT)) {
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, inode_k);
- bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
- bch2_err_str(ret), buf.buf);
- goto out;
- }
-
- bch2_trans_iter_exit(trans, &dirent_iter);
-
- ret = darray_push(&path, ((struct pathbuf_entry) {
- .inum = inode.bi_inum,
- .snapshot = snapshot,
- }));
- if (ret)
- return ret;
-
- snapshot = parent_snapshot;
-
- bch2_trans_iter_exit(trans, &inode_iter);
- inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
- SPOS(0, inode.bi_dir, snapshot), 0);
-
- struct bch_inode_unpacked parent_inode;
- ret = bkey_err(inode_k) ?:
- !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
- : bch2_inode_unpack(inode_k, &parent_inode);
- if (ret) {
- /* Should have been caught in dirents pass */
- bch_err_msg(c, ret, "error looking up parent directory");
- goto out;
- }
-
- min_bi_depth = parent_inode.bi_depth;
-
- if (parent_inode.bi_depth < inode.bi_depth &&
- min_bi_depth < U16_MAX)
- break;
-
- inode = parent_inode;
- snapshot = inode_k.k->p.snapshot;
- redo_bi_depth = true;
-
- if (path_is_dup(&path, inode.bi_inum, snapshot)) {
- /* XXX print path */
- bch_err(c, "directory structure loop");
-
- darray_for_each(path, i)
- pr_err("%llu:%u", i->inum, i->snapshot);
- pr_err("%llu:%u", inode.bi_inum, snapshot);
-
- if (fsck_err(trans, dir_loop, "directory structure loop")) {
- ret = remove_backpointer(trans, &inode);
- bch_err_msg(c, ret, "removing dirent");
- if (ret)
- break;
-
- ret = reattach_inode(trans, &inode);
- bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
- }
-
- goto out;
- }
- }
-
- if (inode.bi_subvol)
- min_bi_depth = 0;
-
- if (redo_bi_depth)
- ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &inode_iter);
- darray_exit(&path);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/*
- * Check for loops in the directory structure: all other connectivity issues
- * have been fixed by prior passes
- */
-int bch2_check_directory_structure(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_intent|
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- if (!S_ISDIR(bkey_inode_mode(k)))
- continue;
-
- if (bch2_inode_flags(k) & BCH_INODE_unlinked)
- continue;
-
- check_path_loop(trans, k);
- })));
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-struct nlink_table {
- size_t nr;
- size_t size;
-
- struct nlink {
- u64 inum;
- u32 snapshot;
- u32 count;
- } *d;
-};
-
-static int add_nlink(struct bch_fs *c, struct nlink_table *t,
- u64 inum, u32 snapshot)
-{
- if (t->nr == t->size) {
- size_t new_size = max_t(size_t, 128UL, t->size * 2);
- void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
-
- if (!d) {
- bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
- new_size);
- return -BCH_ERR_ENOMEM_fsck_add_nlink;
- }
-
- if (t->d)
- memcpy(d, t->d, t->size * sizeof(t->d[0]));
- kvfree(t->d);
-
- t->d = d;
- t->size = new_size;
- }
-
-
- t->d[t->nr++] = (struct nlink) {
- .inum = inum,
- .snapshot = snapshot,
- };
-
- return 0;
-}
-
-static int nlink_cmp(const void *_l, const void *_r)
-{
- const struct nlink *l = _l;
- const struct nlink *r = _r;
-
- return cmp_int(l->inum, r->inum);
-}
-
-static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
- struct nlink_table *links,
- u64 range_start, u64 range_end, u64 inum, u32 snapshot)
-{
- struct nlink *link, key = {
- .inum = inum, .snapshot = U32_MAX,
- };
-
- if (inum < range_start || inum >= range_end)
- return;
-
- link = __inline_bsearch(&key, links->d, links->nr,
- sizeof(links->d[0]), nlink_cmp);
- if (!link)
- return;
-
- while (link > links->d && link[0].inum == link[-1].inum)
- --link;
-
- for (; link < links->d + links->nr && link->inum == inum; link++)
- if (ref_visible(c, s, snapshot, link->snapshot)) {
- link->count++;
- if (link->snapshot >= snapshot)
- break;
- }
-}
-
-noinline_for_stack
-static int check_nlinks_find_hardlinks(struct bch_fs *c,
- struct nlink_table *t,
- u64 start, u64 *end)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_inodes,
- POS(0, start),
- BTREE_ITER_intent|
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k, ({
- if (!bkey_is_inode(k.k))
- continue;
-
- /* Should never fail, checked by bch2_inode_invalid: */
- struct bch_inode_unpacked u;
- _ret3 = bch2_inode_unpack(k, &u);
- if (_ret3)
- break;
-
- /*
- * Backpointer and directory structure checks are sufficient for
- * directories, since they can't have hardlinks:
- */
- if (S_ISDIR(u.bi_mode))
- continue;
-
- /*
- * Previous passes ensured that bi_nlink is nonzero if
- * it had multiple hardlinks:
- */
- if (!u.bi_nlink)
- continue;
-
- ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
- if (ret) {
- *end = k.k->p.offset;
- ret = 0;
- break;
- }
- 0;
- })));
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
- u64 range_start, u64 range_end)
-{
- struct snapshots_seen s;
-
- snapshots_seen_init(&s);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
- BTREE_ITER_intent|
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k, ({
- ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
- if (ret)
- break;
-
- if (k.k->type == KEY_TYPE_dirent) {
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
- if (d.v->d_type != DT_DIR &&
- d.v->d_type != DT_SUBVOL)
- inc_link(c, &s, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
- }
- 0;
- })));
-
- snapshots_seen_exit(&s);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k,
- struct nlink_table *links,
- size_t *idx, u64 range_end)
-{
- struct bch_inode_unpacked u;
- struct nlink *link = &links->d[*idx];
- int ret = 0;
-
- if (k.k->p.offset >= range_end)
- return 1;
-
- if (!bkey_is_inode(k.k))
- return 0;
-
- ret = bch2_inode_unpack(k, &u);
- if (ret)
- return ret;
-
- if (S_ISDIR(u.bi_mode))
- return 0;
-
- if (!u.bi_nlink)
- return 0;
-
- while ((cmp_int(link->inum, k.k->p.offset) ?:
- cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
- BUG_ON(*idx == links->nr);
- link = &links->d[++*idx];
- }
-
- if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
- trans, inode_wrong_nlink,
- "inode %llu type %s has wrong i_nlink (%u, should be %u)",
- u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
- bch2_inode_nlink_get(&u), link->count)) {
- bch2_inode_nlink_set(&u, link->count);
- ret = __bch2_fsck_write_inode(trans, &u);
- }
-fsck_err:
- return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_update_hardlinks(struct bch_fs *c,
- struct nlink_table *links,
- u64 range_start, u64 range_end)
-{
- size_t idx = 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
- POS(0, range_start),
- BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
- if (ret < 0) {
- bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
- return ret;
- }
-
- return 0;
-}
-
-int bch2_check_nlinks(struct bch_fs *c)
-{
- struct nlink_table links = { 0 };
- u64 this_iter_range_start, next_iter_range_start = 0;
- int ret = 0;
-
- do {
- this_iter_range_start = next_iter_range_start;
- next_iter_range_start = U64_MAX;
-
- ret = check_nlinks_find_hardlinks(c, &links,
- this_iter_range_start,
- &next_iter_range_start);
-
- ret = check_nlinks_walk_dirents(c, &links,
- this_iter_range_start,
- next_iter_range_start);
- if (ret)
- break;
-
- ret = check_nlinks_update_hardlinks(c, &links,
- this_iter_range_start,
- next_iter_range_start);
- if (ret)
- break;
-
- links.nr = 0;
- } while (next_iter_range_start != U64_MAX);
-
- kvfree(links.d);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reflink_p p;
- struct bkey_i_reflink_p *u;
-
- if (k.k->type != KEY_TYPE_reflink_p)
- return 0;
-
- p = bkey_s_c_to_reflink_p(k);
-
- if (!p.v->front_pad && !p.v->back_pad)
- return 0;
-
- u = bch2_trans_kmalloc(trans, sizeof(*u));
- int ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- bkey_reassemble(&u->k_i, k);
- u->v.front_pad = 0;
- u->v.back_pad = 0;
-
- return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
-}
-
-int bch2_fix_reflink_p(struct bch_fs *c)
-{
- if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
- return 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_extents, POS_MIN,
- BTREE_ITER_intent|BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- fix_reflink_p_key(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-#ifndef NO_BCACHEFS_CHARDEV
-
-struct fsck_thread {
- struct thread_with_stdio thr;
- struct bch_fs *c;
- struct bch_opts opts;
-};
-
-static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
-{
- struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
- kfree(thr);
-}
-
-static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
-{
- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
- struct bch_fs *c = thr->c;
-
- int ret = PTR_ERR_OR_ZERO(c);
- if (ret)
- return ret;
-
- ret = bch2_fs_start(thr->c);
- if (ret)
- goto err;
-
- if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
- ret |= 1;
- }
- if (test_bit(BCH_FS_error, &c->flags)) {
- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
- ret |= 4;
- }
-err:
- bch2_fs_stop(c);
- return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
- .exit = bch2_fsck_thread_exit,
- .fn = bch2_fsck_offline_thread_fn,
-};
-
-long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
-{
- struct bch_ioctl_fsck_offline arg;
- struct fsck_thread *thr = NULL;
- darray_str(devs) = {};
- long ret = 0;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- for (size_t i = 0; i < arg.nr_devs; i++) {
- u64 dev_u64;
- ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
- if (ret)
- goto err;
-
- char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
- ret = PTR_ERR_OR_ZERO(dev_str);
- if (ret)
- goto err;
-
- ret = darray_push(&devs, dev_str);
- if (ret) {
- kfree(dev_str);
- goto err;
- }
- }
-
- thr = kzalloc(sizeof(*thr), GFP_KERNEL);
- if (!thr) {
- ret = -ENOMEM;
- goto err;
- }
-
- thr->opts = bch2_opts_empty();
-
- if (arg.opts) {
- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
- ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
- if (!IS_ERR(optstr))
- kfree(optstr);
-
- if (ret)
- goto err;
- }
-
- opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
- opt_set(thr->opts, read_only, 1);
- opt_set(thr->opts, ratelimit_errors, 0);
-
- /* We need request_key() to be called before we punt to kthread: */
- opt_set(thr->opts, nostart, true);
-
- bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
-
- thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
-
- if (!IS_ERR(thr->c) &&
- thr->c->opts.errors == BCH_ON_ERROR_panic)
- thr->c->opts.errors = BCH_ON_ERROR_ro;
-
- ret = __bch2_run_thread_with_stdio(&thr->thr);
-out:
- darray_for_each(devs, i)
- kfree(*i);
- darray_exit(&devs);
- return ret;
-err:
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- pr_err("ret %s", bch2_err_str(ret));
- goto out;
-}
-
-static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
-{
- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
- struct bch_fs *c = thr->c;
-
- c->stdio_filter = current;
- c->stdio = &thr->thr.stdio;
-
- /*
- * XXX: can we figure out a way to do this without mucking with c->opts?
- */
- unsigned old_fix_errors = c->opts.fix_errors;
- if (opt_defined(thr->opts, fix_errors))
- c->opts.fix_errors = thr->opts.fix_errors;
- else
- c->opts.fix_errors = FSCK_FIX_ask;
-
- c->opts.fsck = true;
- set_bit(BCH_FS_fsck_running, &c->flags);
-
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
- int ret = bch2_run_online_recovery_passes(c);
-
- clear_bit(BCH_FS_fsck_running, &c->flags);
- bch_err_fn(c, ret);
-
- c->stdio = NULL;
- c->stdio_filter = NULL;
- c->opts.fix_errors = old_fix_errors;
-
- up(&c->online_fsck_mutex);
- bch2_ro_ref_put(c);
- return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
- .exit = bch2_fsck_thread_exit,
- .fn = bch2_fsck_online_thread_fn,
-};
-
-long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
-{
- struct fsck_thread *thr = NULL;
- long ret = 0;
-
- if (arg.flags)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!bch2_ro_ref_tryget(c))
- return -EROFS;
-
- if (down_trylock(&c->online_fsck_mutex)) {
- bch2_ro_ref_put(c);
- return -EAGAIN;
- }
-
- thr = kzalloc(sizeof(*thr), GFP_KERNEL);
- if (!thr) {
- ret = -ENOMEM;
- goto err;
- }
-
- thr->c = c;
- thr->opts = bch2_opts_empty();
-
- if (arg.opts) {
- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
- ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
- if (!IS_ERR(optstr))
- kfree(optstr);
-
- if (ret)
- goto err;
- }
-
- ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
-err:
- if (ret < 0) {
- bch_err_fn(c, ret);
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- up(&c->online_fsck_mutex);
- bch2_ro_ref_put(c);
- }
- return ret;
-}
-
-#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
deleted file mode 100644
index 574948278cd4..000000000000
--- a/fs/bcachefs/fsck.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FSCK_H
-#define _BCACHEFS_FSCK_H
-
-#include "str_hash.h"
-
-int bch2_fsck_update_backpointers(struct btree_trans *,
- struct snapshots_seen *,
- const struct bch_hash_desc,
- struct bch_hash_info *,
- struct bkey_i *);
-
-int bch2_check_inodes(struct bch_fs *);
-int bch2_check_extents(struct bch_fs *);
-int bch2_check_indirect_extents(struct bch_fs *);
-int bch2_check_dirents(struct bch_fs *);
-int bch2_check_xattrs(struct bch_fs *);
-int bch2_check_root(struct bch_fs *);
-int bch2_check_subvolume_structure(struct bch_fs *);
-int bch2_check_unreachable_inodes(struct bch_fs *);
-int bch2_check_directory_structure(struct bch_fs *);
-int bch2_check_nlinks(struct bch_fs *);
-int bch2_fix_reflink_p(struct bch_fs *);
-
-long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *);
-long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online);
-
-#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
deleted file mode 100644
index 80051073f613..000000000000
--- a/fs/bcachefs/inode.c
+++ /dev/null
@@ -1,1451 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_write_buffer.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "compress.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "fs.h"
-#include "inode.h"
-#include "opts.h"
-#include "str_hash.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "varint.h"
-
-#include <linux/random.h>
-
-#include <linux/unaligned.h>
-
-#define x(name, ...) #name,
-const char * const bch2_inode_opts[] = {
- BCH_INODE_OPTS()
- NULL,
-};
-
-static const char * const bch2_inode_flag_strs[] = {
- BCH_INODE_FLAGS()
- NULL
-};
-#undef x
-
-static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
-
-static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-
-static int inode_decode_field(const u8 *in, const u8 *end,
- u64 out[2], unsigned *out_bits)
-{
- __be64 be[2] = { 0, 0 };
- unsigned bytes, shift;
- u8 *p;
-
- if (in >= end)
- return -BCH_ERR_inode_unpack_error;
-
- if (!*in)
- return -BCH_ERR_inode_unpack_error;
-
- /*
- * position of highest set bit indicates number of bytes:
- * shift = number of bits to remove in high byte:
- */
- shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
- bytes = byte_table[shift - 1];
-
- if (in + bytes > end)
- return -BCH_ERR_inode_unpack_error;
-
- p = (u8 *) be + 16 - bytes;
- memcpy(p, in, bytes);
- *p ^= (1 << 8) >> shift;
-
- out[0] = be64_to_cpu(be[0]);
- out[1] = be64_to_cpu(be[1]);
- *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
-
- return bytes;
-}
-
-static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
-{
- struct bkey_i_inode_v3 *k = &packed->inode;
- u8 *out = k->v.fields;
- u8 *end = (void *) &packed[1];
- u8 *last_nonzero_field = out;
- unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
- unsigned bytes;
- int ret;
-
- bkey_inode_v3_init(&packed->inode.k_i);
- packed->inode.k.p.offset = inode->bi_inum;
- packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq);
- packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
- packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
- packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors);
- packed->inode.v.bi_size = cpu_to_le64(inode->bi_size);
- packed->inode.v.bi_version = cpu_to_le64(inode->bi_version);
- SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
- SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
-
-
-#define x(_name, _bits) \
- nr_fields++; \
- \
- if (inode->_name) { \
- ret = bch2_varint_encode_fast(out, inode->_name); \
- out += ret; \
- \
- if (_bits > 64) \
- *out++ = 0; \
- \
- last_nonzero_field = out; \
- last_nonzero_fieldnr = nr_fields; \
- } else { \
- *out++ = 0; \
- \
- if (_bits > 64) \
- *out++ = 0; \
- }
-
- BCH_INODE_FIELDS_v3()
-#undef x
- BUG_ON(out > end);
-
- out = last_nonzero_field;
- nr_fields = last_nonzero_fieldnr;
-
- bytes = out - (u8 *) &packed->inode.v;
- set_bkey_val_bytes(&packed->inode.k, bytes);
- memset_u64s_tail(&packed->inode.v, 0, bytes);
-
- SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- struct bch_inode_unpacked unpacked;
-
- ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
- BUG_ON(ret);
- BUG_ON(unpacked.bi_inum != inode->bi_inum);
- BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
- BUG_ON(unpacked.bi_sectors != inode->bi_sectors);
- BUG_ON(unpacked.bi_size != inode->bi_size);
- BUG_ON(unpacked.bi_version != inode->bi_version);
- BUG_ON(unpacked.bi_mode != inode->bi_mode);
-
-#define x(_name, _bits) if (unpacked._name != inode->_name) \
- panic("unpacked %llu should be %llu", \
- (u64) unpacked._name, (u64) inode->_name);
- BCH_INODE_FIELDS_v3()
-#undef x
- }
-}
-
-void bch2_inode_pack(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
-{
- bch2_inode_pack_inlined(packed, inode);
-}
-
-static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
- struct bch_inode_unpacked *unpacked)
-{
- const u8 *in = inode.v->fields;
- const u8 *end = bkey_val_end(inode);
- u64 field[2];
- unsigned fieldnr = 0, field_bits;
- int ret;
-
-#define x(_name, _bits) \
- if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \
- unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
- memset((void *) unpacked + offset, 0, \
- sizeof(*unpacked) - offset); \
- return 0; \
- } \
- \
- ret = inode_decode_field(in, end, field, &field_bits); \
- if (ret < 0) \
- return ret; \
- \
- if (field_bits > sizeof(unpacked->_name) * 8) \
- return -BCH_ERR_inode_unpack_error; \
- \
- unpacked->_name = field[1]; \
- in += ret;
-
- BCH_INODE_FIELDS_v2()
-#undef x
-
- /* XXX: signal if there were more fields than expected? */
- return 0;
-}
-
-static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
- const u8 *in, const u8 *end,
- unsigned nr_fields)
-{
- unsigned fieldnr = 0;
- int ret;
- u64 v[2];
-
-#define x(_name, _bits) \
- if (fieldnr < nr_fields) { \
- ret = bch2_varint_decode_fast(in, end, &v[0]); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- \
- if (_bits > 64) { \
- ret = bch2_varint_decode_fast(in, end, &v[1]); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- } else { \
- v[1] = 0; \
- } \
- } else { \
- v[0] = v[1] = 0; \
- } \
- \
- unpacked->_name = v[0]; \
- if (v[1] || v[0] != unpacked->_name) \
- return -BCH_ERR_inode_unpack_error; \
- fieldnr++;
-
- BCH_INODE_FIELDS_v2()
-#undef x
-
- /* XXX: signal if there were more fields than expected? */
- return 0;
-}
-
-static int bch2_inode_unpack_v3(struct bkey_s_c k,
- struct bch_inode_unpacked *unpacked)
-{
- struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
- const u8 *in = inode.v->fields;
- const u8 *end = bkey_val_end(inode);
- unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
- unsigned fieldnr = 0;
- int ret;
- u64 v[2];
-
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
- unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors);
- unpacked->bi_size = le64_to_cpu(inode.v->bi_size);
- unpacked->bi_version = le64_to_cpu(inode.v->bi_version);
- unpacked->bi_mode = INODEv3_MODE(inode.v);
-
-#define x(_name, _bits) \
- if (fieldnr < nr_fields) { \
- ret = bch2_varint_decode_fast(in, end, &v[0]); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- \
- if (_bits > 64) { \
- ret = bch2_varint_decode_fast(in, end, &v[1]); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- } else { \
- v[1] = 0; \
- } \
- } else { \
- v[0] = v[1] = 0; \
- } \
- \
- unpacked->_name = v[0]; \
- if (v[1] || v[0] != unpacked->_name) \
- return -BCH_ERR_inode_unpack_error; \
- fieldnr++;
-
- BCH_INODE_FIELDS_v3()
-#undef x
-
- /* XXX: signal if there were more fields than expected? */
- return 0;
-}
-
-static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
- struct bch_inode_unpacked *unpacked)
-{
- memset(unpacked, 0, sizeof(*unpacked));
-
- unpacked->bi_snapshot = k.k->p.snapshot;
-
- switch (k.k->type) {
- case KEY_TYPE_inode: {
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_journal_seq= 0;
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
-
- if (INODEv1_NEW_VARINT(inode.v)) {
- return bch2_inode_unpack_v2(unpacked, inode.v->fields,
- bkey_val_end(inode),
- INODEv1_NR_FIELDS(inode.v));
- } else {
- return bch2_inode_unpack_v1(inode, unpacked);
- }
- break;
- }
- case KEY_TYPE_inode_v2: {
- struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
-
- return bch2_inode_unpack_v2(unpacked, inode.v->fields,
- bkey_val_end(inode),
- INODEv2_NR_FIELDS(inode.v));
- }
- default:
- BUG();
- }
-}
-
-int bch2_inode_unpack(struct bkey_s_c k,
- struct bch_inode_unpacked *unpacked)
-{
- unpacked->bi_snapshot = k.k->p.snapshot;
-
- return likely(k.k->type == KEY_TYPE_inode_v3)
- ? bch2_inode_unpack_v3(k, unpacked)
- : bch2_inode_unpack_slowpath(k, unpacked);
-}
-
-int __bch2_inode_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags,
- bool warn)
-{
- u32 snapshot;
- int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
- if (ret)
- return ret;
-
- struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot),
- flags|BTREE_ITER_cached);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
- if (ret)
- goto err;
-
- ret = bch2_inode_unpack(k, inode);
- if (ret)
- goto err;
-
- return 0;
-err:
- if (warn)
- bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-int bch2_inode_write_flags(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_inode_buf *inode_p;
-
- inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
-
- bch2_inode_pack_inlined(inode_p, inode);
- inode_p->inode.k.p.snapshot = iter->snapshot;
- return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
- struct bkey_inode_buf *inode_p =
- bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
-
- bch2_inode_pack(inode_p, inode);
- inode_p->inode.k.p.snapshot = inode->bi_snapshot;
-
- return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
- &inode_p->inode.k_i,
- BTREE_UPDATE_internal_snapshot_node);
-}
-
-int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
- int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_fsck_write_inode(trans, inode));
- bch_err_fn(trans->c, ret);
- return ret;
-}
-
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
-{
- struct bch_inode_unpacked u;
- struct bkey_inode_buf *inode_p;
- int ret;
-
- if (!bkey_is_inode(&k->k))
- return ERR_PTR(-ENOENT);
-
- inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
- if (IS_ERR(inode_p))
- return ERR_CAST(inode_p);
-
- ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
- if (ret)
- return ERR_PTR(ret);
-
- bch2_inode_pack(inode_p, &u);
- return &inode_p->inode.k_i;
-}
-
-static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bch_inode_unpacked unpacked;
- int ret = 0;
-
- bkey_fsck_err_on(k.k->p.inode,
- c, inode_pos_inode_nonzero,
- "nonzero k.p.inode");
-
- bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
- c, inode_pos_blockdev_range,
- "fs inode in blockdev range");
-
- bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
- c, inode_unpack_error,
- "invalid variable length fields");
-
- bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
- c, inode_checksum_type_invalid,
- "invalid data checksum type (%u >= %u",
- unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
-
- bkey_fsck_err_on(unpacked.bi_compression &&
- !bch2_compression_opt_valid(unpacked.bi_compression - 1),
- c, inode_compression_type_invalid,
- "invalid compression opt %u", unpacked.bi_compression - 1);
-
- bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
- unpacked.bi_nlink != 0,
- c, inode_unlinked_but_nlink_nonzero,
- "flagged as unlinked but bi_nlink != 0");
-
- bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
- c, inode_subvol_root_but_not_dir,
- "subvolume root but not a directory");
-fsck_err:
- return ret;
-}
-
-int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- int ret = 0;
-
- bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
- c, inode_str_hash_invalid,
- "invalid str hash type (%llu >= %u)",
- INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
- ret = __bch2_inode_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
- int ret = 0;
-
- bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
- c, inode_str_hash_invalid,
- "invalid str hash type (%llu >= %u)",
- INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
- ret = __bch2_inode_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
- int ret = 0;
-
- bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
- INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
- c, inode_v3_fields_start_bad,
- "invalid fields_start (got %llu, min %u max %zu)",
- INODEv3_FIELDS_START(inode.v),
- INODEv3_FIELDS_START_INITIAL,
- bkey_val_u64s(inode.k));
-
- bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
- c, inode_str_hash_invalid,
- "invalid str hash type (%llu >= %u)",
- INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
- ret = __bch2_inode_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-static void __bch2_inode_unpacked_to_text(struct printbuf *out,
- struct bch_inode_unpacked *inode)
-{
- prt_printf(out, "\n");
- printbuf_indent_add(out, 2);
- prt_printf(out, "mode=%o\n", inode->bi_mode);
-
- prt_str(out, "flags=");
- prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
- prt_printf(out, "(%x)\n", inode->bi_flags);
-
- prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
- prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed);
- prt_printf(out, "hash_type=");
- bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
- prt_newline(out);
- prt_printf(out, "bi_size=%llu\n", inode->bi_size);
- prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors);
- prt_printf(out, "bi_version=%llu\n", inode->bi_version);
-
-#define x(_name, _bits) \
- prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
- BCH_INODE_FIELDS_v3()
-#undef x
-
- bch2_printbuf_strip_trailing_newline(out);
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
-{
- prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
- __bch2_inode_unpacked_to_text(out, inode);
-}
-
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bch_inode_unpacked inode;
-
- if (bch2_inode_unpack(k, &inode)) {
- prt_printf(out, "(unpack error)");
- return;
- }
-
- __bch2_inode_unpacked_to_text(out, &inode);
-}
-
-static inline u64 bkey_inode_flags(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
- case KEY_TYPE_inode_v2:
- return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
- case KEY_TYPE_inode_v3:
- return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
- default:
- return 0;
- }
-}
-
-static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
- return;
- case KEY_TYPE_inode_v2:
- bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
- return;
- case KEY_TYPE_inode_v3:
- bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
- return;
- default:
- BUG();
- }
-}
-
-static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
-{
- unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
-
- return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
-}
-
-static struct bkey_s_c
-bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
- enum btree_id btree, struct bpos pos,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_max_norestart(trans, *iter, btree,
- bpos_successor(pos),
- SPOS(pos.inode, pos.offset, U32_MAX),
- flags|BTREE_ITER_all_snapshots, k, ret)
- if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-}
-
-static struct bkey_s_c
-bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos pos, unsigned flags)
-{
- struct bkey_s_c k;
-again:
- k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
- if (!k.k ||
- bkey_err(k) ||
- bkey_is_inode(k.k))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
- pos = k.k->p;
- goto again;
-}
-
-int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_max_norestart(trans, iter,
- BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
- BTREE_ITER_all_snapshots|
- BTREE_ITER_with_updates, k, ret)
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
- bkey_is_inode(k.k)) {
- ret = 1;
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int update_inode_has_children(struct btree_trans *trans,
- struct bkey_s k,
- bool have_child)
-{
- if (!have_child) {
- int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
- if (ret)
- return ret < 0 ? ret : 0;
- }
-
- u64 f = bkey_inode_flags(k.s_c);
- if (have_child != !!(f & BCH_INODE_has_child_snapshot))
- bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
-
- return 0;
-}
-
-static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
- bool have_child)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
- &iter, pos, BTREE_ITER_with_updates);
- int ret = bkey_err(k);
- if (ret)
- return ret;
- if (!k.k)
- return 0;
-
- if (!have_child) {
- ret = bch2_inode_has_child_snapshots(trans, k.k->p);
- if (ret) {
- ret = ret < 0 ? ret : 0;
- goto err;
- }
- }
-
- u64 f = bkey_inode_flags(k);
- if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
- struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
- BTREE_UPDATE_internal_snapshot_node);
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- goto err;
-
- bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_trigger_inode(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
-
- if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
- BUG_ON(!trans->journal_res.seq);
- bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
- }
-
- s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) };
- if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) {
- int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes);
- if (ret)
- return ret;
- }
-
- if (flags & BTREE_TRIGGER_transactional) {
- int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) -
- (int) bkey_is_unlinked_inode(old);
- if (unlinked_delta) {
- int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
- new.k->p, unlinked_delta > 0);
- if (ret)
- return ret;
- }
-
- /*
- * If we're creating or deleting an inode at this snapshot ID,
- * and there might be an inode in a parent snapshot ID, we might
- * need to set or clear the has_child_snapshot flag on the
- * parent.
- */
- int deleted_delta = (int) bkey_is_inode(new.k) -
- (int) bkey_is_inode(old.k);
- if (deleted_delta &&
- bch2_snapshot_parent(c, new.k->p.snapshot)) {
- int ret = update_parent_inode_has_children(trans, new.k->p,
- deleted_delta > 0);
- if (ret)
- return ret;
- }
-
- /*
- * When an inode is first updated in a new snapshot, we may need
- * to clear has_child_snapshot
- */
- if (deleted_delta > 0) {
- int ret = update_inode_has_children(trans, new, false);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(k.k->p.inode,
- c, inode_pos_inode_nonzero,
- "nonzero k.p.inode");
-fsck_err:
- return ret;
-}
-
-void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
-
- prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
-}
-
-int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
- c, inode_alloc_cursor_inode_bad,
- "k.p.inode bad");
-fsck_err:
- return ret;
-}
-
-void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);
-
- prt_printf(out, "idx %llu generation %llu",
- le64_to_cpu(i.v->idx),
- le64_to_cpu(i.v->gen));
-}
-
-void bch2_inode_init_early(struct bch_fs *c,
- struct bch_inode_unpacked *inode_u)
-{
- enum bch_str_hash_type str_hash =
- bch2_str_hash_opt_to_type(c, c->opts.str_hash);
-
- memset(inode_u, 0, sizeof(*inode_u));
-
- SET_INODE_STR_HASH(inode_u, str_hash);
- get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
-}
-
-void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
- struct bch_inode_unpacked *parent)
-{
- inode_u->bi_mode = mode;
- inode_u->bi_uid = uid;
- inode_u->bi_gid = gid;
- inode_u->bi_dev = rdev;
- inode_u->bi_atime = now;
- inode_u->bi_mtime = now;
- inode_u->bi_ctime = now;
- inode_u->bi_otime = now;
-
- if (parent && parent->bi_mode & S_ISGID) {
- inode_u->bi_gid = parent->bi_gid;
- if (S_ISDIR(mode))
- inode_u->bi_mode |= S_ISGID;
- }
-
- if (parent) {
-#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name;
- BCH_INODE_OPTS()
-#undef x
- }
-}
-
-void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
- struct bch_inode_unpacked *parent)
-{
- bch2_inode_init_early(c, inode_u);
- bch2_inode_init_late(inode_u, bch2_current_time(c),
- uid, gid, mode, rdev, parent);
-}
-
-static struct bkey_i_inode_alloc_cursor *
-bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
-{
- struct bch_fs *c = trans->c;
-
- u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
-
- cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
- BTREE_ID_logged_ops,
- POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
- BTREE_ITER_cached);
- int ret = bkey_err(k);
- if (ret)
- return ERR_PTR(ret);
-
- struct bkey_i_inode_alloc_cursor *cursor =
- k.k->type == KEY_TYPE_inode_alloc_cursor
- ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
- : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
- ret = PTR_ERR_OR_ZERO(cursor);
- if (ret)
- goto err;
-
- if (c->opts.inodes_32bit) {
- *min = BLOCKDEV_INODE_MAX;
- *max = INT_MAX;
- } else {
- cursor->v.bits = c->opts.shard_inode_numbers_bits;
-
- unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
-
- *min = max(cpu << bits, (u64) INT_MAX + 1);
- *max = (cpu << bits) | ~(ULLONG_MAX << bits);
- }
-
- if (le64_to_cpu(cursor->v.idx) < *min)
- cursor->v.idx = cpu_to_le64(*min);
-
- if (le64_to_cpu(cursor->v.idx) >= *max) {
- cursor->v.idx = cpu_to_le64(*min);
- le32_add_cpu(&cursor->v.gen, 1);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret ? ERR_PTR(ret) : cursor;
-}
-
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode_u,
- u32 snapshot, u64 cpu)
-{
- u64 min, max;
- struct bkey_i_inode_alloc_cursor *cursor =
- bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
- int ret = PTR_ERR_OR_ZERO(cursor);
- if (ret)
- return ret;
-
- u64 start = le64_to_cpu(cursor->v.idx);
- u64 pos = start;
-
- bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
- BTREE_ITER_all_snapshots|
- BTREE_ITER_intent);
- struct bkey_s_c k;
-again:
- while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k)) &&
- bkey_lt(k.k->p, POS(0, max))) {
- if (pos < iter->pos.offset)
- goto found_slot;
-
- /*
- * We don't need to iterate over keys in every snapshot once
- * we've found just one:
- */
- pos = iter->pos.offset + 1;
- bch2_btree_iter_set_pos(iter, POS(0, pos));
- }
-
- if (!ret && pos < max)
- goto found_slot;
-
- if (!ret && start == min)
- ret = -BCH_ERR_ENOSPC_inode_create;
-
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return ret;
- }
-
- /* Retry from start */
- pos = start = min;
- bch2_btree_iter_set_pos(iter, POS(0, pos));
- le32_add_cpu(&cursor->v.gen, 1);
- goto again;
-found_slot:
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return ret;
- }
-
- inode_u->bi_inum = k.k->p.offset;
- inode_u->bi_generation = le64_to_cpu(cursor->v.gen);
- cursor->v.idx = cpu_to_le64(k.k->p.offset + 1);
- return 0;
-}
-
-static int bch2_inode_delete_keys(struct btree_trans *trans,
- subvol_inum inum, enum btree_id id)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i delete;
- struct bpos end = POS(inum.inum, U64_MAX);
- u32 snapshot;
- int ret = 0;
-
- /*
- * We're never going to be deleting partial extents, no need to use an
- * extent iterator:
- */
- bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
- BTREE_ITER_intent);
-
- while (1) {
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(&iter, snapshot);
-
- k = bch2_btree_iter_peek_max(&iter, end);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k)
- break;
-
- bkey_init(&delete.k);
- delete.k.p = iter.pos;
-
- if (iter.flags & BTREE_ITER_is_extents)
- bch2_key_resize(&delete.k,
- bpos_min(end, k.k->p).offset -
- iter.pos.offset);
-
- ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
-err:
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
- u32 snapshot;
- int ret;
-
- /*
- * If this was a directory, there shouldn't be any real dirents left -
- * but there could be whiteouts (from hash collisions) that we should
- * delete:
- *
- * XXX: the dirent could ideally would delete whiteouts when they're no
- * longer needed
- */
- ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
- bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
- bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
- if (ret)
- goto err;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot),
- BTREE_ITER_intent|BTREE_ITER_cached);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!bkey_is_inode(k.k)) {
- bch2_fs_inconsistent(c,
- "inode %llu:%u not found when deleting",
- inum.inum, snapshot);
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
- }
-
- ret = bch2_btree_delete_at(trans, &iter, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
-err:
- bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- if (ret)
- goto err2;
-
- ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
-err2:
- bch2_trans_put(trans);
- return ret;
-}
-
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- int ret;
-
- ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- int ret;
-
- ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
-{
- if (bi->bi_flags & BCH_INODE_unlinked)
- bi->bi_flags &= ~BCH_INODE_unlinked;
- else {
- if (bi->bi_nlink == U32_MAX)
- return -EINVAL;
-
- bi->bi_nlink++;
- }
-
- return 0;
-}
-
-void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
-{
- if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
- bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
- bi->bi_inum);
- return;
- }
-
- if (bi->bi_flags & BCH_INODE_unlinked) {
- bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
- return;
- }
-
- if (bi->bi_nlink)
- bi->bi_nlink--;
- else
- bi->bi_flags |= BCH_INODE_unlinked;
-}
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
-{
- struct bch_opts ret = { 0 };
-#define x(_name, _bits) \
- if (inode->bi_##_name) \
- opt_set(ret, _name, inode->bi_##_name - 1);
- BCH_INODE_OPTS()
-#undef x
- return ret;
-}
-
-void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
- struct bch_inode_unpacked *inode)
-{
-#define x(_name, _bits) \
- if ((inode)->bi_##_name) { \
- opts->_name = inode->bi_##_name - 1; \
- opts->_name##_from_inode = true; \
- } else { \
- opts->_name = c->opts._name; \
- opts->_name##_from_inode = false; \
- }
- BCH_INODE_OPTS()
-#undef x
-
- bch2_io_opts_fixups(opts);
-}
-
-int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
-{
- struct bch_inode_unpacked inode;
- int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
-
- if (ret)
- return ret;
-
- bch2_inode_opts_get(opts, trans->c, &inode);
- return 0;
-}
-
-static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
- struct bkey_i_inode_generation delete;
- struct bch_inode_unpacked inode_u;
- struct bkey_s_c k;
- int ret;
-
- do {
- ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL) ?:
- bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL) ?:
- bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL);
- } while (ret == -BCH_ERR_transaction_restart_nested);
- if (ret)
- goto err;
-retry:
- bch2_trans_begin(trans);
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum, snapshot), BTREE_ITER_intent);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!bkey_is_inode(k.k)) {
- bch2_fs_inconsistent(c,
- "inode %llu:%u not found when deleting",
- inum, snapshot);
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
- }
-
- bch2_inode_unpack(k, &inode_u);
-
- /* Subvolume root? */
- if (inode_u.bi_subvol)
- bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
-
- bkey_inode_generation_init(&delete.k_i);
- delete.k.p = iter.pos;
- delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
- ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
-err:
- bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- return ret ?: -BCH_ERR_transaction_restart_nested;
-}
-
-/*
- * After deleting an inode, there may be versions in older snapshots that should
- * also be deleted - if they're not referenced by sibling snapshots and not open
- * in other subvolumes:
- */
-static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-next_parent:
- ret = lockrestart_do(trans,
- bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
- if (ret || !k.k)
- return ret;
-
- bool unlinked = bkey_is_unlinked_inode(k);
- pos = k.k->p;
- bch2_trans_iter_exit(trans, &iter);
-
- if (!unlinked)
- return 0;
-
- ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
- if (ret)
- return ret < 0 ? ret : 0;
-
- ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
- if (ret)
- return ret;
- goto next_parent;
-}
-
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
- return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
- delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
-}
-
-static int may_delete_deleted_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos,
- bool *need_another_pass)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter inode_iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked inode;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
- if (fsck_err_on(!bkey_is_inode(k.k),
- trans, deleted_inode_missing,
- "nonexistent inode %llu:%u in deleted_inodes btree",
- pos.offset, pos.snapshot))
- goto delete;
-
- ret = bch2_inode_unpack(k, &inode);
- if (ret)
- goto out;
-
- if (S_ISDIR(inode.bi_mode)) {
- ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
- if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
- trans, deleted_inode_is_dir,
- "non empty directory %llu:%u in deleted_inodes btree",
- pos.offset, pos.snapshot))
- goto delete;
- if (ret)
- goto out;
- }
-
- if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
- trans, deleted_inode_not_unlinked,
- "non-deleted inode %llu:%u in deleted_inodes btree",
- pos.offset, pos.snapshot))
- goto delete;
-
- if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
- trans, deleted_inode_has_child_snapshots,
- "inode with child snapshots %llu:%u in deleted_inodes btree",
- pos.offset, pos.snapshot))
- goto delete;
-
- ret = bch2_inode_has_child_snapshots(trans, k.k->p);
- if (ret < 0)
- goto out;
-
- if (ret) {
- if (fsck_err(trans, inode_has_child_snapshots_wrong,
- "inode has_child_snapshots flag wrong (should be set)\n%s",
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &inode),
- buf.buf))) {
- inode.bi_flags |= BCH_INODE_has_child_snapshot;
- ret = __bch2_fsck_write_inode(trans, &inode);
- if (ret)
- goto out;
- }
- goto delete;
-
- }
-
- if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
- !fsck_err(trans, deleted_inode_but_clean,
- "filesystem marked as clean but have deleted inode %llu:%u",
- pos.offset, pos.snapshot)) {
- ret = 0;
- goto out;
- }
-
- ret = 1;
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &inode_iter);
- printbuf_exit(&buf);
- return ret;
-delete:
- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
- goto out;
-}
-
-int bch2_delete_dead_inodes(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- bool need_another_pass;
- int ret;
-again:
- /*
- * if we ran check_inodes() unlinked inodes will have already been
- * cleaned up but the write buffer will be out of sync; therefore we
- * alway need a write buffer flush
- */
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
- need_another_pass = false;
-
- /*
- * Weird transaction restart handling here because on successful delete,
- * bch2_inode_rm_snapshot() will return a nested transaction restart,
- * but we can't retry because the btree write buffer won't have been
- * flushed and we'd spin:
- */
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
- if (ret > 0) {
- bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
- k.k->p.offset, k.k->p.snapshot);
-
- ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
- /*
- * We don't want to loop here: a transaction restart
- * error here means we handled a transaction restart and
- * we're actually done, but if we loop we'll retry the
- * same key because the write buffer hasn't been flushed
- * yet
- */
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- ret = 0;
- continue;
- }
- }
-
- ret;
- }));
-
- if (!ret && need_another_pass)
- goto again;
-err:
- bch2_trans_put(trans);
- return ret;
-}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
deleted file mode 100644
index f82cfbf460d0..000000000000
--- a/fs/bcachefs/inode.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_H
-#define _BCACHEFS_INODE_H
-
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "opts.h"
-#include "snapshot.h"
-
-extern const char * const bch2_inode_opts[];
-
-int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
-
-static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
-{
- return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
- ? __bch2_inode_has_child_snapshots(trans, pos)
- : 0;
-}
-
-int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_inode ((struct bkey_ops) { \
- .key_validate = bch2_inode_validate, \
- .val_to_text = bch2_inode_to_text, \
- .trigger = bch2_trigger_inode, \
- .min_val_size = 16, \
-})
-
-#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \
- .key_validate = bch2_inode_v2_validate, \
- .val_to_text = bch2_inode_to_text, \
- .trigger = bch2_trigger_inode, \
- .min_val_size = 32, \
-})
-
-#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \
- .key_validate = bch2_inode_v3_validate, \
- .val_to_text = bch2_inode_to_text, \
- .trigger = bch2_trigger_inode, \
- .min_val_size = 48, \
-})
-
-static inline bool bkey_is_inode(const struct bkey *k)
-{
- return k->type == KEY_TYPE_inode ||
- k->type == KEY_TYPE_inode_v2 ||
- k->type == KEY_TYPE_inode_v3;
-}
-
-int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \
- .key_validate = bch2_inode_generation_validate, \
- .val_to_text = bch2_inode_generation_to_text, \
- .min_val_size = 8, \
-})
-
-int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \
- .key_validate = bch2_inode_alloc_cursor_validate, \
- .val_to_text = bch2_inode_alloc_cursor_to_text, \
- .min_val_size = 16, \
-})
-
-#if 0
-typedef struct {
- u64 lo;
- u32 hi;
-} __packed __aligned(4) u96;
-#endif
-typedef u64 u96;
-
-struct bch_inode_unpacked {
- u64 bi_inum;
- u32 bi_snapshot;
- u64 bi_journal_seq;
- __le64 bi_hash_seed;
- u64 bi_size;
- u64 bi_sectors;
- u64 bi_version;
- u32 bi_flags;
- u16 bi_mode;
-
-#define x(_name, _bits) u##_bits _name;
- BCH_INODE_FIELDS_v3()
-#undef x
-};
-BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24);
-
-struct bkey_inode_buf {
- struct bkey_i_inode_v3 inode;
-
-#define x(_name, _bits) + 8 + _bits / 8
- u8 _pad[0 + BCH_INODE_FIELDS_v3()];
-#undef x
-};
-
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
-
-void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
-
-int __bch2_inode_peek(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, subvol_inum, unsigned, bool);
-
-static inline int bch2_inode_peek_nowarn(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags)
-{
- return __bch2_inode_peek(trans, iter, inode, inum, flags, false);
-}
-
-static inline int bch2_inode_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags)
-{
- return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
- int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
- return ret;
-}
-
-int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
-
-static inline int bch2_inode_write(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode)
-{
- return bch2_inode_write_flags(trans, iter, inode, 0);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
-int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
-
-void bch2_inode_init_early(struct bch_fs *,
- struct bch_inode_unpacked *);
-void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
- uid_t, gid_t, umode_t, dev_t,
- struct bch_inode_unpacked *);
-void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
- uid_t, gid_t, umode_t, dev_t,
- struct bch_inode_unpacked *);
-
-int bch2_inode_create(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, u32, u64);
-
-int bch2_inode_rm(struct bch_fs *, subvol_inum);
-
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
- subvol_inum,
- struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
- struct bch_inode_unpacked *);
-
-#define inode_opt_get(_c, _inode, _name) \
- ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
-
-static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
- enum inode_opt_id id, u64 v)
-{
- switch (id) {
-#define x(_name, ...) \
- case Inode_opt_##_name: \
- inode->bi_##_name = v; \
- break;
- BCH_INODE_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
- enum inode_opt_id id)
-{
- switch (id) {
-#define x(_name, ...) \
- case Inode_opt_##_name: \
- return inode->bi_##_name;
- BCH_INODE_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-static inline u8 mode_to_type(umode_t mode)
-{
- return (mode >> 12) & 15;
-}
-
-static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
-{
- return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
-}
-
-static inline u32 bch2_inode_flags(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
- case KEY_TYPE_inode_v2:
- return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
- case KEY_TYPE_inode_v3:
- return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
- default:
- return 0;
- }
-}
-
-static inline unsigned bkey_inode_mode(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode);
- case KEY_TYPE_inode_v2:
- return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode);
- case KEY_TYPE_inode_v3:
- return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v);
- default:
- return 0;
- }
-}
-
-/* i_nlink: */
-
-static inline unsigned nlink_bias(umode_t mode)
-{
- return S_ISDIR(mode) ? 2 : 1;
-}
-
-static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
-{
- return bi->bi_flags & BCH_INODE_unlinked
- ? 0
- : bi->bi_nlink + nlink_bias(bi->bi_mode);
-}
-
-static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
- unsigned nlink)
-{
- if (nlink) {
- bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
- bi->bi_flags &= ~BCH_INODE_unlinked;
- } else {
- bi->bi_nlink = 0;
- bi->bi_flags |= BCH_INODE_unlinked;
- }
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
-void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
-
-static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode)
-{
- bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
-
- return S_ISDIR(inode->bi_mode) ||
- inode->bi_subvol ||
- (!inode->bi_nlink && inode_has_bp);
-}
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
-void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
- struct bch_inode_unpacked *);
-int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
-
-#include "rebalance.h"
-
-static inline struct bch_extent_rebalance
-bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
-{
- struct bch_io_opts io_opts;
- bch2_inode_opts_get(&io_opts, c, inode);
- return io_opts_to_rebalance_opts(c, &io_opts);
-}
-
-int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
-int bch2_delete_dead_inodes(struct bch_fs *);
-
-#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
deleted file mode 100644
index 117110af1e3f..000000000000
--- a/fs/bcachefs/inode_format.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_FORMAT_H
-#define _BCACHEFS_INODE_FORMAT_H
-
-#define BLOCKDEV_INODE_MAX 4096
-#define BCACHEFS_ROOT_INO 4096
-
-struct bch_inode {
- struct bch_val v;
-
- __le64 bi_hash_seed;
- __le32 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le64 bi_sectors;
- __le64 bi_size;
- __le64 bi_version;
- __u8 fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL 6
-#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
- struct bch_val v;
-
- __le32 bi_generation;
- __le32 pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_size, 64) \
- x(bi_sectors, 64) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32)
-
-#define BCH_INODE_FIELDS_v3() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32) \
- x(bi_nocow, 8) \
- x(bi_depth, 32) \
- x(bi_inodes_32bit, 8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS() \
- x(data_checksum, 8) \
- x(compression, 8) \
- x(project, 32) \
- x(background_compression, 8) \
- x(data_replicas, 8) \
- x(promote_target, 16) \
- x(foreground_target, 16) \
- x(background_target, 16) \
- x(erasure_code, 16) \
- x(nocow, 8) \
- x(inodes_32bit, 8)
-
-enum inode_opt_id {
-#define x(name, ...) \
- Inode_opt_##name,
- BCH_INODE_OPTS()
-#undef x
- Inode_opt_nr,
-};
-
-#define BCH_INODE_FLAGS() \
- x(sync, 0) \
- x(immutable, 1) \
- x(append, 2) \
- x(nodump, 3) \
- x(noatime, 4) \
- x(i_size_dirty, 5) \
- x(i_sectors_dirty, 6) \
- x(unlinked, 7) \
- x(backptr_untrusted, 8) \
- x(has_child_snapshot, 9) \
- x(casefolded, 10)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n) BCH_INODE_##t = 1U << n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n) __BCH_INODE_##t = n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
- struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
-
-struct bch_inode_alloc_cursor {
- struct bch_val v;
- __u8 bits;
- __u8 pad;
- __le32 gen;
- __le64 idx;
-};
-
-#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
deleted file mode 100644
index 6b842c8d21be..000000000000
--- a/fs/bcachefs/io_misc.c
+++ /dev/null
@@ -1,543 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * io_misc.c - fallocate, fpunch, truncate:
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "logged_ops.h"
-#include "rebalance.h"
-#include "subvolume.h"
-
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
- subvol_inum inum,
- struct btree_iter *iter,
- u64 sectors,
- struct bch_io_opts opts,
- s64 *i_sectors_delta,
- struct write_point_specifier write_point)
-{
- struct bch_fs *c = trans->c;
- struct disk_reservation disk_res = { 0 };
- struct closure cl;
- struct open_buckets open_buckets = { 0 };
- struct bkey_s_c k;
- struct bkey_buf old, new;
- unsigned sectors_allocated = 0, new_replicas;
- bool unwritten = opts.nocow &&
- c->sb.version >= bcachefs_metadata_version_unwritten_extents;
- int ret;
-
- bch2_bkey_buf_init(&old);
- bch2_bkey_buf_init(&new);
- closure_init_stack(&cl);
-
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
- new_replicas = max(0, (int) opts.data_replicas -
- (int) bch2_bkey_nr_ptrs_fully_allocated(k));
-
- /*
- * Get a disk reservation before (in the nocow case) calling
- * into the allocator:
- */
- ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
- if (unlikely(ret))
- goto err_noprint;
-
- bch2_bkey_buf_reassemble(&old, c, k);
-
- if (!unwritten) {
- struct bkey_i_reservation *reservation;
-
- bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
- reservation = bkey_reservation_init(new.k);
- reservation->k.p = iter->pos;
- bch2_key_resize(&reservation->k, sectors);
- reservation->v.nr_replicas = opts.data_replicas;
- } else {
- struct bkey_i_extent *e;
- struct bch_devs_list devs_have;
- struct write_point *wp;
-
- devs_have.nr = 0;
-
- bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
- e = bkey_extent_init(new.k);
- e->k.p = iter->pos;
-
- ret = bch2_alloc_sectors_start_trans(trans,
- opts.foreground_target,
- false,
- write_point,
- &devs_have,
- opts.data_replicas,
- opts.data_replicas,
- BCH_WATERMARK_normal, 0, &cl, &wp);
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- ret = -BCH_ERR_transaction_restart_nested;
- if (ret)
- goto err;
-
- sectors = min_t(u64, sectors, wp->sectors_free);
- sectors_allocated = sectors;
-
- bch2_key_resize(&e->k, sectors);
-
- bch2_open_bucket_get(c, wp, &open_buckets);
- bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
- bch2_alloc_sectors_done(c, wp);
-
- extent_for_each_ptr(extent_i_to_s(e), ptr)
- ptr->unwritten = true;
- }
-
- ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
- 0, i_sectors_delta, true);
-err:
- if (!ret && sectors_allocated)
- bch2_increment_clock(c, sectors_allocated, WRITE);
- if (should_print_err(ret)) {
- struct printbuf buf = PRINTBUF;
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9));
- prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-err_noprint:
- bch2_open_buckets_put(c, &open_buckets);
- bch2_disk_reservation_put(c, &disk_res);
- bch2_bkey_buf_exit(&new, c);
- bch2_bkey_buf_exit(&old, c);
-
- if (closure_nr_remaining(&cl) != 1) {
- bch2_trans_unlock_long(trans);
- bch2_wait_on_allocator(c, &cl);
- }
-
- return ret;
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
- subvol_inum inum, u64 end,
- s64 *i_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
- struct bpos end_pos = POS(inum.inum, end);
- struct bkey_s_c k;
- int ret = 0, ret2 = 0;
- u32 snapshot;
-
- while (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- struct bkey_i delete;
-
- if (ret)
- ret2 = ret;
-
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- continue;
-
- bch2_btree_iter_set_snapshot(iter, snapshot);
-
- /*
- * peek_max() doesn't have ideal semantics for extents:
- */
- k = bch2_btree_iter_peek_max(iter, end_pos);
- if (!k.k)
- break;
-
- ret = bkey_err(k);
- if (ret)
- continue;
-
- bkey_init(&delete.k);
- delete.k.p = iter->pos;
-
- /* create the biggest key we can */
- bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end_pos, &delete);
-
- ret = bch2_extent_update(trans, inum, iter, &delete,
- &disk_res, 0, i_sectors_delta, false);
- bch2_disk_reservation_put(c, &disk_res);
- }
-
- return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
- s64 *i_sectors_delta)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inum.inum, start),
- BTREE_ITER_intent);
-
- ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
-
- return ret;
-}
-
-/* truncate: */
-
-void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
-
- prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
- prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
- prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
-}
-
-static int truncate_set_isize(struct btree_trans *trans,
- subvol_inum inum,
- u64 new_i_size,
- bool warn)
-{
- struct btree_iter iter = { NULL };
- struct bch_inode_unpacked inode_u;
- int ret;
-
- ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?:
- (inode_u.bi_size = new_i_size, 0) ?:
- bch2_inode_write(trans, &iter, &inode_u);
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
- struct bkey_i *op_k,
- u64 *i_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter fpunch_iter;
- struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
- subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
- u64 new_i_size = le64_to_cpu(op->v.new_i_size);
- bool warn_errors = i_sectors_delta != NULL;
- int ret;
-
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL));
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
- POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
- BTREE_ITER_intent);
- ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
- bch2_trans_iter_exit(trans, &fpunch_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
-err:
- if (warn_errors)
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
-{
- return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
-}
-
-int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
-{
- struct bkey_i_logged_op_truncate op;
-
- bkey_logged_op_truncate_init(&op.k_i);
- op.v.subvol = cpu_to_le32(inum.subvol);
- op.v.inum = cpu_to_le64(inum.inum);
- op.v.new_i_size = cpu_to_le64(new_i_size);
-
- /*
- * Logged ops aren't atomic w.r.t. snapshot creation: creating a
- * snapshot while they're in progress, then crashing, will result in the
- * resume only proceeding in one of the snapshots
- */
- down_read(&c->snapshot_create_lock);
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = bch2_logged_op_start(trans, &op.k_i);
- if (ret)
- goto out;
- ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta);
- ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
-out:
- bch2_trans_put(trans);
- up_read(&c->snapshot_create_lock);
-
- return ret;
-}
-
-/* finsert/fcollapse: */
-
-void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
-
- prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
- prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
- prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset));
- prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset));
-}
-
-static int adjust_i_size(struct btree_trans *trans, subvol_inum inum,
- u64 offset, s64 len, bool warn)
-{
- struct btree_iter iter;
- struct bch_inode_unpacked inode_u;
- int ret;
-
- offset <<= 9;
- len <<= 9;
-
- ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn);
- if (ret)
- return ret;
-
- if (len > 0) {
- if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
- ret = -EFBIG;
- goto err;
- }
-
- if (offset >= inode_u.bi_size) {
- ret = -EINVAL;
- goto err;
- }
- }
-
- inode_u.bi_size += len;
- inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
-
- ret = bch2_inode_write(trans, &iter, &inode_u);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
- struct bkey_i *op_k,
- u64 *i_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
- subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
- struct bch_io_opts opts;
- u64 dst_offset = le64_to_cpu(op->v.dst_offset);
- u64 src_offset = le64_to_cpu(op->v.src_offset);
- s64 shift = dst_offset - src_offset;
- u64 len = abs(shift);
- u64 pos = le64_to_cpu(op->v.pos);
- bool insert = shift > 0;
- u32 snapshot;
- bool warn_errors = i_sectors_delta != NULL;
- int ret = 0;
-
- ret = bch2_inum_opts_get(trans, inum, &opts);
- if (ret)
- return ret;
-
- /*
- * check for missing subvolume before fpunch, as in resume we don't want
- * it to be a fatal error
- */
- ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors));
- if (ret)
- return ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inum.inum, 0),
- BTREE_ITER_intent);
-
- switch (op->v.state) {
-case LOGGED_OP_FINSERT_start:
- op->v.state = LOGGED_OP_FINSERT_shift_extents;
-
- if (insert) {
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, src_offset, len, warn_errors) ?:
- bch2_logged_op_update(trans, &op->k_i));
- if (ret)
- goto err;
- } else {
- bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
-
- ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto err;
-
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_logged_op_update(trans, &op->k_i));
- }
-
- fallthrough;
-case LOGGED_OP_FINSERT_shift_extents:
- while (1) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- struct bkey_i delete, *copy;
- struct bkey_s_c k;
- struct bpos src_pos = POS(inum.inum, src_offset);
-
- bch2_trans_begin(trans);
-
- ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot,
- warn_errors);
- if (ret)
- goto btree_err;
-
- bch2_btree_iter_set_snapshot(&iter, snapshot);
- bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
-
- k = insert
- ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
- : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
- if ((ret = bkey_err(k)))
- goto btree_err;
-
- if (!k.k ||
- k.k->p.inode != inum.inum ||
- bkey_le(k.k->p, POS(inum.inum, src_offset)))
- break;
-
- copy = bch2_bkey_make_mut_noupdate(trans, k);
- if ((ret = PTR_ERR_OR_ZERO(copy)))
- goto btree_err;
-
- if (insert &&
- bkey_lt(bkey_start_pos(k.k), src_pos)) {
- bch2_cut_front(src_pos, copy);
-
- /* Splitting compressed extent? */
- bch2_disk_reservation_add(c, &disk_res,
- copy->k.size *
- bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
- BCH_DISK_RESERVATION_NOFAIL);
- }
-
- bkey_init(&delete.k);
- delete.k.p = copy->k.p;
- delete.k.p.snapshot = snapshot;
- delete.k.size = copy->k.size;
-
- copy->k.p.offset += shift;
- copy->k.p.snapshot = snapshot;
-
- op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
-
- ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
- bch2_logged_op_update(trans, &op->k_i) ?:
- bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
-btree_err:
- bch2_disk_reservation_put(c, &disk_res);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
-
- pos = le64_to_cpu(op->v.pos);
- }
-
- op->v.state = LOGGED_OP_FINSERT_finish;
-
- if (!insert) {
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?:
- bch2_logged_op_update(trans, &op->k_i));
- } else {
- /* We need an inode update to update bi_journal_seq for fsync: */
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, 0, 0, warn_errors) ?:
- bch2_logged_op_update(trans, &op->k_i));
- }
-
- break;
-case LOGGED_OP_FINSERT_finish:
- break;
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- if (warn_errors)
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
-{
- return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
-}
-
-int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
- u64 offset, u64 len, bool insert,
- s64 *i_sectors_delta)
-{
- struct bkey_i_logged_op_finsert op;
- s64 shift = insert ? len : -len;
-
- bkey_logged_op_finsert_init(&op.k_i);
- op.v.subvol = cpu_to_le32(inum.subvol);
- op.v.inum = cpu_to_le64(inum.inum);
- op.v.dst_offset = cpu_to_le64(offset + shift);
- op.v.src_offset = cpu_to_le64(offset);
- op.v.pos = cpu_to_le64(insert ? U64_MAX : offset);
-
- /*
- * Logged ops aren't atomic w.r.t. snapshot creation: creating a
- * snapshot while they're in progress, then crashing, will result in the
- * resume only proceeding in one of the snapshots
- */
- down_read(&c->snapshot_create_lock);
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = bch2_logged_op_start(trans, &op.k_i);
- if (ret)
- goto out;
- ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta);
- ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
-out:
- bch2_trans_put(trans);
- up_read(&c->snapshot_create_lock);
-
- return ret;
-}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
deleted file mode 100644
index 9cb44a7c43c1..000000000000
--- a/fs/bcachefs/io_misc.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_MISC_H
-#define _BCACHEFS_IO_MISC_H
-
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
- u64, struct bch_io_opts, s64 *,
- struct write_point_specifier);
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
- subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) { \
- .val_to_text = bch2_logged_op_truncate_to_text, \
- .min_val_size = 24, \
-})
-
-int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
-
-int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
-
-void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \
- .val_to_text = bch2_logged_op_finsert_to_text, \
- .min_val_size = 24, \
-})
-
-int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
-
-int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
-
-#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
deleted file mode 100644
index f1503df57dc7..000000000000
--- a/fs/bcachefs/io_read.c
+++ /dev/null
@@ -1,1387 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "data_update.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "io_read.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "reflink.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_read_corrupt_ratio;
-module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(read_corrupt_ratio, "");
-#endif
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
- const struct bch_devs_mask *devs;
- unsigned d, nr = 0, total = 0;
- u64 now = local_clock(), last;
- s64 congested;
- struct bch_dev *ca;
-
- if (!target)
- return false;
-
- rcu_read_lock();
- devs = bch2_target_to_mask(c, target) ?:
- &c->rw_devs[BCH_DATA_user];
-
- for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
- ca = rcu_dereference(c->devs[d]);
- if (!ca)
- continue;
-
- congested = atomic_read(&ca->congested);
- last = READ_ONCE(ca->congested_last);
- if (time_after64(now, last))
- congested -= (now - last) >> 12;
-
- total += max(congested, 0LL);
- nr++;
- }
- rcu_read_unlock();
-
- return get_random_u32_below(nr * CONGESTED_MAX) < total;
-}
-
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
- return false;
-}
-
-#endif
-
-/* Cache promotion on read */
-
-struct promote_op {
- struct rcu_head rcu;
- u64 start_time;
-
- struct rhash_head hash;
- struct bpos pos;
-
- struct work_struct work;
- struct data_update write;
- struct bio_vec bi_inline_vecs[]; /* must be last */
-};
-
-static const struct rhashtable_params bch_promote_params = {
- .head_offset = offsetof(struct promote_op, hash),
- .key_offset = offsetof(struct promote_op, pos),
- .key_len = sizeof(struct bpos),
- .automatic_shrinking = true,
-};
-
-static inline bool have_io_error(struct bch_io_failures *failed)
-{
- return failed && failed->nr;
-}
-
-static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
-{
- EBUG_ON(rbio->split);
-
- return rbio->data_update
- ? container_of(rbio, struct data_update, rbio)
- : NULL;
-}
-
-static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
-{
- struct data_update *u = rbio_data_update(orig);
- if (!u)
- return false;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
- unsigned i = 0;
- bkey_for_each_ptr(ptrs, ptr) {
- if (ptr->dev == dev &&
- u->data_opts.rewrite_ptrs & BIT(i))
- return true;
- i++;
- }
-
- return false;
-}
-
-static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
- struct bpos pos,
- struct bch_io_opts opts,
- unsigned flags,
- struct bch_io_failures *failed)
-{
- if (!have_io_error(failed)) {
- BUG_ON(!opts.promote_target);
-
- if (!(flags & BCH_READ_may_promote))
- return -BCH_ERR_nopromote_may_not;
-
- if (bch2_bkey_has_target(c, k, opts.promote_target))
- return -BCH_ERR_nopromote_already_promoted;
-
- if (bkey_extent_is_unwritten(k))
- return -BCH_ERR_nopromote_unwritten;
-
- if (bch2_target_congested(c, opts.promote_target))
- return -BCH_ERR_nopromote_congested;
- }
-
- if (rhashtable_lookup_fast(&c->promote_table, &pos,
- bch_promote_params))
- return -BCH_ERR_nopromote_in_flight;
-
- return 0;
-}
-
-static noinline void promote_free(struct bch_read_bio *rbio)
-{
- struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
- struct bch_fs *c = rbio->c;
-
- int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params);
- BUG_ON(ret);
-
- bch2_data_update_exit(&op->write);
-
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
- kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
- struct promote_op *op = container_of(wop, struct promote_op, write.op);
- struct bch_fs *c = op->write.rbio.c;
-
- bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
- promote_free(&op->write.rbio);
-}
-
-static void promote_start_work(struct work_struct *work)
-{
- struct promote_op *op = container_of(work, struct promote_op, work);
-
- bch2_data_update_read_done(&op->write);
-}
-
-static noinline void promote_start(struct bch_read_bio *rbio)
-{
- struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
-
- trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
-
- INIT_WORK(&op->work, promote_start_work);
- queue_work(rbio->c->write_ref_wq, &op->work);
-}
-
-static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bkey_s_c k,
- struct bpos pos,
- struct extent_ptr_decoded *pick,
- unsigned sectors,
- struct bch_read_bio *orig,
- struct bch_io_failures *failed)
-{
- struct bch_fs *c = trans->c;
- int ret;
-
- struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
-
- if (!have_io_error(failed)) {
- update_opts.target = orig->opts.promote_target;
- update_opts.extra_replicas = 1;
- update_opts.write_flags |= BCH_WRITE_cached;
- update_opts.write_flags |= BCH_WRITE_only_specified_devs;
- } else {
- update_opts.target = orig->opts.foreground_target;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned ptr_bit = 1;
- bkey_for_each_ptr(ptrs, ptr) {
- if (bch2_dev_io_failures(failed, ptr->dev) &&
- !ptr_being_rewritten(orig, ptr->dev))
- update_opts.rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
- }
-
- if (!update_opts.rewrite_ptrs)
- return NULL;
- }
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
- return ERR_PTR(-BCH_ERR_nopromote_no_writes);
-
- struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op) {
- ret = -BCH_ERR_nopromote_enomem;
- goto err_put;
- }
-
- op->start_time = local_clock();
- op->pos = pos;
-
- if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
- bch_promote_params)) {
- ret = -BCH_ERR_nopromote_in_flight;
- goto err;
- }
-
- ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
- writepoint_hashed((unsigned long) current),
- &orig->opts,
- update_opts,
- btree_id, k);
- /*
- * possible errors: -BCH_ERR_nocow_lock_blocked,
- * -BCH_ERR_ENOSPC_disk_reservation:
- */
- if (ret)
- goto err_remove_hash;
-
- rbio_init_fragment(&op->write.rbio.bio, orig);
- op->write.rbio.bounce = true;
- op->write.rbio.promote = true;
- op->write.op.end_io = promote_done;
-
- return &op->write.rbio;
-err_remove_hash:
- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params));
-err:
- bio_free_pages(&op->write.op.wbio.bio);
- /* We may have added to the rhashtable and thus need rcu freeing: */
- kfree_rcu(op, rcu);
-err_put:
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
- return ERR_PTR(ret);
-}
-
-noinline
-static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
- struct bvec_iter iter,
- struct bkey_s_c k,
- struct extent_ptr_decoded *pick,
- unsigned flags,
- struct bch_read_bio *orig,
- bool *bounce,
- bool *read_full,
- struct bch_io_failures *failed)
-{
- struct bch_fs *c = trans->c;
- /*
- * if failed != NULL we're not actually doing a promote, we're
- * recovering from an io/checksum error
- */
- bool promote_full = (have_io_error(failed) ||
- *read_full ||
- READ_ONCE(c->opts.promote_whole_extents));
- /* data might have to be decompressed in the write path: */
- unsigned sectors = promote_full
- ? max(pick->crc.compressed_size, pick->crc.live_size)
- : bvec_iter_sectors(iter);
- struct bpos pos = promote_full
- ? bkey_start_pos(k.k)
- : POS(k.k->p.inode, iter.bi_sector);
- int ret;
-
- ret = should_promote(c, k, pos, orig->opts, flags, failed);
- if (ret)
- goto nopromote;
-
- struct bch_read_bio *promote =
- __promote_alloc(trans,
- k.k->type == KEY_TYPE_reflink_v
- ? BTREE_ID_reflink
- : BTREE_ID_extents,
- k, pos, pick, sectors, orig, failed);
- if (!promote)
- return NULL;
-
- ret = PTR_ERR_OR_ZERO(promote);
- if (ret)
- goto nopromote;
-
- *bounce = true;
- *read_full = promote_full;
- return promote;
-nopromote:
- trace_io_read_nopromote(c, ret);
- return NULL;
-}
-
-/* Read */
-
-static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
- struct bch_read_bio *rbio, struct bpos read_pos)
-{
- int ret = lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, out,
- (subvol_inum) { rbio->subvol, read_pos.inode },
- read_pos.offset << 9));
- if (ret)
- return ret;
-
- if (rbio->data_update)
- prt_str(out, "(internal move) ");
-
- return 0;
-}
-
-static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
- struct bch_read_bio *rbio, struct bpos read_pos)
-{
- bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
-}
-
-enum rbio_context {
- RBIO_CONTEXT_NULL,
- RBIO_CONTEXT_HIGHPRI,
- RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
- return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
- enum rbio_context context,
- struct workqueue_struct *wq)
-{
- if (context <= rbio->context) {
- fn(&rbio->work);
- } else {
- rbio->work.func = fn;
- rbio->context = context;
- queue_work(wq, &rbio->work);
- }
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
- BUG_ON(rbio->bounce && !rbio->split);
-
- if (rbio->have_ioref) {
- struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
- percpu_ref_put(&ca->io_ref);
- }
-
- if (rbio->split) {
- struct bch_read_bio *parent = rbio->parent;
-
- if (unlikely(rbio->promote)) {
- if (!rbio->bio.bi_status)
- promote_start(rbio);
- else
- promote_free(rbio);
- } else {
- if (rbio->bounce)
- bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
- bio_put(&rbio->bio);
- }
-
- rbio = parent;
- }
-
- return rbio;
-}
-
-/*
- * Only called on a top level bch_read_bio to complete an entire read request,
- * not a split:
- */
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
- if (rbio->start_time)
- bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
- rbio->start_time);
- bio_endio(&rbio->bio);
-}
-
-static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
- struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter,
- struct bch_io_failures *failed,
- unsigned flags)
-{
- struct data_update *u = container_of(rbio, struct data_update, rbio);
-retry:
- bch2_trans_begin(trans);
-
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = lockrestart_do(trans,
- bkey_err(k = bch2_bkey_get_iter(trans, &iter,
- u->btree_id, bkey_start_pos(&u->k.k->k),
- 0)));
- if (ret)
- goto err;
-
- if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
- /* extent we wanted to read no longer exists: */
- rbio->ret = -BCH_ERR_data_read_key_overwritten;
- goto err;
- }
-
- ret = __bch2_read_extent(trans, rbio, bvec_iter,
- bkey_start_pos(&u->k.k->k),
- u->btree_id,
- bkey_i_to_s_c(u->k.k),
- 0, failed, flags, -1);
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
- goto retry;
-
- if (ret) {
- rbio->bio.bi_status = BLK_STS_IOERR;
- rbio->ret = ret;
- }
-
- BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
- return ret;
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bvec_iter iter = rbio->bvec_iter;
- unsigned flags = rbio->flags;
- subvol_inum inum = {
- .subvol = rbio->subvol,
- .inum = rbio->read_pos.inode,
- };
- struct bch_io_failures failed = { .nr = 0 };
- struct btree_trans *trans = bch2_trans_get(c);
-
- trace_io_read_retry(&rbio->bio);
- this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
- bvec_iter_sectors(rbio->bvec_iter));
-
- if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
- bch2_mark_io_failure(&failed, &rbio->pick,
- rbio->ret == -BCH_ERR_data_read_retry_csum_err);
-
- if (!rbio->split) {
- rbio->bio.bi_status = 0;
- rbio->ret = 0;
- }
-
- unsigned subvol = rbio->subvol;
- struct bpos read_pos = rbio->read_pos;
-
- rbio = bch2_rbio_free(rbio);
-
- flags |= BCH_READ_in_retry;
- flags &= ~BCH_READ_may_promote;
- flags &= ~BCH_READ_last_fragment;
- flags |= BCH_READ_must_clone;
-
- int ret = rbio->data_update
- ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
- : __bch2_read(trans, rbio, iter, inum, &failed, flags);
-
- if (ret) {
- rbio->ret = ret;
- rbio->bio.bi_status = BLK_STS_IOERR;
- } else {
- struct printbuf buf = PRINTBUF;
-
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf,
- (subvol_inum) { subvol, read_pos.inode },
- read_pos.offset << 9));
- if (rbio->data_update)
- prt_str(&buf, "(internal move) ");
- prt_str(&buf, "successful retry");
-
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- bch2_rbio_done(rbio);
- bch2_trans_put(trans);
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio,
- int ret, blk_status_t blk_error)
-{
- BUG_ON(ret >= 0);
-
- rbio->ret = ret;
- rbio->bio.bi_status = blk_error;
-
- bch2_rbio_parent(rbio)->saw_error = true;
-
- if (rbio->flags & BCH_READ_in_retry)
- return;
-
- if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
- bch2_rbio_punt(rbio, bch2_rbio_retry,
- RBIO_CONTEXT_UNBOUND, system_unbound_wq);
- } else {
- rbio = bch2_rbio_free(rbio);
-
- rbio->ret = ret;
- rbio->bio.bi_status = blk_error;
-
- bch2_rbio_done(rbio);
- }
-}
-
-static void bch2_read_io_err(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bio *bio = &rbio->bio;
- struct bch_fs *c = rbio->c;
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- struct printbuf buf = PRINTBUF;
-
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
-
- if (ca)
- bch_err_ratelimited(ca, "%s", buf.buf);
- else
- bch_err_ratelimited(c, "%s", buf.buf);
-
- printbuf_exit(&buf);
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
-}
-
-static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
- struct bch_read_bio *rbio)
-{
- struct bch_fs *c = rbio->c;
- u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
- struct bch_extent_crc_unpacked new_crc;
- struct btree_iter iter;
- struct bkey_i *new;
- struct bkey_s_c k;
- int ret = 0;
-
- if (crc_is_compressed(rbio->pick.crc))
- return 0;
-
- k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_slots|BTREE_ITER_intent);
- if ((ret = bkey_err(k)))
- goto out;
-
- if (bversion_cmp(k.k->bversion, rbio->version) ||
- !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
- goto out;
-
- /* Extent was merged? */
- if (bkey_start_offset(k.k) < data_offset ||
- k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
- goto out;
-
- if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
- rbio->pick.crc, NULL, &new_crc,
- bkey_start_offset(k.k) - data_offset, k.k->size,
- rbio->pick.crc.csum_type)) {
- bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
- ret = 0;
- goto out;
- }
-
- /*
- * going to be temporarily appending another checksum entry:
- */
- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
- sizeof(struct bch_extent_crc128));
- if ((ret = PTR_ERR_OR_ZERO(new)))
- goto out;
-
- bkey_reassemble(new, k);
-
- if (!bch2_bkey_narrow_crcs(new, new_crc))
- goto out;
-
- ret = bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_internal_snapshot_node);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
- bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_rbio_narrow_crcs(trans, rbio));
-}
-
-static void bch2_read_csum_err(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio;
- struct bch_extent_crc_unpacked crc = rbio->pick.crc;
- struct nonce nonce = extent_nonce(rbio->version, crc);
- struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- struct printbuf buf = PRINTBUF;
-
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_str(&buf, "data ");
- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
-
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca)
- bch_err_ratelimited(ca, "%s", buf.buf);
- else
- bch_err_ratelimited(c, "%s", buf.buf);
-
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
- printbuf_exit(&buf);
-}
-
-static void bch2_read_decompress_err(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct printbuf buf = PRINTBUF;
-
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_str(&buf, "decompression error");
-
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca)
- bch_err_ratelimited(ca, "%s", buf.buf);
- else
- bch_err_ratelimited(c, "%s", buf.buf);
-
- bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
- printbuf_exit(&buf);
-}
-
-static void bch2_read_decrypt_err(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct printbuf buf = PRINTBUF;
-
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_str(&buf, "decrypt error");
-
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca)
- bch_err_ratelimited(ca, "%s", buf.buf);
- else
- bch_err_ratelimited(c, "%s", buf.buf);
-
- bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
- printbuf_exit(&buf);
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- struct bch_read_bio *parent = bch2_rbio_parent(rbio);
- struct bio *src = &rbio->bio;
- struct bio *dst = &parent->bio;
- struct bvec_iter dst_iter = rbio->bvec_iter;
- struct bch_extent_crc_unpacked crc = rbio->pick.crc;
- struct nonce nonce = extent_nonce(rbio->version, crc);
- unsigned nofs_flags;
- struct bch_csum csum;
- int ret;
-
- nofs_flags = memalloc_nofs_save();
-
- /* Reset iterator for checksumming and copying bounced data: */
- if (rbio->bounce) {
- src->bi_iter.bi_size = crc.compressed_size << 9;
- src->bi_iter.bi_idx = 0;
- src->bi_iter.bi_bvec_done = 0;
- } else {
- src->bi_iter = rbio->bvec_iter;
- }
-
- bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
-
- csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
-
- /*
- * Checksum error: if the bio wasn't bounced, we may have been
- * reading into buffers owned by userspace (that userspace can
- * scribble over) - retry the read, bouncing it this time:
- */
- if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
- rbio->flags |= BCH_READ_must_bounce;
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
- BLK_STS_IOERR);
- goto out;
- }
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
-
- if (!csum_good)
- goto csum_err;
-
- /*
- * XXX
- * We need to rework the narrow_crcs path to deliver the read completion
- * first, and then punt to a different workqueue, otherwise we're
- * holding up reads while doing btree updates which is bad for memory
- * reclaim.
- */
- if (unlikely(rbio->narrow_crcs))
- bch2_rbio_narrow_crcs(rbio);
-
- if (likely(!parent->data_update)) {
- /* Adjust crc to point to subset of data we want: */
- crc.offset += rbio->offset_into_extent;
- crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
-
- if (crc_is_compressed(crc)) {
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
-
- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
- !c->opts.no_data_io)
- goto decompression_err;
- } else {
- /* don't need to decrypt the entire bio: */
- nonce = nonce_add(nonce, crc.offset << 9);
- bio_advance(src, crc.offset << 9);
-
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
- src->bi_iter.bi_size = dst_iter.bi_size;
-
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
-
- if (rbio->bounce) {
- struct bvec_iter src_iter = src->bi_iter;
-
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
- }
- }
- } else {
- if (rbio->split)
- rbio->parent->pick = rbio->pick;
-
- if (rbio->bounce) {
- struct bvec_iter src_iter = src->bi_iter;
-
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
- }
- }
-
- if (rbio->promote) {
- /*
- * Re encrypt data we decrypted, so it's consistent with
- * rbio->crc:
- */
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
- }
-
- if (likely(!(rbio->flags & BCH_READ_in_retry))) {
- rbio = bch2_rbio_free(rbio);
- bch2_rbio_done(rbio);
- }
-out:
- memalloc_nofs_restore(nofs_flags);
- return;
-csum_err:
- bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
- goto out;
-decompression_err:
- bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
- goto out;
-decrypt_err:
- bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
- goto out;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
- struct bch_read_bio *rbio =
- container_of(bio, struct bch_read_bio, bio);
- struct bch_fs *c = rbio->c;
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- struct workqueue_struct *wq = NULL;
- enum rbio_context context = RBIO_CONTEXT_NULL;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
- rbio->submit_time, !bio->bi_status);
-
- if (!rbio->split)
- rbio->bio.bi_end_io = rbio->end_io;
-
- if (unlikely(bio->bi_status)) {
- bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
- return;
- }
-
- if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
- (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
- trace_and_count(c, io_read_reuse_race, &rbio->bio);
-
- if (rbio->flags & BCH_READ_retry_if_stale)
- bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
- else
- bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
- return;
- }
-
- if (rbio->narrow_crcs ||
- rbio->promote ||
- crc_is_compressed(rbio->pick.crc) ||
- bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
- context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
- else if (rbio->pick.crc.csum_type)
- context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
-
- bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bkey_s_c k,
- struct bch_extent_ptr ptr)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- PTR_BUCKET_POS(ca, &ptr),
- BTREE_ITER_cached);
-
- int gen = bucket_gen_get(ca, iter.pos.offset);
- if (gen >= 0) {
- prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
- printbuf_indent_add(&buf, 2);
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
-
- prt_printf(&buf, "memory gen: %u", gen);
-
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- if (!ret) {
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, k);
- }
- } else {
- prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
- iter.pos.inode, iter.pos.offset);
- printbuf_indent_add(&buf, 2);
-
- prt_printf(&buf, "first bucket %u nbuckets %llu\n",
- ca->mi.first_bucket, ca->mi.nbuckets);
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
- }
-
- bch2_fs_inconsistent(c, "%s", buf.buf);
-
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
-}
-
-int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bpos read_pos,
- enum btree_id data_btree, struct bkey_s_c k,
- unsigned offset_into_extent,
- struct bch_io_failures *failed, unsigned flags, int dev)
-{
- struct bch_fs *c = trans->c;
- struct extent_ptr_decoded pick;
- struct bch_read_bio *rbio = NULL;
- bool bounce = false, read_full = false, narrow_crcs = false;
- struct bpos data_pos = bkey_start_pos(k.k);
- struct data_update *u = rbio_data_update(orig);
- int ret = 0;
-
- if (bkey_extent_is_inline_data(k.k)) {
- unsigned bytes = min_t(unsigned, iter.bi_size,
- bkey_inline_data_bytes(k.k));
-
- swap(iter.bi_size, bytes);
- memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
- swap(iter.bi_size, bytes);
- bio_advance_iter(&orig->bio, &iter, bytes);
- zero_fill_bio_iter(&orig->bio, iter);
- this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
- bvec_iter_sectors(iter));
- goto out_read_done;
- }
-retry_pick:
- ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
-
- /* hole or reservation - just zero fill: */
- if (!ret)
- goto hole;
-
- if (unlikely(ret < 0)) {
- struct printbuf buf = PRINTBUF;
- bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
- prt_printf(&buf, "%s\n ", bch2_err_str(ret));
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- goto err;
- }
-
- if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
- struct printbuf buf = PRINTBUF;
- bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
- prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_data_read_no_encryption_key;
- goto err;
- }
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
-
- /*
- * Stale dirty pointers are treated as IO errors, but @failed isn't
- * allocated unless we're in the retry path - so if we're not in the
- * retry path, don't check here, it'll be caught in bch2_read_endio()
- * and we'll end up in the retry path:
- */
- if ((flags & BCH_READ_in_retry) &&
- !pick.ptr.cached &&
- ca &&
- unlikely(dev_ptr_stale(ca, &pick.ptr))) {
- read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
- bch2_mark_io_failure(failed, &pick, false);
- percpu_ref_put(&ca->io_ref);
- goto retry_pick;
- }
-
- if (likely(!u)) {
- if (!(flags & BCH_READ_last_fragment) ||
- bio_flagged(&orig->bio, BIO_CHAIN))
- flags |= BCH_READ_must_clone;
-
- narrow_crcs = !(flags & BCH_READ_in_retry) &&
- bch2_can_narrow_extent_crcs(k, pick.crc);
-
- if (narrow_crcs && (flags & BCH_READ_user_mapped))
- flags |= BCH_READ_must_bounce;
-
- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
- if (crc_is_compressed(pick.crc) ||
- (pick.crc.csum_type != BCH_CSUM_none &&
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
- (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
- (flags & BCH_READ_user_mapped)) ||
- (flags & BCH_READ_must_bounce)))) {
- read_full = true;
- bounce = true;
- }
- } else {
- /*
- * can happen if we retry, and the extent we were going to read
- * has been merged in the meantime:
- */
- if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
- if (ca)
- percpu_ref_put(&ca->io_ref);
- rbio->ret = -BCH_ERR_data_read_buffer_too_small;
- goto out_read_done;
- }
-
- iter.bi_size = pick.crc.compressed_size << 9;
- read_full = true;
- }
-
- if (orig->opts.promote_target || have_io_error(failed))
- rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
- &bounce, &read_full, failed);
-
- if (!read_full) {
- EBUG_ON(crc_is_compressed(pick.crc));
- EBUG_ON(pick.crc.csum_type &&
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
- bvec_iter_sectors(iter) != pick.crc.live_size ||
- pick.crc.offset ||
- offset_into_extent));
-
- data_pos.offset += offset_into_extent;
- pick.ptr.offset += pick.crc.offset +
- offset_into_extent;
- offset_into_extent = 0;
- pick.crc.compressed_size = bvec_iter_sectors(iter);
- pick.crc.uncompressed_size = bvec_iter_sectors(iter);
- pick.crc.offset = 0;
- pick.crc.live_size = bvec_iter_sectors(iter);
- }
-
- if (rbio) {
- /*
- * promote already allocated bounce rbio:
- * promote needs to allocate a bio big enough for uncompressing
- * data in the write path, but we're not going to use it all
- * here:
- */
- EBUG_ON(rbio->bio.bi_iter.bi_size <
- pick.crc.compressed_size << 9);
- rbio->bio.bi_iter.bi_size =
- pick.crc.compressed_size << 9;
- } else if (bounce) {
- unsigned sectors = pick.crc.compressed_size;
-
- rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
- DIV_ROUND_UP(sectors, PAGE_SECTORS),
- 0,
- GFP_NOFS,
- &c->bio_read_split),
- orig);
-
- bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
- rbio->bounce = true;
- } else if (flags & BCH_READ_must_clone) {
- /*
- * Have to clone if there were any splits, due to error
- * reporting issues (if a split errored, and retrying didn't
- * work, when it reports the error to its parent (us) we don't
- * know if the error was from our bio, and we should retry, or
- * from the whole bio, in which case we don't want to retry and
- * lose the error)
- */
- rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
- &c->bio_read_split),
- orig);
- rbio->bio.bi_iter = iter;
- } else {
- rbio = orig;
- rbio->bio.bi_iter = iter;
- EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
- }
-
- EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
- rbio->submit_time = local_clock();
- if (!rbio->split)
- rbio->end_io = orig->bio.bi_end_io;
- rbio->bvec_iter = iter;
- rbio->offset_into_extent= offset_into_extent;
- rbio->flags = flags;
- rbio->have_ioref = ca != NULL;
- rbio->narrow_crcs = narrow_crcs;
- rbio->ret = 0;
- rbio->context = 0;
- rbio->pick = pick;
- rbio->subvol = orig->subvol;
- rbio->read_pos = read_pos;
- rbio->data_btree = data_btree;
- rbio->data_pos = data_pos;
- rbio->version = k.k->bversion;
- INIT_WORK(&rbio->work, NULL);
-
- rbio->bio.bi_opf = orig->bio.bi_opf;
- rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
- rbio->bio.bi_end_io = bch2_read_endio;
-
- if (rbio->bounce)
- trace_and_count(c, io_read_bounce, &rbio->bio);
-
- if (!u)
- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
- else
- this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
- bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
- /*
- * If it's being moved internally, we don't want to flag it as a cache
- * hit:
- */
- if (ca && pick.ptr.cached && !u)
- bch2_bucket_io_time_reset(trans, pick.ptr.dev,
- PTR_BUCKET_NR(ca, &pick.ptr), READ);
-
- if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
- bio_inc_remaining(&orig->bio);
- trace_and_count(c, io_read_split, &orig->bio);
- }
-
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- if (!(flags & BCH_READ_in_retry))
- bch2_trans_unlock(trans);
- else
- bch2_trans_unlock_long(trans);
-
- if (likely(!rbio->pick.do_ec_reconstruct)) {
- if (unlikely(!rbio->have_ioref)) {
- struct printbuf buf = PRINTBUF;
- bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
- prt_printf(&buf, "no device to read from:\n ");
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
- bch2_rbio_error(rbio,
- -BCH_ERR_data_read_retry_device_offline,
- BLK_STS_IOERR);
- goto out;
- }
-
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
- bio_sectors(&rbio->bio));
- bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
- if (unlikely(c->opts.no_data_io)) {
- if (likely(!(flags & BCH_READ_in_retry)))
- bio_endio(&rbio->bio);
- } else {
- if (likely(!(flags & BCH_READ_in_retry)))
- submit_bio(&rbio->bio);
- else
- submit_bio_wait(&rbio->bio);
- }
-
- /*
- * We just submitted IO which may block, we expect relock fail
- * events and shouldn't count them:
- */
- trans->notrace_relock_fail = true;
- } else {
- /* Attempting reconstruct read: */
- if (bch2_ec_read_extent(trans, rbio, k)) {
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
- BLK_STS_IOERR);
- goto out;
- }
-
- if (likely(!(flags & BCH_READ_in_retry)))
- bio_endio(&rbio->bio);
- }
-out:
- if (likely(!(flags & BCH_READ_in_retry))) {
- return 0;
- } else {
- bch2_trans_unlock(trans);
-
- int ret;
-
- rbio->context = RBIO_CONTEXT_UNBOUND;
- bch2_read_endio(&rbio->bio);
-
- ret = rbio->ret;
- rbio = bch2_rbio_free(rbio);
-
- if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
- bch2_mark_io_failure(failed, &pick,
- ret == -BCH_ERR_data_read_retry_csum_err);
-
- return ret;
- }
-
-err:
- if (flags & BCH_READ_in_retry)
- return ret;
-
- orig->bio.bi_status = BLK_STS_IOERR;
- orig->ret = ret;
- goto out_read_done;
-
-hole:
- this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
- bvec_iter_sectors(iter));
- /*
- * won't normally happen in the data update (bch2_move_extent()) path,
- * but if we retry and the extent we wanted to read no longer exists we
- * have to signal that:
- */
- if (u)
- orig->ret = -BCH_ERR_data_read_key_overwritten;
-
- zero_fill_bio_iter(&orig->bio, iter);
-out_read_done:
- if ((flags & BCH_READ_last_fragment) &&
- !(flags & BCH_READ_in_retry))
- bch2_rbio_done(orig);
- return 0;
-}
-
-int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, subvol_inum inum,
- struct bch_io_failures *failed, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_buf sk;
- struct bkey_s_c k;
- int ret;
-
- EBUG_ON(rbio->data_update);
-
- bch2_bkey_buf_init(&sk);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inum.inum, bvec_iter.bi_sector),
- BTREE_ITER_slots);
-
- while (1) {
- enum btree_id data_btree = BTREE_ID_extents;
-
- bch2_trans_begin(trans);
-
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(&iter, snapshot);
-
- bch2_btree_iter_set_pos(&iter,
- POS(inum.inum, bvec_iter.bi_sector));
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- s64 offset_into_extent = iter.pos.offset -
- bkey_start_offset(k.k);
- unsigned sectors = k.k->size - offset_into_extent;
-
- bch2_bkey_buf_reassemble(&sk, c, k);
-
- ret = bch2_read_indirect_extent(trans, &data_btree,
- &offset_into_extent, &sk);
- if (ret)
- goto err;
-
- k = bkey_i_to_s_c(sk.k);
-
- /*
- * With indirect extents, the amount of data to read is the min
- * of the original extent and the indirect extent:
- */
- sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
-
- unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
- swap(bvec_iter.bi_size, bytes);
-
- if (bvec_iter.bi_size == bytes)
- flags |= BCH_READ_last_fragment;
-
- ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
- data_btree, k,
- offset_into_extent, failed, flags, -1);
- if (ret)
- goto err;
-
- if (flags & BCH_READ_last_fragment)
- break;
-
- swap(bvec_iter.bi_size, bytes);
- bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-err:
- if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
- flags |= BCH_READ_must_bounce;
-
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- !bch2_err_matches(ret, BCH_ERR_data_read_retry))
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret) {
- struct printbuf buf = PRINTBUF;
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf, inum,
- bvec_iter.bi_sector << 9));
- prt_printf(&buf, "read error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
- rbio->bio.bi_status = BLK_STS_IOERR;
- rbio->ret = ret;
-
- if (!(flags & BCH_READ_in_retry))
- bch2_rbio_done(rbio);
- }
-
- bch2_bkey_buf_exit(&sk, c);
- return ret;
-}
-
-void bch2_fs_io_read_exit(struct bch_fs *c)
-{
- if (c->promote_table.tbl)
- rhashtable_destroy(&c->promote_table);
- bioset_exit(&c->bio_read_split);
- bioset_exit(&c->bio_read);
-}
-
-int bch2_fs_io_read_init(struct bch_fs *c)
-{
- if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_init;
-
- if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_split_init;
-
- if (rhashtable_init(&c->promote_table, &bch_promote_params))
- return -BCH_ERR_ENOMEM_promote_table_init;
-
- return 0;
-}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
deleted file mode 100644
index cd21950417f6..000000000000
--- a/fs/bcachefs/io_read.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_READ_H
-#define _BCACHEFS_IO_READ_H
-
-#include "bkey_buf.h"
-#include "btree_iter.h"
-#include "reflink.h"
-
-struct bch_read_bio {
- struct bch_fs *c;
- u64 start_time;
- u64 submit_time;
-
- /*
- * Reads will often have to be split, and if the extent being read from
- * was checksummed or compressed we'll also have to allocate bounce
- * buffers and copy the data back into the original bio.
- *
- * If we didn't have to split, we have to save and restore the original
- * bi_end_io - @split below indicates which:
- */
- union {
- struct bch_read_bio *parent;
- bio_end_io_t *end_io;
- };
-
- /*
- * Saved copy of bio->bi_iter, from submission time - allows us to
- * resubmit on IO error, and also to copy data back to the original bio
- * when we're bouncing:
- */
- struct bvec_iter bvec_iter;
-
- unsigned offset_into_extent;
-
- u16 flags;
- union {
- struct {
- u16 data_update:1,
- promote:1,
- bounce:1,
- split:1,
- have_ioref:1,
- narrow_crcs:1,
- saw_error:1,
- context:2;
- };
- u16 _state;
- };
- s16 ret;
-
- struct extent_ptr_decoded pick;
-
- /*
- * pos we read from - different from data_pos for indirect extents:
- */
- u32 subvol;
- struct bpos read_pos;
-
- /*
- * start pos of data we read (may not be pos of data we want) - for
- * promote, narrow extents paths:
- */
- enum btree_id data_btree;
- struct bpos data_pos;
- struct bversion version;
-
- struct bch_io_opts opts;
-
- struct work_struct work;
-
- struct bio bio;
-};
-
-#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio)
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_ptr_decoded;
-
-static inline int bch2_read_indirect_extent(struct btree_trans *trans,
- enum btree_id *data_btree,
- s64 *offset_into_extent,
- struct bkey_buf *extent)
-{
- if (extent->k->k.type != KEY_TYPE_reflink_p)
- return 0;
-
- *data_btree = BTREE_ID_reflink;
- struct btree_iter iter;
- struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
- offset_into_extent,
- bkey_i_to_s_c_reflink_p(extent->k),
- true, 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (bkey_deleted(k.k)) {
- bch2_trans_iter_exit(trans, &iter);
- return -BCH_ERR_missing_indirect_extent;
- }
-
- bch2_bkey_buf_reassemble(extent, trans->c, k);
- bch2_trans_iter_exit(trans, &iter);
- return 0;
-}
-
-#define BCH_READ_FLAGS() \
- x(retry_if_stale) \
- x(may_promote) \
- x(user_mapped) \
- x(last_fragment) \
- x(must_bounce) \
- x(must_clone) \
- x(in_retry)
-
-enum __bch_read_flags {
-#define x(n) __BCH_READ_##n,
- BCH_READ_FLAGS()
-#undef x
-};
-
-enum bch_read_flags {
-#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n),
- BCH_READ_FLAGS()
-#undef x
-};
-
-int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
- struct bvec_iter, struct bpos, enum btree_id,
- struct bkey_s_c, unsigned,
- struct bch_io_failures *, unsigned, int);
-
-static inline void bch2_read_extent(struct btree_trans *trans,
- struct bch_read_bio *rbio, struct bpos read_pos,
- enum btree_id data_btree, struct bkey_s_c k,
- unsigned offset_into_extent, unsigned flags)
-{
- __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
- data_btree, k, offset_into_extent, NULL, flags, -1);
-}
-
-int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
- subvol_inum, struct bch_io_failures *, unsigned flags);
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- subvol_inum inum)
-{
- BUG_ON(rbio->_state);
-
- rbio->subvol = inum.subvol;
-
- bch2_trans_run(c,
- __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL,
- BCH_READ_retry_if_stale|
- BCH_READ_may_promote|
- BCH_READ_user_mapped));
-}
-
-static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
- struct bch_read_bio *orig)
-{
- struct bch_read_bio *rbio = to_rbio(bio);
-
- rbio->c = orig->c;
- rbio->_state = 0;
- rbio->flags = 0;
- rbio->ret = 0;
- rbio->split = true;
- rbio->parent = orig;
- rbio->opts = orig->opts;
- return rbio;
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
- struct bch_fs *c,
- struct bch_io_opts opts,
- bio_end_io_t end_io)
-{
- struct bch_read_bio *rbio = to_rbio(bio);
-
- rbio->start_time = local_clock();
- rbio->c = c;
- rbio->_state = 0;
- rbio->flags = 0;
- rbio->ret = 0;
- rbio->opts = opts;
- rbio->bio.bi_end_io = end_io;
- return rbio;
-}
-
-void bch2_fs_io_read_exit(struct bch_fs *);
-int bch2_fs_io_read_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_READ_H */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
deleted file mode 100644
index 29671075e3f1..000000000000
--- a/fs/bcachefs/io_write.c
+++ /dev/null
@@ -1,1727 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "debug.h"
-#include "ec.h"
-#include "error.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io_write.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/blkdev.h>
-#include <linux/prefetch.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_write_corrupt_ratio;
-module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(write_corrupt_ratio, "");
-#endif
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
- u64 now, int rw)
-{
- u64 latency_capable =
- ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
- /* ideally we'd be taking into account the device's variance here: */
- u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
- s64 latency_over = io_latency - latency_threshold;
-
- if (latency_threshold && latency_over > 0) {
- /*
- * bump up congested by approximately latency_over * 4 /
- * latency_threshold - we don't need much accuracy here so don't
- * bother with the divide:
- */
- if (atomic_read(&ca->congested) < CONGESTED_MAX)
- atomic_add(latency_over >>
- max_t(int, ilog2(latency_threshold) - 2, 0),
- &ca->congested);
-
- ca->congested_last = now;
- } else if (atomic_read(&ca->congested) > 0) {
- atomic_dec(&ca->congested);
- }
-}
-
-void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
-{
- atomic64_t *latency = &ca->cur_latency[rw];
- u64 now = local_clock();
- u64 io_latency = time_after64(now, submit_time)
- ? now - submit_time
- : 0;
- u64 old, new;
-
- old = atomic64_read(latency);
- do {
- /*
- * If the io latency was reasonably close to the current
- * latency, skip doing the update and atomic operation - most of
- * the time:
- */
- if (abs((int) (old - io_latency)) < (old >> 1) &&
- now & ~(~0U << 5))
- break;
-
- new = ewma_add(old, io_latency, 5);
- } while (!atomic64_try_cmpxchg(latency, &old, new));
-
- bch2_congested_acct(ca, io_latency, now, rw);
-
- __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
-}
-
-#endif
-
-/* Allocate, free from mempool: */
-
-void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
-{
- struct bvec_iter_all iter;
- struct bio_vec *bv;
-
- bio_for_each_segment_all(bv, bio, iter)
- if (bv->bv_page != ZERO_PAGE(0))
- mempool_free(bv->bv_page, &c->bio_bounce_pages);
- bio->bi_vcnt = 0;
-}
-
-static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
-{
- struct page *page;
-
- if (likely(!*using_mempool)) {
- page = alloc_page(GFP_NOFS);
- if (unlikely(!page)) {
- mutex_lock(&c->bio_bounce_pages_lock);
- *using_mempool = true;
- goto pool_alloc;
-
- }
- } else {
-pool_alloc:
- page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
- }
-
- return page;
-}
-
-void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
- size_t size)
-{
- bool using_mempool = false;
-
- while (size) {
- struct page *page = __bio_alloc_page_pool(c, &using_mempool);
- unsigned len = min_t(size_t, PAGE_SIZE, size);
-
- BUG_ON(!bio_add_page(bio, page, len, 0));
- size -= len;
- }
-
- if (using_mempool)
- mutex_unlock(&c->bio_bounce_pages_lock);
-}
-
-/* Extent update path: */
-
-int bch2_sum_sector_overwrites(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- struct bkey_i *new,
- bool *usage_increasing,
- s64 *i_sectors_delta,
- s64 *disk_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c old;
- unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
- bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
- int ret = 0;
-
- *usage_increasing = false;
- *i_sectors_delta = 0;
- *disk_sectors_delta = 0;
-
- bch2_trans_copy_iter(&iter, extent_iter);
-
- for_each_btree_key_max_continue_norestart(iter,
- new->k.p, BTREE_ITER_slots, old, ret) {
- s64 sectors = min(new->k.p.offset, old.k->p.offset) -
- max(bkey_start_offset(&new->k),
- bkey_start_offset(old.k));
-
- *i_sectors_delta += sectors *
- (bkey_extent_is_allocation(&new->k) -
- bkey_extent_is_allocation(old.k));
-
- *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
- *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
- ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
- : 0;
-
- if (!*usage_increasing &&
- (new->k.p.snapshot != old.k->p.snapshot ||
- new_replicas > bch2_bkey_replicas(c, old) ||
- (!new_compressed && bch2_bkey_sectors_compressed(old))))
- *usage_increasing = true;
-
- if (bkey_ge(old.k->p, new->k.p))
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- u64 new_i_size,
- s64 i_sectors_delta)
-{
- /*
- * Crazy performance optimization:
- * Every extent update needs to also update the inode: the inode trigger
- * will set bi->journal_seq to the journal sequence number of this
- * transaction - for fsync.
- *
- * But if that's the only reason we're updating the inode (we're not
- * updating bi_size or bi_sectors), then we don't need the inode update
- * to be journalled - if we crash, the bi_journal_seq update will be
- * lost, but that's fine.
- */
- unsigned inode_update_flags = BTREE_UPDATE_nojournal;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0,
- extent_iter->pos.inode,
- extent_iter->snapshot),
- BTREE_ITER_intent|
- BTREE_ITER_cached);
- int ret = bkey_err(k);
- if (unlikely(ret))
- return ret;
-
- /*
- * varint_decode_fast(), in the inode .invalid method, reads up to 7
- * bytes past the end of the buffer:
- */
- struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
- ret = PTR_ERR_OR_ZERO(k_mut);
- if (unlikely(ret))
- goto err;
-
- bkey_reassemble(k_mut, k);
-
- if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
- k_mut = bch2_inode_to_v3(trans, k_mut);
- ret = PTR_ERR_OR_ZERO(k_mut);
- if (unlikely(ret))
- goto err;
- }
-
- struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
-
- if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
- new_i_size > le64_to_cpu(inode->v.bi_size)) {
- inode->v.bi_size = cpu_to_le64(new_i_size);
- inode_update_flags = 0;
- }
-
- if (i_sectors_delta) {
- le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
- inode_update_flags = 0;
- }
-
- if (inode->k.p.snapshot != iter.snapshot) {
- inode->k.p.snapshot = iter.snapshot;
- inode_update_flags = 0;
- }
-
- ret = bch2_trans_update(trans, &iter, &inode->k_i,
- BTREE_UPDATE_internal_snapshot_node|
- inode_update_flags);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_extent_update(struct btree_trans *trans,
- subvol_inum inum,
- struct btree_iter *iter,
- struct bkey_i *k,
- struct disk_reservation *disk_res,
- u64 new_i_size,
- s64 *i_sectors_delta_total,
- bool check_enospc)
-{
- struct bpos next_pos;
- bool usage_increasing;
- s64 i_sectors_delta = 0, disk_sectors_delta = 0;
- int ret;
-
- /*
- * This traverses us the iterator without changing iter->path->pos to
- * search_key() (which is pos + 1 for extents): we want there to be a
- * path already traversed at iter->pos because
- * bch2_trans_extent_update() will use it to attempt extent merging
- */
- ret = __bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- ret = bch2_extent_trim_atomic(trans, iter, k);
- if (ret)
- return ret;
-
- next_pos = k->k.p;
-
- ret = bch2_sum_sector_overwrites(trans, iter, k,
- &usage_increasing,
- &i_sectors_delta,
- &disk_sectors_delta);
- if (ret)
- return ret;
-
- if (disk_res &&
- disk_sectors_delta > (s64) disk_res->sectors) {
- ret = bch2_disk_reservation_add(trans->c, disk_res,
- disk_sectors_delta - disk_res->sectors,
- !check_enospc || !usage_increasing
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- return ret;
- }
-
- /*
- * Note:
- * We always have to do an inode update - even when i_size/i_sectors
- * aren't changing - for fsync to work properly; fsync relies on
- * inode->bi_journal_seq which is updated by the trigger code:
- */
- ret = bch2_extent_update_i_size_sectors(trans, iter,
- min(k->k.p.offset << 9, new_i_size),
- i_sectors_delta) ?:
- bch2_trans_update(trans, iter, k, 0) ?:
- bch2_trans_commit(trans, disk_res, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc);
- if (unlikely(ret))
- return ret;
-
- if (i_sectors_delta_total)
- *i_sectors_delta_total += i_sectors_delta;
- bch2_btree_iter_set_pos(iter, next_pos);
- return 0;
-}
-
-static int bch2_write_index_default(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct bkey_buf sk;
- struct keylist *keys = &op->insert_keys;
- struct bkey_i *k = bch2_keylist_front(keys);
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- subvol_inum inum = {
- .subvol = op->subvol,
- .inum = k->k.p.inode,
- };
- int ret;
-
- BUG_ON(!inum.subvol);
-
- bch2_bkey_buf_init(&sk);
-
- do {
- bch2_trans_begin(trans);
-
- k = bch2_keylist_front(keys);
- bch2_bkey_buf_copy(&sk, c, k);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
- &sk.k->k.p.snapshot);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- bkey_start_pos(&sk.k->k),
- BTREE_ITER_slots|BTREE_ITER_intent);
-
- ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?:
- bch2_extent_update(trans, inum, &iter, sk.k,
- &op->res,
- op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_check_enospc);
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- if (bkey_ge(iter.pos, k->k.p))
- bch2_keylist_pop_front(&op->insert_keys);
- else
- bch2_cut_front(iter.pos, k);
- } while (!bch2_keylist_empty(keys));
-
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&sk, c);
-
- return ret;
-}
-
-/* Writes */
-
-void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
-{
- struct printbuf buf = PRINTBUF;
-
- if (op->subvol) {
- bch2_inum_offset_err_msg(op->c, &buf,
- (subvol_inum) { op->subvol, op->pos.inode, },
- offset << 9);
- } else {
- struct bpos pos = op->pos;
- pos.offset = offset;
- bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
- }
-
- prt_str(&buf, "write error: ");
-
- va_list args;
- va_start(args, fmt);
- prt_vprintf(&buf, fmt, args);
- va_end(args);
-
- if (op->flags & BCH_WRITE_move) {
- struct data_update *u = container_of(op, struct data_update, op);
-
- prt_printf(&buf, "\n from internal move ");
- bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
- }
-
- bch_err_ratelimited(op->c, "%s", buf.buf);
- printbuf_exit(&buf);
-}
-
-static void bch2_write_csum_err_msg(struct bch_write_op *op)
-{
- bch2_write_op_error(op, op->pos.offset,
- "error verifying existing checksum while rewriting existing data (memory corruption?)");
-}
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
- enum bch_data_type type,
- const struct bkey_i *k,
- bool nocow)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
- struct bch_write_bio *n;
-
- BUG_ON(c->opts.nochanges);
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = nocow
- ? bch2_dev_have_ref(c, ptr->dev)
- : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
-
- if (to_entry(ptr + 1) < ptrs.end) {
- n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
-
- n->bio.bi_end_io = wbio->bio.bi_end_io;
- n->bio.bi_private = wbio->bio.bi_private;
- n->parent = wbio;
- n->split = true;
- n->bounce = false;
- n->put_bio = true;
- n->bio.bi_opf = wbio->bio.bi_opf;
- bio_inc_remaining(&wbio->bio);
- } else {
- n = wbio;
- n->split = false;
- }
-
- n->c = c;
- n->dev = ptr->dev;
- n->have_ioref = ca != NULL;
- n->nocow = nocow;
- n->submit_time = local_clock();
- n->inode_offset = bkey_start_offset(&k->k);
- if (nocow)
- n->nocow_bucket = PTR_BUCKET_NR(ca, ptr);
- n->bio.bi_iter.bi_sector = ptr->offset;
-
- if (likely(n->have_ioref)) {
- this_cpu_add(ca->io_done->sectors[WRITE][type],
- bio_sectors(&n->bio));
-
- bio_set_dev(&n->bio, ca->disk_sb.bdev);
-
- if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
- bio_endio(&n->bio);
- continue;
- }
-
- submit_bio(&n->bio);
- } else {
- n->bio.bi_status = BLK_STS_REMOVED;
- bio_endio(&n->bio);
- }
- }
-}
-
-static void __bch2_write(struct bch_write_op *);
-
-static void bch2_write_done(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_fs *c = op->c;
-
- EBUG_ON(op->open_buckets.nr);
-
- bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
- bch2_disk_reservation_put(c, &op->res);
-
- if (!(op->flags & BCH_WRITE_move))
- bch2_write_ref_put(c, BCH_WRITE_REF_write);
- bch2_keylist_free(&op->insert_keys, op->inline_keys);
-
- EBUG_ON(cl->parent);
- closure_debug_destroy(cl);
- if (op->end_io)
- op->end_io(op);
-}
-
-static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
-{
- struct keylist *keys = &op->insert_keys;
- struct bkey_i *src, *dst = keys->keys, *n;
-
- for (src = keys->keys; src != keys->top; src = n) {
- n = bkey_next(src);
-
- if (bkey_extent_is_direct_data(&src->k)) {
- bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
- test_bit(ptr->dev, op->failed.d));
-
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
- return -BCH_ERR_data_write_io;
- }
-
- if (dst != src)
- memmove_u64s_down(dst, src, src->k.u64s);
- dst = bkey_next(dst);
- }
-
- keys->top = dst;
- return 0;
-}
-
-/**
- * __bch2_write_index - after a write, update index to point to new data
- * @op: bch_write_op to process
- */
-static void __bch2_write_index(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct keylist *keys = &op->insert_keys;
- unsigned dev;
- int ret = 0;
-
- if (unlikely(op->flags & BCH_WRITE_io_error)) {
- ret = bch2_write_drop_io_error_ptrs(op);
- if (ret)
- goto err;
- }
-
- if (!bch2_keylist_empty(keys)) {
- u64 sectors_start = keylist_sectors(keys);
-
- ret = !(op->flags & BCH_WRITE_move)
- ? bch2_write_index_default(op)
- : bch2_data_update_index_update(op);
-
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
- BUG_ON(keylist_sectors(keys) && !ret);
-
- op->written += sectors_start - keylist_sectors(keys);
-
- if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-
- bch2_write_op_error(op, bkey_start_offset(&insert->k),
- "btree update error: %s", bch2_err_str(ret));
- }
-
- if (ret)
- goto err;
- }
-out:
- /* If some a bucket wasn't written, we can't erasure code it: */
- for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
- bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
-
- bch2_open_buckets_put(c, &op->open_buckets);
- return;
-err:
- keys->top = keys->keys;
- op->error = ret;
- op->flags |= BCH_WRITE_submitted;
- goto out;
-}
-
-static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
-{
- if (state != wp->state) {
- struct task_struct *p = current;
- u64 now = ktime_get_ns();
- u64 runtime = p->se.sum_exec_runtime +
- (now - p->se.exec_start);
-
- if (state == WRITE_POINT_runnable)
- wp->last_runtime = runtime;
- else if (wp->state == WRITE_POINT_runnable)
- wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
-
- if (wp->last_state_change &&
- time_after64(now, wp->last_state_change))
- wp->time[wp->state] += now - wp->last_state_change;
- wp->state = state;
- wp->last_state_change = now;
- }
-}
-
-static inline void wp_update_state(struct write_point *wp, bool running)
-{
- enum write_point_state state;
-
- state = running ? WRITE_POINT_runnable:
- !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
- : WRITE_POINT_stopped;
-
- __wp_update_state(wp, state);
-}
-
-static CLOSURE_CALLBACK(bch2_write_index)
-{
- closure_type(op, struct bch_write_op, cl);
- struct write_point *wp = op->wp;
- struct workqueue_struct *wq = index_update_wq(op);
- unsigned long flags;
-
- if ((op->flags & BCH_WRITE_submitted) &&
- (op->flags & BCH_WRITE_move))
- bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
-
- spin_lock_irqsave(&wp->writes_lock, flags);
- if (wp->state == WRITE_POINT_waiting_io)
- __wp_update_state(wp, WRITE_POINT_waiting_work);
- list_add_tail(&op->wp_list, &wp->writes);
- spin_unlock_irqrestore (&wp->writes_lock, flags);
-
- queue_work(wq, &wp->index_update_work);
-}
-
-static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
-{
- op->wp = wp;
-
- if (wp->state == WRITE_POINT_stopped) {
- spin_lock_irq(&wp->writes_lock);
- __wp_update_state(wp, WRITE_POINT_waiting_io);
- spin_unlock_irq(&wp->writes_lock);
- }
-}
-
-void bch2_write_point_do_index_updates(struct work_struct *work)
-{
- struct write_point *wp =
- container_of(work, struct write_point, index_update_work);
- struct bch_write_op *op;
-
- while (1) {
- spin_lock_irq(&wp->writes_lock);
- op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
- wp_update_state(wp, op != NULL);
- spin_unlock_irq(&wp->writes_lock);
-
- if (!op)
- break;
-
- op->flags |= BCH_WRITE_in_worker;
-
- __bch2_write_index(op);
-
- if (!(op->flags & BCH_WRITE_submitted))
- __bch2_write(op);
- else
- bch2_write_done(&op->cl);
- }
-}
-
-static void bch2_write_endio(struct bio *bio)
-{
- struct closure *cl = bio->bi_private;
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
- struct bch_fs *c = wbio->c;
- struct bch_dev *ca = wbio->have_ioref
- ? bch2_dev_have_ref(c, wbio->dev)
- : NULL;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
- wbio->submit_time, !bio->bi_status);
-
- if (bio->bi_status) {
- bch_err_inum_offset_ratelimited(ca,
- op->pos.inode,
- wbio->inode_offset << 9,
- "data write error: %s",
- bch2_blk_status_to_str(bio->bi_status));
- set_bit(wbio->dev, op->failed.d);
- op->flags |= BCH_WRITE_io_error;
- }
-
- if (wbio->nocow) {
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- POS(ca->dev_idx, wbio->nocow_bucket),
- BUCKET_NOCOW_LOCK_UPDATE);
- set_bit(wbio->dev, op->devs_need_flush->d);
- }
-
- if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref);
-
- if (wbio->bounce)
- bch2_bio_free_pages_pool(c, bio);
-
- if (wbio->put_bio)
- bio_put(bio);
-
- if (parent)
- bio_endio(&parent->bio);
- else
- closure_put(cl);
-}
-
-static void init_append_extent(struct bch_write_op *op,
- struct write_point *wp,
- struct bversion version,
- struct bch_extent_crc_unpacked crc)
-{
- struct bkey_i_extent *e;
-
- op->pos.offset += crc.uncompressed_size;
-
- e = bkey_extent_init(op->insert_keys.top);
- e->k.p = op->pos;
- e->k.size = crc.uncompressed_size;
- e->k.bversion = version;
-
- if (crc.csum_type ||
- crc.compression_type ||
- crc.nonce)
- bch2_extent_crc_append(&e->k_i, crc);
-
- bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
- op->flags & BCH_WRITE_cached);
-
- bch2_keylist_push(&op->insert_keys);
-}
-
-static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
- struct write_point *wp,
- struct bio *src,
- bool *page_alloc_failed,
- void *buf)
-{
- struct bch_write_bio *wbio;
- struct bio *bio;
- unsigned output_available =
- min(wp->sectors_free << 9, src->bi_iter.bi_size);
- unsigned pages = DIV_ROUND_UP(output_available +
- (buf
- ? ((unsigned long) buf & (PAGE_SIZE - 1))
- : 0), PAGE_SIZE);
-
- pages = min(pages, BIO_MAX_VECS);
-
- bio = bio_alloc_bioset(NULL, pages, 0,
- GFP_NOFS, &c->bio_write);
- wbio = wbio_init(bio);
- wbio->put_bio = true;
- /* copy WRITE_SYNC flag */
- wbio->bio.bi_opf = src->bi_opf;
-
- if (buf) {
- bch2_bio_map(bio, buf, output_available);
- return bio;
- }
-
- wbio->bounce = true;
-
- /*
- * We can't use mempool for more than c->sb.encoded_extent_max
- * worth of pages, but we'd like to allocate more if we can:
- */
- bch2_bio_alloc_pages_pool(c, bio,
- min_t(unsigned, output_available,
- c->opts.encoded_extent_max));
-
- if (bio->bi_iter.bi_size < output_available)
- *page_alloc_failed =
- bch2_bio_alloc_pages(bio,
- output_available -
- bio->bi_iter.bi_size,
- GFP_NOFS) != 0;
-
- return bio;
-}
-
-static int bch2_write_rechecksum(struct bch_fs *c,
- struct bch_write_op *op,
- unsigned new_csum_type)
-{
- struct bio *bio = &op->wbio.bio;
- struct bch_extent_crc_unpacked new_crc;
-
- /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
-
- if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
- bch2_csum_type_is_encryption(new_csum_type))
- new_csum_type = op->crc.csum_type;
-
- int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
- NULL, &new_crc,
- op->crc.offset, op->crc.live_size,
- new_csum_type);
- if (ret)
- return ret;
-
- bio_advance(bio, op->crc.offset << 9);
- bio->bi_iter.bi_size = op->crc.live_size << 9;
- op->crc = new_crc;
- return 0;
-}
-
-static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
-{
- struct bch_fs *c = op->c;
- struct bio *bio = &op->wbio.bio;
- struct nonce nonce = extent_nonce(op->version, op->crc);
- int ret = 0;
-
- BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
-
- /* Can we just write the entire extent as is? */
- if (op->crc.uncompressed_size == op->crc.live_size &&
- op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
- op->crc.compressed_size <= wp->sectors_free &&
- (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
- op->incompressible)) {
- if (!crc_is_compressed(op->crc) &&
- op->csum_type != op->crc.csum_type) {
- ret = bch2_write_rechecksum(c, op, op->csum_type);
- if (ret)
- return ret;
- }
-
- return 1;
- }
-
- /*
- * If the data is compressed and we couldn't write the entire extent as
- * is, we have to decompress it:
- */
- if (crc_is_compressed(op->crc)) {
- /* Last point we can still verify checksum: */
- struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- goto csum_err;
-
- if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
- if (ret)
- return ret;
-
- op->crc.csum_type = 0;
- op->crc.csum = (struct bch_csum) { 0, 0 };
- }
-
- ret = bch2_bio_uncompress_inplace(op, bio);
- if (ret)
- return ret;
- }
-
- /*
- * No longer have compressed data after this point - data might be
- * encrypted:
- */
-
- /*
- * If the data is checksummed and we're only writing a subset,
- * rechecksum and adjust bio to point to currently live data:
- */
- if (op->crc.live_size != op->crc.uncompressed_size ||
- op->crc.csum_type != op->csum_type) {
- ret = bch2_write_rechecksum(c, op, op->csum_type);
- if (ret)
- return ret;
- }
-
- /*
- * If we want to compress the data, it has to be decrypted:
- */
- if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
- (op->compression_opt || op->crc.csum_type != op->csum_type)) {
- struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- goto csum_err;
-
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
- if (ret)
- return ret;
-
- op->crc.csum_type = 0;
- op->crc.csum = (struct bch_csum) { 0, 0 };
- }
-
- return 0;
-csum_err:
- bch2_write_csum_err_msg(op);
- return -BCH_ERR_data_write_csum;
-}
-
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
- struct bio **_dst)
-{
- struct bch_fs *c = op->c;
- struct bio *src = &op->wbio.bio, *dst = src;
- struct bvec_iter saved_iter;
- void *ec_buf;
- unsigned total_output = 0, total_input = 0;
- bool bounce = false;
- bool page_alloc_failed = false;
- int ret, more = 0;
-
- BUG_ON(!bio_sectors(src));
-
- ec_buf = bch2_writepoint_ec_buf(c, wp);
-
- if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
- ret = bch2_write_prep_encoded_data(op, wp);
- if (ret < 0)
- goto err;
- if (ret) {
- if (ec_buf) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bio_copy_data(dst, src);
- bounce = true;
- }
- init_append_extent(op, wp, op->version, op->crc);
- goto do_write;
- }
- }
-
- if (ec_buf ||
- op->compression_opt ||
- (op->csum_type &&
- !(op->flags & BCH_WRITE_pages_stable)) ||
- (bch2_csum_type_is_encryption(op->csum_type) &&
- !(op->flags & BCH_WRITE_pages_owned))) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bounce = true;
- }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
- if (!bounce && write_corrupt_ratio) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bounce = true;
- }
-#endif
- saved_iter = dst->bi_iter;
-
- do {
- struct bch_extent_crc_unpacked crc = { 0 };
- struct bversion version = op->version;
- size_t dst_len = 0, src_len = 0;
-
- if (page_alloc_failed &&
- dst->bi_iter.bi_size < (wp->sectors_free << 9) &&
- dst->bi_iter.bi_size < c->opts.encoded_extent_max)
- break;
-
- BUG_ON(op->compression_opt &&
- (op->flags & BCH_WRITE_data_encoded) &&
- bch2_csum_type_is_encryption(op->crc.csum_type));
- BUG_ON(op->compression_opt && !bounce);
-
- crc.compression_type = op->incompressible
- ? BCH_COMPRESSION_TYPE_incompressible
- : op->compression_opt
- ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
- op->compression_opt)
- : 0;
- if (!crc_is_compressed(crc)) {
- dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
- dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
-
- if (op->csum_type)
- dst_len = min_t(unsigned, dst_len,
- c->opts.encoded_extent_max);
-
- if (bounce) {
- swap(dst->bi_iter.bi_size, dst_len);
- bio_copy_data(dst, src);
- swap(dst->bi_iter.bi_size, dst_len);
- }
-
- src_len = dst_len;
- }
-
- BUG_ON(!src_len || !dst_len);
-
- if (bch2_csum_type_is_encryption(op->csum_type)) {
- if (bversion_zero(version)) {
- version.lo = atomic64_inc_return(&c->key_version);
- } else {
- crc.nonce = op->nonce;
- op->nonce += src_len >> 9;
- }
- }
-
- if ((op->flags & BCH_WRITE_data_encoded) &&
- !crc_is_compressed(crc) &&
- bch2_csum_type_is_encryption(op->crc.csum_type) ==
- bch2_csum_type_is_encryption(op->csum_type)) {
- u8 compression_type = crc.compression_type;
- u16 nonce = crc.nonce;
- /*
- * Note: when we're using rechecksum(), we need to be
- * checksumming @src because it has all the data our
- * existing checksum covers - if we bounced (because we
- * were trying to compress), @dst will only have the
- * part of the data the new checksum will cover.
- *
- * But normally we want to be checksumming post bounce,
- * because part of the reason for bouncing is so the
- * data can't be modified (by userspace) while it's in
- * flight.
- */
- if (bch2_rechecksum_bio(c, src, version, op->crc,
- &crc, &op->crc,
- src_len >> 9,
- bio_sectors(src) - (src_len >> 9),
- op->csum_type))
- goto csum_err;
- /*
- * rchecksum_bio sets compression_type on crc from op->crc,
- * this isn't always correct as sometimes we're changing
- * an extent from uncompressed to incompressible.
- */
- crc.compression_type = compression_type;
- crc.nonce = nonce;
- } else {
- if ((op->flags & BCH_WRITE_data_encoded) &&
- bch2_rechecksum_bio(c, src, version, op->crc,
- NULL, &op->crc,
- src_len >> 9,
- bio_sectors(src) - (src_len >> 9),
- op->crc.csum_type))
- goto csum_err;
-
- crc.compressed_size = dst_len >> 9;
- crc.uncompressed_size = src_len >> 9;
- crc.live_size = src_len >> 9;
-
- swap(dst->bi_iter.bi_size, dst_len);
- ret = bch2_encrypt_bio(c, op->csum_type,
- extent_nonce(version, crc), dst);
- if (ret)
- goto err;
-
- crc.csum = bch2_checksum_bio(c, op->csum_type,
- extent_nonce(version, crc), dst);
- crc.csum_type = op->csum_type;
- swap(dst->bi_iter.bi_size, dst_len);
- }
-
- init_append_extent(op, wp, version, crc);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- if (write_corrupt_ratio) {
- swap(dst->bi_iter.bi_size, dst_len);
- bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
- swap(dst->bi_iter.bi_size, dst_len);
- }
-#endif
-
- if (dst != src)
- bio_advance(dst, dst_len);
- bio_advance(src, src_len);
- total_output += dst_len;
- total_input += src_len;
- } while (dst->bi_iter.bi_size &&
- src->bi_iter.bi_size &&
- wp->sectors_free &&
- !bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX));
-
- more = src->bi_iter.bi_size != 0;
-
- dst->bi_iter = saved_iter;
-
- if (dst == src && more) {
- BUG_ON(total_output != total_input);
-
- dst = bio_split(src, total_input >> 9,
- GFP_NOFS, &c->bio_write);
- wbio_init(dst)->put_bio = true;
- /* copy WRITE_SYNC flag */
- dst->bi_opf = src->bi_opf;
- }
-
- dst->bi_iter.bi_size = total_output;
-do_write:
- *_dst = dst;
- return more;
-csum_err:
- bch2_write_csum_err_msg(op);
- ret = -BCH_ERR_data_write_csum;
-err:
- if (to_wbio(dst)->bounce)
- bch2_bio_free_pages_pool(c, dst);
- if (to_wbio(dst)->put_bio)
- bio_put(dst);
-
- return ret;
-}
-
-static bool bch2_extent_is_writeable(struct bch_write_op *op,
- struct bkey_s_c k)
-{
- struct bch_fs *c = op->c;
- struct bkey_s_c_extent e;
- struct extent_ptr_decoded p;
- const union bch_extent_entry *entry;
- unsigned replicas = 0;
-
- if (k.k->type != KEY_TYPE_extent)
- return false;
-
- e = bkey_s_c_to_extent(k);
-
- rcu_read_lock();
- extent_for_each_ptr_decode(e, p, entry) {
- if (crc_is_encoded(p.crc) || p.has_ec) {
- rcu_read_unlock();
- return false;
- }
-
- replicas += bch2_extent_ptr_durability(c, &p);
- }
- rcu_read_unlock();
-
- return replicas >= op->opts.data_replicas;
-}
-
-static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *orig,
- struct bkey_s_c k,
- u64 new_i_size)
-{
- if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
- /* trace this */
- return 0;
- }
-
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- int ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- bch2_cut_front(bkey_start_pos(&orig->k), new);
- bch2_cut_back(orig->k.p, new);
-
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_for_each_ptr(ptrs, ptr)
- ptr->unwritten = 0;
-
- /*
- * Note that we're not calling bch2_subvol_get_snapshot() in this path -
- * that was done when we kicked off the write, and here it's important
- * that we update the extent that we wrote to - even if a snapshot has
- * since been created. The write is still outstanding, so we're ok
- * w.r.t. snapshot atomicity:
- */
- return bch2_extent_update_i_size_sectors(trans, iter,
- min(new->k.p.offset << 9, new_i_size), 0) ?:
- bch2_trans_update(trans, iter, new,
- BTREE_UPDATE_internal_snapshot_node);
-}
-
-static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = 0;
-
- for_each_keylist_key(&op->insert_keys, orig) {
- ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
- bkey_start_pos(&orig->k), orig->k.p,
- BTREE_ITER_intent, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
- }));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
-
- if (ret && !bch2_err_matches(ret, EROFS)) {
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- bch2_write_op_error(op, bkey_start_offset(&insert->k),
- "btree update error: %s", bch2_err_str(ret));
- }
-
- if (ret)
- op->error = ret;
-}
-
-static void __bch2_nocow_write_done(struct bch_write_op *op)
-{
- if (unlikely(op->flags & BCH_WRITE_io_error)) {
- op->error = -BCH_ERR_data_write_io;
- } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
- bch2_nocow_write_convert_unwritten(op);
-}
-
-static CLOSURE_CALLBACK(bch2_nocow_write_done)
-{
- closure_type(op, struct bch_write_op, cl);
-
- __bch2_nocow_write_done(op);
- bch2_write_done(cl);
-}
-
-struct bucket_to_lock {
- struct bpos b;
- unsigned gen;
- struct nocow_lock_bucket *l;
-};
-
-static void bch2_nocow_write(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
- u32 snapshot;
- struct bucket_to_lock *stale_at;
- int stale, ret;
-
- if (op->flags & BCH_WRITE_move)
- return;
-
- darray_init(&buckets);
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
- if (unlikely(ret))
- goto err;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(op->pos.inode, op->pos.offset, snapshot),
- BTREE_ITER_slots);
- while (1) {
- struct bio *bio = &op->wbio.bio;
-
- buckets.nr = 0;
-
- ret = bch2_trans_relock(trans);
- if (ret)
- break;
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- /* fall back to normal cow write path? */
- if (unlikely(k.k->p.snapshot != snapshot ||
- !bch2_extent_is_writeable(op, k)))
- break;
-
- if (bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- k.k->u64s))
- break;
-
- /* Get iorefs before dropping btree locks: */
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
- if (unlikely(!ca))
- goto err_get_ioref;
-
- struct bpos b = PTR_BUCKET_POS(ca, ptr);
- struct nocow_lock_bucket *l =
- bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
- prefetch(l);
-
- /* XXX allocating memory with btree locks held - rare */
- darray_push_gfp(&buckets, ((struct bucket_to_lock) {
- .b = b, .gen = ptr->gen, .l = l,
- }), GFP_KERNEL|__GFP_NOFAIL);
-
- if (ptr->unwritten)
- op->flags |= BCH_WRITE_convert_unwritten;
- }
-
- /* Unlock before taking nocow locks, doing IO: */
- bkey_reassemble(op->insert_keys.top, k);
- bch2_trans_unlock(trans);
-
- bch2_cut_front(op->pos, op->insert_keys.top);
- if (op->flags & BCH_WRITE_convert_unwritten)
- bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
-
- darray_for_each(buckets, i) {
- struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
-
- __bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
- bucket_to_u64(i->b),
- BUCKET_NOCOW_LOCK_UPDATE);
-
- int gen = bucket_gen_get(ca, i->b.offset);
- stale = gen < 0 ? gen : gen_after(gen, i->gen);
- if (unlikely(stale)) {
- stale_at = i;
- goto err_bucket_stale;
- }
- }
-
- bio = &op->wbio.bio;
- if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
- bio = bio_split(bio, k.k->p.offset - op->pos.offset,
- GFP_KERNEL, &c->bio_write);
- wbio_init(bio)->put_bio = true;
- bio->bi_opf = op->wbio.bio.bi_opf;
- } else {
- op->flags |= BCH_WRITE_submitted;
- }
-
- op->pos.offset += bio_sectors(bio);
- op->written += bio_sectors(bio);
-
- bio->bi_end_io = bch2_write_endio;
- bio->bi_private = &op->cl;
- bio->bi_opf |= REQ_OP_WRITE;
- closure_get(&op->cl);
-
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
- op->insert_keys.top, true);
-
- bch2_keylist_push(&op->insert_keys);
- if (op->flags & BCH_WRITE_submitted)
- break;
- bch2_btree_iter_advance(&iter);
- }
-out:
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
- darray_exit(&buckets);
-
- if (ret) {
- bch2_write_op_error(op, op->pos.offset,
- "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
- op->error = ret;
- op->flags |= BCH_WRITE_submitted;
- }
-
- /* fallback to cow write path? */
- if (!(op->flags & BCH_WRITE_submitted)) {
- closure_sync(&op->cl);
- __bch2_nocow_write_done(op);
- op->insert_keys.top = op->insert_keys.keys;
- } else if (op->flags & BCH_WRITE_sync) {
- closure_sync(&op->cl);
- bch2_nocow_write_done(&op->cl.work);
- } else {
- /*
- * XXX
- * needs to run out of process context because ei_quota_lock is
- * a mutex
- */
- continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
- }
- return;
-err_get_ioref:
- darray_for_each(buckets, i)
- percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
-
- /* Fall back to COW path: */
- goto out;
-err_bucket_stale:
- darray_for_each(buckets, i) {
- bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
- if (i == stale_at)
- break;
- }
-
- struct printbuf buf = PRINTBUF;
- if (bch2_fs_inconsistent_on(stale < 0, c,
- "pointer to invalid bucket in nocow path on device %llu\n %s",
- stale_at->b.inode,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -BCH_ERR_data_write_invalid_ptr;
- } else {
- /* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
- }
- printbuf_exit(&buf);
-
- goto err_get_ioref;
-}
-
-static void __bch2_write(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct write_point *wp = NULL;
- struct bio *bio = NULL;
- unsigned nofs_flags;
- int ret;
-
- nofs_flags = memalloc_nofs_save();
-
- if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
- bch2_nocow_write(op);
- if (op->flags & BCH_WRITE_submitted)
- goto out_nofs_restore;
- }
-again:
- memset(&op->failed, 0, sizeof(op->failed));
-
- do {
- struct bkey_i *key_to_write;
- unsigned key_to_write_offset = op->insert_keys.top_p -
- op->insert_keys.keys_p;
-
- /* +1 for possible cache device: */
- if (op->open_buckets.nr + op->nr_replicas + 1 >
- ARRAY_SIZE(op->open_buckets.v))
- break;
-
- if (bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX))
- break;
-
- /*
- * The copygc thread is now global, which means it's no longer
- * freeing up space on specific disks, which means that
- * allocations for specific disks may hang arbitrarily long:
- */
- ret = bch2_trans_run(c, lockrestart_do(trans,
- bch2_alloc_sectors_start_trans(trans,
- op->target,
- op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
- op->write_point,
- &op->devs_have,
- op->nr_replicas,
- op->nr_replicas_required,
- op->watermark,
- op->flags,
- &op->cl, &wp)));
- if (unlikely(ret)) {
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- break;
-
- goto err;
- }
-
- EBUG_ON(!wp);
-
- bch2_open_bucket_get(c, wp, &op->open_buckets);
- ret = bch2_write_extent(op, wp, &bio);
-
- bch2_alloc_sectors_done_inlined(c, wp);
-err:
- if (ret <= 0) {
- op->flags |= BCH_WRITE_submitted;
-
- if (unlikely(ret < 0)) {
- if (!(op->flags & BCH_WRITE_alloc_nowait))
- bch2_write_op_error(op, op->pos.offset,
- "%s(): %s", __func__, bch2_err_str(ret));
- op->error = ret;
- break;
- }
- }
-
- bio->bi_end_io = bch2_write_endio;
- bio->bi_private = &op->cl;
- bio->bi_opf |= REQ_OP_WRITE;
-
- closure_get(bio->bi_private);
-
- key_to_write = (void *) (op->insert_keys.keys_p +
- key_to_write_offset);
-
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
- key_to_write, false);
- } while (ret);
-
- /*
- * Sync or no?
- *
- * If we're running asynchronously, wne may still want to block
- * synchronously here if we weren't able to submit all of the IO at
- * once, as that signals backpressure to the caller.
- */
- if ((op->flags & BCH_WRITE_sync) ||
- (!(op->flags & BCH_WRITE_submitted) &&
- !(op->flags & BCH_WRITE_in_worker))) {
- bch2_wait_on_allocator(c, &op->cl);
-
- __bch2_write_index(op);
-
- if (!(op->flags & BCH_WRITE_submitted))
- goto again;
- bch2_write_done(&op->cl);
- } else {
- bch2_write_queue(op, wp);
- continue_at(&op->cl, bch2_write_index, NULL);
- }
-out_nofs_restore:
- memalloc_nofs_restore(nofs_flags);
-}
-
-static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
-{
- struct bio *bio = &op->wbio.bio;
- struct bvec_iter iter;
- struct bkey_i_inline_data *id;
- unsigned sectors;
- int ret;
-
- memset(&op->failed, 0, sizeof(op->failed));
-
- op->flags |= BCH_WRITE_wrote_data_inline;
- op->flags |= BCH_WRITE_submitted;
-
- bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
-
- ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_U64s + DIV_ROUND_UP(data_len, 8));
- if (ret) {
- op->error = ret;
- goto err;
- }
-
- sectors = bio_sectors(bio);
- op->pos.offset += sectors;
-
- id = bkey_inline_data_init(op->insert_keys.top);
- id->k.p = op->pos;
- id->k.bversion = op->version;
- id->k.size = sectors;
-
- iter = bio->bi_iter;
- iter.bi_size = data_len;
- memcpy_from_bio(id->v.data, bio, iter);
-
- while (data_len & 7)
- id->v.data[data_len++] = '\0';
- set_bkey_val_bytes(&id->k, data_len);
- bch2_keylist_push(&op->insert_keys);
-
- __bch2_write_index(op);
-err:
- bch2_write_done(&op->cl);
-}
-
-/**
- * bch2_write() - handle a write to a cache device or flash only volume
- * @cl: &bch_write_op->cl
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-CLOSURE_CALLBACK(bch2_write)
-{
- closure_type(op, struct bch_write_op, cl);
- struct bio *bio = &op->wbio.bio;
- struct bch_fs *c = op->c;
- unsigned data_len;
-
- EBUG_ON(op->cl.parent);
- BUG_ON(!op->nr_replicas);
- BUG_ON(!op->write_point.v);
- BUG_ON(bkey_eq(op->pos, POS_MAX));
-
- if (op->flags & BCH_WRITE_only_specified_devs)
- op->flags |= BCH_WRITE_alloc_nowait;
-
- op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
- op->start_time = local_clock();
- bch2_keylist_init(&op->insert_keys, op->inline_keys);
- wbio_init(bio)->put_bio = false;
-
- if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
- bch2_write_op_error(op, op->pos.offset, "misaligned write");
- op->error = -BCH_ERR_data_write_misaligned;
- goto err;
- }
-
- if (c->opts.nochanges) {
- op->error = -BCH_ERR_erofs_no_writes;
- goto err;
- }
-
- if (!(op->flags & BCH_WRITE_move) &&
- !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
- op->error = -BCH_ERR_erofs_no_writes;
- goto err;
- }
-
- if (!(op->flags & BCH_WRITE_move))
- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
- bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
- data_len = min_t(u64, bio->bi_iter.bi_size,
- op->new_i_size - (op->pos.offset << 9));
-
- if (c->opts.inline_data &&
- data_len <= min(block_bytes(c) / 2, 1024U)) {
- bch2_write_data_inline(op, data_len);
- return;
- }
-
- __bch2_write(op);
- return;
-err:
- bch2_disk_reservation_put(c, &op->res);
-
- closure_debug_destroy(&op->cl);
- if (op->end_io)
- op->end_io(op);
-}
-
-static const char * const bch2_write_flags[] = {
-#define x(f) #f,
- BCH_WRITE_FLAGS()
-#undef x
- NULL
-};
-
-void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- prt_printf(out, "pos:\t");
- bch2_bpos_to_text(out, op->pos);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "started:\t");
- bch2_pr_time_units(out, local_clock() - op->start_time);
- prt_newline(out);
-
- prt_printf(out, "flags:\t");
- prt_bitflags(out, bch2_write_flags, op->flags);
- prt_newline(out);
-
- prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
- prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
-
- prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
-
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_fs_io_write_exit(struct bch_fs *c)
-{
- mempool_exit(&c->bio_bounce_pages);
- bioset_exit(&c->replica_set);
- bioset_exit(&c->bio_write);
-}
-
-int bch2_fs_io_write_init(struct bch_fs *c)
-{
- if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
- bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
- return -BCH_ERR_ENOMEM_bio_write_init;
-
- if (mempool_init_page_pool(&c->bio_bounce_pages,
- max_t(unsigned,
- c->opts.btree_node_size,
- c->opts.encoded_extent_max) /
- PAGE_SIZE, 0))
- return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
-
- return 0;
-}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
deleted file mode 100644
index b8ab19a1e1da..000000000000
--- a/fs/bcachefs/io_write.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_WRITE_H
-#define _BCACHEFS_IO_WRITE_H
-
-#include "checksum.h"
-#include "io_write_types.h"
-
-#define to_wbio(_bio) \
- container_of((_bio), struct bch_write_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
- enum bch_data_type, const struct bkey_i *, bool);
-
-__printf(3, 4)
-void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
-
-#define BCH_WRITE_FLAGS() \
- x(alloc_nowait) \
- x(cached) \
- x(data_encoded) \
- x(pages_stable) \
- x(pages_owned) \
- x(only_specified_devs) \
- x(wrote_data_inline) \
- x(check_enospc) \
- x(sync) \
- x(move) \
- x(in_worker) \
- x(submitted) \
- x(io_error) \
- x(convert_unwritten)
-
-enum __bch_write_flags {
-#define x(f) __BCH_WRITE_##f,
- BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
- BCH_WRITE_FLAGS()
-#undef x
-};
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
- return op->watermark == BCH_WATERMARK_copygc
- ? op->c->copygc_wq
- : op->c->btree_update_wq;
-}
-
-int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, subvol_inum,
- struct btree_iter *, struct bkey_i *,
- struct disk_reservation *, u64, s64 *, bool);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct bch_io_opts opts)
-{
- op->c = c;
- op->end_io = NULL;
- op->flags = 0;
- op->written = 0;
- op->error = 0;
- op->csum_type = bch2_data_checksum_type(c, opts);
- op->compression_opt = opts.compression;
- op->nr_replicas = 0;
- op->nr_replicas_required = c->opts.data_replicas_required;
- op->watermark = BCH_WATERMARK_normal;
- op->incompressible = 0;
- op->open_buckets.nr = 0;
- op->devs_have.nr = 0;
- op->target = 0;
- op->opts = opts;
- op->subvol = 0;
- op->pos = POS_MAX;
- op->version = ZERO_VERSION;
- op->write_point = (struct write_point_specifier) { 0 };
- op->res = (struct disk_reservation) { 0 };
- op->new_i_size = U64_MAX;
- op->i_sectors_delta = 0;
- op->devs_need_flush = NULL;
-}
-
-CLOSURE_CALLBACK(bch2_write);
-void bch2_write_point_do_index_updates(struct work_struct *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
- struct bch_write_bio *wbio = to_wbio(bio);
-
- memset(&wbio->wbio, 0, sizeof(wbio->wbio));
- return wbio;
-}
-
-void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
-
-void bch2_fs_io_write_exit(struct bch_fs *);
-int bch2_fs_io_write_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
deleted file mode 100644
index 3ef6df9145ef..000000000000
--- a/fs/bcachefs/io_write_types.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_WRITE_TYPES_H
-#define _BCACHEFS_IO_WRITE_TYPES_H
-
-#include "alloc_types.h"
-#include "btree_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-#include "keylist_types.h"
-#include "opts.h"
-#include "super_types.h"
-
-#include <linux/llist.h>
-#include <linux/workqueue.h>
-
-struct bch_write_bio {
- struct_group(wbio,
- struct bch_fs *c;
- struct bch_write_bio *parent;
-
- u64 submit_time;
- u64 inode_offset;
- u64 nocow_bucket;
-
- struct bch_devs_list failed;
- u8 dev;
-
- unsigned split:1,
- bounce:1,
- put_bio:1,
- have_ioref:1,
- nocow:1,
- used_mempool:1,
- first_btree_write:1;
- );
-
- struct bio bio;
-};
-
-struct bch_write_op {
- struct closure cl;
- struct bch_fs *c;
- void (*end_io)(struct bch_write_op *);
- u64 start_time;
-
- unsigned written; /* sectors */
- u16 flags;
- s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
-
- unsigned compression_opt:8;
- unsigned csum_type:4;
- unsigned nr_replicas:4;
- unsigned nr_replicas_required:4;
- unsigned watermark:3;
- unsigned incompressible:1;
- unsigned stripe_waited:1;
-
- struct bch_devs_list devs_have;
- u16 target;
- u16 nonce;
- struct bch_io_opts opts;
-
- u32 subvol;
- struct bpos pos;
- struct bversion version;
-
- /* For BCH_WRITE_data_encoded: */
- struct bch_extent_crc_unpacked crc;
-
- struct write_point_specifier write_point;
-
- struct write_point *wp;
- struct list_head wp_list;
-
- struct disk_reservation res;
-
- struct open_buckets open_buckets;
-
- u64 new_i_size;
- s64 i_sectors_delta;
-
- struct bch_devs_mask failed;
-
- struct keylist insert_keys;
- u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
-
- /*
- * Bitmask of devices that have had nocow writes issued to them since
- * last flush:
- */
- struct bch_devs_mask *devs_need_flush;
-
- /* Must be last: */
- struct bch_write_bio wbio;
-};
-
-#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
deleted file mode 100644
index bfdaea6569ae..000000000000
--- a/fs/bcachefs/journal.c
+++ /dev/null
@@ -1,1717 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcachefs journalling code, for btree insertions
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_methods.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_sb.h"
-#include "journal_seq_blacklist.h"
-#include "trace.h"
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
- return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
- return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
- return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
- return __journal_entry_is_open(j->reservations);
-}
-
-static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
-{
- union journal_res_state s = READ_ONCE(j->reservations);
- unsigned i = seq & JOURNAL_BUF_MASK;
- struct journal_buf *buf = j->buf + i;
-
- prt_printf(out, "seq:\t%llu\n", seq);
- printbuf_indent_add(out, 2);
-
- if (!buf->write_started)
- prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK));
-
- struct closure *cl = &buf->io;
- int r = atomic_read(&cl->remaining);
- prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK);
-
- if (buf->data) {
- prt_printf(out, "size:\t");
- prt_human_readable_u64(out, vstruct_bytes(buf->data));
- prt_newline(out);
- }
-
- prt_printf(out, "expires:\t");
- prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
-
- prt_printf(out, "flags:\t");
- if (buf->noflush)
- prt_str(out, "noflush ");
- if (buf->must_flush)
- prt_str(out, "must_flush ");
- if (buf->separate_flush)
- prt_str(out, "separate_flush ");
- if (buf->need_flush_to_write_buffer)
- prt_str(out, "need_flush_to_write_buffer ");
- if (buf->write_started)
- prt_str(out, "write_started ");
- if (buf->write_allocated)
- prt_str(out, "write_allocated ");
- if (buf->write_done)
- prt_str(out, "write_done");
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
-}
-
-static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
-{
- lockdep_assert_held(&j->lock);
- out->atomic++;
-
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 24);
-
- for (u64 seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j);
- seq++)
- bch2_journal_buf_to_text(out, j, seq);
- prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
-
- --out->atomic;
-}
-
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
- struct journal_buf *buf = NULL;
-
- EBUG_ON(seq > journal_cur_seq(j));
-
- if (journal_seq_unwritten(j, seq))
- buf = j->buf + (seq & JOURNAL_BUF_MASK);
- return buf;
-}
-
-static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
- INIT_LIST_HEAD(&p->unflushed[i]);
- for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
- INIT_LIST_HEAD(&p->flushed[i]);
- atomic_set(&p->count, count);
- p->devs.nr = 0;
-}
-
-/*
- * Detect stuck journal conditions and trigger shutdown. Technically the journal
- * can end up stuck for a variety of reasons, such as a blocked I/O, journal
- * reservation lockup, etc. Since this is a fatal error with potentially
- * unpredictable characteristics, we want to be fairly conservative before we
- * decide to shut things down.
- *
- * Consider the journal stuck when it appears full with no ability to commit
- * btree transactions, to discard journal buckets, nor acquire priority
- * (reserved watermark) reservation.
- */
-static inline bool
-journal_error_check_stuck(struct journal *j, int error, unsigned flags)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- bool stuck = false;
- struct printbuf buf = PRINTBUF;
-
- if (!(error == -BCH_ERR_journal_full ||
- error == -BCH_ERR_journal_pin_full) ||
- nr_unwritten_journal_entries(j) ||
- (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
- return stuck;
-
- spin_lock(&j->lock);
-
- if (j->can_discard) {
- spin_unlock(&j->lock);
- return stuck;
- }
-
- stuck = true;
-
- /*
- * The journal shutdown path will set ->err_seq, but do it here first to
- * serialize against concurrent failures and avoid duplicate error
- * reports.
- */
- if (j->err_seq) {
- spin_unlock(&j->lock);
- return stuck;
- }
- j->err_seq = journal_cur_seq(j);
- spin_unlock(&j->lock);
-
- bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
- bch2_err_str(error));
- bch2_journal_debug_to_text(&buf, j);
- bch_err(c, "%s", buf.buf);
-
- printbuf_reset(&buf);
- bch2_journal_pins_to_text(&buf, j);
- bch_err(c, "Journal pins:\n%s", buf.buf);
- printbuf_exit(&buf);
-
- bch2_fatal_error(c);
- dump_stack();
-
- return stuck;
-}
-
-void bch2_journal_do_writes(struct journal *j)
-{
- for (u64 seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j);
- seq++) {
- unsigned idx = seq & JOURNAL_BUF_MASK;
- struct journal_buf *w = j->buf + idx;
-
- if (w->write_started && !w->write_allocated)
- break;
- if (w->write_started)
- continue;
-
- if (!journal_state_seq_count(j, j->reservations, seq)) {
- j->seq_write_started = seq;
- w->write_started = true;
- closure_call(&w->io, bch2_journal_write, j->wq, NULL);
- }
-
- break;
- }
-}
-
-/*
- * Final processing when the last reference of a journal buffer has been
- * dropped. Drop the pin list reference acquired at journal entry open and write
- * the buffer, if requested.
- */
-void bch2_journal_buf_put_final(struct journal *j, u64 seq)
-{
- lockdep_assert_held(&j->lock);
-
- if (__bch2_journal_pin_put(j, seq))
- bch2_journal_reclaim_fast(j);
- bch2_journal_do_writes(j);
-
- /*
- * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an
- * open journal entry
- */
- wake_up(&j->wait);
-}
-
-/*
- * Returns true if journal entry is now closed:
- *
- * We don't close a journal_buf until the next journal_buf is finished writing,
- * and can be opened again - this also initializes the next journal_buf:
- */
-static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf = journal_cur_buf(j);
- union journal_res_state old, new;
- unsigned sectors;
-
- BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
- closed_val != JOURNAL_ENTRY_ERROR_VAL);
-
- lockdep_assert_held(&j->lock);
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
- new.cur_entry_offset = closed_val;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
- old.cur_entry_offset == new.cur_entry_offset)
- return;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
- &old.v, new.v));
-
- if (!__journal_entry_is_open(old))
- return;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)
- old.cur_entry_offset = j->cur_entry_offset_if_blocked;
-
- /* Close out old buffer: */
- buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
-
- if (trace_journal_entry_close_enabled() && trace) {
- struct printbuf pbuf = PRINTBUF;
- pbuf.atomic++;
-
- prt_str(&pbuf, "entry size: ");
- prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
- prt_newline(&pbuf);
- bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
- trace_journal_entry_close(c, pbuf.buf);
- printbuf_exit(&pbuf);
- }
-
- sectors = vstruct_blocks_plus(buf->data, c->block_bits,
- buf->u64s_reserved) << c->block_bits;
- BUG_ON(sectors > buf->sectors);
- buf->sectors = sectors;
-
- /*
- * We have to set last_seq here, _before_ opening a new journal entry:
- *
- * A threads may replace an old pin with a new pin on their current
- * journal reservation - the expectation being that the journal will
- * contain either what the old pin protected or what the new pin
- * protects.
- *
- * After the old pin is dropped journal_last_seq() won't include the old
- * pin, so we can only write the updated last_seq on the entry that
- * contains whatever the new pin protects.
- *
- * Restated, we can _not_ update last_seq for a given entry if there
- * could be a newer entry open with reservations/pins that have been
- * taken against it.
- *
- * Hence, we want update/set last_seq on the current journal entry right
- * before we open a new one:
- */
- buf->last_seq = journal_last_seq(j);
- buf->data->last_seq = cpu_to_le64(buf->last_seq);
- BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
-
- cancel_delayed_work(&j->write_work);
-
- bch2_journal_space_available(j);
-
- __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
-}
-
-void bch2_journal_halt(struct journal *j)
-{
- spin_lock(&j->lock);
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
- if (!j->err_seq)
- j->err_seq = journal_cur_seq(j);
- journal_wake(j);
- spin_unlock(&j->lock);
-}
-
-void bch2_journal_halt_locked(struct journal *j)
-{
- lockdep_assert_held(&j->lock);
-
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
- if (!j->err_seq)
- j->err_seq = journal_cur_seq(j);
- journal_wake(j);
-}
-
-static bool journal_entry_want_write(struct journal *j)
-{
- bool ret = !journal_entry_is_open(j) ||
- journal_cur_seq(j) == journal_last_unwritten_seq(j);
-
- /* Don't close it yet if we already have a write in flight: */
- if (ret)
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
- else if (nr_unwritten_journal_entries(j)) {
- struct journal_buf *buf = journal_cur_buf(j);
-
- if (!buf->flush_time) {
- buf->flush_time = local_clock() ?: 1;
- buf->expires = jiffies;
- }
- }
-
- return ret;
-}
-
-bool bch2_journal_entry_close(struct journal *j)
-{
- bool ret;
-
- spin_lock(&j->lock);
- ret = journal_entry_want_write(j);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-/*
- * should _only_ called from journal_res_get() - when we actually want a
- * journal reservation - journal entry is open means journal is dirty:
- */
-static int journal_entry_open(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf = j->buf +
- ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
- union journal_res_state old, new;
- int u64s;
-
- lockdep_assert_held(&j->lock);
- BUG_ON(journal_entry_is_open(j));
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
- if (j->blocked)
- return -BCH_ERR_journal_blocked;
-
- if (j->cur_entry_error)
- return j->cur_entry_error;
-
- int ret = bch2_journal_error(j);
- if (unlikely(ret))
- return ret;
-
- if (!fifo_free(&j->pin))
- return -BCH_ERR_journal_pin_full;
-
- if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
- return -BCH_ERR_journal_max_in_flight;
-
- if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
- return -BCH_ERR_journal_max_open;
-
- if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) {
- bch_err(c, "cannot start: journal seq overflow");
- if (bch2_fs_emergency_read_only_locked(c))
- bch_err(c, "fatal error - emergency read only");
- return -BCH_ERR_journal_shutdown;
- }
-
- if (!j->free_buf && !buf->data)
- return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */
-
- BUG_ON(!j->cur_entry_sectors);
-
- if (!buf->data) {
- swap(buf->data, j->free_buf);
- swap(buf->buf_size, j->free_buf_size);
- }
-
- buf->expires =
- (journal_cur_seq(j) == j->flushed_seq_ondisk
- ? jiffies
- : j->last_flush_write) +
- msecs_to_jiffies(c->opts.journal_flush_delay);
-
- buf->u64s_reserved = j->entry_u64s_reserved;
- buf->disk_sectors = j->cur_entry_sectors;
- buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
-
- u64s = (int) (buf->sectors << 9) / sizeof(u64) -
- journal_entry_overhead(j);
- u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
-
- if (u64s <= (ssize_t) j->early_journal_entries.nr)
- return -BCH_ERR_journal_full;
-
- if (fifo_empty(&j->pin) && j->reclaim_thread)
- wake_up_process(j->reclaim_thread);
-
- /*
- * The fifo_push() needs to happen at the same time as j->seq is
- * incremented for journal_last_seq() to be calculated correctly
- */
- atomic64_inc(&j->seq);
- journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-
- BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
-
- BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
-
- bkey_extent_init(&buf->key);
- buf->noflush = false;
- buf->must_flush = false;
- buf->separate_flush = false;
- buf->flush_time = 0;
- buf->need_flush_to_write_buffer = true;
- buf->write_started = false;
- buf->write_allocated = false;
- buf->write_done = false;
-
- memset(buf->data, 0, sizeof(*buf->data));
- buf->data->seq = cpu_to_le64(journal_cur_seq(j));
- buf->data->u64s = 0;
-
- if (j->early_journal_entries.nr) {
- memcpy(buf->data->_data, j->early_journal_entries.data,
- j->early_journal_entries.nr * sizeof(u64));
- le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
- }
-
- /*
- * Must be set before marking the journal entry as open:
- */
- j->cur_entry_u64s = u64s;
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
-
- BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
-
- new.idx++;
- BUG_ON(journal_state_count(new, new.idx));
- BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK));
-
- journal_state_inc(&new);
-
- /* Handle any already added entries */
- new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
- &old.v, new.v));
-
- if (nr_unwritten_journal_entries(j) == 1)
- mod_delayed_work(j->wq,
- &j->write_work,
- msecs_to_jiffies(c->opts.journal_flush_delay));
- journal_wake(j);
-
- if (j->early_journal_entries.nr)
- darray_exit(&j->early_journal_entries);
- return 0;
-}
-
-static bool journal_quiesced(struct journal *j)
-{
- bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
-
- if (!ret)
- bch2_journal_entry_close(j);
- return ret;
-}
-
-static void journal_quiesce(struct journal *j)
-{
- wait_event(j->wait, journal_quiesced(j));
-}
-
-static void journal_write_work(struct work_struct *work)
-{
- struct journal *j = container_of(work, struct journal, write_work.work);
-
- spin_lock(&j->lock);
- if (__journal_entry_is_open(j->reservations)) {
- long delta = journal_cur_buf(j)->expires - jiffies;
-
- if (delta > 0)
- mod_delayed_work(j->wq, &j->write_work, delta);
- else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
- }
- spin_unlock(&j->lock);
-}
-
-static void journal_buf_prealloc(struct journal *j)
-{
- if (j->free_buf &&
- j->free_buf_size >= j->buf_size_want)
- return;
-
- unsigned buf_size = j->buf_size_want;
-
- spin_unlock(&j->lock);
- void *buf = kvmalloc(buf_size, GFP_NOFS);
- spin_lock(&j->lock);
-
- if (buf &&
- (!j->free_buf ||
- buf_size > j->free_buf_size)) {
- swap(buf, j->free_buf);
- swap(buf_size, j->free_buf_size);
- }
-
- if (unlikely(buf)) {
- spin_unlock(&j->lock);
- /* kvfree can sleep */
- kvfree(buf);
- spin_lock(&j->lock);
- }
-}
-
-static int __journal_res_get(struct journal *j, struct journal_res *res,
- unsigned flags)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf;
- bool can_discard;
- int ret;
-retry:
- if (journal_res_get_fast(j, res, flags))
- return 0;
-
- ret = bch2_journal_error(j);
- if (unlikely(ret))
- return ret;
-
- if (j->blocked)
- return -BCH_ERR_journal_blocked;
-
- if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- ret = -BCH_ERR_journal_full;
- can_discard = j->can_discard;
- goto out;
- }
-
- if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
- ret = -BCH_ERR_journal_max_in_flight;
- goto out;
- }
-
- spin_lock(&j->lock);
-
- journal_buf_prealloc(j);
-
- /*
- * Recheck after taking the lock, so we don't race with another thread
- * that just did journal_entry_open() and call bch2_journal_entry_close()
- * unnecessarily
- */
- if (journal_res_get_fast(j, res, flags)) {
- ret = 0;
- goto unlock;
- }
-
- /*
- * If we couldn't get a reservation because the current buf filled up,
- * and we had room for a bigger entry on disk, signal that we want to
- * realloc the journal bufs:
- */
- buf = journal_cur_buf(j);
- if (journal_entry_is_open(j) &&
- buf->buf_size >> 9 < buf->disk_sectors &&
- buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
- j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
-
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
- ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open;
-unlock:
- can_discard = j->can_discard;
- spin_unlock(&j->lock);
-out:
- if (likely(!ret))
- return 0;
- if (ret == -BCH_ERR_journal_retry_open)
- goto retry;
-
- if (journal_error_check_stuck(j, ret, flags))
- ret = -BCH_ERR_journal_stuck;
-
- if (ret == -BCH_ERR_journal_max_in_flight &&
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
- trace_journal_entry_full_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_printbuf_make_room(&buf, 4096);
-
- spin_lock(&j->lock);
- prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
- bch2_journal_bufs_to_text(&buf, j);
- spin_unlock(&j->lock);
-
- trace_journal_entry_full(c, buf.buf);
- printbuf_exit(&buf);
- count_event(c, journal_entry_full);
- }
-
- if (ret == -BCH_ERR_journal_max_open &&
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) &&
- trace_journal_entry_full_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_printbuf_make_room(&buf, 4096);
-
- spin_lock(&j->lock);
- prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
- bch2_journal_bufs_to_text(&buf, j);
- spin_unlock(&j->lock);
-
- trace_journal_entry_full(c, buf.buf);
- printbuf_exit(&buf);
- count_event(c, journal_entry_full);
- }
-
- /*
- * Journal is full - can't rely on reclaim from work item due to
- * freezing:
- */
- if ((ret == -BCH_ERR_journal_full ||
- ret == -BCH_ERR_journal_pin_full) &&
- !(flags & JOURNAL_RES_GET_NONBLOCK)) {
- if (can_discard) {
- bch2_journal_do_discards(j);
- goto retry;
- }
-
- if (mutex_trylock(&j->reclaim_lock)) {
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
- }
- }
-
- return ret;
-}
-
-static unsigned max_dev_latency(struct bch_fs *c)
-{
- u64 nsecs = 0;
-
- for_each_rw_member(c, ca)
- nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
-
- return nsecs_to_jiffies(nsecs);
-}
-
-/*
- * Essentially the entry function to the journaling code. When bcachefs is doing
- * a btree insert, it calls this function to get the current journal write.
- * Journal write is the structure used set up journal writes. The calling
- * function will then add its keys to the structure, queuing them for the next
- * write.
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks.
- */
-int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
- unsigned flags,
- struct btree_trans *trans)
-{
- int ret;
-
- if (closure_wait_event_timeout(&j->async_wait,
- !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
- (flags & JOURNAL_RES_GET_NONBLOCK),
- HZ))
- return ret;
-
- if (trans)
- bch2_trans_unlock_long(trans);
-
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10);
-
- remaining_wait = max(0, remaining_wait - HZ);
-
- if (closure_wait_event_timeout(&j->async_wait,
- !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
- (flags & JOURNAL_RES_GET_NONBLOCK),
- remaining_wait))
- return ret;
-
- struct printbuf buf = PRINTBUF;
- bch2_journal_debug_to_text(&buf, j);
- bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
- buf.buf);
- printbuf_exit(&buf);
-
- closure_wait_event(&j->async_wait,
- !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
- (flags & JOURNAL_RES_GET_NONBLOCK));
- return ret;
-}
-
-/* journal_entry_res: */
-
-void bch2_journal_entry_res_resize(struct journal *j,
- struct journal_entry_res *res,
- unsigned new_u64s)
-{
- union journal_res_state state;
- int d = new_u64s - res->u64s;
-
- spin_lock(&j->lock);
-
- j->entry_u64s_reserved += d;
- if (d <= 0)
- goto out;
-
- j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
- state = READ_ONCE(j->reservations);
-
- if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
- state.cur_entry_offset > j->cur_entry_u64s) {
- j->cur_entry_u64s += d;
- /*
- * Not enough room in current journal entry, have to flush it:
- */
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
- } else {
- journal_cur_buf(j)->u64s_reserved += d;
- }
-out:
- spin_unlock(&j->lock);
- res->u64s += d;
-}
-
-/* journal flushing: */
-
-/**
- * bch2_journal_flush_seq_async - wait for a journal entry to be written
- * @j: journal object
- * @seq: seq to flush
- * @parent: closure object to wait with
- * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed,
- * -BCH_ERR_journal_flush_err if @seq will never be flushed
- *
- * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
- * necessary
- */
-int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
- struct closure *parent)
-{
- struct journal_buf *buf;
- int ret = 0;
-
- if (seq <= j->flushed_seq_ondisk)
- return 1;
-
- spin_lock(&j->lock);
-
- if (WARN_ONCE(seq > journal_cur_seq(j),
- "requested to flush journal seq %llu, but currently at %llu",
- seq, journal_cur_seq(j)))
- goto out;
-
- /* Recheck under lock: */
- if (j->err_seq && seq >= j->err_seq) {
- ret = -BCH_ERR_journal_flush_err;
- goto out;
- }
-
- if (seq <= j->flushed_seq_ondisk) {
- ret = 1;
- goto out;
- }
-
- /* if seq was written, but not flushed - flush a newer one instead */
- seq = max(seq, journal_last_unwritten_seq(j));
-
-recheck_need_open:
- if (seq > journal_cur_seq(j)) {
- struct journal_res res = { 0 };
-
- if (journal_entry_is_open(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-
- spin_unlock(&j->lock);
-
- /*
- * We're called from bch2_journal_flush_seq() -> wait_event();
- * but this might block. We won't usually block, so we won't
- * livelock:
- */
- sched_annotate_sleep();
- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
- if (ret)
- return ret;
-
- seq = res.seq;
- buf = journal_seq_to_buf(j, seq);
- buf->must_flush = true;
-
- if (!buf->flush_time) {
- buf->flush_time = local_clock() ?: 1;
- buf->expires = jiffies;
- }
-
- if (parent && !closure_wait(&buf->wait, parent))
- BUG();
-
- bch2_journal_res_put(j, &res);
-
- spin_lock(&j->lock);
- goto want_write;
- }
-
- /*
- * if write was kicked off without a flush, or if we promised it
- * wouldn't be a flush, flush the next sequence number instead
- */
- buf = journal_seq_to_buf(j, seq);
- if (buf->noflush) {
- seq++;
- goto recheck_need_open;
- }
-
- buf->must_flush = true;
- j->flushing_seq = max(j->flushing_seq, seq);
-
- if (parent && !closure_wait(&buf->wait, parent))
- BUG();
-want_write:
- if (seq == journal_cur_seq(j))
- journal_entry_want_write(j);
-out:
- spin_unlock(&j->lock);
- return ret;
-}
-
-int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state)
-{
- u64 start_time = local_clock();
- int ret, ret2;
-
- /*
- * Don't update time_stats when @seq is already flushed:
- */
- if (seq <= j->flushed_seq_ondisk)
- return 0;
-
- ret = wait_event_state(j->wait,
- (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)),
- task_state);
-
- if (!ret)
- bch2_time_stats_update(j->flush_seq_time, start_time);
-
- return ret ?: ret2 < 0 ? ret2 : 0;
-}
-
-/*
- * bch2_journal_flush_async - if there is an open journal entry, or a journal
- * still being written, write it and wait for the write to complete
- */
-void bch2_journal_flush_async(struct journal *j, struct closure *parent)
-{
- bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
-}
-
-int bch2_journal_flush(struct journal *j)
-{
- return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE);
-}
-
-/*
- * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the
- * range [start, end)
- * @seq
- */
-bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- u64 unwritten_seq;
- bool ret = false;
-
- if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
- return false;
-
- if (c->journal.flushed_seq_ondisk >= start)
- return false;
-
- spin_lock(&j->lock);
- if (c->journal.flushed_seq_ondisk >= start)
- goto out;
-
- for (unwritten_seq = journal_last_unwritten_seq(j);
- unwritten_seq < end;
- unwritten_seq++) {
- struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
-
- /* journal flush already in flight, or flush requseted */
- if (buf->must_flush)
- goto out;
-
- buf->noflush = true;
- }
-
- ret = true;
-out:
- spin_unlock(&j->lock);
- return ret;
-}
-
-static int __bch2_journal_meta(struct journal *j)
-{
- struct journal_res res = {};
- int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
- if (ret)
- return ret;
-
- struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
- buf->must_flush = true;
-
- if (!buf->flush_time) {
- buf->flush_time = local_clock() ?: 1;
- buf->expires = jiffies;
- }
-
- bch2_journal_res_put(j, &res);
-
- return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
-}
-
-int bch2_journal_meta(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
- return -BCH_ERR_erofs_no_writes;
-
- int ret = __bch2_journal_meta(j);
- bch2_write_ref_put(c, BCH_WRITE_REF_journal);
- return ret;
-}
-
-/* block/unlock the journal: */
-
-void bch2_journal_unblock(struct journal *j)
-{
- spin_lock(&j->lock);
- if (!--j->blocked &&
- j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL &&
- j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) {
- union journal_res_state old, new;
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
- new.cur_entry_offset = j->cur_entry_offset_if_blocked;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
- }
- spin_unlock(&j->lock);
-
- journal_wake(j);
-}
-
-static void __bch2_journal_block(struct journal *j)
-{
- if (!j->blocked++) {
- union journal_res_state old, new;
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- j->cur_entry_offset_if_blocked = old.cur_entry_offset;
-
- if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL)
- break;
-
- new.v = old.v;
- new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
-
- if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL)
- journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
- }
-}
-
-void bch2_journal_block(struct journal *j)
-{
- spin_lock(&j->lock);
- __bch2_journal_block(j);
- spin_unlock(&j->lock);
-
- journal_quiesce(j);
-}
-
-static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j,
- u64 max_seq, bool *blocked)
-{
- struct journal_buf *ret = NULL;
-
- /* We're inside wait_event(), but using mutex_lock(: */
- sched_annotate_sleep();
- mutex_lock(&j->buf_lock);
- spin_lock(&j->lock);
- max_seq = min(max_seq, journal_cur_seq(j));
-
- for (u64 seq = journal_last_unwritten_seq(j);
- seq <= max_seq;
- seq++) {
- unsigned idx = seq & JOURNAL_BUF_MASK;
- struct journal_buf *buf = j->buf + idx;
-
- if (buf->need_flush_to_write_buffer) {
- union journal_res_state s;
- s.v = atomic64_read_acquire(&j->reservations.counter);
-
- unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s);
-
- if (open && !*blocked) {
- __bch2_journal_block(j);
- *blocked = true;
- }
-
- ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
- ? ERR_PTR(-EAGAIN)
- : buf;
- break;
- }
- }
-
- spin_unlock(&j->lock);
- if (IS_ERR_OR_NULL(ret))
- mutex_unlock(&j->buf_lock);
- return ret;
-}
-
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
- u64 max_seq, bool *blocked)
-{
- struct journal_buf *ret;
- *blocked = false;
-
- wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j,
- max_seq, blocked)) != ERR_PTR(-EAGAIN));
- if (IS_ERR_OR_NULL(ret) && *blocked)
- bch2_journal_unblock(j);
-
- return ret;
-}
-
-/* allocate journal on a device: */
-
-static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
- bool new_fs, struct closure *cl)
-{
- struct bch_fs *c = ca->fs;
- struct journal_device *ja = &ca->journal;
- u64 *new_bucket_seq = NULL, *new_buckets = NULL;
- struct open_bucket **ob = NULL;
- long *bu = NULL;
- unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
- int ret = 0;
-
- BUG_ON(nr <= ja->nr);
-
- bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
- ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
- new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL);
- new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL);
- if (!bu || !ob || !new_buckets || !new_bucket_seq) {
- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
- goto err_free;
- }
-
- for (nr_got = 0; nr_got < nr_want; nr_got++) {
- enum bch_watermark watermark = new_fs
- ? BCH_WATERMARK_btree
- : BCH_WATERMARK_normal;
-
- ob[nr_got] = bch2_bucket_alloc(c, ca, watermark,
- BCH_DATA_journal, cl);
- ret = PTR_ERR_OR_ZERO(ob[nr_got]);
- if (ret)
- break;
-
- if (!new_fs) {
- ret = bch2_trans_run(c,
- bch2_trans_mark_metadata_bucket(trans, ca,
- ob[nr_got]->bucket, BCH_DATA_journal,
- ca->mi.bucket_size, BTREE_TRIGGER_transactional));
- if (ret) {
- bch2_open_bucket_put(c, ob[nr_got]);
- bch_err_msg(c, ret, "marking new journal buckets");
- break;
- }
- }
-
- bu[nr_got] = ob[nr_got]->bucket;
- }
-
- if (!nr_got)
- goto err_free;
-
- /* Don't return an error if we successfully allocated some buckets: */
- ret = 0;
-
- if (c) {
- bch2_journal_flush_all_pins(&c->journal);
- bch2_journal_block(&c->journal);
- mutex_lock(&c->sb_lock);
- }
-
- memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
- memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
-
- BUG_ON(ja->discard_idx > ja->nr);
-
- pos = ja->discard_idx ?: ja->nr;
-
- memmove(new_buckets + pos + nr_got,
- new_buckets + pos,
- sizeof(new_buckets[0]) * (ja->nr - pos));
- memmove(new_bucket_seq + pos + nr_got,
- new_bucket_seq + pos,
- sizeof(new_bucket_seq[0]) * (ja->nr - pos));
-
- for (i = 0; i < nr_got; i++) {
- new_buckets[pos + i] = bu[i];
- new_bucket_seq[pos + i] = 0;
- }
-
- nr = ja->nr + nr_got;
-
- ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
- if (ret)
- goto err_unblock;
-
- bch2_write_super(c);
-
- /* Commit: */
- if (c)
- spin_lock(&c->journal.lock);
-
- swap(new_buckets, ja->buckets);
- swap(new_bucket_seq, ja->bucket_seq);
- ja->nr = nr;
-
- if (pos <= ja->discard_idx)
- ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
- if (pos <= ja->dirty_idx_ondisk)
- ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
- if (pos <= ja->dirty_idx)
- ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
- if (pos <= ja->cur_idx)
- ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
-
- if (c)
- spin_unlock(&c->journal.lock);
-err_unblock:
- if (c) {
- bch2_journal_unblock(&c->journal);
- mutex_unlock(&c->sb_lock);
- }
-
- if (ret && !new_fs)
- for (i = 0; i < nr_got; i++)
- bch2_trans_run(c,
- bch2_trans_mark_metadata_bucket(trans, ca,
- bu[i], BCH_DATA_free, 0,
- BTREE_TRIGGER_transactional));
-err_free:
- for (i = 0; i < nr_got; i++)
- bch2_open_bucket_put(c, ob[i]);
-
- kfree(new_bucket_seq);
- kfree(new_buckets);
- kfree(ob);
- kfree(bu);
- return ret;
-}
-
-static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr, bool new_fs)
-{
- struct journal_device *ja = &ca->journal;
- int ret = 0;
-
- struct closure cl;
- closure_init_stack(&cl);
-
- /* don't handle reducing nr of buckets yet: */
- if (nr < ja->nr)
- return 0;
-
- while (!ret && ja->nr < nr) {
- struct disk_reservation disk_res = { 0, 0, 0 };
-
- /*
- * note: journal buckets aren't really counted as _sectors_ used yet, so
- * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
- * when space used goes up without a reservation - but we do need the
- * reservation to ensure we'll actually be able to allocate:
- *
- * XXX: that's not right, disk reservations only ensure a
- * filesystem-wide allocation will succeed, this is a device
- * specific allocation - we can hang here:
- */
- if (!new_fs) {
- ret = bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0);
- if (ret)
- break;
- }
-
- ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
-
- if (ret == -BCH_ERR_bucket_alloc_blocked ||
- ret == -BCH_ERR_open_buckets_empty)
- ret = 0; /* wait and retry */
-
- bch2_disk_reservation_put(c, &disk_res);
- closure_sync(&cl);
- }
-
- return ret;
-}
-
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
-{
- down_write(&c->state_lock);
- int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
- up_write(&c->state_lock);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
-{
- unsigned nr;
- int ret;
-
- if (dynamic_fault("bcachefs:add:journal_alloc")) {
- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
- goto err;
- }
-
- /* 1/128th of the device by default: */
- nr = ca->mi.nbuckets >> 7;
-
- /*
- * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
- * is smaller:
- */
- nr = clamp_t(unsigned, nr,
- BCH_JOURNAL_BUCKETS_MIN,
- min(1 << 13,
- (1 << 24) / ca->mi.bucket_size));
-
- ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs);
-err:
- bch_err_fn(ca, ret);
- return ret;
-}
-
-int bch2_fs_journal_alloc(struct bch_fs *c)
-{
- for_each_online_member(c, ca) {
- if (ca->journal.nr)
- continue;
-
- int ret = bch2_dev_journal_alloc(ca, true);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- return ret;
- }
- }
-
- return 0;
-}
-
-/* startup/shutdown: */
-
-static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
-{
- bool ret = false;
- u64 seq;
-
- spin_lock(&j->lock);
- for (seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j) && !ret;
- seq++) {
- struct journal_buf *buf = journal_seq_to_buf(j, seq);
-
- if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
- ret = true;
- }
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
-{
- wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
-}
-
-void bch2_fs_journal_stop(struct journal *j)
-{
- if (!test_bit(JOURNAL_running, &j->flags))
- return;
-
- bch2_journal_reclaim_stop(j);
- bch2_journal_flush_all_pins(j);
-
- wait_event(j->wait, bch2_journal_entry_close(j));
-
- /*
- * Always write a new journal entry, to make sure the clock hands are up
- * to date (and match the superblock)
- */
- __bch2_journal_meta(j);
-
- journal_quiesce(j);
- cancel_delayed_work_sync(&j->write_work);
-
- WARN(!bch2_journal_error(j) &&
- test_bit(JOURNAL_replay_done, &j->flags) &&
- j->last_empty_seq != journal_cur_seq(j),
- "journal shutdown error: cur seq %llu but last empty seq %llu",
- journal_cur_seq(j), j->last_empty_seq);
-
- if (!bch2_journal_error(j))
- clear_bit(JOURNAL_running, &j->flags);
-}
-
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_entry_pin_list *p;
- struct journal_replay *i, **_i;
- struct genradix_iter iter;
- bool had_entries = false;
- u64 last_seq = cur_seq, nr, seq;
-
- if (cur_seq >= JOURNAL_SEQ_MAX) {
- bch_err(c, "cannot start: journal seq overflow");
- return -EINVAL;
- }
-
- genradix_for_each_reverse(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- last_seq = le64_to_cpu(i->j.last_seq);
- break;
- }
-
- nr = cur_seq - last_seq;
-
- if (nr + 1 > j->pin.size) {
- free_fifo(&j->pin);
- init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
- if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
- return -BCH_ERR_ENOMEM_journal_pin_fifo;
- }
- }
-
- j->replay_journal_seq = last_seq;
- j->replay_journal_seq_end = cur_seq;
- j->last_seq_ondisk = last_seq;
- j->flushed_seq_ondisk = cur_seq - 1;
- j->seq_write_started = cur_seq - 1;
- j->seq_ondisk = cur_seq - 1;
- j->pin.front = last_seq;
- j->pin.back = cur_seq;
- atomic64_set(&j->seq, cur_seq - 1);
-
- fifo_for_each_entry_ptr(p, &j->pin, seq)
- journal_pin_list_init(p, 1);
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- seq = le64_to_cpu(i->j.seq);
- BUG_ON(seq >= cur_seq);
-
- if (seq < last_seq)
- continue;
-
- if (journal_entry_empty(&i->j))
- j->last_empty_seq = le64_to_cpu(i->j.seq);
-
- p = journal_seq_pin(j, seq);
-
- p->devs.nr = 0;
- darray_for_each(i->ptrs, ptr)
- bch2_dev_list_add_dev(&p->devs, ptr->dev);
-
- had_entries = true;
- }
-
- if (!had_entries)
- j->last_empty_seq = cur_seq - 1; /* to match j->seq */
-
- spin_lock(&j->lock);
-
- set_bit(JOURNAL_running, &j->flags);
- j->last_flush_write = jiffies;
-
- j->reservations.idx = journal_cur_seq(j);
-
- c->last_bucket_seq_cleanup = journal_cur_seq(j);
-
- bch2_journal_space_available(j);
- spin_unlock(&j->lock);
-
- return bch2_journal_reclaim_start(j);
-}
-
-/* init/exit: */
-
-void bch2_dev_journal_exit(struct bch_dev *ca)
-{
- struct journal_device *ja = &ca->journal;
-
- for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
- kfree(ja->bio[i]);
- ja->bio[i] = NULL;
- }
-
- kfree(ja->buckets);
- kfree(ja->bucket_seq);
- ja->buckets = NULL;
- ja->bucket_seq = NULL;
-}
-
-int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
-{
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets =
- bch2_sb_field_get(sb, journal);
- struct bch_sb_field_journal_v2 *journal_buckets_v2 =
- bch2_sb_field_get(sb, journal_v2);
-
- ja->nr = 0;
-
- if (journal_buckets_v2) {
- unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-
- for (unsigned i = 0; i < nr; i++)
- ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
- } else if (journal_buckets) {
- ja->nr = bch2_nr_journal_buckets(journal_buckets);
- }
-
- ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
- if (!ja->bucket_seq)
- return -BCH_ERR_ENOMEM_dev_journal_init;
-
- unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
-
- for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
- ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
- nr_bvecs), GFP_KERNEL);
- if (!ja->bio[i])
- return -BCH_ERR_ENOMEM_dev_journal_init;
-
- ja->bio[i]->ca = ca;
- ja->bio[i]->buf_idx = i;
- bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
- }
-
- ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
- if (!ja->buckets)
- return -BCH_ERR_ENOMEM_dev_journal_init;
-
- if (journal_buckets_v2) {
- unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- unsigned dst = 0;
-
- for (unsigned i = 0; i < nr; i++)
- for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
- ja->buckets[dst++] =
- le64_to_cpu(journal_buckets_v2->d[i].start) + j;
- } else if (journal_buckets) {
- for (unsigned i = 0; i < ja->nr; i++)
- ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
- }
-
- return 0;
-}
-
-void bch2_fs_journal_exit(struct journal *j)
-{
- if (j->wq)
- destroy_workqueue(j->wq);
-
- darray_exit(&j->early_journal_entries);
-
- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
- kvfree(j->buf[i].data);
- kvfree(j->free_buf);
- free_fifo(&j->pin);
-}
-
-int bch2_fs_journal_init(struct journal *j)
-{
- static struct lock_class_key res_key;
-
- mutex_init(&j->buf_lock);
- spin_lock_init(&j->lock);
- spin_lock_init(&j->err_lock);
- init_waitqueue_head(&j->wait);
- INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- init_waitqueue_head(&j->reclaim_wait);
- init_waitqueue_head(&j->pin_flush_wait);
- mutex_init(&j->reclaim_lock);
- mutex_init(&j->discard_lock);
-
- lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
- atomic64_set(&j->reservations.counter,
- ((union journal_res_state)
- { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
- return -BCH_ERR_ENOMEM_journal_pin_fifo;
-
- j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
- j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
- if (!j->free_buf)
- return -BCH_ERR_ENOMEM_journal_buf;
-
- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
- j->buf[i].idx = i;
-
- j->pin.front = j->pin.back = 1;
-
- j->wq = alloc_workqueue("bcachefs_journal",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
- if (!j->wq)
- return -BCH_ERR_ENOMEM_fs_other_alloc;
- return 0;
-}
-
-/* debug: */
-
-static const char * const bch2_journal_flags_strs[] = {
-#define x(n) #n,
- JOURNAL_FLAGS()
-#undef x
- NULL
-};
-
-void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- union journal_res_state s;
- unsigned long now = jiffies;
- u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 28);
- out->atomic++;
-
- rcu_read_lock();
- s = READ_ONCE(j->reservations);
-
- prt_printf(out, "flags:\t");
- prt_bitflags(out, bch2_journal_flags_strs, j->flags);
- prt_newline(out);
- prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
- prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j));
- prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j));
- prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
- prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]);
- prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
- prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
- prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
- prt_printf(out, "average write size:\t");
- prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
- prt_newline(out);
- prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0);
- prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
- prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
- prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);
- prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
- ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- prt_printf(out, "blocked:\t%u\n", j->blocked);
- prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error));
- prt_printf(out, "current entry:\t");
-
- switch (s.cur_entry_offset) {
- case JOURNAL_ENTRY_ERROR_VAL:
- prt_printf(out, "error\n");
- break;
- case JOURNAL_ENTRY_CLOSED_VAL:
- prt_printf(out, "closed\n");
- break;
- case JOURNAL_ENTRY_BLOCKED_VAL:
- prt_printf(out, "blocked\n");
- break;
- default:
- prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
- break;
- }
-
- prt_printf(out, "unwritten entries:\n");
- bch2_journal_bufs_to_text(out, j);
-
- prt_printf(out, "space:\n");
- printbuf_indent_add(out, 2);
- prt_printf(out, "discarded\t%u:%u\n",
- j->space[journal_space_discarded].next_entry,
- j->space[journal_space_discarded].total);
- prt_printf(out, "clean ondisk\t%u:%u\n",
- j->space[journal_space_clean_ondisk].next_entry,
- j->space[journal_space_clean_ondisk].total);
- prt_printf(out, "clean\t%u:%u\n",
- j->space[journal_space_clean].next_entry,
- j->space[journal_space_clean].total);
- prt_printf(out, "total\t%u:%u\n",
- j->space[journal_space_total].next_entry,
- j->space[journal_space_total].total);
- printbuf_indent_sub(out, 2);
-
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
- if (!ca->mi.durability)
- continue;
-
- struct journal_device *ja = &ca->journal;
-
- if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
- continue;
-
- if (!ja->nr)
- continue;
-
- prt_printf(out, "dev %u:\n", ca->dev_idx);
- prt_printf(out, "durability %u:\n", ca->mi.durability);
- printbuf_indent_add(out, 2);
- prt_printf(out, "nr\t%u\n", ja->nr);
- prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size);
- prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
- prt_printf(out, "discard_idx\t%u\n", ja->discard_idx);
- prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
- prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
- prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
- printbuf_indent_sub(out, 2);
- }
-
- prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
-
- rcu_read_unlock();
-
- --out->atomic;
-}
-
-void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
-{
- spin_lock(&j->lock);
- __bch2_journal_debug_to_text(out, j);
- spin_unlock(&j->lock);
-}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
deleted file mode 100644
index 47828771f9c2..000000000000
--- a/fs/bcachefs/journal.h
+++ /dev/null
@@ -1,468 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_H
-#define _BCACHEFS_JOURNAL_H
-
-/*
- * THE JOURNAL:
- *
- * The primary purpose of the journal is to log updates (insertions) to the
- * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
- *
- * Without the journal, the b-tree is always internally consistent on
- * disk - and in fact, in the earliest incarnations bcache didn't have a journal
- * but did handle unclean shutdowns by doing all index updates synchronously
- * (with coalescing).
- *
- * Updates to interior nodes still happen synchronously and without the journal
- * (for simplicity) - this may change eventually but updates to interior nodes
- * are rare enough it's not a huge priority.
- *
- * This means the journal is relatively separate from the b-tree; it consists of
- * just a list of keys and journal replay consists of just redoing those
- * insertions in same order that they appear in the journal.
- *
- * PERSISTENCE:
- *
- * For synchronous updates (where we're waiting on the index update to hit
- * disk), the journal entry will be written out immediately (or as soon as
- * possible, if the write for the previous journal entry was still in flight).
- *
- * Synchronous updates are specified by passing a closure (@flush_cl) to
- * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
- * down to the journalling code. That closure will wait on the journal write to
- * complete (via closure_wait()).
- *
- * If the index update wasn't synchronous, the journal entry will be
- * written out after 10 ms have elapsed, by default (the delay_ms field
- * in struct journal).
- *
- * JOURNAL ENTRIES:
- *
- * A journal entry is variable size (struct jset), it's got a fixed length
- * header and then a variable number of struct jset_entry entries.
- *
- * Journal entries are identified by monotonically increasing 64 bit sequence
- * numbers - jset->seq; other places in the code refer to this sequence number.
- *
- * A jset_entry entry contains one or more bkeys (which is what gets inserted
- * into the b-tree). We need a container to indicate which b-tree the key is
- * for; also, the roots of the various b-trees are stored in jset_entry entries
- * (one for each b-tree) - this lets us add new b-tree types without changing
- * the on disk format.
- *
- * We also keep some things in the journal header that are logically part of the
- * superblock - all the things that are frequently updated. This is for future
- * bcache on raw flash support; the superblock (which will become another
- * journal) can't be moved or wear leveled, so it contains just enough
- * information to find the main journal, and the superblock only has to be
- * rewritten when we want to move/wear level the main journal.
- *
- * JOURNAL LAYOUT ON DISK:
- *
- * The journal is written to a ringbuffer of buckets (which is kept in the
- * superblock); the individual buckets are not necessarily contiguous on disk
- * which means that journal entries are not allowed to span buckets, but also
- * that we can resize the journal at runtime if desired (unimplemented).
- *
- * The journal buckets exist in the same pool as all the other buckets that are
- * managed by the allocator and garbage collection - garbage collection marks
- * the journal buckets as metadata buckets.
- *
- * OPEN/DIRTY JOURNAL ENTRIES:
- *
- * Open/dirty journal entries are journal entries that contain b-tree updates
- * that have not yet been written out to the b-tree on disk. We have to track
- * which journal entries are dirty, and we also have to avoid wrapping around
- * the journal and overwriting old but still dirty journal entries with new
- * journal entries.
- *
- * On disk, this is represented with the "last_seq" field of struct jset;
- * last_seq is the first sequence number that journal replay has to replay.
- *
- * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
- * journal_device->seq) of for each journal bucket, the highest sequence number
- * any journal entry it contains. Then, by comparing that against last_seq we
- * can determine whether that journal bucket contains dirty journal entries or
- * not.
- *
- * To track which journal entries are dirty, we maintain a fifo of refcounts
- * (where each entry corresponds to a specific sequence number) - when a ref
- * goes to 0, that journal entry is no longer dirty.
- *
- * Journalling of index updates is done at the same time as the b-tree itself is
- * being modified (see btree_insert_key()); when we add the key to the journal
- * the pending b-tree write takes a ref on the journal entry the key was added
- * to. If a pending b-tree write would need to take refs on multiple dirty
- * journal entries, it only keeps the ref on the oldest one (since a newer
- * journal entry will still be replayed if an older entry was dirty).
- *
- * JOURNAL FILLING UP:
- *
- * There are two ways the journal could fill up; either we could run out of
- * space to write to, or we could have too many open journal entries and run out
- * of room in the fifo of refcounts. Since those refcounts are decremented
- * without any locking we can't safely resize that fifo, so we handle it the
- * same way.
- *
- * If the journal fills up, we start flushing dirty btree nodes until we can
- * allocate space for a journal write again - preferentially flushing btree
- * nodes that are pinning the oldest journal entries first.
- */
-
-#include <linux/hash.h>
-
-#include "journal_types.h"
-
-struct bch_fs;
-
-static inline void journal_wake(struct journal *j)
-{
- wake_up(&j->wait);
- closure_wake_up(&j->async_wait);
-}
-
-/* Sequence number of oldest dirty journal entry */
-
-static inline u64 journal_last_seq(struct journal *j)
-{
- return j->pin.front;
-}
-
-static inline u64 journal_cur_seq(struct journal *j)
-{
- return atomic64_read(&j->seq);
-}
-
-static inline u64 journal_last_unwritten_seq(struct journal *j)
-{
- return j->seq_ondisk + 1;
-}
-
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
- unsigned idx = (journal_cur_seq(j) &
- JOURNAL_BUF_MASK &
- ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;
-
- return j->buf + idx;
-}
-
-static inline int journal_state_count(union journal_res_state s, int idx)
-{
- switch (idx) {
- case 0: return s.buf0_count;
- case 1: return s.buf1_count;
- case 2: return s.buf2_count;
- case 3: return s.buf3_count;
- }
- BUG();
-}
-
-static inline int journal_state_seq_count(struct journal *j,
- union journal_res_state s, u64 seq)
-{
- if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR)
- return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK);
- else
- return 0;
-}
-
-static inline void journal_state_inc(union journal_res_state *s)
-{
- s->buf0_count += s->idx == 0;
- s->buf1_count += s->idx == 1;
- s->buf2_count += s->idx == 2;
- s->buf3_count += s->idx == 3;
-}
-
-/*
- * Amount of space that will be taken up by some keys in the journal (i.e.
- * including the jset header)
- */
-static inline unsigned jset_u64s(unsigned u64s)
-{
- return u64s + sizeof(struct jset_entry) / sizeof(u64);
-}
-
-static inline int journal_entry_overhead(struct journal *j)
-{
- return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
-}
-
-static inline struct jset_entry *
-bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
-{
- struct jset *jset = buf->data;
- struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
-
- memset(entry, 0, sizeof(*entry));
- entry->u64s = cpu_to_le16(u64s);
-
- le32_add_cpu(&jset->u64s, jset_u64s(u64s));
-
- return entry;
-}
-
-static inline struct jset_entry *
-journal_res_entry(struct journal *j, struct journal_res *res)
-{
- return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);
-}
-
-static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
- enum btree_id id, unsigned level,
- unsigned u64s)
-{
- entry->u64s = cpu_to_le16(u64s);
- entry->btree_id = id;
- entry->level = level;
- entry->type = type;
- entry->pad[0] = 0;
- entry->pad[1] = 0;
- entry->pad[2] = 0;
- return jset_u64s(u64s);
-}
-
-static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
- enum btree_id id, unsigned level,
- const void *data, unsigned u64s)
-{
- unsigned ret = journal_entry_init(entry, type, id, level, u64s);
-
- memcpy_u64s_small(entry->_data, data, u64s);
- return ret;
-}
-
-static inline struct jset_entry *
-bch2_journal_add_entry(struct journal *j, struct journal_res *res,
- unsigned type, enum btree_id id,
- unsigned level, unsigned u64s)
-{
- struct jset_entry *entry = journal_res_entry(j, res);
- unsigned actual = journal_entry_init(entry, type, id, level, u64s);
-
- EBUG_ON(!res->ref);
- EBUG_ON(actual > res->u64s);
-
- res->offset += actual;
- res->u64s -= actual;
- return entry;
-}
-
-static inline bool journal_entry_empty(struct jset *j)
-{
- if (j->seq != j->last_seq)
- return false;
-
- vstruct_for_each(j, i)
- if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
- return false;
- return true;
-}
-
-/*
- * Drop reference on a buffer index and return true if the count has hit zero.
- */
-static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
-{
- union journal_res_state s;
-
- s.v = atomic64_sub_return(((union journal_res_state) {
- .buf0_count = idx == 0,
- .buf1_count = idx == 1,
- .buf2_count = idx == 2,
- .buf3_count = idx == 3,
- }).v, &j->reservations.counter);
- return s;
-}
-
-bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_do_writes(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64);
-
-static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
-{
- unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
- union journal_res_state s;
-
- s = journal_state_buf_put(j, idx);
- if (!journal_state_count(s, idx))
- bch2_journal_buf_put_final(j, seq);
-}
-
-static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
-{
- unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
- union journal_res_state s;
-
- s = journal_state_buf_put(j, idx);
- if (!journal_state_count(s, idx)) {
- spin_lock(&j->lock);
- bch2_journal_buf_put_final(j, seq);
- spin_unlock(&j->lock);
- } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL))
- wake_up(&j->wait);
-}
-
-/*
- * This function releases the journal write structure so other threads can
- * then proceed to add their keys as well.
- */
-static inline void bch2_journal_res_put(struct journal *j,
- struct journal_res *res)
-{
- if (!res->ref)
- return;
-
- lock_release(&j->res_map, _THIS_IP_);
-
- while (res->u64s)
- bch2_journal_add_entry(j, res,
- BCH_JSET_ENTRY_btree_keys,
- 0, 0, 0);
-
- bch2_journal_buf_put(j, res->seq);
-
- res->ref = 0;
-}
-
-int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
- unsigned, struct btree_trans *);
-
-/* First bits for BCH_WATERMARK: */
-enum journal_res_flags {
- __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS,
- __JOURNAL_RES_GET_CHECK,
-};
-
-#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK)
-#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK)
-
-static inline int journal_res_get_fast(struct journal *j,
- struct journal_res *res,
- unsigned flags)
-{
- union journal_res_state old, new;
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
-
- /*
- * Check if there is still room in the current journal
- * entry, smp_rmb() guarantees that reads from reservations.counter
- * occur before accessing cur_entry_u64s:
- */
- smp_rmb();
- if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
- return 0;
-
- EBUG_ON(!journal_state_count(new, new.idx));
-
- if ((flags & BCH_WATERMARK_MASK) < j->watermark)
- return 0;
-
- new.cur_entry_offset += res->u64s;
- journal_state_inc(&new);
-
- /*
- * If the refcount would overflow, we have to wait:
- * XXX - tracepoint this:
- */
- if (!journal_state_count(new, new.idx))
- return 0;
-
- if (flags & JOURNAL_RES_GET_CHECK)
- return 1;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
- &old.v, new.v));
-
- res->ref = true;
- res->offset = old.cur_entry_offset;
- res->seq = journal_cur_seq(j);
- res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;
- return 1;
-}
-
-static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
- unsigned u64s, unsigned flags,
- struct btree_trans *trans)
-{
- int ret;
-
- EBUG_ON(res->ref);
- EBUG_ON(!test_bit(JOURNAL_running, &j->flags));
-
- res->u64s = u64s;
-
- if (journal_res_get_fast(j, res, flags))
- goto out;
-
- ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
- if (ret)
- return ret;
-out:
- if (!(flags & JOURNAL_RES_GET_CHECK)) {
- lock_acquire_shared(&j->res_map, 0,
- (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
- NULL, _THIS_IP_);
- EBUG_ON(!res->ref);
- BUG_ON(!res->seq);
- }
- return 0;
-}
-
-/* journal_entry_res: */
-
-void bch2_journal_entry_res_resize(struct journal *,
- struct journal_entry_res *,
- unsigned);
-
-int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
-void bch2_journal_flush_async(struct journal *, struct closure *);
-
-int bch2_journal_flush_seq(struct journal *, u64, unsigned);
-int bch2_journal_flush(struct journal *);
-bool bch2_journal_noflush_seq(struct journal *, u64, u64);
-int bch2_journal_meta(struct journal *);
-
-void bch2_journal_halt(struct journal *);
-void bch2_journal_halt_locked(struct journal *);
-
-static inline int bch2_journal_error(struct journal *j)
-{
- return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
- ? -BCH_ERR_journal_shutdown : 0;
-}
-
-struct bch_dev;
-
-static inline void bch2_journal_set_replay_done(struct journal *j)
-{
- BUG_ON(!test_bit(JOURNAL_running, &j->flags));
- set_bit(JOURNAL_replay_done, &j->flags);
-}
-
-void bch2_journal_unblock(struct journal *);
-void bch2_journal_block(struct journal *);
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
-
-void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-
-int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
- unsigned nr);
-int bch2_dev_journal_alloc(struct bch_dev *, bool);
-int bch2_fs_journal_alloc(struct bch_fs *);
-
-void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
-
-void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64);
-
-void bch2_dev_journal_exit(struct bch_dev *);
-int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
-void bch2_fs_journal_exit(struct journal *);
-int bch2_fs_journal_init(struct journal *);
-
-#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
deleted file mode 100644
index 4ed6137f0439..000000000000
--- a/fs/bcachefs/journal_io.c
+++ /dev/null
@@ -1,2137 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_io.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-#include <linux/string_choices.h>
-
-void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
-{
- lockdep_assert_held(&c->sb_lock);
-
- for_each_member_device(c, ca) {
- struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
- m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
- m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
- }
-}
-
-void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
-{
- mutex_lock(&c->sb_lock);
- for_each_member_device(c, ca) {
- struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
-
- unsigned idx = le32_to_cpu(m.last_journal_bucket);
- if (idx < ca->journal.nr)
- ca->journal.cur_idx = idx;
- unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
- if (offset <= ca->mi.bucket_size)
- ca->journal.sectors_free = ca->mi.bucket_size - offset;
- }
- mutex_unlock(&c->sb_lock);
-}
-
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
-{
- darray_for_each(j->ptrs, i) {
- if (i != j->ptrs.data)
- prt_printf(out, " ");
- prt_printf(out, "%u:%u:%u (sector %llu)",
- i->dev, i->bucket, i->bucket_offset, i->sector);
- }
-}
-
-static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
-{
- prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
-
- bch2_journal_ptrs_to_text(out, c, j);
-
- for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
- struct jset_entry_datetime *datetime =
- container_of(entry, struct jset_entry_datetime, entry);
- bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
- break;
- }
-}
-
-static struct nonce journal_nonce(const struct jset *jset)
-{
- return (struct nonce) {{
- [0] = 0,
- [1] = ((__le32 *) &jset->seq)[0],
- [2] = ((__le32 *) &jset->seq)[1],
- [3] = BCH_NONCE_JOURNAL,
- }};
-}
-
-static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
-{
- if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
- *csum = (struct bch_csum) {};
- return false;
- }
-
- *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
- return !bch2_crc_cmp(j->csum, *csum);
-}
-
-static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
-{
- return (seq - c->journal_entries_base_seq) & (~0U >> 1);
-}
-
-static void __journal_replay_free(struct bch_fs *c,
- struct journal_replay *i)
-{
- struct journal_replay **p =
- genradix_ptr(&c->journal_entries,
- journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
-
- BUG_ON(*p != i);
- *p = NULL;
- kvfree(i);
-}
-
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
-{
- if (blacklisted)
- i->ignore_blacklisted = true;
- else
- i->ignore_not_dirty = true;
-
- if (!c->opts.read_entire_journal)
- __journal_replay_free(c, i);
-}
-
-struct journal_list {
- struct closure cl;
- u64 last_seq;
- struct mutex lock;
- int ret;
-};
-
-#define JOURNAL_ENTRY_ADD_OK 0
-#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
-
-/*
- * Given a journal entry we just read, add it to the list of journal entries to
- * be replayed:
- */
-static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
- struct journal_ptr entry_ptr,
- struct journal_list *jlist, struct jset *j)
-{
- struct genradix_iter iter;
- struct journal_replay **_i, *i, *dup;
- size_t bytes = vstruct_bytes(j);
- u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
- struct printbuf buf = PRINTBUF;
- int ret = JOURNAL_ENTRY_ADD_OK;
-
- if (!c->journal.oldest_seq_found_ondisk ||
- le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
- c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
-
- /* Is this entry older than the range we need? */
- if (!c->opts.read_entire_journal &&
- le64_to_cpu(j->seq) < jlist->last_seq)
- return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-
- /*
- * genradixes are indexed by a ulong, not a u64, so we can't index them
- * by sequence number directly: Assume instead that they will all fall
- * within the range of +-2billion of the filrst one we find.
- */
- if (!c->journal_entries_base_seq)
- c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
-
- /* Drop entries we don't need anymore */
- if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
- genradix_for_each_from(&c->journal_entries, iter, _i,
- journal_entry_radix_idx(c, jlist->last_seq)) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- if (le64_to_cpu(i->j.seq) >= last_seq)
- break;
-
- journal_replay_free(c, i, false);
- }
- }
-
- jlist->last_seq = max(jlist->last_seq, last_seq);
-
- _i = genradix_ptr_alloc(&c->journal_entries,
- journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
- GFP_KERNEL);
- if (!_i)
- return -BCH_ERR_ENOMEM_journal_entry_add;
-
- /*
- * Duplicate journal entries? If so we want the one that didn't have a
- * checksum error:
- */
- dup = *_i;
- if (dup) {
- bool identical = bytes == vstruct_bytes(&dup->j) &&
- !memcmp(j, &dup->j, bytes);
- bool not_identical = !identical &&
- entry_ptr.csum_good &&
- dup->csum_good;
-
- bool same_device = false;
- darray_for_each(dup->ptrs, ptr)
- if (ptr->dev == ca->dev_idx)
- same_device = true;
-
- ret = darray_push(&dup->ptrs, entry_ptr);
- if (ret)
- goto out;
-
- bch2_journal_replay_to_text(&buf, c, dup);
-
- fsck_err_on(same_device,
- c, journal_entry_dup_same_device,
- "duplicate journal entry on same device\n %s",
- buf.buf);
-
- fsck_err_on(not_identical,
- c, journal_entry_replicas_data_mismatch,
- "found duplicate but non identical journal entries\n %s",
- buf.buf);
-
- if (entry_ptr.csum_good && !identical)
- goto replace;
-
- goto out;
- }
-replace:
- i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
- if (!i)
- return -BCH_ERR_ENOMEM_journal_entry_add;
-
- darray_init(&i->ptrs);
- i->csum_good = entry_ptr.csum_good;
- i->ignore_blacklisted = false;
- i->ignore_not_dirty = false;
- unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
-
- if (dup) {
- /* The first ptr should represent the jset we kept: */
- darray_for_each(dup->ptrs, ptr)
- darray_push(&i->ptrs, *ptr);
- __journal_replay_free(c, dup);
- } else {
- darray_push(&i->ptrs, entry_ptr);
- }
-
- *_i = i;
-out:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/* this fills in a range with empty jset_entries: */
-static void journal_entry_null_range(void *start, void *end)
-{
- struct jset_entry *entry;
-
- for (entry = start; entry != end; entry = vstruct_next(entry))
- memset(entry, 0, sizeof(*entry));
-}
-
-#define JOURNAL_ENTRY_REREAD 5
-#define JOURNAL_ENTRY_NONE 6
-#define JOURNAL_ENTRY_BAD 7
-
-static void journal_entry_err_msg(struct printbuf *out,
- u32 version,
- struct jset *jset,
- struct jset_entry *entry)
-{
- prt_str(out, "invalid journal entry, version=");
- bch2_version_to_text(out, version);
-
- if (entry) {
- prt_str(out, " type=");
- bch2_prt_jset_entry_type(out, entry->type);
- }
-
- if (!jset) {
- prt_printf(out, " in superblock");
- } else {
-
- prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
-
- if (entry)
- prt_printf(out, " offset=%zi/%u",
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s));
- }
-
- prt_str(out, ": ");
-}
-
-#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \
-({ \
- struct printbuf _buf = PRINTBUF; \
- \
- journal_entry_err_msg(&_buf, version, jset, entry); \
- prt_printf(&_buf, msg, ##__VA_ARGS__); \
- \
- switch (from.flags & BCH_VALIDATE_write) { \
- case READ: \
- mustfix_fsck_err(c, _err, "%s", _buf.buf); \
- break; \
- case WRITE: \
- bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
- bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
- if (bch2_fs_inconsistent(c)) { \
- ret = -BCH_ERR_fsck_errors_not_fixed; \
- goto fsck_err; \
- } \
- break; \
- } \
- \
- printbuf_exit(&_buf); \
- true; \
-})
-
-#define journal_entry_err_on(cond, ...) \
- ((cond) ? journal_entry_err(__VA_ARGS__) : false)
-
-#define FSCK_DELETED_KEY 5
-
-static int journal_validate_key(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- struct bkey_i *k,
- struct bkey_validate_context from,
- unsigned version, int big_endian)
-{
- enum bch_validate_flags flags = from.flags;
- int write = flags & BCH_VALIDATE_write;
- void *next = vstruct_next(entry);
- int ret = 0;
-
- if (journal_entry_err_on(!k->k.u64s,
- c, version, jset, entry,
- journal_entry_bkey_u64s_0,
- "k->u64s 0")) {
- entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(vstruct_next(entry), next);
- return FSCK_DELETED_KEY;
- }
-
- if (journal_entry_err_on((void *) bkey_next(k) >
- (void *) vstruct_next(entry),
- c, version, jset, entry,
- journal_entry_bkey_past_end,
- "extends past end of journal entry")) {
- entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(vstruct_next(entry), next);
- return FSCK_DELETED_KEY;
- }
-
- if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
- c, version, jset, entry,
- journal_entry_bkey_bad_format,
- "bad format %u", k->k.format)) {
- le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
- memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(vstruct_next(entry), next);
- return FSCK_DELETED_KEY;
- }
-
- if (!write)
- bch2_bkey_compat(from.level, from.btree, version, big_endian,
- write, NULL, bkey_to_packed(k));
-
- ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
- if (ret == -BCH_ERR_fsck_delete_bkey) {
- le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
- memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(vstruct_next(entry), next);
- return FSCK_DELETED_KEY;
- }
- if (ret)
- goto fsck_err;
-
- if (write)
- bch2_bkey_compat(from.level, from.btree, version, big_endian,
- write, NULL, bkey_to_packed(k));
-fsck_err:
- return ret;
-}
-
-static int journal_entry_btree_keys_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct bkey_i *k = entry->start;
-
- from.level = entry->level;
- from.btree = entry->btree_id;
-
- while (k != vstruct_last(entry)) {
- int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
- if (ret == FSCK_DELETED_KEY)
- continue;
- else if (ret)
- return ret;
-
- k = bkey_next(k);
- }
-
- return 0;
-}
-
-static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- bool first = true;
-
- jset_entry_for_each_key(entry, k) {
- if (!first) {
- prt_newline(out);
- bch2_prt_jset_entry_type(out, entry->type);
- prt_str(out, ": ");
- }
- bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
- prt_char(out, ' ');
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
- first = false;
- }
-}
-
-static int journal_entry_btree_root_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct bkey_i *k = entry->start;
- int ret = 0;
-
- from.root = true;
- from.level = entry->level + 1;
- from.btree = entry->btree_id;
-
- if (journal_entry_err_on(!entry->u64s ||
- le16_to_cpu(entry->u64s) != k->k.u64s,
- c, version, jset, entry,
- journal_entry_btree_root_bad_size,
- "invalid btree root journal entry: wrong number of keys")) {
- void *next = vstruct_next(entry);
- /*
- * we don't want to null out this jset_entry,
- * just the contents, so that later we can tell
- * we were _supposed_ to have a btree root
- */
- entry->u64s = 0;
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-
- ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
- if (ret == FSCK_DELETED_KEY)
- ret = 0;
-fsck_err:
- return ret;
-}
-
-static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- /* obsolete, don't care: */
- return 0;
-}
-
-static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
-}
-
-static int journal_entry_blacklist_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
- c, version, jset, entry,
- journal_entry_blacklist_bad_size,
- "invalid journal seq blacklist entry: bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- }
-fsck_err:
- return ret;
-}
-
-static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_blacklist *bl =
- container_of(entry, struct jset_entry_blacklist, entry);
-
- prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
-}
-
-static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_blacklist_v2 *bl_entry;
- int ret = 0;
-
- if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
- c, version, jset, entry,
- journal_entry_blacklist_v2_bad_size,
- "invalid journal seq blacklist entry: bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- goto out;
- }
-
- bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
-
- if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
- le64_to_cpu(bl_entry->end),
- c, version, jset, entry,
- journal_entry_blacklist_v2_start_past_end,
- "invalid journal seq blacklist entry: start > end")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- }
-out:
-fsck_err:
- return ret;
-}
-
-static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_blacklist_v2 *bl =
- container_of(entry, struct jset_entry_blacklist_v2, entry);
-
- prt_printf(out, "start=%llu end=%llu",
- le64_to_cpu(bl->start),
- le64_to_cpu(bl->end));
-}
-
-static int journal_entry_usage_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- int ret = 0;
-
- if (journal_entry_err_on(bytes < sizeof(*u),
- c, version, jset, entry,
- journal_entry_usage_bad_size,
- "invalid journal entry usage: bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
-fsck_err:
- return ret;
-}
-
-static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
-
- prt_str(out, "type=");
- bch2_prt_fs_usage_type(out, u->entry.btree_id);
- prt_printf(out, " v=%llu", le64_to_cpu(u->v));
-}
-
-static int journal_entry_data_usage_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- struct printbuf err = PRINTBUF;
- int ret = 0;
-
- if (journal_entry_err_on(bytes < sizeof(*u) ||
- bytes < sizeof(*u) + u->r.nr_devs,
- c, version, jset, entry,
- journal_entry_data_usage_bad_size,
- "invalid journal entry usage: bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- goto out;
- }
-
- if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
- c, version, jset, entry,
- journal_entry_data_usage_bad_size,
- "invalid journal entry usage: %s", err.buf)) {
- journal_entry_null_range(entry, vstruct_next(entry));
- goto out;
- }
-out:
-fsck_err:
- printbuf_exit(&err);
- return ret;
-}
-
-static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
-
- bch2_replicas_entry_to_text(out, &u->r);
- prt_printf(out, "=%llu", le64_to_cpu(u->v));
-}
-
-static int journal_entry_clock_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_clock *clock =
- container_of(entry, struct jset_entry_clock, entry);
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- int ret = 0;
-
- if (journal_entry_err_on(bytes != sizeof(*clock),
- c, version, jset, entry,
- journal_entry_clock_bad_size,
- "bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
- if (journal_entry_err_on(clock->rw > 1,
- c, version, jset, entry,
- journal_entry_clock_bad_rw,
- "bad rw")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
-fsck_err:
- return ret;
-}
-
-static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_clock *clock =
- container_of(entry, struct jset_entry_clock, entry);
-
- prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
-}
-
-static int journal_entry_dev_usage_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_dev_usage *u =
- container_of(entry, struct jset_entry_dev_usage, entry);
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- unsigned expected = sizeof(*u);
- int ret = 0;
-
- if (journal_entry_err_on(bytes < expected,
- c, version, jset, entry,
- journal_entry_dev_usage_bad_size,
- "bad size (%u < %u)",
- bytes, expected)) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
- if (journal_entry_err_on(u->pad,
- c, version, jset, entry,
- journal_entry_dev_usage_bad_pad,
- "bad pad")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
-fsck_err:
- return ret;
-}
-
-static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_dev_usage *u =
- container_of(entry, struct jset_entry_dev_usage, entry);
- unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
-
- if (vstruct_bytes(entry) < sizeof(*u))
- return;
-
- prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
-
- printbuf_indent_add(out, 2);
- for (i = 0; i < nr_types; i++) {
- prt_newline(out);
- bch2_prt_data_type(out, i);
- prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
- le64_to_cpu(u->d[i].buckets),
- le64_to_cpu(u->d[i].sectors),
- le64_to_cpu(u->d[i].fragmented));
- }
- printbuf_indent_sub(out, 2);
-}
-
-static int journal_entry_log_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
-
- prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
-}
-
-static int journal_entry_overwrite_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- from.flags = 0;
- return journal_entry_btree_keys_validate(c, jset, entry,
- version, big_endian, from);
-}
-
-static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- return journal_entry_btree_keys_validate(c, jset, entry,
- version, big_endian, from);
-}
-
-static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_datetime_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- unsigned bytes = vstruct_bytes(entry);
- unsigned expected = 16;
- int ret = 0;
-
- if (journal_entry_err_on(vstruct_bytes(entry) < expected,
- c, version, jset, entry,
- journal_entry_dev_usage_bad_size,
- "bad size (%u < %u)",
- bytes, expected)) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-fsck_err:
- return ret;
-}
-
-static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_datetime *datetime =
- container_of(entry, struct jset_entry_datetime, entry);
-
- bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
-}
-
-struct jset_entry_ops {
- int (*validate)(struct bch_fs *, struct jset *,
- struct jset_entry *, unsigned, int,
- struct bkey_validate_context);
- void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
-};
-
-static const struct jset_entry_ops bch2_jset_entry_ops[] = {
-#define x(f, nr) \
- [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
- .validate = journal_entry_##f##_validate, \
- .to_text = journal_entry_##f##_to_text, \
- },
- BCH_JSET_ENTRY_TYPES()
-#undef x
-};
-
-int bch2_journal_entry_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- return entry->type < BCH_JSET_ENTRY_NR
- ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
- version, big_endian, from)
- : 0;
-}
-
-void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- bch2_prt_jset_entry_type(out, entry->type);
-
- if (entry->type < BCH_JSET_ENTRY_NR) {
- prt_str(out, ": ");
- bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
- }
-}
-
-static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
- enum bch_validate_flags flags)
-{
- struct bkey_validate_context from = {
- .flags = flags,
- .from = BKEY_VALIDATE_journal,
- .journal_seq = le64_to_cpu(jset->seq),
- };
-
- unsigned version = le32_to_cpu(jset->version);
- int ret = 0;
-
- vstruct_for_each(jset, entry) {
- from.journal_offset = (u64 *) entry - jset->_data;
-
- if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
- c, version, jset, entry,
- journal_entry_past_jset_end,
- "journal entry extends past end of jset")) {
- jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
- break;
- }
-
- ret = bch2_journal_entry_validate(c, jset, entry, version,
- JSET_BIG_ENDIAN(jset), from);
- if (ret)
- break;
- }
-fsck_err:
- return ret;
-}
-
-static int jset_validate(struct bch_fs *c,
- struct bch_dev *ca,
- struct jset *jset, u64 sector,
- enum bch_validate_flags flags)
-{
- struct bkey_validate_context from = {
- .flags = flags,
- .from = BKEY_VALIDATE_journal,
- .journal_seq = le64_to_cpu(jset->seq),
- };
- int ret = 0;
-
- if (le64_to_cpu(jset->magic) != jset_magic(c))
- return JOURNAL_ENTRY_NONE;
-
- unsigned version = le32_to_cpu(jset->version);
- if (journal_entry_err_on(!bch2_version_compatible(version),
- c, version, jset, NULL,
- jset_unsupported_version,
- "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq),
- BCH_VERSION_MAJOR(version),
- BCH_VERSION_MINOR(version))) {
- /* don't try to continue: */
- return -EINVAL;
- }
-
- if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
- c, version, jset, NULL,
- jset_unknown_csum,
- "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq),
- JSET_CSUM_TYPE(jset)))
- ret = JOURNAL_ENTRY_BAD;
-
- /* last_seq is ignored when JSET_NO_FLUSH is true */
- if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
- le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
- c, version, jset, NULL,
- jset_last_seq_newer_than_seq,
- "invalid journal entry: last_seq > seq (%llu > %llu)",
- le64_to_cpu(jset->last_seq),
- le64_to_cpu(jset->seq))) {
- jset->last_seq = jset->seq;
- return JOURNAL_ENTRY_BAD;
- }
-
- ret = jset_validate_entries(c, jset, flags);
-fsck_err:
- return ret;
-}
-
-static int jset_validate_early(struct bch_fs *c,
- struct bch_dev *ca,
- struct jset *jset, u64 sector,
- unsigned bucket_sectors_left,
- unsigned sectors_read)
-{
- struct bkey_validate_context from = {
- .from = BKEY_VALIDATE_journal,
- .journal_seq = le64_to_cpu(jset->seq),
- };
- int ret = 0;
-
- if (le64_to_cpu(jset->magic) != jset_magic(c))
- return JOURNAL_ENTRY_NONE;
-
- unsigned version = le32_to_cpu(jset->version);
- if (journal_entry_err_on(!bch2_version_compatible(version),
- c, version, jset, NULL,
- jset_unsupported_version,
- "%s sector %llu seq %llu: unknown journal entry version %u.%u",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq),
- BCH_VERSION_MAJOR(version),
- BCH_VERSION_MINOR(version))) {
- /* don't try to continue: */
- return -EINVAL;
- }
-
- size_t bytes = vstruct_bytes(jset);
- if (bytes > (sectors_read << 9) &&
- sectors_read < bucket_sectors_left)
- return JOURNAL_ENTRY_REREAD;
-
- if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
- c, version, jset, NULL,
- jset_past_bucket_end,
- "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq), bytes))
- le32_add_cpu(&jset->u64s,
- -((bytes - (bucket_sectors_left << 9)) / 8));
-fsck_err:
- return ret;
-}
-
-struct journal_read_buf {
- void *data;
- size_t size;
-};
-
-static int journal_read_buf_realloc(struct journal_read_buf *b,
- size_t new_size)
-{
- void *n;
-
- /* the bios are sized for this many pages, max: */
- if (new_size > JOURNAL_ENTRY_SIZE_MAX)
- return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
-
- new_size = roundup_pow_of_two(new_size);
- n = kvmalloc(new_size, GFP_KERNEL);
- if (!n)
- return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
-
- kvfree(b->data);
- b->data = n;
- b->size = new_size;
- return 0;
-}
-
-static int journal_read_bucket(struct bch_dev *ca,
- struct journal_read_buf *buf,
- struct journal_list *jlist,
- unsigned bucket)
-{
- struct bch_fs *c = ca->fs;
- struct journal_device *ja = &ca->journal;
- struct jset *j = NULL;
- unsigned sectors, sectors_read = 0;
- u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
- end = offset + ca->mi.bucket_size;
- bool saw_bad = false, csum_good;
- struct printbuf err = PRINTBUF;
- int ret = 0;
-
- pr_debug("reading %u", bucket);
-
- while (offset < end) {
- if (!sectors_read) {
- struct bio *bio;
- unsigned nr_bvecs;
-reread:
- sectors_read = min_t(unsigned,
- end - offset, buf->size >> 9);
- nr_bvecs = buf_pages(buf->data, sectors_read << 9);
-
- bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
- if (!bio)
- return -BCH_ERR_ENOMEM_journal_read_bucket;
- bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
-
- bio->bi_iter.bi_sector = offset;
- bch2_bio_map(bio, buf->data, sectors_read << 9);
-
- u64 submit_time = local_clock();
- ret = submit_bio_wait(bio);
- kfree(bio);
-
- if (!ret && bch2_meta_read_fault("journal"))
- ret = -BCH_ERR_EIO_fault_injected;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
- submit_time, !ret);
-
- if (ret) {
- bch_err_dev_ratelimited(ca,
- "journal read error: sector %llu", offset);
- /*
- * We don't error out of the recovery process
- * here, since the relevant journal entry may be
- * found on a different device, and missing or
- * no journal entries will be handled later
- */
- goto out;
- }
-
- j = buf->data;
- }
-
- ret = jset_validate_early(c, ca, j, offset,
- end - offset, sectors_read);
- switch (ret) {
- case 0:
- sectors = vstruct_sectors(j, c->block_bits);
- break;
- case JOURNAL_ENTRY_REREAD:
- if (vstruct_bytes(j) > buf->size) {
- ret = journal_read_buf_realloc(buf,
- vstruct_bytes(j));
- if (ret)
- goto err;
- }
- goto reread;
- case JOURNAL_ENTRY_NONE:
- if (!saw_bad)
- goto out;
- /*
- * On checksum error we don't really trust the size
- * field of the journal entry we read, so try reading
- * again at next block boundary:
- */
- sectors = block_sectors(c);
- goto next_block;
- default:
- goto err;
- }
-
- if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
- ja->highest_seq_found = le64_to_cpu(j->seq);
- ja->cur_idx = bucket;
- ja->sectors_free = ca->mi.bucket_size -
- bucket_remainder(ca, offset) - sectors;
- }
-
- /*
- * This happens sometimes if we don't have discards on -
- * when we've partially overwritten a bucket with new
- * journal entries. We don't need the rest of the
- * bucket:
- */
- if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- goto out;
-
- ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
- enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
- struct bch_csum csum;
- csum_good = jset_csum_good(c, j, &csum);
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
-
- if (!csum_good) {
- bch_err_dev_ratelimited(ca, "%s",
- (printbuf_reset(&err),
- prt_str(&err, "journal "),
- bch2_csum_err_msg(&err, csum_type, j->csum, csum),
- err.buf));
- saw_bad = true;
- }
-
- ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
- j->encrypted_start,
- vstruct_end(j) - (void *) j->encrypted_start);
- bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
-
- mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, (struct journal_ptr) {
- .csum_good = csum_good,
- .dev = ca->dev_idx,
- .bucket = bucket,
- .bucket_offset = offset -
- bucket_to_sector(ca, ja->buckets[bucket]),
- .sector = offset,
- }, jlist, j);
- mutex_unlock(&jlist->lock);
-
- switch (ret) {
- case JOURNAL_ENTRY_ADD_OK:
- break;
- case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
- break;
- default:
- goto err;
- }
-next_block:
- pr_debug("next");
- offset += sectors;
- sectors_read -= sectors;
- j = ((void *) j) + (sectors << 9);
- }
-
-out:
- ret = 0;
-err:
- printbuf_exit(&err);
- return ret;
-}
-
-static CLOSURE_CALLBACK(bch2_journal_read_device)
-{
- closure_type(ja, struct journal_device, read);
- struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
- struct bch_fs *c = ca->fs;
- struct journal_list *jlist =
- container_of(cl->parent, struct journal_list, cl);
- struct journal_read_buf buf = { NULL, 0 };
- unsigned i;
- int ret = 0;
-
- if (!ja->nr)
- goto out;
-
- ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
- if (ret)
- goto err;
-
- pr_debug("%u journal buckets", ja->nr);
-
- for (i = 0; i < ja->nr; i++) {
- ret = journal_read_bucket(ca, &buf, jlist, i);
- if (ret)
- goto err;
- }
-
- /*
- * Set dirty_idx to indicate the entire journal is full and needs to be
- * reclaimed - journal reclaim will immediately reclaim whatever isn't
- * pinned when it first runs:
- */
- ja->discard_idx = ja->dirty_idx_ondisk =
- ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
-out:
- bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
- kvfree(buf.data);
- percpu_ref_put(&ca->io_ref);
- closure_return(cl);
- return;
-err:
- mutex_lock(&jlist->lock);
- jlist->ret = ret;
- mutex_unlock(&jlist->lock);
- goto out;
-}
-
-int bch2_journal_read(struct bch_fs *c,
- u64 *last_seq,
- u64 *blacklist_seq,
- u64 *start_seq)
-{
- struct journal_list jlist;
- struct journal_replay *i, **_i, *prev = NULL;
- struct genradix_iter radix_iter;
- struct printbuf buf = PRINTBUF;
- bool degraded = false, last_write_torn = false;
- u64 seq;
- int ret = 0;
-
- closure_init_stack(&jlist.cl);
- mutex_init(&jlist.lock);
- jlist.last_seq = 0;
- jlist.ret = 0;
-
- for_each_member_device(c, ca) {
- if (!c->opts.fsck &&
- !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
- continue;
-
- if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
- ca->mi.state == BCH_MEMBER_STATE_ro) &&
- percpu_ref_tryget(&ca->io_ref))
- closure_call(&ca->journal.read,
- bch2_journal_read_device,
- system_unbound_wq,
- &jlist.cl);
- else
- degraded = true;
- }
-
- closure_sync(&jlist.cl);
-
- if (jlist.ret)
- return jlist.ret;
-
- *last_seq = 0;
- *start_seq = 0;
- *blacklist_seq = 0;
-
- /*
- * Find most recent flush entry, and ignore newer non flush entries -
- * those entries will be blacklisted:
- */
- genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- if (!*start_seq)
- *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
-
- if (JSET_NO_FLUSH(&i->j)) {
- i->ignore_blacklisted = true;
- continue;
- }
-
- if (!last_write_torn && !i->csum_good) {
- last_write_torn = true;
- i->ignore_blacklisted = true;
- continue;
- }
-
- struct bkey_validate_context from = {
- .from = BKEY_VALIDATE_journal,
- .journal_seq = le64_to_cpu(i->j.seq),
- };
- if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
- c, le32_to_cpu(i->j.version), &i->j, NULL,
- jset_last_seq_newer_than_seq,
- "invalid journal entry: last_seq > seq (%llu > %llu)",
- le64_to_cpu(i->j.last_seq),
- le64_to_cpu(i->j.seq)))
- i->j.last_seq = i->j.seq;
-
- *last_seq = le64_to_cpu(i->j.last_seq);
- *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
- break;
- }
-
- if (!*start_seq) {
- bch_info(c, "journal read done, but no entries found");
- return 0;
- }
-
- if (!*last_seq) {
- fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
- "journal read done, but no entries found after dropping non-flushes");
- return 0;
- }
-
- bch_info(c, "journal read done, replaying entries %llu-%llu",
- *last_seq, *blacklist_seq - 1);
-
- if (*start_seq != *blacklist_seq)
- bch_info(c, "dropped unflushed entries %llu-%llu",
- *blacklist_seq, *start_seq - 1);
-
- /* Drop blacklisted entries and entries older than last_seq: */
- genradix_for_each(&c->journal_entries, radix_iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- seq = le64_to_cpu(i->j.seq);
- if (seq < *last_seq) {
- journal_replay_free(c, i, false);
- continue;
- }
-
- if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
- fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
- jset_seq_blacklisted,
- "found blacklisted journal entry %llu", seq);
- i->ignore_blacklisted = true;
- }
- }
-
- /* Check for missing entries: */
- seq = *last_seq;
- genradix_for_each(&c->journal_entries, radix_iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- BUG_ON(seq > le64_to_cpu(i->j.seq));
-
- while (seq < le64_to_cpu(i->j.seq)) {
- u64 missing_start, missing_end;
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (seq == le64_to_cpu(i->j.seq))
- break;
-
- missing_start = seq;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- !bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (prev) {
- bch2_journal_ptrs_to_text(&buf1, c, prev);
- prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
- } else
- prt_printf(&buf1, "(none)");
- bch2_journal_ptrs_to_text(&buf2, c, i);
-
- missing_end = seq - 1;
- fsck_err(c, journal_entries_missing,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
- " prev at %s\n"
- " next at %s, continue?",
- missing_start, missing_end,
- *last_seq, *blacklist_seq - 1,
- buf1.buf, buf2.buf);
-
- printbuf_exit(&buf1);
- printbuf_exit(&buf2);
- }
-
- prev = i;
- seq++;
- }
-
- genradix_for_each(&c->journal_entries, radix_iter, _i) {
- struct bch_replicas_padded replicas = {
- .e.data_type = BCH_DATA_journal,
- .e.nr_devs = 0,
- .e.nr_required = 1,
- };
-
- i = *_i;
- if (journal_replay_ignore(i))
- continue;
-
- darray_for_each(i->ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-
- if (!ptr->csum_good)
- bch_err_dev_offset(ca, ptr->sector,
- "invalid journal checksum, seq %llu%s",
- le64_to_cpu(i->j.seq),
- i->csum_good ? " (had good copy on another device)" : "");
- }
-
- ret = jset_validate(c,
- bch2_dev_have_ref(c, i->ptrs.data[0].dev),
- &i->j,
- i->ptrs.data[0].sector,
- READ);
- if (ret)
- goto err;
-
- darray_for_each(i->ptrs, ptr)
- replicas_entry_add_dev(&replicas.e, ptr->dev);
-
- bch2_replicas_entry_sort(&replicas.e);
-
- printbuf_reset(&buf);
- bch2_replicas_entry_to_text(&buf, &replicas.e);
-
- if (!degraded &&
- !bch2_replicas_marked(c, &replicas.e) &&
- (le64_to_cpu(i->j.seq) == *last_seq ||
- fsck_err(c, journal_entry_replicas_not_marked,
- "superblock not marked as containing replicas for journal entry %llu\n %s",
- le64_to_cpu(i->j.seq), buf.buf))) {
- ret = bch2_mark_replicas(c, &replicas.e);
- if (ret)
- goto err;
- }
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/* journal write: */
-
-static void journal_advance_devs_to_next_bucket(struct journal *j,
- struct dev_alloc_list *devs,
- unsigned sectors, u64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- darray_for_each(*devs, i) {
- struct bch_dev *ca = rcu_dereference(c->devs[*i]);
- if (!ca)
- continue;
-
- struct journal_device *ja = &ca->journal;
-
- if (sectors > ja->sectors_free &&
- sectors <= ca->mi.bucket_size &&
- bch2_journal_dev_buckets_available(j, ja,
- journal_space_discarded)) {
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- ja->sectors_free = ca->mi.bucket_size;
-
- /*
- * ja->bucket_seq[ja->cur_idx] must always have
- * something sensible:
- */
- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
- }
- }
-}
-
-static void __journal_write_alloc(struct journal *j,
- struct journal_buf *w,
- struct dev_alloc_list *devs,
- unsigned sectors,
- unsigned *replicas,
- unsigned replicas_want)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- darray_for_each(*devs, i) {
- struct bch_dev *ca = rcu_dereference(c->devs[*i]);
- if (!ca)
- continue;
-
- struct journal_device *ja = &ca->journal;
-
- /*
- * Check that we can use this device, and aren't already using
- * it:
- */
- if (!ca->mi.durability ||
- ca->mi.state != BCH_MEMBER_STATE_rw ||
- !ja->nr ||
- bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
- sectors > ja->sectors_free)
- continue;
-
- bch2_dev_stripe_increment(ca, &j->wp.stripe);
-
- bch2_bkey_append_ptr(&w->key,
- (struct bch_extent_ptr) {
- .offset = bucket_to_sector(ca,
- ja->buckets[ja->cur_idx]) +
- ca->mi.bucket_size -
- ja->sectors_free,
- .dev = ca->dev_idx,
- });
-
- ja->sectors_free -= sectors;
- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-
- *replicas += ca->mi.durability;
-
- if (*replicas >= replicas_want)
- break;
- }
-}
-
-/**
- * journal_write_alloc - decide where to write next journal entry
- *
- * @j: journal object
- * @w: journal buf (entry to be written)
- *
- * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure
- */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_devs_mask devs;
- struct dev_alloc_list devs_sorted;
- unsigned sectors = vstruct_sectors(w->data, c->block_bits);
- unsigned target = c->opts.metadata_target ?:
- c->opts.foreground_target;
- unsigned replicas = 0, replicas_want =
- READ_ONCE(c->opts.metadata_replicas);
- unsigned replicas_need = min_t(unsigned, replicas_want,
- READ_ONCE(c->opts.metadata_replicas_required));
- bool advance_done = false;
-
- rcu_read_lock();
-
- /* We might run more than once if we have to stop and do discards: */
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
- bkey_for_each_ptr(ptrs, p) {
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
- if (ca)
- replicas += ca->mi.durability;
- }
-
-retry_target:
- devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
-retry_alloc:
- __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
-
- if (likely(replicas >= replicas_want))
- goto done;
-
- if (!advance_done) {
- journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq);
- advance_done = true;
- goto retry_alloc;
- }
-
- if (replicas < replicas_want && target) {
- /* Retry from all devices: */
- target = 0;
- advance_done = false;
- goto retry_target;
- }
-done:
- rcu_read_unlock();
-
- BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
-
- return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
-}
-
-static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- /* we aren't holding j->lock: */
- unsigned new_size = READ_ONCE(j->buf_size_want);
- void *new_buf;
-
- if (buf->buf_size >= new_size)
- return;
-
- size_t btree_write_buffer_size = new_size / 64;
-
- if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
- return;
-
- new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
- if (!new_buf)
- return;
-
- memcpy(new_buf, buf->data, buf->buf_size);
-
- spin_lock(&j->lock);
- swap(buf->data, new_buf);
- swap(buf->buf_size, new_size);
- spin_unlock(&j->lock);
-
- kvfree(new_buf);
-}
-
-static CLOSURE_CALLBACK(journal_write_done)
-{
- closure_type(w, struct journal_buf, io);
- struct journal *j = container_of(w, struct journal, buf[w->idx]);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_replicas_padded replicas;
- u64 seq = le64_to_cpu(w->data->seq);
- int err = 0;
-
- bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
- ? j->flush_write_time
- : j->noflush_write_time, j->write_start_time);
-
- if (!w->devs_written.nr) {
- bch_err(c, "unable to write journal to sufficient devices");
- err = -BCH_ERR_journal_write_err;
- } else {
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
- w->devs_written);
- err = bch2_mark_replicas(c, &replicas.e);
- }
-
- if (err)
- bch2_fatal_error(c);
-
- closure_debug_destroy(cl);
-
- spin_lock(&j->lock);
- if (seq >= j->pin.front)
- journal_seq_pin(j, seq)->devs = w->devs_written;
- if (err && (!j->err_seq || seq < j->err_seq))
- j->err_seq = seq;
- w->write_done = true;
-
- if (!j->free_buf || j->free_buf_size < w->buf_size) {
- swap(j->free_buf, w->data);
- swap(j->free_buf_size, w->buf_size);
- }
-
- if (w->data) {
- void *buf = w->data;
- w->data = NULL;
- w->buf_size = 0;
-
- spin_unlock(&j->lock);
- kvfree(buf);
- spin_lock(&j->lock);
- }
-
- bool completed = false;
- bool do_discards = false;
-
- for (seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j);
- seq++) {
- w = j->buf + (seq & JOURNAL_BUF_MASK);
- if (!w->write_done)
- break;
-
- if (!j->err_seq && !w->noflush) {
- j->flushed_seq_ondisk = seq;
- j->last_seq_ondisk = w->last_seq;
-
- closure_wake_up(&c->freelist_wait);
- bch2_reset_alloc_cursors(c);
- }
-
- j->seq_ondisk = seq;
-
- /*
- * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
- * more buckets:
- *
- * Must come before signaling write completion, for
- * bch2_fs_journal_stop():
- */
- if (j->watermark != BCH_WATERMARK_stripe)
- journal_reclaim_kick(&c->journal);
-
- closure_wake_up(&w->wait);
- completed = true;
- }
-
- if (completed) {
- bch2_journal_reclaim_fast(j);
- bch2_journal_space_available(j);
-
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
-
- journal_wake(j);
- }
-
- if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
- j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
- struct journal_buf *buf = journal_cur_buf(j);
- long delta = buf->expires - jiffies;
-
- /*
- * We don't close a journal entry to write it while there's
- * previous entries still in flight - the current journal entry
- * might want to be written now:
- */
- mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
- }
-
- /*
- * We don't typically trigger journal writes from her - the next journal
- * write will be triggered immediately after the previous one is
- * allocated, in bch2_journal_write() - but the journal write error path
- * is special:
- */
- bch2_journal_do_writes(j);
- spin_unlock(&j->lock);
-
- if (do_discards)
- bch2_do_discards(c);
-}
-
-static void journal_write_endio(struct bio *bio)
-{
- struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
- struct bch_dev *ca = jbio->ca;
- struct journal *j = &ca->fs->journal;
- struct journal_buf *w = j->buf + jbio->buf_idx;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
- jbio->submit_time, !bio->bi_status);
-
- if (bio->bi_status) {
- bch_err_dev_ratelimited(ca,
- "error writing journal entry %llu: %s",
- le64_to_cpu(w->data->seq),
- bch2_blk_status_to_str(bio->bi_status));
-
- unsigned long flags;
- spin_lock_irqsave(&j->err_lock, flags);
- bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
- spin_unlock_irqrestore(&j->err_lock, flags);
- }
-
- closure_put(&w->io);
- percpu_ref_put(&ca->io_ref);
-}
-
-static CLOSURE_CALLBACK(journal_write_submit)
-{
- closure_type(w, struct journal_buf, io);
- struct journal *j = container_of(w, struct journal, buf[w->idx]);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned sectors = vstruct_sectors(w->data, c->block_bits);
-
- extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
- if (!ca) {
- /* XXX: fix this */
- bch_err(c, "missing device for journal write\n");
- continue;
- }
-
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
- sectors);
-
- struct journal_device *ja = &ca->journal;
- struct journal_bio *jbio = ja->bio[w->idx];
- struct bio *bio = &jbio->bio;
-
- jbio->submit_time = local_clock();
-
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
- bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0);
-
- BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
- ca->prev_journal_sector = bio->bi_iter.bi_sector;
-
- if (!JSET_NO_FLUSH(w->data))
- bio->bi_opf |= REQ_FUA;
- if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
- bio->bi_opf |= REQ_PREFLUSH;
-
- bch2_bio_map(bio, w->data, sectors << 9);
-
- trace_and_count(c, journal_write, bio);
- closure_bio_submit(bio, cl);
-
- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
- }
-
- continue_at(cl, journal_write_done, j->wq);
-}
-
-static CLOSURE_CALLBACK(journal_write_preflush)
-{
- closure_type(w, struct journal_buf, io);
- struct journal *j = container_of(w, struct journal, buf[w->idx]);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- /*
- * Wait for previous journal writes to comelete; they won't necessarily
- * be flushed if they're still in flight
- */
- if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
- spin_lock(&j->lock);
- if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
- closure_wait(&j->async_wait, cl);
- spin_unlock(&j->lock);
- continue_at(cl, journal_write_preflush, j->wq);
- return;
- }
- spin_unlock(&j->lock);
- }
-
- if (w->separate_flush) {
- for_each_rw_member(c, ca) {
- percpu_ref_get(&ca->io_ref);
-
- struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->bio[w->idx]->bio;
- bio_reset(bio, ca->disk_sb.bdev,
- REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
-
- continue_at(cl, journal_write_submit, j->wq);
- } else {
- /*
- * no need to punt to another work item if we're not waiting on
- * preflushes
- */
- journal_write_submit(&cl->work);
- }
-}
-
-static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *start, *end;
- struct jset *jset = w->data;
- struct journal_keys_to_wb wb = { NULL };
- unsigned sectors, bytes, u64s;
- unsigned long btree_roots_have = 0;
- bool validate_before_checksum = false;
- u64 seq = le64_to_cpu(jset->seq);
- int ret;
-
- /*
- * Simple compaction, dropping empty jset_entries (from journal
- * reservations that weren't fully used) and merging jset_entries that
- * can be.
- *
- * If we wanted to be really fancy here, we could sort all the keys in
- * the jset and drop keys that were overwritten - probably not worth it:
- */
- vstruct_for_each(jset, i) {
- unsigned u64s = le16_to_cpu(i->u64s);
-
- /* Empty entry: */
- if (!u64s)
- continue;
-
- /*
- * New btree roots are set by journalling them; when the journal
- * entry gets written we have to propagate them to
- * c->btree_roots
- *
- * But, every journal entry we write has to contain all the
- * btree roots (at least for now); so after we copy btree roots
- * to c->btree_roots we have to get any missing btree roots and
- * add them to this journal entry:
- */
- switch (i->type) {
- case BCH_JSET_ENTRY_btree_root:
- bch2_journal_entry_to_btree_root(c, i);
- __set_bit(i->btree_id, &btree_roots_have);
- break;
- case BCH_JSET_ENTRY_write_buffer_keys:
- EBUG_ON(!w->need_flush_to_write_buffer);
-
- if (!wb.wb)
- bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
-
- jset_entry_for_each_key(i, k) {
- ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
- if (ret) {
- bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
- bch2_err_str(ret));
- bch2_journal_keys_to_write_buffer_end(c, &wb);
- return ret;
- }
- }
- i->type = BCH_JSET_ENTRY_btree_keys;
- break;
- }
- }
-
- if (wb.wb) {
- ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
- if (ret) {
- bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
- bch2_err_str(ret));
- return ret;
- }
- }
-
- spin_lock(&c->journal.lock);
- w->need_flush_to_write_buffer = false;
- spin_unlock(&c->journal.lock);
-
- start = end = vstruct_last(jset);
-
- end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
-
- struct jset_entry_datetime *d =
- container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
- d->entry.type = BCH_JSET_ENTRY_datetime;
- d->seconds = cpu_to_le64(ktime_get_real_seconds());
-
- bch2_journal_super_entries_add_common(c, &end, seq);
- u64s = (u64 *) end - (u64 *) start;
-
- WARN_ON(u64s > j->entry_u64s_reserved);
-
- le32_add_cpu(&jset->u64s, u64s);
-
- sectors = vstruct_sectors(jset, c->block_bits);
- bytes = vstruct_bytes(jset);
-
- if (sectors > w->sectors) {
- bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
- vstruct_bytes(jset), w->sectors << 9,
- u64s, w->u64s_reserved, j->entry_u64s_reserved);
- return -EINVAL;
- }
-
- jset->magic = cpu_to_le64(jset_magic(c));
- jset->version = cpu_to_le32(c->sb.version);
-
- SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
- SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
-
- if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
- j->last_empty_seq = seq;
-
- if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
- validate_before_checksum = true;
-
- if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
- validate_before_checksum = true;
-
- if (validate_before_checksum &&
- (ret = jset_validate(c, NULL, jset, 0, WRITE)))
- return ret;
-
- ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
- jset->encrypted_start,
- vstruct_end(jset) - (void *) jset->encrypted_start);
- if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
- return ret;
-
- jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
- journal_nonce(jset), jset);
-
- if (!validate_before_checksum &&
- (ret = jset_validate(c, NULL, jset, 0, WRITE)))
- return ret;
-
- memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
- return 0;
-}
-
-static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- int error = bch2_journal_error(j);
-
- /*
- * If the journal is in an error state - we did an emergency shutdown -
- * we prefer to continue doing journal writes. We just mark them as
- * noflush so they'll never be used, but they'll still be visible by the
- * list_journal tool - this helps in debugging.
- *
- * There's a caveat: the first journal write after marking the
- * superblock dirty must always be a flush write, because on startup
- * from a clean shutdown we didn't necessarily read the journal and the
- * new journal write might overwrite whatever was in the journal
- * previously - we can't leave the journal without any flush writes in
- * it.
- *
- * So if we're in an error state, and we're still starting up, we don't
- * write anything at all.
- */
- if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
- return error;
-
- if (error ||
- w->noflush ||
- (!w->must_flush &&
- time_before(jiffies, j->last_flush_write +
- msecs_to_jiffies(c->opts.journal_flush_delay)) &&
- test_bit(JOURNAL_may_skip_flush, &j->flags))) {
- w->noflush = true;
- SET_JSET_NO_FLUSH(w->data, true);
- w->data->last_seq = 0;
- w->last_seq = 0;
-
- j->nr_noflush_writes++;
- } else {
- w->must_flush = true;
- j->last_flush_write = jiffies;
- j->nr_flush_writes++;
- clear_bit(JOURNAL_need_flush_write, &j->flags);
- }
-
- return 0;
-}
-
-CLOSURE_CALLBACK(bch2_journal_write)
-{
- closure_type(w, struct journal_buf, io);
- struct journal *j = container_of(w, struct journal, buf[w->idx]);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_replicas_padded replicas;
- unsigned nr_rw_members = 0;
- int ret;
-
- for_each_rw_member(c, ca)
- nr_rw_members++;
-
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
- BUG_ON(!w->write_started);
- BUG_ON(w->write_allocated);
- BUG_ON(w->write_done);
-
- j->write_start_time = local_clock();
-
- spin_lock(&j->lock);
- if (nr_rw_members > 1)
- w->separate_flush = true;
-
- ret = bch2_journal_write_pick_flush(j, w);
- spin_unlock(&j->lock);
- if (ret)
- goto err;
-
- mutex_lock(&j->buf_lock);
- journal_buf_realloc(j, w);
-
- ret = bch2_journal_write_prep(j, w);
- mutex_unlock(&j->buf_lock);
- if (ret)
- goto err;
-
- j->entry_bytes_written += vstruct_bytes(w->data);
-
- while (1) {
- spin_lock(&j->lock);
- ret = journal_write_alloc(j, w);
- if (!ret || !j->can_discard)
- break;
-
- spin_unlock(&j->lock);
- bch2_journal_do_discards(j);
- }
-
- if (ret && !bch2_journal_error(j)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
- le64_to_cpu(w->data->seq),
- vstruct_sectors(w->data, c->block_bits),
- bch2_err_str(ret));
- __bch2_journal_debug_to_text(&buf, j);
- spin_unlock(&j->lock);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
- if (ret)
- goto err;
-
- /*
- * write is allocated, no longer need to account for it in
- * bch2_journal_space_available():
- */
- w->sectors = 0;
- w->write_allocated = true;
-
- /*
- * journal entry has been compacted and allocated, recalculate space
- * available:
- */
- bch2_journal_space_available(j);
- bch2_journal_do_writes(j);
- spin_unlock(&j->lock);
-
- w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
-
- if (c->opts.nochanges)
- goto no_io;
-
- /*
- * Mark journal replicas before we submit the write to guarantee
- * recovery will find the journal entries after a crash.
- */
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
- w->devs_written);
- ret = bch2_mark_replicas(c, &replicas.e);
- if (ret)
- goto err;
-
- if (!JSET_NO_FLUSH(w->data))
- continue_at(cl, journal_write_preflush, j->wq);
- else
- continue_at(cl, journal_write_submit, j->wq);
- return;
-no_io:
- continue_at(cl, journal_write_done, j->wq);
- return;
-err:
- bch2_fatal_error(c);
- continue_at(cl, journal_write_done, j->wq);
-}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
deleted file mode 100644
index 12b39fcb4424..000000000000
--- a/fs/bcachefs/journal_io.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_IO_H
-#define _BCACHEFS_JOURNAL_IO_H
-
-#include "darray.h"
-
-void bch2_journal_pos_from_member_info_set(struct bch_fs *);
-void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
-
-struct journal_ptr {
- bool csum_good;
- u8 dev;
- u32 bucket;
- u32 bucket_offset;
- u64 sector;
-};
-
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
- DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
-
- bool csum_good;
- bool ignore_blacklisted;
- bool ignore_not_dirty;
- /* must be last: */
- struct jset j;
-};
-
-static inline bool journal_replay_ignore(struct journal_replay *i)
-{
- return !i || i->ignore_blacklisted || i->ignore_not_dirty;
-}
-
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
- struct jset_entry *entry, unsigned type)
-{
- while (entry < vstruct_last(jset)) {
- if (entry->type == type)
- return entry;
-
- entry = vstruct_next(entry);
- }
-
- return NULL;
-}
-
-#define for_each_jset_entry_type(entry, jset, type) \
- for (struct jset_entry *entry = (jset)->start; \
- (entry = __jset_entry_type_next(jset, entry, type)); \
- entry = vstruct_next(entry))
-
-#define jset_entry_for_each_key(_e, _k) \
- for (struct bkey_i *_k = (_e)->start; \
- _k < vstruct_last(_e); \
- _k = bkey_next(_k))
-
-#define for_each_jset_key(k, entry, jset) \
- for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
- jset_entry_for_each_key(entry, k)
-
-int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
- struct jset_entry *, unsigned, int,
- struct bkey_validate_context);
-void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
- struct jset_entry *);
-
-void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
- struct journal_replay *);
-
-int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
-
-CLOSURE_CALLBACK(bch2_journal_write);
-
-static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
- struct jset_entry *entry = *end;
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
- memset(entry, 0, u64s * sizeof(u64));
- /*
- * The u64s field counts from the start of data, ignoring the shared
- * fields.
- */
- entry->u64s = cpu_to_le16(u64s - 1);
-
- *end = vstruct_next(*end);
- return entry;
-}
-
-#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
deleted file mode 100644
index 5d1547aa118a..000000000000
--- a/fs/bcachefs/journal_reclaim.c
+++ /dev/null
@@ -1,1025 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "trace.h"
-
-#include <linux/kthread.h>
-#include <linux/sched/mm.h>
-
-/* Free space calculations: */
-
-static unsigned journal_space_from(struct journal_device *ja,
- enum journal_space_from from)
-{
- switch (from) {
- case journal_space_discarded:
- return ja->discard_idx;
- case journal_space_clean_ondisk:
- return ja->dirty_idx_ondisk;
- case journal_space_clean:
- return ja->dirty_idx;
- default:
- BUG();
- }
-}
-
-unsigned bch2_journal_dev_buckets_available(struct journal *j,
- struct journal_device *ja,
- enum journal_space_from from)
-{
- if (!ja->nr)
- return 0;
-
- unsigned available = (journal_space_from(ja, from) -
- ja->cur_idx - 1 + ja->nr) % ja->nr;
-
- /*
- * Don't use the last bucket unless writing the new last_seq
- * will make another bucket available:
- */
- if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
- --available;
-
- return available;
-}
-
-void bch2_journal_set_watermark(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- bool low_on_space = j->space[journal_space_clean].total * 4 <=
- j->space[journal_space_total].total;
- bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
- bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
- unsigned watermark = low_on_space || low_on_pin || low_on_wb
- ? BCH_WATERMARK_reclaim
- : BCH_WATERMARK_stripe;
-
- if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
- track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
- track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
- trace_and_count(c, journal_full, c);
-
- mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin);
-
- swap(watermark, j->watermark);
- if (watermark > j->watermark)
- journal_wake(j);
-}
-
-static struct journal_space
-journal_dev_space_available(struct journal *j, struct bch_dev *ca,
- enum journal_space_from from)
-{
- struct journal_device *ja = &ca->journal;
- unsigned sectors, buckets, unwritten;
- u64 seq;
-
- if (from == journal_space_total)
- return (struct journal_space) {
- .next_entry = ca->mi.bucket_size,
- .total = ca->mi.bucket_size * ja->nr,
- };
-
- buckets = bch2_journal_dev_buckets_available(j, ja, from);
- sectors = ja->sectors_free;
-
- /*
- * We that we don't allocate the space for a journal entry
- * until we write it out - thus, account for it here:
- */
- for (seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j);
- seq++) {
- unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
-
- if (!unwritten)
- continue;
-
- /* entry won't fit on this device, skip: */
- if (unwritten > ca->mi.bucket_size)
- continue;
-
- if (unwritten >= sectors) {
- if (!buckets) {
- sectors = 0;
- break;
- }
-
- buckets--;
- sectors = ca->mi.bucket_size;
- }
-
- sectors -= unwritten;
- }
-
- if (sectors < ca->mi.bucket_size && buckets) {
- buckets--;
- sectors = ca->mi.bucket_size;
- }
-
- return (struct journal_space) {
- .next_entry = sectors,
- .total = sectors + buckets * ca->mi.bucket_size,
- };
-}
-
-static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
- enum journal_space_from from)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned pos, nr_devs = 0;
- struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
- unsigned min_bucket_size = U32_MAX;
-
- BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
- if (!ca->journal.nr ||
- !ca->mi.durability)
- continue;
-
- min_bucket_size = min(min_bucket_size, ca->mi.bucket_size);
-
- space = journal_dev_space_available(j, ca, from);
- if (!space.next_entry)
- continue;
-
- for (pos = 0; pos < nr_devs; pos++)
- if (space.total > dev_space[pos].total)
- break;
-
- array_insert_item(dev_space, nr_devs, pos, space);
- }
- rcu_read_unlock();
-
- if (nr_devs < nr_devs_want)
- return (struct journal_space) { 0, 0 };
-
- /*
- * We sorted largest to smallest, and we want the smallest out of the
- * @nr_devs_want largest devices:
- */
- space = dev_space[nr_devs_want - 1];
- space.next_entry = min(space.next_entry, min_bucket_size);
- return space;
-}
-
-void bch2_journal_space_available(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned clean, clean_ondisk, total;
- unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
- j->buf[1].buf_size >> 9);
- unsigned nr_online = 0, nr_devs_want;
- bool can_discard = false;
- int ret = 0;
-
- lockdep_assert_held(&j->lock);
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
- struct journal_device *ja = &ca->journal;
-
- if (!ja->nr)
- continue;
-
- while (ja->dirty_idx != ja->cur_idx &&
- ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
- ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
-
- while (ja->dirty_idx_ondisk != ja->dirty_idx &&
- ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
- ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
-
- if (ja->discard_idx != ja->dirty_idx_ondisk)
- can_discard = true;
-
- max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
- nr_online++;
- }
- rcu_read_unlock();
-
- j->can_discard = can_discard;
-
- if (nr_online < metadata_replicas_required(c)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
- prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
- "rw journal devs:", nr_online, metadata_replicas_required(c));
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
- prt_printf(&buf, " %s", ca->name);
- rcu_read_unlock();
-
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_insufficient_journal_devices;
- goto out;
- }
-
- nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
-
- for (unsigned i = 0; i < journal_space_nr; i++)
- j->space[i] = __journal_space_available(j, nr_devs_want, i);
-
- clean_ondisk = j->space[journal_space_clean_ondisk].total;
- clean = j->space[journal_space_clean].total;
- total = j->space[journal_space_total].total;
-
- if (!j->space[journal_space_discarded].next_entry)
- ret = -BCH_ERR_journal_full;
-
- if ((j->space[journal_space_clean_ondisk].next_entry <
- j->space[journal_space_clean_ondisk].total) &&
- (clean - clean_ondisk <= total / 8) &&
- (clean_ondisk * 2 > clean))
- set_bit(JOURNAL_may_skip_flush, &j->flags);
- else
- clear_bit(JOURNAL_may_skip_flush, &j->flags);
-
- bch2_journal_set_watermark(j);
-out:
- j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
- j->cur_entry_error = ret;
-
- if (!ret)
- journal_wake(j);
-}
-
-/* Discards - last part of journal reclaim: */
-
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
- bool ret;
-
- spin_lock(&j->lock);
- ret = ja->discard_idx != ja->dirty_idx_ondisk;
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-/*
- * Advance ja->discard_idx as long as it points to buckets that are no longer
- * dirty, issuing discards if necessary:
- */
-void bch2_journal_do_discards(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- mutex_lock(&j->discard_lock);
-
- for_each_rw_member(c, ca) {
- struct journal_device *ja = &ca->journal;
-
- while (should_discard_bucket(j, ja)) {
- if (!c->opts.nochanges &&
- ca->mi.discard &&
- bdev_max_discard_sectors(ca->disk_sb.bdev))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca,
- ja->buckets[ja->discard_idx]),
- ca->mi.bucket_size, GFP_NOFS);
-
- spin_lock(&j->lock);
- ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
-
- bch2_journal_space_available(j);
- spin_unlock(&j->lock);
- }
- }
-
- mutex_unlock(&j->discard_lock);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, holding it open to ensure it gets replayed during recovery:
- */
-
-void bch2_journal_reclaim_fast(struct journal *j)
-{
- bool popped = false;
-
- lockdep_assert_held(&j->lock);
-
- /*
- * Unpin journal entries whose reference counts reached zero, meaning
- * all btree nodes got written out
- */
- while (!fifo_empty(&j->pin) &&
- j->pin.front <= j->seq_ondisk &&
- !atomic_read(&fifo_peek_front(&j->pin).count)) {
- j->pin.front++;
- popped = true;
- }
-
- if (popped) {
- bch2_journal_space_available(j);
- __closure_wake_up(&j->reclaim_flush_wait);
- }
-}
-
-bool __bch2_journal_pin_put(struct journal *j, u64 seq)
-{
- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
- return atomic_dec_and_test(&pin_list->count);
-}
-
-void bch2_journal_pin_put(struct journal *j, u64 seq)
-{
- if (__bch2_journal_pin_put(j, seq)) {
- spin_lock(&j->lock);
- bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
- }
-}
-
-static inline bool __journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- struct journal_entry_pin_list *pin_list;
-
- if (!journal_pin_active(pin))
- return false;
-
- if (j->flush_in_progress == pin)
- j->flush_in_progress_dropped = true;
-
- pin_list = journal_seq_pin(j, pin->seq);
- pin->seq = 0;
- list_del_init(&pin->list);
-
- if (j->reclaim_flush_wait.list.first)
- __closure_wake_up(&j->reclaim_flush_wait);
-
- /*
- * Unpinning a journal entry may make journal_next_bucket() succeed, if
- * writing a new last_seq will now make another bucket available:
- */
- return atomic_dec_and_test(&pin_list->count) &&
- pin_list == &fifo_peek_front(&j->pin);
-}
-
-void bch2_journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- spin_lock(&j->lock);
- if (__journal_pin_drop(j, pin))
- bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
-}
-
-static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
- journal_pin_flush_fn fn)
-{
- if (fn == bch2_btree_node_flush0 ||
- fn == bch2_btree_node_flush1) {
- unsigned idx = fn == bch2_btree_node_flush1;
- struct btree *b = container_of(pin, struct btree, writes[idx].journal);
-
- return JOURNAL_PIN_TYPE_btree0 - b->c.level;
- } else if (fn == bch2_btree_key_cache_journal_flush)
- return JOURNAL_PIN_TYPE_key_cache;
- else
- return JOURNAL_PIN_TYPE_other;
-}
-
-static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn,
- enum journal_pin_type type)
-{
- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
- /*
- * flush_fn is how we identify journal pins in debugfs, so must always
- * exist, even if it doesn't do anything:
- */
- BUG_ON(!flush_fn);
-
- atomic_inc(&pin_list->count);
- pin->seq = seq;
- pin->flush = flush_fn;
-
- if (list_empty(&pin_list->unflushed[type]) &&
- j->reclaim_flush_wait.list.first)
- __closure_wake_up(&j->reclaim_flush_wait);
-
- list_add(&pin->list, &pin_list->unflushed[type]);
-}
-
-void bch2_journal_pin_copy(struct journal *j,
- struct journal_entry_pin *dst,
- struct journal_entry_pin *src,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
-
- u64 seq = READ_ONCE(src->seq);
-
- if (seq < journal_last_seq(j)) {
- /*
- * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
- * the src pin - with the pin dropped, the entry to pin might no
- * longer to exist, but that means there's no longer anything to
- * copy and we can bail out here:
- */
- spin_unlock(&j->lock);
- return;
- }
-
- bool reclaim = __journal_pin_drop(j, dst);
-
- bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
-
- if (reclaim)
- bch2_journal_reclaim_fast(j);
-
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- if (seq == journal_last_seq(j))
- journal_wake(j);
- spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_set(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
-
- BUG_ON(seq < journal_last_seq(j));
-
- bool reclaim = __journal_pin_drop(j, pin);
-
- bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
-
- if (reclaim)
- bch2_journal_reclaim_fast(j);
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- if (seq == journal_last_seq(j))
- journal_wake(j);
-
- spin_unlock(&j->lock);
-}
-
-/**
- * bch2_journal_pin_flush: ensure journal pin callback is no longer running
- * @j: journal object
- * @pin: pin to flush
- */
-void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
-{
- BUG_ON(journal_pin_active(pin));
-
- wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
-}
-
-/*
- * Journal reclaim: flush references to open journal entries to reclaim space in
- * the journal
- *
- * May be done by the journal code in the background as needed to free up space
- * for more journal entries, or as part of doing a clean shutdown, or to migrate
- * data off of a specific device:
- */
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j,
- u64 seq_to_flush,
- unsigned allowed_below_seq,
- unsigned allowed_above_seq,
- u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *ret = NULL;
-
- fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
- if (*seq > seq_to_flush && !allowed_above_seq)
- break;
-
- for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
- if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
- (BIT(i) & allowed_above_seq)) {
- ret = list_first_entry_or_null(&pin_list->unflushed[i],
- struct journal_entry_pin, list);
- if (ret)
- return ret;
- }
- }
-
- return NULL;
-}
-
-/* returns true if we did work */
-static size_t journal_flush_pins(struct journal *j,
- u64 seq_to_flush,
- unsigned allowed_below_seq,
- unsigned allowed_above_seq,
- unsigned min_any,
- unsigned min_key_cache)
-{
- struct journal_entry_pin *pin;
- size_t nr_flushed = 0;
- journal_pin_flush_fn flush_fn;
- u64 seq;
- int err;
-
- lockdep_assert_held(&j->reclaim_lock);
-
- while (1) {
- unsigned allowed_above = allowed_above_seq;
- unsigned allowed_below = allowed_below_seq;
-
- if (min_any) {
- allowed_above |= ~0;
- allowed_below |= ~0;
- }
-
- if (min_key_cache) {
- allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
- allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
- }
-
- cond_resched();
-
- j->last_flushed = jiffies;
-
- spin_lock(&j->lock);
- pin = journal_get_next_pin(j, seq_to_flush,
- allowed_below,
- allowed_above, &seq);
- if (pin) {
- BUG_ON(j->flush_in_progress);
- j->flush_in_progress = pin;
- j->flush_in_progress_dropped = false;
- flush_fn = pin->flush;
- }
- spin_unlock(&j->lock);
-
- if (!pin)
- break;
-
- if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
- min_key_cache--;
-
- if (min_any)
- min_any--;
-
- err = flush_fn(j, pin, seq);
-
- spin_lock(&j->lock);
- /* Pin might have been dropped or rearmed: */
- if (likely(!err && !j->flush_in_progress_dropped))
- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
- j->flush_in_progress = NULL;
- j->flush_in_progress_dropped = false;
- spin_unlock(&j->lock);
-
- wake_up(&j->pin_flush_wait);
-
- if (err)
- break;
-
- nr_flushed++;
- }
-
- return nr_flushed;
-}
-
-static u64 journal_seq_to_flush(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- u64 seq_to_flush = 0;
-
- spin_lock(&j->lock);
-
- for_each_rw_member(c, ca) {
- struct journal_device *ja = &ca->journal;
- unsigned nr_buckets, bucket_to_flush;
-
- if (!ja->nr)
- continue;
-
- /* Try to keep the journal at most half full: */
- nr_buckets = ja->nr / 2;
-
- nr_buckets = min(nr_buckets, ja->nr);
-
- bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
- seq_to_flush = max(seq_to_flush,
- ja->bucket_seq[bucket_to_flush]);
- }
-
- /* Also flush if the pin fifo is more than half full */
- seq_to_flush = max_t(s64, seq_to_flush,
- (s64) journal_cur_seq(j) -
- (j->pin.size >> 1));
- spin_unlock(&j->lock);
-
- return seq_to_flush;
-}
-
-/**
- * __bch2_journal_reclaim - free up journal buckets
- * @j: journal object
- * @direct: direct or background reclaim?
- * @kicked: requested to run since we last ran?
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct btree_cache *bc = &c->btree_cache;
- bool kthread = (current->flags & PF_KTHREAD) != 0;
- u64 seq_to_flush;
- size_t min_nr, min_key_cache, nr_flushed;
- unsigned flags;
- int ret = 0;
-
- /*
- * We can't invoke memory reclaim while holding the reclaim_lock -
- * journal reclaim is required to make progress for memory reclaim
- * (cleaning the caches), so we can't get stuck in memory reclaim while
- * we're holding the reclaim lock:
- */
- lockdep_assert_held(&j->reclaim_lock);
- flags = memalloc_noreclaim_save();
-
- do {
- if (kthread && kthread_should_stop())
- break;
-
- ret = bch2_journal_error(j);
- if (ret)
- break;
-
- bch2_journal_do_discards(j);
-
- seq_to_flush = journal_seq_to_flush(j);
- min_nr = 0;
-
- /*
- * If it's been longer than j->reclaim_delay_ms since we last flushed,
- * make sure to flush at least one journal pin:
- */
- if (time_after(jiffies, j->last_flushed +
- msecs_to_jiffies(c->opts.journal_reclaim_delay)))
- min_nr = 1;
-
- if (j->watermark != BCH_WATERMARK_stripe)
- min_nr = 1;
-
- size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
- if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
- min_nr = 1;
-
- min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
-
- trace_and_count(c, journal_reclaim_start, c,
- direct, kicked,
- min_nr, min_key_cache,
- atomic_long_read(&bc->nr_dirty), btree_cache_live,
- atomic_long_read(&c->btree_key_cache.nr_dirty),
- atomic_long_read(&c->btree_key_cache.nr_keys));
-
- nr_flushed = journal_flush_pins(j, seq_to_flush,
- ~0, 0,
- min_nr, min_key_cache);
-
- if (direct)
- j->nr_direct_reclaim += nr_flushed;
- else
- j->nr_background_reclaim += nr_flushed;
- trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
-
- if (nr_flushed)
- wake_up(&j->reclaim_wait);
- } while ((min_nr || min_key_cache) && nr_flushed && !direct);
-
- memalloc_noreclaim_restore(flags);
-
- return ret;
-}
-
-int bch2_journal_reclaim(struct journal *j)
-{
- return __bch2_journal_reclaim(j, true, true);
-}
-
-static int bch2_journal_reclaim_thread(void *arg)
-{
- struct journal *j = arg;
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned long delay, now;
- bool journal_empty;
- int ret = 0;
-
- set_freezable();
-
- j->last_flushed = jiffies;
-
- while (!ret && !kthread_should_stop()) {
- bool kicked = j->reclaim_kicked;
-
- j->reclaim_kicked = false;
-
- mutex_lock(&j->reclaim_lock);
- ret = __bch2_journal_reclaim(j, false, kicked);
- mutex_unlock(&j->reclaim_lock);
-
- now = jiffies;
- delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
- j->next_reclaim = j->last_flushed + delay;
-
- if (!time_in_range(j->next_reclaim, now, now + delay))
- j->next_reclaim = now + delay;
-
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
- if (kthread_should_stop())
- break;
- if (j->reclaim_kicked)
- break;
-
- spin_lock(&j->lock);
- journal_empty = fifo_empty(&j->pin);
- spin_unlock(&j->lock);
-
- long timeout = j->next_reclaim - jiffies;
-
- if (journal_empty)
- schedule();
- else if (timeout > 0)
- schedule_timeout(timeout);
- else
- break;
- }
- __set_current_state(TASK_RUNNING);
- }
-
- return 0;
-}
-
-void bch2_journal_reclaim_stop(struct journal *j)
-{
- struct task_struct *p = j->reclaim_thread;
-
- j->reclaim_thread = NULL;
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-int bch2_journal_reclaim_start(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct task_struct *p;
- int ret;
-
- if (j->reclaim_thread)
- return 0;
-
- p = kthread_create(bch2_journal_reclaim_thread, j,
- "bch-reclaim/%s", c->name);
- ret = PTR_ERR_OR_ZERO(p);
- bch_err_msg(c, ret, "creating journal reclaim thread");
- if (ret)
- return ret;
-
- get_task_struct(p);
- j->reclaim_thread = p;
- wake_up_process(p);
- return 0;
-}
-
-static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
- unsigned types)
-{
- struct journal_entry_pin_list *pin_list;
- u64 seq;
-
- spin_lock(&j->lock);
- fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
- if (seq > seq_to_flush)
- break;
-
- for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
- if ((BIT(i) & types) &&
- (!list_empty(&pin_list->unflushed[i]) ||
- !list_empty(&pin_list->flushed[i]))) {
- spin_unlock(&j->lock);
- return true;
- }
- }
- spin_unlock(&j->lock);
-
- return false;
-}
-
-static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
- unsigned types)
-{
- return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
- journal_pins_still_flushing(j, seq_to_flush, types);
-}
-
-static int journal_flush_done(struct journal *j, u64 seq_to_flush,
- bool *did_work)
-{
- int ret = 0;
-
- ret = bch2_journal_error(j);
- if (ret)
- return ret;
-
- mutex_lock(&j->reclaim_lock);
-
- for (int type = JOURNAL_PIN_TYPE_NR - 1;
- type >= 0;
- --type)
- if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
- *did_work = true;
- goto unlock;
- }
-
- if (seq_to_flush > journal_cur_seq(j))
- bch2_journal_entry_close(j);
-
- spin_lock(&j->lock);
- /*
- * If journal replay hasn't completed, the unreplayed journal entries
- * hold refs on their corresponding sequence numbers
- */
- ret = !test_bit(JOURNAL_replay_done, &j->flags) ||
- journal_last_seq(j) > seq_to_flush ||
- !fifo_used(&j->pin);
-
- spin_unlock(&j->lock);
-unlock:
- mutex_unlock(&j->reclaim_lock);
-
- return ret;
-}
-
-bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
-{
- /* time_stats this */
- bool did_work = false;
-
- if (!test_bit(JOURNAL_running, &j->flags))
- return false;
-
- closure_wait_event(&j->reclaim_flush_wait,
- journal_flush_done(j, seq_to_flush, &did_work));
-
- return did_work;
-}
-
-int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_entry_pin_list *p;
- u64 iter, seq = 0;
- int ret = 0;
-
- spin_lock(&j->lock);
- fifo_for_each_entry_ptr(p, &j->pin, iter)
- if (dev_idx >= 0
- ? bch2_dev_list_has_dev(p->devs, dev_idx)
- : p->devs.nr < c->opts.metadata_replicas)
- seq = iter;
- spin_unlock(&j->lock);
-
- bch2_journal_flush_pins(j, seq);
-
- ret = bch2_journal_error(j);
- if (ret)
- return ret;
-
- mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
-
- /*
- * Now that we've populated replicas_gc, write to the journal to mark
- * active journal devices. This handles the case where the journal might
- * be empty. Otherwise we could clear all journal replicas and
- * temporarily put the fs into an unrecoverable state. Journal recovery
- * expects to find devices marked for journal data on unclean mount.
- */
- ret = bch2_journal_meta(&c->journal);
- if (ret)
- goto err;
-
- seq = 0;
- spin_lock(&j->lock);
- while (!ret) {
- struct bch_replicas_padded replicas;
-
- seq = max(seq, journal_last_seq(j));
- if (seq >= j->pin.back)
- break;
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
- journal_seq_pin(j, seq)->devs);
- seq++;
-
- if (replicas.e.nr_devs) {
- spin_unlock(&j->lock);
- ret = bch2_mark_replicas(c, &replicas.e);
- spin_lock(&j->lock);
- }
- }
- spin_unlock(&j->lock);
-err:
- ret = bch2_replicas_gc_end(c, ret);
- mutex_unlock(&c->replicas_gc_lock);
-
- return ret;
-}
-
-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *pin;
-
- spin_lock(&j->lock);
- if (!test_bit(JOURNAL_running, &j->flags)) {
- spin_unlock(&j->lock);
- return true;
- }
-
- *seq = max(*seq, j->pin.front);
-
- if (*seq >= j->pin.back) {
- spin_unlock(&j->lock);
- return true;
- }
-
- out->atomic++;
-
- pin_list = journal_seq_pin(j, *seq);
-
- prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "unflushed:\n");
- for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
- list_for_each_entry(pin, &pin_list->unflushed[i], list)
- prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
- prt_printf(out, "flushed:\n");
- for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
- list_for_each_entry(pin, &pin_list->flushed[i], list)
- prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
- printbuf_indent_sub(out, 2);
-
- --out->atomic;
- spin_unlock(&j->lock);
-
- return false;
-}
-
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
-{
- u64 seq = 0;
-
- while (!bch2_journal_seq_pins_to_text(out, j, &seq))
- seq++;
-}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
deleted file mode 100644
index 0a73d7134e1c..000000000000
--- a/fs/bcachefs/journal_reclaim.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
-#define _BCACHEFS_JOURNAL_RECLAIM_H
-
-#define JOURNAL_PIN (32 * 1024)
-
-static inline void journal_reclaim_kick(struct journal *j)
-{
- struct task_struct *p = READ_ONCE(j->reclaim_thread);
-
- j->reclaim_kicked = true;
- if (p)
- wake_up_process(p);
-}
-
-unsigned bch2_journal_dev_buckets_available(struct journal *,
- struct journal_device *,
- enum journal_space_from);
-void bch2_journal_set_watermark(struct journal *);
-void bch2_journal_space_available(struct journal *);
-
-static inline bool journal_pin_active(struct journal_entry_pin *pin)
-{
- return pin->seq != 0;
-}
-
-static inline struct journal_entry_pin_list *
-journal_seq_pin(struct journal *j, u64 seq)
-{
- EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
-
- return &j->pin.data[seq & j->pin.mask];
-}
-
-void bch2_journal_reclaim_fast(struct journal *);
-bool __bch2_journal_pin_put(struct journal *, u64);
-void bch2_journal_pin_put(struct journal *, u64);
-void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
- journal_pin_flush_fn);
-
-static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
- bch2_journal_pin_set(j, seq, pin, flush_fn);
-}
-
-void bch2_journal_pin_copy(struct journal *,
- struct journal_entry_pin *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
-
-static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
- bch2_journal_pin_set(j, seq, pin, flush_fn);
-}
-
-void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_do_discards(struct journal *);
-int bch2_journal_reclaim(struct journal *);
-
-void bch2_journal_reclaim_stop(struct journal *);
-int bch2_journal_reclaim_start(struct journal *);
-
-bool bch2_journal_flush_pins(struct journal *, u64);
-
-static inline bool bch2_journal_flush_all_pins(struct journal *j)
-{
- return bch2_journal_flush_pins(j, U64_MAX);
-}
-
-int bch2_journal_flush_device_pins(struct journal *, int);
-
-void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
-
-#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
deleted file mode 100644
index 62b910f2fb27..000000000000
--- a/fs/bcachefs/journal_sb.c
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "journal_sb.h"
-#include "darray.h"
-
-#include <linux/sort.h>
-
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
- const u64 *l = _l;
- const u64 *r = _r;
-
- return cmp_int(*l, *r);
-}
-
-static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_journal *journal = field_to_type(f, journal);
- struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
- int ret = -BCH_ERR_invalid_sb_journal;
- unsigned nr;
- unsigned i;
- u64 *b;
-
- nr = bch2_nr_journal_buckets(journal);
- if (!nr)
- return 0;
-
- b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
- if (!b)
- return -BCH_ERR_ENOMEM_sb_journal_validate;
-
- for (i = 0; i < nr; i++)
- b[i] = le64_to_cpu(journal->buckets[i]);
-
- sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
- if (!b[0]) {
- prt_printf(err, "journal bucket at sector 0");
- goto err;
- }
-
- if (b[0] < le16_to_cpu(m.first_bucket)) {
- prt_printf(err, "journal bucket %llu before first bucket %u",
- b[0], le16_to_cpu(m.first_bucket));
- goto err;
- }
-
- if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
- prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
- b[nr - 1], le64_to_cpu(m.nbuckets));
- goto err;
- }
-
- for (i = 0; i + 1 < nr; i++)
- if (b[i] == b[i + 1]) {
- prt_printf(err, "duplicate journal buckets %llu", b[i]);
- goto err;
- }
-
- ret = 0;
-err:
- kfree(b);
- return ret;
-}
-
-static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_journal *journal = field_to_type(f, journal);
- unsigned i, nr = bch2_nr_journal_buckets(journal);
-
- prt_printf(out, "Buckets: ");
- for (i = 0; i < nr; i++)
- prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal = {
- .validate = bch2_sb_journal_validate,
- .to_text = bch2_sb_journal_to_text,
-};
-
-struct u64_range {
- u64 start;
- u64 end;
-};
-
-static int u64_range_cmp(const void *_l, const void *_r)
-{
- const struct u64_range *l = _l;
- const struct u64_range *r = _r;
-
- return cmp_int(l->start, r->start);
-}
-
-static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
- struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
- int ret = -BCH_ERR_invalid_sb_journal;
- u64 sum = 0;
- unsigned nr;
- unsigned i;
- struct u64_range *b;
-
- nr = bch2_sb_field_journal_v2_nr_entries(journal);
- if (!nr)
- return 0;
-
- b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
- if (!b)
- return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
-
- for (i = 0; i < nr; i++) {
- b[i].start = le64_to_cpu(journal->d[i].start);
- b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
-
- if (b[i].end <= b[i].start) {
- prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
- le64_to_cpu(journal->d[i].start),
- le64_to_cpu(journal->d[i].nr));
- goto err;
- }
-
- sum += le64_to_cpu(journal->d[i].nr);
- }
-
- sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
-
- if (!b[0].start) {
- prt_printf(err, "journal bucket at sector 0");
- goto err;
- }
-
- if (b[0].start < le16_to_cpu(m.first_bucket)) {
- prt_printf(err, "journal bucket %llu before first bucket %u",
- b[0].start, le16_to_cpu(m.first_bucket));
- goto err;
- }
-
- if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
- prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
- b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
- goto err;
- }
-
- for (i = 0; i + 1 < nr; i++) {
- if (b[i].end > b[i + 1].start) {
- prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
- b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
- goto err;
- }
- }
-
- if (sum > UINT_MAX) {
- prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
- goto err;
- }
-
- ret = 0;
-err:
- kfree(b);
- return ret;
-}
-
-static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
- unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
-
- prt_printf(out, "Buckets: ");
- for (i = 0; i < nr; i++)
- prt_printf(out, " %llu-%llu",
- le64_to_cpu(journal->d[i].start),
- le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
- .validate = bch2_sb_journal_v2_validate,
- .to_text = bch2_sb_journal_v2_to_text,
-};
-
-int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
- u64 *buckets, unsigned nr)
-{
- struct bch_sb_field_journal_v2 *j;
- unsigned i, dst = 0, nr_compacted = 1;
-
- if (c)
- lockdep_assert_held(&c->sb_lock);
-
- if (!nr) {
- bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
- bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
- return 0;
- }
-
- for (i = 0; i + 1 < nr; i++)
- if (buckets[i] + 1 != buckets[i + 1])
- nr_compacted++;
-
- j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
- (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
- if (!j)
- return -BCH_ERR_ENOSPC_sb_journal;
-
- bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
-
- j->d[dst].start = cpu_to_le64(buckets[0]);
- j->d[dst].nr = cpu_to_le64(1);
-
- for (i = 1; i < nr; i++) {
- if (buckets[i] == buckets[i - 1] + 1) {
- le64_add_cpu(&j->d[dst].nr, 1);
- } else {
- dst++;
- j->d[dst].start = cpu_to_le64(buckets[i]);
- j->d[dst].nr = cpu_to_le64(1);
- }
- }
-
- BUG_ON(dst + 1 != nr_compacted);
- return 0;
-}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
deleted file mode 100644
index ba40a7e8d90a..000000000000
--- a/fs/bcachefs/journal_sb.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include "super-io.h"
-#include "vstructs.h"
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
- return j
- ? (__le64 *) vstruct_end(&j->field) - j->buckets
- : 0;
-}
-
-static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
-{
- if (!j)
- return 0;
-
- return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
-
-int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
deleted file mode 100644
index e463d2d95359..000000000000
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ /dev/null
@@ -1,254 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "eytzinger.h"
-#include "journal.h"
-#include "journal_seq_blacklist.h"
-#include "super-io.h"
-
-/*
- * journal_seq_blacklist machinery:
- *
- * To guarantee order of btree updates after a crash, we need to detect when a
- * btree node entry (bset) is newer than the newest journal entry that was
- * successfully written, and ignore it - effectively ignoring any btree updates
- * that didn't make it into the journal.
- *
- * If we didn't do this, we might have two btree nodes, a and b, both with
- * updates that weren't written to the journal yet: if b was updated after a,
- * but b was flushed and not a - oops; on recovery we'll find that the updates
- * to b happened, but not the updates to a that happened before it.
- *
- * Ignoring bsets that are newer than the newest journal entry is always safe,
- * because everything they contain will also have been journalled - and must
- * still be present in the journal on disk until a journal entry has been
- * written _after_ that bset was written.
- *
- * To accomplish this, bsets record the newest journal sequence number they
- * contain updates for; then, on startup, the btree code queries the journal
- * code to ask "Is this sequence number newer than the newest journal entry? If
- * so, ignore it."
- *
- * When this happens, we must blacklist that journal sequence number: the
- * journal must not write any entries with that sequence number, and it must
- * record that it was blacklisted so that a) on recovery we don't think we have
- * missing journal entries and b) so that the btree code continues to ignore
- * that bset, until that btree node is rewritten.
- */
-
-static unsigned sb_blacklist_u64s(unsigned nr)
-{
- struct bch_sb_field_journal_seq_blacklist *bl;
-
- return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
-}
-
-int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
-{
- struct bch_sb_field_journal_seq_blacklist *bl;
- unsigned i = 0, nr;
- int ret = 0;
-
- mutex_lock(&c->sb_lock);
- bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
- nr = blacklist_nr_entries(bl);
-
- while (i < nr) {
- struct journal_seq_blacklist_entry *e =
- bl->start + i;
-
- if (end < le64_to_cpu(e->start))
- break;
-
- if (start > le64_to_cpu(e->end)) {
- i++;
- continue;
- }
-
- /*
- * Entry is contiguous or overlapping with new entry: merge it
- * with new entry, and delete:
- */
-
- start = min(start, le64_to_cpu(e->start));
- end = max(end, le64_to_cpu(e->end));
- array_remove_item(bl->start, nr, i);
- }
-
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- sb_blacklist_u64s(nr + 1));
- if (!bl) {
- ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
- goto out;
- }
-
- array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
- .start = cpu_to_le64(start),
- .end = cpu_to_le64(end),
- }));
- c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
-
- ret = bch2_write_super(c);
-out:
- mutex_unlock(&c->sb_lock);
-
- return ret ?: bch2_blacklist_table_initialize(c);
-}
-
-static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
-{
- const struct journal_seq_blacklist_table_entry *l = _l;
- const struct journal_seq_blacklist_table_entry *r = _r;
-
- return cmp_int(l->start, r->start);
-}
-
-bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
- bool dirty)
-{
- struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
- struct journal_seq_blacklist_table_entry search = { .start = seq };
- int idx;
-
- if (!t)
- return false;
-
- idx = eytzinger0_find_le(t->entries, t->nr,
- sizeof(t->entries[0]),
- journal_seq_blacklist_table_cmp,
- &search);
- if (idx < 0)
- return false;
-
- BUG_ON(t->entries[idx].start > seq);
-
- if (seq >= t->entries[idx].end)
- return false;
-
- if (dirty)
- t->entries[idx].dirty = true;
- return true;
-}
-
-int bch2_blacklist_table_initialize(struct bch_fs *c)
-{
- struct bch_sb_field_journal_seq_blacklist *bl =
- bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
- struct journal_seq_blacklist_table *t;
- unsigned i, nr = blacklist_nr_entries(bl);
-
- if (!bl)
- return 0;
-
- t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
- if (!t)
- return -BCH_ERR_ENOMEM_blacklist_table_init;
-
- t->nr = nr;
-
- for (i = 0; i < nr; i++) {
- t->entries[i].start = le64_to_cpu(bl->start[i].start);
- t->entries[i].end = le64_to_cpu(bl->start[i].end);
- }
-
- eytzinger0_sort(t->entries,
- t->nr,
- sizeof(t->entries[0]),
- journal_seq_blacklist_table_cmp,
- NULL);
-
- kfree(c->journal_seq_blacklist_table);
- c->journal_seq_blacklist_table = t;
- return 0;
-}
-
-static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_journal_seq_blacklist *bl =
- field_to_type(f, journal_seq_blacklist);
- unsigned i, nr = blacklist_nr_entries(bl);
-
- for (i = 0; i < nr; i++) {
- struct journal_seq_blacklist_entry *e = bl->start + i;
-
- if (le64_to_cpu(e->start) >=
- le64_to_cpu(e->end)) {
- prt_printf(err, "entry %u start >= end (%llu >= %llu)",
- i, le64_to_cpu(e->start), le64_to_cpu(e->end));
- return -BCH_ERR_invalid_sb_journal_seq_blacklist;
- }
-
- if (i + 1 < nr &&
- le64_to_cpu(e[0].end) >
- le64_to_cpu(e[1].start)) {
- prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
- i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
- return -BCH_ERR_invalid_sb_journal_seq_blacklist;
- }
- }
-
- return 0;
-}
-
-static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
- struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_journal_seq_blacklist *bl =
- field_to_type(f, journal_seq_blacklist);
- struct journal_seq_blacklist_entry *i;
- unsigned nr = blacklist_nr_entries(bl);
-
- for (i = bl->start; i < bl->start + nr; i++) {
- if (i != bl->start)
- prt_printf(out, " ");
-
- prt_printf(out, "%llu-%llu",
- le64_to_cpu(i->start),
- le64_to_cpu(i->end));
- }
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
- .validate = bch2_sb_journal_seq_blacklist_validate,
- .to_text = bch2_sb_journal_seq_blacklist_to_text
-};
-
-bool bch2_blacklist_entries_gc(struct bch_fs *c)
-{
- struct journal_seq_blacklist_entry *src, *dst;
-
- struct bch_sb_field_journal_seq_blacklist *bl =
- bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
- if (!bl)
- return false;
-
- unsigned nr = blacklist_nr_entries(bl);
- dst = bl->start;
-
- struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
- BUG_ON(nr != t->nr);
-
- src = bl->start;
- eytzinger0_for_each(i, nr) {
- BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
- BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
-
- if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
- *dst++ = *src;
- src++;
- }
-
- unsigned new_nr = dst - bl->start;
- if (new_nr == nr)
- return false;
-
- bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr);
-
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- new_nr ? sb_blacklist_u64s(new_nr) : 0);
- BUG_ON(new_nr && !bl);
- return true;
-}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
deleted file mode 100644
index d47636f96fdc..000000000000
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
-#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
-
-static inline unsigned
-blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
-{
- return bl
- ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
- sizeof(struct journal_seq_blacklist_entry))
- : 0;
-}
-
-bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
-int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
-int bch2_blacklist_table_initialize(struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
-
-bool bch2_blacklist_entries_gc(struct bch_fs *);
-
-#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h
deleted file mode 100644
index 2566b12dbc04..000000000000
--- a/fs/bcachefs/journal_seq_blacklist_format.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
-#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
-
-struct journal_seq_blacklist_entry {
- __le64 start;
- __le64 end;
-};
-
-struct bch_sb_field_journal_seq_blacklist {
- struct bch_sb_field field;
- struct journal_seq_blacklist_entry start[];
-};
-
-#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
deleted file mode 100644
index 8e0eba776b9d..000000000000
--- a/fs/bcachefs/journal_types.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_TYPES_H
-#define _BCACHEFS_JOURNAL_TYPES_H
-
-#include <linux/cache.h>
-#include <linux/workqueue.h>
-
-#include "alloc_types.h"
-#include "super_types.h"
-#include "fifo.h"
-
-/* btree write buffer steals 8 bits for its own purposes: */
-#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1)
-
-#define JOURNAL_STATE_BUF_BITS 2
-#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS)
-#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1)
-
-#define JOURNAL_BUF_BITS 4
-#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
-#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
-
-/*
- * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
- * the journal that are being staged or in flight.
- */
-struct journal_buf {
- struct closure io;
- struct jset *data;
-
- __BKEY_PADDED(key, BCH_REPLICAS_MAX);
- struct bch_devs_list devs_written;
-
- struct closure_waitlist wait;
- u64 last_seq; /* copy of data->last_seq */
- long expires;
- u64 flush_time;
-
- unsigned buf_size; /* size in bytes of @data */
- unsigned sectors; /* maximum size for current entry */
- unsigned disk_sectors; /* maximum size entry could have been, if
- buf_size was bigger */
- unsigned u64s_reserved;
- bool noflush:1; /* write has already been kicked off, and was noflush */
- bool must_flush:1; /* something wants a flush */
- bool separate_flush:1;
- bool need_flush_to_write_buffer:1;
- bool write_started:1;
- bool write_allocated:1;
- bool write_done:1;
- u8 idx;
-};
-
-/*
- * Something that makes a journal entry dirty - i.e. a btree node that has to be
- * flushed:
- */
-
-enum journal_pin_type {
- JOURNAL_PIN_TYPE_btree3,
- JOURNAL_PIN_TYPE_btree2,
- JOURNAL_PIN_TYPE_btree1,
- JOURNAL_PIN_TYPE_btree0,
- JOURNAL_PIN_TYPE_key_cache,
- JOURNAL_PIN_TYPE_other,
- JOURNAL_PIN_TYPE_NR,
-};
-
-struct journal_entry_pin_list {
- struct list_head unflushed[JOURNAL_PIN_TYPE_NR];
- struct list_head flushed[JOURNAL_PIN_TYPE_NR];
- atomic_t count;
- struct bch_devs_list devs;
-};
-
-struct journal;
-struct journal_entry_pin;
-typedef int (*journal_pin_flush_fn)(struct journal *j,
- struct journal_entry_pin *, u64);
-
-struct journal_entry_pin {
- struct list_head list;
- journal_pin_flush_fn flush;
- u64 seq;
-};
-
-struct journal_res {
- bool ref;
- u16 u64s;
- u32 offset;
- u64 seq;
-};
-
-union journal_res_state {
- struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
- struct {
- u64 cur_entry_offset:22,
- idx:2,
- buf0_count:10,
- buf1_count:10,
- buf2_count:10,
- buf3_count:10;
- };
-};
-
-/* bytes: */
-#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
-#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */
-
-/*
- * We stash some journal state as sentinal values in cur_entry_offset:
- * note - cur_entry_offset is in units of u64s
- */
-#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1)
-
-#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2)
-#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
-#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
-
-struct journal_space {
- /* Units of 512 bytes sectors: */
- unsigned next_entry; /* How big the next journal entry can be */
- unsigned total;
-};
-
-enum journal_space_from {
- journal_space_discarded,
- journal_space_clean_ondisk,
- journal_space_clean,
- journal_space_total,
- journal_space_nr,
-};
-
-#define JOURNAL_FLAGS() \
- x(replay_done) \
- x(running) \
- x(may_skip_flush) \
- x(need_flush_write) \
- x(space_low)
-
-enum journal_flags {
-#define x(n) JOURNAL_##n,
- JOURNAL_FLAGS()
-#undef x
-};
-
-typedef DARRAY(u64) darray_u64;
-
-struct journal_bio {
- struct bch_dev *ca;
- unsigned buf_idx;
- u64 submit_time;
-
- struct bio bio;
-};
-
-/* Embedded in struct bch_fs */
-struct journal {
- /* Fastpath stuff up front: */
- struct {
-
- union journal_res_state reservations;
- enum bch_watermark watermark;
-
- } __aligned(SMP_CACHE_BYTES);
-
- unsigned long flags;
-
- /* Max size of current journal entry */
- unsigned cur_entry_u64s;
- unsigned cur_entry_sectors;
-
- /* Reserved space in journal entry to be used just prior to write */
- unsigned entry_u64s_reserved;
-
-
- /*
- * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
- * insufficient devices:
- */
- int cur_entry_error;
- unsigned cur_entry_offset_if_blocked;
-
- unsigned buf_size_want;
- /*
- * We may queue up some things to be journalled (log messages) before
- * the journal has actually started - stash them here:
- */
- darray_u64 early_journal_entries;
-
- /*
- * Protects journal_buf->data, when accessing without a jorunal
- * reservation: for synchronization between the btree write buffer code
- * and the journal write path:
- */
- struct mutex buf_lock;
- /*
- * Two journal entries -- one is currently open for new entries, the
- * other is possibly being written out.
- */
- struct journal_buf buf[JOURNAL_BUF_NR];
- void *free_buf;
- unsigned free_buf_size;
-
- spinlock_t lock;
-
- /* if nonzero, we may not open a new journal entry: */
- unsigned blocked;
-
- /* Used when waiting because the journal was full */
- wait_queue_head_t wait;
- struct closure_waitlist async_wait;
- struct closure_waitlist reclaim_flush_wait;
-
- struct delayed_work write_work;
- struct workqueue_struct *wq;
-
- /* Sequence number of most recent journal entry (last entry in @pin) */
- atomic64_t seq;
-
- u64 seq_write_started;
- /* seq, last_seq from the most recent journal entry successfully written */
- u64 seq_ondisk;
- u64 flushed_seq_ondisk;
- u64 flushing_seq;
- u64 last_seq_ondisk;
- u64 err_seq;
- u64 last_empty_seq;
- u64 oldest_seq_found_ondisk;
-
- /*
- * FIFO of journal entries whose btree updates have not yet been
- * written out.
- *
- * Each entry is a reference count. The position in the FIFO is the
- * entry's sequence number relative to @seq.
- *
- * The journal entry itself holds a reference count, put when the
- * journal entry is written out. Each btree node modified by the journal
- * entry also holds a reference count, put when the btree node is
- * written.
- *
- * When a reference count reaches zero, the journal entry is no longer
- * needed. When all journal entries in the oldest journal bucket are no
- * longer needed, the bucket can be discarded and reused.
- */
- struct {
- u64 front, back, size, mask;
- struct journal_entry_pin_list *data;
- } pin;
-
- struct journal_space space[journal_space_nr];
-
- u64 replay_journal_seq;
- u64 replay_journal_seq_end;
-
- struct write_point wp;
- spinlock_t err_lock;
-
- struct mutex reclaim_lock;
- /*
- * Used for waiting until journal reclaim has freed up space in the
- * journal:
- */
- wait_queue_head_t reclaim_wait;
- struct task_struct *reclaim_thread;
- bool reclaim_kicked;
- unsigned long next_reclaim;
- u64 nr_direct_reclaim;
- u64 nr_background_reclaim;
-
- unsigned long last_flushed;
- struct journal_entry_pin *flush_in_progress;
- bool flush_in_progress_dropped;
- wait_queue_head_t pin_flush_wait;
-
- /* protects advancing ja->discard_idx: */
- struct mutex discard_lock;
- bool can_discard;
-
- unsigned long last_flush_write;
-
- u64 write_start_time;
-
- u64 nr_flush_writes;
- u64 nr_noflush_writes;
- u64 entry_bytes_written;
-
- struct bch2_time_stats *flush_write_time;
- struct bch2_time_stats *noflush_write_time;
- struct bch2_time_stats *flush_seq_time;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map res_map;
-#endif
-} __aligned(SMP_CACHE_BYTES);
-
-/*
- * Embedded in struct bch_dev. First three fields refer to the array of journal
- * buckets, in bch_sb.
- */
-struct journal_device {
- /*
- * For each journal bucket, contains the max sequence number of the
- * journal writes it contains - so we know when a bucket can be reused.
- */
- u64 *bucket_seq;
-
- unsigned sectors_free;
-
- /*
- * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
- */
- unsigned discard_idx; /* Next bucket to discard */
- unsigned dirty_idx_ondisk;
- unsigned dirty_idx;
- unsigned cur_idx; /* Journal bucket we're currently writing to */
- unsigned nr;
-
- u64 *buckets;
-
- /* Bio for journal reads/writes to this device */
- struct journal_bio *bio[JOURNAL_BUF_NR];
-
- /* for bch_journal_read_device */
- struct closure read;
- u64 highest_seq_found;
-};
-
-/*
- * journal_entry_res - reserve space in every journal entry:
- */
-struct journal_entry_res {
- unsigned u64s;
-};
-
-#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
deleted file mode 100644
index 1b828bddd11b..000000000000
--- a/fs/bcachefs/keylist.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "keylist.h"
-
-int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
- size_t nr_inline_u64s, size_t new_u64s)
-{
- size_t oldsize = bch2_keylist_u64s(l);
- size_t newsize = oldsize + new_u64s;
- u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
- u64 *new_keys;
-
- newsize = roundup_pow_of_two(newsize);
-
- if (newsize <= nr_inline_u64s ||
- (old_buf && roundup_pow_of_two(oldsize) == newsize))
- return 0;
-
- new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS);
- if (!new_keys)
- return -ENOMEM;
-
- if (!old_buf)
- memcpy_u64s(new_keys, inline_u64s, oldsize);
-
- l->keys_p = new_keys;
- l->top_p = new_keys + oldsize;
-
- return 0;
-}
-
-void bch2_keylist_pop_front(struct keylist *l)
-{
- l->top_p -= bch2_keylist_front(l)->k.u64s;
-
- memmove_u64s_down(l->keys,
- bkey_next(l->keys),
- bch2_keylist_u64s(l));
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_verify_keylist_sorted(struct keylist *l)
-{
- for_each_keylist_key(l, k)
- BUG_ON(bkey_next(k) != l->top &&
- bpos_ge(k->k.p, bkey_next(k)->k.p));
-}
-#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
deleted file mode 100644
index e687e0e9aede..000000000000
--- a/fs/bcachefs/keylist.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_KEYLIST_H
-#define _BCACHEFS_KEYLIST_H
-
-#include "keylist_types.h"
-
-int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
-void bch2_keylist_pop_front(struct keylist *);
-
-static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
-{
- l->top_p = l->keys_p = inline_keys;
-}
-
-static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
-{
- if (l->keys_p != inline_keys)
- kfree(l->keys_p);
-}
-
-static inline void bch2_keylist_push(struct keylist *l)
-{
- l->top = bkey_next(l->top);
-}
-
-static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
-{
- bkey_copy(l->top, k);
- bch2_keylist_push(l);
-}
-
-static inline bool bch2_keylist_empty(struct keylist *l)
-{
- return l->top == l->keys;
-}
-
-static inline size_t bch2_keylist_u64s(struct keylist *l)
-{
- return l->top_p - l->keys_p;
-}
-
-static inline size_t bch2_keylist_bytes(struct keylist *l)
-{
- return bch2_keylist_u64s(l) * sizeof(u64);
-}
-
-static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
-{
- return l->keys;
-}
-
-#define for_each_keylist_key(_keylist, _k) \
- for (struct bkey_i *_k = (_keylist)->keys; \
- _k != (_keylist)->top; \
- _k = bkey_next(_k))
-
-static inline u64 keylist_sectors(struct keylist *keys)
-{
- u64 ret = 0;
-
- for_each_keylist_key(keys, k)
- ret += k->k.size;
- return ret;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_verify_keylist_sorted(struct keylist *);
-#else
-static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
-#endif
-
-#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
deleted file mode 100644
index 4b3ff7d8a875..000000000000
--- a/fs/bcachefs/keylist_types.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_KEYLIST_TYPES_H
-#define _BCACHEFS_KEYLIST_TYPES_H
-
-struct keylist {
- union {
- struct bkey_i *keys;
- u64 *keys_p;
- };
- union {
- struct bkey_i *top;
- u64 *top_p;
- };
-};
-
-#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
deleted file mode 100644
index 75f27ec26f85..000000000000
--- a/fs/bcachefs/logged_ops.c
+++ /dev/null
@@ -1,119 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "error.h"
-#include "io_misc.h"
-#include "logged_ops.h"
-#include "super.h"
-
-struct bch_logged_op_fn {
- u8 type;
- int (*resume)(struct btree_trans *, struct bkey_i *);
-};
-
-static const struct bch_logged_op_fn logged_op_fns[] = {
-#define x(n) { \
- .type = KEY_TYPE_logged_op_##n, \
- .resume = bch2_resume_logged_op_##n, \
-},
- BCH_LOGGED_OPS()
-#undef x
-};
-
-static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
- if (logged_op_fns[i].type == type)
- return logged_op_fns + i;
- return NULL;
-}
-
-static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
- trans, logged_op_but_clean,
- "filesystem marked as clean but have logged op\n%s",
- (bch2_bkey_val_to_text(&buf, c, k),
- buf.buf));
-
- struct bkey_buf sk;
- bch2_bkey_buf_init(&sk);
- bch2_bkey_buf_reassemble(&sk, c, k);
-
- const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type);
- if (fn)
- fn->resume(trans, sk.k);
-
- ret = bch2_logged_op_finish(trans, sk.k);
-
- bch2_bkey_buf_exit(&sk, c);
-fsck_err:
- printbuf_exit(&buf);
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-int bch2_resume_logged_ops(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter,
- BTREE_ID_logged_ops,
- POS(LOGGED_OPS_INUM_logged_ops, 0),
- POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
- BTREE_ITER_prefetch, k,
- resume_logged_op(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
-{
- struct btree_iter iter;
- int ret = bch2_bkey_get_empty_slot(trans, &iter,
- BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
- if (ret)
- return ret;
-
- k->k.p = iter.pos;
-
- ret = bch2_trans_update(trans, &iter, k, 0);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
-{
- return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_logged_op_start(trans, k));
-}
-
-int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
-{
- int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
- /*
- * This needs to be a fatal error because we've left an unfinished
- * operation in the logged ops btree.
- *
- * We should only ever see an error here if the filesystem has already
- * been shut down, but make sure of that here:
- */
- if (ret) {
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
- bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
- buf.buf, bch2_err_str(ret));
- printbuf_exit(&buf);
- }
-
- return ret;
-}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
deleted file mode 100644
index 30ae9ef737dd..000000000000
--- a/fs/bcachefs/logged_ops.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LOGGED_OPS_H
-#define _BCACHEFS_LOGGED_OPS_H
-
-#include "bkey.h"
-
-#define BCH_LOGGED_OPS() \
- x(truncate) \
- x(finsert)
-
-static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
-{
- return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
-}
-
-int bch2_resume_logged_ops(struct bch_fs *);
-int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
-int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
-
-#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
deleted file mode 100644
index cfb67c95d4c8..000000000000
--- a/fs/bcachefs/logged_ops_format.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
-#define _BCACHEFS_LOGGED_OPS_FORMAT_H
-
-enum logged_ops_inums {
- LOGGED_OPS_INUM_logged_ops,
- LOGGED_OPS_INUM_inode_cursors,
-};
-
-struct bch_logged_op_truncate {
- struct bch_val v;
- __le32 subvol;
- __le32 pad;
- __le64 inum;
- __le64 new_i_size;
-};
-
-enum logged_op_finsert_state {
- LOGGED_OP_FINSERT_start,
- LOGGED_OP_FINSERT_shift_extents,
- LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
- struct bch_val v;
- __u8 state;
- __u8 pad[3];
- __le32 subvol;
- __le64 inum;
- __le64 dst_offset;
- __le64 src_offset;
- __le64 pos;
-};
-
-#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
deleted file mode 100644
index a299d9ec8ee4..000000000000
--- a/fs/bcachefs/lru.c
+++ /dev/null
@@ -1,226 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "bkey_buf.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "ec.h"
-#include "error.h"
-#include "lru.h"
-#include "recovery.h"
-
-/* KEY_TYPE_lru is obsolete: */
-int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(!lru_pos_time(k.k->p),
- c, lru_entry_at_time_0,
- "lru entry at time=0");
-fsck_err:
- return ret;
-}
-
-void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
-
- prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
-}
-
-void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
-{
- prt_printf(out, "%llu:%llu -> %llu:%llu",
- lru_pos_id(lru),
- lru_pos_time(lru),
- u64_to_bucket(lru.offset).inode,
- u64_to_bucket(lru.offset).offset);
-}
-
-static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
- u64 dev_bucket, u64 time, bool set)
-{
- return time
- ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
- lru_pos(lru_id, dev_bucket, time), set)
- : 0;
-}
-
-int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
-{
- return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
-}
-
-int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
-{
- return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
-}
-
-int __bch2_lru_change(struct btree_trans *trans,
- u16 lru_id, u64 dev_bucket,
- u64 old_time, u64 new_time)
-{
- if (old_time == new_time)
- return 0;
-
- return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
- bch2_lru_set(trans, lru_id, dev_bucket, new_time);
-}
-
-static const char * const bch2_lru_types[] = {
-#define x(n) #n,
- BCH_LRU_TYPES()
-#undef x
- NULL
-};
-
-int bch2_lru_check_set(struct btree_trans *trans,
- u16 lru_id,
- u64 dev_bucket,
- u64 time,
- struct bkey_s_c referring_k,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter lru_iter;
- struct bkey_s_c lru_k =
- bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
- lru_pos(lru_id, dev_bucket, time), 0);
- int ret = bkey_err(lru_k);
- if (ret)
- return ret;
-
- if (lru_k.k->type != KEY_TYPE_set) {
- ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed);
- if (ret)
- goto err;
-
- if (fsck_err(trans, alloc_key_to_missing_lru_entry,
- "missing %s lru entry\n"
- " %s",
- bch2_lru_types[lru_type(lru_k)],
- (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
- ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
- if (ret)
- goto err;
- }
- }
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &lru_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
-{
- enum bch_lru_type type = lru_type(lru_k);
-
- switch (type) {
- case BCH_LRU_read:
- case BCH_LRU_fragmentation:
- return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset));
- case BCH_LRU_stripes:
- return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset));
- default:
- BUG();
- }
-}
-
-static u64 bkey_lru_type_idx(struct bch_fs *c,
- enum bch_lru_type type,
- struct bkey_s_c k)
-{
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
-
- switch (type) {
- case BCH_LRU_read:
- a = bch2_alloc_to_v4(k, &a_convert);
- return alloc_lru_idx_read(*a);
- case BCH_LRU_fragmentation: {
- a = bch2_alloc_to_v4(k, &a_convert);
-
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
- u64 idx = ca
- ? alloc_lru_idx_fragmentation(*a, ca)
- : 0;
- rcu_read_unlock();
- return idx;
- }
- case BCH_LRU_stripes:
- return k.k->type == KEY_TYPE_stripe
- ? stripe_lru_pos(bkey_s_c_to_stripe(k).v)
- : 0;
- default:
- BUG();
- }
-}
-
-static int bch2_check_lru_key(struct btree_trans *trans,
- struct btree_iter *lru_iter,
- struct bkey_s_c lru_k,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- struct bbpos bp = lru_pos_to_bp(lru_k);
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- enum bch_lru_type type = lru_type(lru_k);
- u64 idx = bkey_lru_type_idx(c, type, k);
-
- if (lru_pos_time(lru_k.k->p) != idx) {
- ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
- if (ret)
- goto err;
-
- if (fsck_err(trans, lru_entry_bad,
- "incorrect lru entry: lru %s time %llu\n"
- " %s\n"
- " for %s",
- bch2_lru_types[type],
- lru_pos_time(lru_k.k->p),
- (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
- (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
- }
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
-int bch2_check_lrus(struct bch_fs *c)
-{
- struct bkey_buf last_flushed;
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_lru_key(trans, &iter, k, &last_flushed)));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch_err_fn(c, ret);
- return ret;
-
-}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
deleted file mode 100644
index 8abd0aa2083a..000000000000
--- a/fs/bcachefs/lru.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LRU_H
-#define _BCACHEFS_LRU_H
-
-static inline u64 lru_pos_id(struct bpos pos)
-{
- return pos.inode >> LRU_TIME_BITS;
-}
-
-static inline u64 lru_pos_time(struct bpos pos)
-{
- return pos.inode & ~(~0ULL << LRU_TIME_BITS);
-}
-
-static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
-{
- struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
-
- EBUG_ON(time > LRU_TIME_MAX);
- EBUG_ON(lru_pos_id(pos) != lru_id);
- EBUG_ON(lru_pos_time(pos) != time);
- EBUG_ON(pos.offset != dev_bucket);
-
- return pos;
-}
-
-static inline enum bch_lru_type lru_type(struct bkey_s_c l)
-{
- u16 lru_id = l.k->p.inode >> 48;
-
- switch (lru_id) {
- case BCH_LRU_BUCKET_FRAGMENTATION:
- return BCH_LRU_fragmentation;
- case BCH_LRU_STRIPE_FRAGMENTATION:
- return BCH_LRU_stripes;
- default:
- return BCH_LRU_read;
- }
-}
-
-int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
-void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
-
-#define bch2_bkey_ops_lru ((struct bkey_ops) { \
- .key_validate = bch2_lru_validate, \
- .val_to_text = bch2_lru_to_text, \
- .min_val_size = 8, \
-})
-
-int bch2_lru_del(struct btree_trans *, u16, u64, u64);
-int bch2_lru_set(struct btree_trans *, u16, u64, u64);
-int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
-
-static inline int bch2_lru_change(struct btree_trans *trans,
- u16 lru_id, u64 dev_bucket,
- u64 old_time, u64 new_time)
-{
- return old_time != new_time
- ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time)
- : 0;
-}
-
-struct bkey_buf;
-int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
-
-int bch2_check_lrus(struct bch_fs *);
-
-#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
deleted file mode 100644
index b7392ad8e41f..000000000000
--- a/fs/bcachefs/lru_format.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LRU_FORMAT_H
-#define _BCACHEFS_LRU_FORMAT_H
-
-struct bch_lru {
- struct bch_val v;
- __le64 idx;
-} __packed __aligned(8);
-
-#define BCH_LRU_TYPES() \
- x(read) \
- x(fragmentation) \
- x(stripes)
-
-enum bch_lru_type {
-#define x(n) BCH_LRU_##n,
- BCH_LRU_TYPES()
-#undef x
-};
-
-#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1)
-#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2)
-
-#define LRU_TIME_BITS 48
-#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
-
-#endif /* _BCACHEFS_LRU_FORMAT_H */
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
deleted file mode 100644
index 0ea9f30803a2..000000000000
--- a/fs/bcachefs/mean_and_variance.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions for incremental mean and variance.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * Copyright © 2022 Daniel B. Hill
- *
- * Author: Daniel B. Hill <daniel@gluo.nz>
- *
- * Description:
- *
- * This is includes some incremental algorithms for mean and variance calculation
- *
- * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
- *
- * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
- *
- * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
- *
- * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
- * is deferred to these functions for performance reasons.
- *
- * see lib/math/mean_and_variance_test.c for examples of usage.
- *
- * DO NOT access the mean and variance fields of the weighted variants directly.
- * DO NOT change the weight after calling update.
- */
-
-#include <linux/bug.h>
-#include <linux/compiler.h>
-#include <linux/export.h>
-#include <linux/limits.h>
-#include <linux/math.h>
-#include <linux/math64.h>
-#include <linux/module.h>
-
-#include "mean_and_variance.h"
-
-u128_u u128_div(u128_u n, u64 d)
-{
- u128_u r;
- u64 rem;
- u64 hi = u128_hi(n);
- u64 lo = u128_lo(n);
- u64 h = hi & ((u64) U32_MAX << 32);
- u64 l = (hi & (u64) U32_MAX) << 32;
-
- r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64);
- r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32));
- r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
- return r;
-}
-EXPORT_SYMBOL_GPL(u128_div);
-
-/**
- * mean_and_variance_get_mean() - get mean from @s
- * @s: mean and variance number of samples and their sums
- */
-s64 mean_and_variance_get_mean(struct mean_and_variance s)
-{
- return s.n ? div64_u64(s.sum, s.n) : 0;
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
-
-/**
- * mean_and_variance_get_variance() - get variance from @s1
- * @s1: mean and variance number of samples and sums
- *
- * see linked pdf equation 12.
- */
-u64 mean_and_variance_get_variance(struct mean_and_variance s1)
-{
- if (s1.n) {
- u128_u s2 = u128_div(s1.sum_squares, s1.n);
- u64 s3 = abs(mean_and_variance_get_mean(s1));
-
- return u128_lo(u128_sub(s2, u128_square(s3)));
- } else {
- return 0;
- }
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
-
-/**
- * mean_and_variance_get_stddev() - get standard deviation from @s
- * @s: mean and variance number of samples and their sums
- */
-u32 mean_and_variance_get_stddev(struct mean_and_variance s)
-{
- return int_sqrt64(mean_and_variance_get_variance(s));
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
-
-/**
- * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
- * @s: mean and variance number of samples and their sums
- * @x: new value to include in the &mean_and_variance_weighted
- * @initted: caller must track whether this is the first use or not
- * @weight: ewma weight
- *
- * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
- * values are stored bitshifted for performance and added precision.
- */
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
- s64 x, bool initted, u8 weight)
-{
- // previous weighted variance.
- u8 w = weight;
- u64 var_w0 = s->variance;
- // new value weighted.
- s64 x_w = x << w;
- s64 diff_w = x_w - s->mean;
- s64 diff = fast_divpow2(diff_w, w);
- // new mean weighted.
- s64 u_w1 = s->mean + diff;
-
- if (!initted) {
- s->mean = x_w;
- s->variance = 0;
- } else {
- s->mean = u_w1;
- s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
- }
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
-
-/**
- * mean_and_variance_weighted_get_mean() - get mean from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
- u8 weight)
-{
- return fast_divpow2(s.mean, weight);
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
-
-/**
- * mean_and_variance_weighted_get_variance() -- get variance from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
- u8 weight)
-{
- // always positive don't need fast divpow2
- return s.variance >> weight;
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
-
-/**
- * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
- u8 weight)
-{
- return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
-
-MODULE_AUTHOR("Daniel B. Hill");
-MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
deleted file mode 100644
index 47e4a3c3d26e..000000000000
--- a/fs/bcachefs/mean_and_variance.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef MEAN_AND_VARIANCE_H_
-#define MEAN_AND_VARIANCE_H_
-
-#include <linux/types.h>
-#include <linux/limits.h>
-#include <linux/math.h>
-#include <linux/math64.h>
-
-#define SQRT_U64_MAX 4294967295ULL
-
-/*
- * u128_u: u128 user mode, because not all architectures support a real int128
- * type
- *
- * We don't use this version in userspace, because in userspace we link with
- * Rust and rustc has issues with u128.
- */
-
-#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
-
-typedef struct {
- unsigned __int128 v;
-} __aligned(16) u128_u;
-
-static inline u128_u u64_to_u128(u64 a)
-{
- return (u128_u) { .v = a };
-}
-
-static inline u64 u128_lo(u128_u a)
-{
- return a.v;
-}
-
-static inline u64 u128_hi(u128_u a)
-{
- return a.v >> 64;
-}
-
-static inline u128_u u128_add(u128_u a, u128_u b)
-{
- a.v += b.v;
- return a;
-}
-
-static inline u128_u u128_sub(u128_u a, u128_u b)
-{
- a.v -= b.v;
- return a;
-}
-
-static inline u128_u u128_shl(u128_u a, s8 shift)
-{
- a.v <<= shift;
- return a;
-}
-
-static inline u128_u u128_square(u64 a)
-{
- u128_u b = u64_to_u128(a);
-
- b.v *= b.v;
- return b;
-}
-
-#else
-
-typedef struct {
- u64 hi, lo;
-} __aligned(16) u128_u;
-
-/* conversions */
-
-static inline u128_u u64_to_u128(u64 a)
-{
- return (u128_u) { .lo = a };
-}
-
-static inline u64 u128_lo(u128_u a)
-{
- return a.lo;
-}
-
-static inline u64 u128_hi(u128_u a)
-{
- return a.hi;
-}
-
-/* arithmetic */
-
-static inline u128_u u128_add(u128_u a, u128_u b)
-{
- u128_u c;
-
- c.lo = a.lo + b.lo;
- c.hi = a.hi + b.hi + (c.lo < a.lo);
- return c;
-}
-
-static inline u128_u u128_sub(u128_u a, u128_u b)
-{
- u128_u c;
-
- c.lo = a.lo - b.lo;
- c.hi = a.hi - b.hi - (c.lo > a.lo);
- return c;
-}
-
-static inline u128_u u128_shl(u128_u i, s8 shift)
-{
- u128_u r;
-
- r.lo = i.lo << (shift & 63);
- if (shift < 64)
- r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63));
- else {
- r.hi = i.lo << (-shift & 63);
- r.lo = 0;
- }
- return r;
-}
-
-static inline u128_u u128_square(u64 i)
-{
- u128_u r;
- u64 h = i >> 32, l = i & U32_MAX;
-
- r = u128_shl(u64_to_u128(h*h), 64);
- r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
- r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
- r = u128_add(r, u64_to_u128(l*l));
- return r;
-}
-
-#endif
-
-static inline u128_u u64s_to_u128(u64 hi, u64 lo)
-{
- u128_u c = u64_to_u128(hi);
-
- c = u128_shl(c, 64);
- c = u128_add(c, u64_to_u128(lo));
- return c;
-}
-
-u128_u u128_div(u128_u n, u64 d);
-
-struct mean_and_variance {
- s64 n;
- s64 sum;
- u128_u sum_squares;
-};
-
-/* expontentially weighted variant */
-struct mean_and_variance_weighted {
- s64 mean;
- u64 variance;
-};
-
-/**
- * fast_divpow2() - fast approximation for n / (1 << d)
- * @n: numerator
- * @d: the power of 2 denominator.
- *
- * note: this rounds towards 0.
- */
-static inline s64 fast_divpow2(s64 n, u8 d)
-{
- return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
-}
-
-/**
- * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
- * and return it.
- * @s1: the mean_and_variance to update.
- * @v1: the new sample.
- *
- * see linked pdf equation 12.
- */
-static inline void
-mean_and_variance_update(struct mean_and_variance *s, s64 v)
-{
- s->n++;
- s->sum += v;
- s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v)));
-}
-
-s64 mean_and_variance_get_mean(struct mean_and_variance s);
-u64 mean_and_variance_get_variance(struct mean_and_variance s1);
-u32 mean_and_variance_get_stddev(struct mean_and_variance s);
-
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
- s64 v, bool initted, u8 weight);
-
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
- u8 weight);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
- u8 weight);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
- u8 weight);
-
-#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
deleted file mode 100644
index e9d9c0212e44..000000000000
--- a/fs/bcachefs/mean_and_variance_test.c
+++ /dev/null
@@ -1,221 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <kunit/test.h>
-
-#include "mean_and_variance.h"
-
-#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
-
-static void mean_and_variance_basic_test(struct kunit *test)
-{
- struct mean_and_variance s = {};
-
- mean_and_variance_update(&s, 2);
- mean_and_variance_update(&s, 2);
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
- KUNIT_EXPECT_EQ(test, s.n, 2);
-
- mean_and_variance_update(&s, 4);
- mean_and_variance_update(&s, 4);
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
- KUNIT_EXPECT_EQ(test, s.n, 4);
-}
-
-/*
- * Test values computed using a spreadsheet from the psuedocode at the bottom:
- * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
- */
-
-static void mean_and_variance_weighted_test(struct kunit *test)
-{
- struct mean_and_variance_weighted s = { };
-
- mean_and_variance_weighted_update(&s, 10, false, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
-
- mean_and_variance_weighted_update(&s, 20, true, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
-
- mean_and_variance_weighted_update(&s, 30, true, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
-
- s = (struct mean_and_variance_weighted) { };
-
- mean_and_variance_weighted_update(&s, -10, false, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
-
- mean_and_variance_weighted_update(&s, -20, true, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
-
- mean_and_variance_weighted_update(&s, -30, true, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
-}
-
-static void mean_and_variance_weighted_advanced_test(struct kunit *test)
-{
- struct mean_and_variance_weighted s = { };
- bool initted = false;
- s64 i;
-
- for (i = 10; i <= 100; i += 10) {
- mean_and_variance_weighted_update(&s, i, initted, 8);
- initted = true;
- }
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
-
- s = (struct mean_and_variance_weighted) { };
- initted = false;
-
- for (i = -10; i >= -100; i -= 10) {
- mean_and_variance_weighted_update(&s, i, initted, 8);
- initted = true;
- }
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
-}
-
-static void do_mean_and_variance_test(struct kunit *test,
- s64 initial_value,
- s64 initial_n,
- s64 n,
- unsigned weight,
- s64 *data,
- s64 *mean,
- s64 *stddev,
- s64 *weighted_mean,
- s64 *weighted_stddev)
-{
- struct mean_and_variance mv = {};
- struct mean_and_variance_weighted vw = { };
-
- for (unsigned i = 0; i < initial_n; i++) {
- mean_and_variance_update(&mv, initial_value);
- mean_and_variance_weighted_update(&vw, initial_value, false, weight);
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value);
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
- }
-
- for (unsigned i = 0; i < n; i++) {
- mean_and_variance_update(&mv, data[i]);
- mean_and_variance_weighted_update(&vw, data[i], true, weight);
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
- }
-
- KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
-}
-
-/* Test behaviour with a single outlier, then back to steady state: */
-static void mean_and_variance_test_1(struct kunit *test)
-{
- s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
- s64 mean[] = { 22, 21, 20, 19, 18, 17, 16 };
- s64 stddev[] = { 32, 29, 28, 27, 26, 25, 24 };
- s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
- s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
-/* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_2(struct kunit *test)
-{
- s64 d[] = { 100, 100, 100, 100, 100 };
- s64 mean[] = { 22, 32, 40, 46, 50 };
- s64 stddev[] = { 32, 39, 42, 44, 45 };
- s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
- s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
-static void mean_and_variance_fast_divpow2(struct kunit *test)
-{
- s64 i;
- u8 d;
-
- for (i = 0; i < 100; i++) {
- d = 0;
- KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d));
- KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d));
- for (d = 1; d < 32; d++) {
- KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)),
- div_u64(i, 1 << d), "%lld %u", i, d);
- KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)),
- div_u64(i, 1 << d), "%lld %u", -i, d);
- }
- }
-}
-
-static void mean_and_variance_u128_basic_test(struct kunit *test)
-{
- u128_u a = u64s_to_u128(0, U64_MAX);
- u128_u a1 = u64s_to_u128(0, 1);
- u128_u b = u64s_to_u128(1, 0);
- u128_u c = u64s_to_u128(0, 1LLU << 63);
- u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0);
- KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0);
-
- KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX);
- KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1);
-
- KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31);
-}
-
-static struct kunit_case mean_and_variance_test_cases[] = {
- KUNIT_CASE(mean_and_variance_fast_divpow2),
- KUNIT_CASE(mean_and_variance_u128_basic_test),
- KUNIT_CASE(mean_and_variance_basic_test),
- KUNIT_CASE(mean_and_variance_weighted_test),
- KUNIT_CASE(mean_and_variance_weighted_advanced_test),
- KUNIT_CASE(mean_and_variance_test_1),
- KUNIT_CASE(mean_and_variance_test_2),
- {}
-};
-
-static struct kunit_suite mean_and_variance_test_suite = {
- .name = "mean and variance tests",
- .test_cases = mean_and_variance_test_cases
-};
-
-kunit_test_suite(mean_and_variance_test_suite);
-
-MODULE_AUTHOR("Daniel B. Hill");
-MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests");
-MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
deleted file mode 100644
index 57ad662871ba..000000000000
--- a/fs/bcachefs/migrate.c
+++ /dev/null
@@ -1,188 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for moving data off a device.
- */
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "errcode.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal.h"
-#include "keylist.h"
-#include "migrate.h"
-#include "move.h"
-#include "progress.h"
-#include "replicas.h"
-#include "super-io.h"
-
-static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
- unsigned dev_idx, int flags, bool metadata)
-{
- unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
- unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
- unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
- unsigned nr_good;
-
- bch2_bkey_drop_device(k, dev_idx);
-
- nr_good = bch2_bkey_durability(c, k.s_c);
- if ((!nr_good && !(flags & lost)) ||
- (nr_good < replicas && !(flags & degraded)))
- return -BCH_ERR_remove_would_lose_data;
-
- return 0;
-}
-
-static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- unsigned dev_idx,
- int flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i *n;
- int ret;
-
- if (!bch2_bkey_has_device_c(k, dev_idx))
- return 0;
-
- n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
- if (ret)
- return ret;
-
- /*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_error key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize(c, bkey_i_to_s(n));
-
- /*
- * Since we're not inserting through an extent iterator
- * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
- * we aren't using the extent overwrite path to delete, we're
- * just using the normal key deletion path:
- */
- if (bkey_deleted(&n->k))
- n->k.size = 0;
- return 0;
-}
-
-static int bch2_dev_usrdata_drop(struct bch_fs *c,
- struct progress_indicator_state *progress,
- unsigned dev_idx, int flags)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- enum btree_id id;
- int ret = 0;
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- if (!btree_type_has_ptrs(id))
- continue;
-
- ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- bch2_progress_update_iter(trans, progress, &iter, "dropping user data");
- bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
- }));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
-
- return ret;
-}
-
-static int bch2_dev_metadata_drop(struct bch_fs *c,
- struct progress_indicator_state *progress,
- unsigned dev_idx, int flags)
-{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct closure cl;
- struct btree *b;
- struct bkey_buf k;
- unsigned id;
- int ret;
-
- /* don't handle this yet: */
- if (flags & BCH_FORCE_IF_METADATA_LOST)
- return -BCH_ERR_remove_with_metadata_missing_unimplemented;
-
- trans = bch2_trans_get(c);
- bch2_bkey_buf_init(&k);
- closure_init_stack(&cl);
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
- BTREE_ITER_prefetch);
-retry:
- ret = 0;
- while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(&iter)) &&
- !(ret = PTR_ERR_OR_ZERO(b))) {
- bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
-
- if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
- goto next;
-
- bch2_bkey_buf_copy(&k, c, &b->key);
-
- ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
- dev_idx, flags, true);
- if (ret)
- break;
-
- ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- ret = 0;
- continue;
- }
-
- bch_err_msg(c, ret, "updating btree node key");
- if (ret)
- break;
-next:
- bch2_btree_iter_next_node(&iter);
- }
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- goto err;
- }
-
- bch2_btree_interior_updates_flush(c);
- ret = 0;
-err:
- bch2_bkey_buf_exit(&k, c);
- bch2_trans_put(trans);
-
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
- return ret;
-}
-
-int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
-{
- struct progress_indicator_state progress;
- bch2_progress_init(&progress, c,
- BIT_ULL(BTREE_ID_extents)|
- BIT_ULL(BTREE_ID_reflink));
-
- return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?:
- bch2_dev_metadata_drop(c, &progress, dev_idx, flags);
-}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
deleted file mode 100644
index 027efaa0d575..000000000000
--- a/fs/bcachefs/migrate.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MIGRATE_H
-#define _BCACHEFS_MIGRATE_H
-
-int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
-
-#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
deleted file mode 100644
index 8fcdc6984f6e..000000000000
--- a/fs/bcachefs/move.c
+++ /dev/null
@@ -1,1329 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "move.h"
-#include "rebalance.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "snapshot.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-#include <linux/kthread.h>
-
-const char * const bch2_data_ops_strs[] = {
-#define x(t, n, ...) [n] = #t,
- BCH_DATA_OPS()
-#undef x
- NULL
-};
-
-static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- if (trace_io_move_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
- bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
- trace_io_move(c, buf.buf);
- printbuf_exit(&buf);
- }
-}
-
-static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
-{
- if (trace_io_move_read_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- trace_io_move_read(c, buf.buf);
- printbuf_exit(&buf);
- }
-}
-
-struct moving_io {
- struct list_head read_list;
- struct list_head io_list;
- struct move_bucket_in_flight *b;
- struct closure cl;
- bool read_completed;
-
- unsigned read_sectors;
- unsigned write_sectors;
-
- struct data_update write;
-};
-
-static void move_free(struct moving_io *io)
-{
- struct moving_context *ctxt = io->write.ctxt;
-
- if (io->b)
- atomic_dec(&io->b->count);
-
- mutex_lock(&ctxt->lock);
- list_del(&io->io_list);
- wake_up(&ctxt->wait);
- mutex_unlock(&ctxt->lock);
-
- if (!io->write.data_opts.scrub) {
- bch2_data_update_exit(&io->write);
- } else {
- bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
- kfree(io->write.bvecs);
- }
- kfree(io);
-}
-
-static void move_write_done(struct bch_write_op *op)
-{
- struct moving_io *io = container_of(op, struct moving_io, write.op);
- struct bch_fs *c = op->c;
- struct moving_context *ctxt = io->write.ctxt;
-
- if (op->error) {
- if (trace_io_move_write_fail_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_write_op_to_text(&buf, op);
- prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error));
- trace_io_move_write_fail(c, buf.buf);
- printbuf_exit(&buf);
- }
- this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
-
- ctxt->write_error = true;
- }
-
- atomic_sub(io->write_sectors, &ctxt->write_sectors);
- atomic_dec(&ctxt->write_ios);
- move_free(io);
- closure_put(&ctxt->cl);
-}
-
-static void move_write(struct moving_io *io)
-{
- struct moving_context *ctxt = io->write.ctxt;
-
- if (ctxt->stats) {
- if (io->write.rbio.bio.bi_status)
- atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
- &ctxt->stats->sectors_error_uncorrected);
- else if (io->write.rbio.saw_error)
- atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
- &ctxt->stats->sectors_error_corrected);
- }
-
- if (unlikely(io->write.rbio.ret ||
- io->write.rbio.bio.bi_status ||
- io->write.data_opts.scrub)) {
- move_free(io);
- return;
- }
-
- if (trace_io_move_write_enabled()) {
- struct bch_fs *c = io->write.op.c;
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
- trace_io_move_write(c, buf.buf);
- printbuf_exit(&buf);
- }
-
- closure_get(&io->write.ctxt->cl);
- atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
- atomic_inc(&io->write.ctxt->write_ios);
-
- bch2_data_update_read_done(&io->write);
-}
-
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
-{
- struct moving_io *io =
- list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
-
- return io && io->read_completed ? io : NULL;
-}
-
-static void move_read_endio(struct bio *bio)
-{
- struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
- struct moving_context *ctxt = io->write.ctxt;
-
- atomic_sub(io->read_sectors, &ctxt->read_sectors);
- atomic_dec(&ctxt->read_ios);
- io->read_completed = true;
-
- wake_up(&ctxt->wait);
- closure_put(&ctxt->cl);
-}
-
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
-{
- struct moving_io *io;
-
- while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
- bch2_trans_unlock_long(ctxt->trans);
- list_del(&io->read_list);
- move_write(io);
- }
-}
-
-void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
-{
- unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
-
- move_ctxt_wait_event(ctxt,
- !atomic_read(&ctxt->write_sectors) ||
- atomic_read(&ctxt->write_sectors) != sectors_pending);
-}
-
-void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
-{
- move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
- bch2_trans_unlock_long(ctxt->trans);
- closure_sync(&ctxt->cl);
-}
-
-void bch2_moving_ctxt_exit(struct moving_context *ctxt)
-{
- struct bch_fs *c = ctxt->trans->c;
-
- bch2_moving_ctxt_flush_all(ctxt);
-
- EBUG_ON(atomic_read(&ctxt->write_sectors));
- EBUG_ON(atomic_read(&ctxt->write_ios));
- EBUG_ON(atomic_read(&ctxt->read_sectors));
- EBUG_ON(atomic_read(&ctxt->read_ios));
-
- mutex_lock(&c->moving_context_lock);
- list_del(&ctxt->list);
- mutex_unlock(&c->moving_context_lock);
-
- /*
- * Generally, releasing a transaction within a transaction restart means
- * an unhandled transaction restart: but this can happen legitimately
- * within the move code, e.g. when bch2_move_ratelimit() tells us to
- * exit before we've retried
- */
- bch2_trans_begin(ctxt->trans);
- bch2_trans_put(ctxt->trans);
- memset(ctxt, 0, sizeof(*ctxt));
-}
-
-void bch2_moving_ctxt_init(struct moving_context *ctxt,
- struct bch_fs *c,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc)
-{
- memset(ctxt, 0, sizeof(*ctxt));
-
- ctxt->trans = bch2_trans_get(c);
- ctxt->fn = (void *) _RET_IP_;
- ctxt->rate = rate;
- ctxt->stats = stats;
- ctxt->wp = wp;
- ctxt->wait_on_copygc = wait_on_copygc;
-
- closure_init_stack(&ctxt->cl);
-
- mutex_init(&ctxt->lock);
- INIT_LIST_HEAD(&ctxt->reads);
- INIT_LIST_HEAD(&ctxt->ios);
- init_waitqueue_head(&ctxt->wait);
-
- mutex_lock(&c->moving_context_lock);
- list_add(&ctxt->list, &c->moving_context_list);
- mutex_unlock(&c->moving_context_lock);
-}
-
-void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
-{
- trace_move_data(c, stats);
-}
-
-void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
-{
- memset(stats, 0, sizeof(*stats));
- stats->data_type = BCH_DATA_user;
- scnprintf(stats->name, sizeof(stats->name), "%s", name);
-}
-
-int bch2_move_extent(struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_io_opts io_opts,
- struct data_update_opts data_opts)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- int ret = -ENOMEM;
-
- trace_io_move2(c, k, &io_opts, &data_opts);
- this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
-
- if (ctxt->stats)
- ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
-
- bch2_data_update_opts_normalize(k, &data_opts);
-
- if (!data_opts.rewrite_ptrs &&
- !data_opts.extra_replicas &&
- !data_opts.scrub) {
- if (data_opts.kill_ptrs)
- return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
- return 0;
- }
-
- /*
- * Before memory allocations & taking nocow locks in
- * bch2_data_update_init():
- */
- bch2_trans_unlock(trans);
-
- struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL);
- if (!io)
- goto err;
-
- INIT_LIST_HEAD(&io->io_list);
- io->write.ctxt = ctxt;
- io->read_sectors = k.k->size;
- io->write_sectors = k.k->size;
-
- if (!data_opts.scrub) {
- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
- &io_opts, data_opts, iter->btree_id, k);
- if (ret)
- goto err_free;
-
- io->write.op.end_io = move_write_done;
- } else {
- bch2_bkey_buf_init(&io->write.k);
- bch2_bkey_buf_reassemble(&io->write.k, c, k);
-
- io->write.op.c = c;
- io->write.data_opts = data_opts;
-
- ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
- if (ret)
- goto err_free;
- }
-
- io->write.rbio.bio.bi_end_io = move_read_endio;
- io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
-
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
-
- if (ctxt->stats) {
- atomic64_inc(&ctxt->stats->keys_moved);
- atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
- }
-
- if (bucket_in_flight) {
- io->b = bucket_in_flight;
- atomic_inc(&io->b->count);
- }
-
- trace_io_move_read2(c, k);
-
- mutex_lock(&ctxt->lock);
- atomic_add(io->read_sectors, &ctxt->read_sectors);
- atomic_inc(&ctxt->read_ios);
-
- list_add_tail(&io->read_list, &ctxt->reads);
- list_add_tail(&io->io_list, &ctxt->ios);
- mutex_unlock(&ctxt->lock);
-
- /*
- * dropped by move_read_endio() - guards against use after free of
- * ctxt when doing wakeup
- */
- closure_get(&ctxt->cl);
- __bch2_read_extent(trans, &io->write.rbio,
- io->write.rbio.bio.bi_iter,
- bkey_start_pos(k.k),
- iter->btree_id, k, 0,
- NULL,
- BCH_READ_last_fragment,
- data_opts.scrub ? data_opts.read_dev : -1);
- return 0;
-err_free:
- kfree(io);
-err:
- if (bch2_err_matches(ret, BCH_ERR_data_update_done))
- return 0;
-
- if (bch2_err_matches(ret, EROFS) ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
-
- count_event(c, io_move_start_fail);
-
- if (trace_io_move_start_fail_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, ": ");
- prt_str(&buf, bch2_err_str(ret));
- trace_io_move_start_fail(c, buf.buf);
- printbuf_exit(&buf);
- }
- return ret;
-}
-
-static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
- struct per_snapshot_io_opts *io_opts,
- struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
- struct btree_iter *extent_iter,
- struct bkey_s_c extent_k)
-{
- struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
- struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
- int ret = 0;
-
- if (extent_k.k->type == KEY_TYPE_reflink_v)
- goto out;
-
- if (io_opts->cur_inum != extent_pos.inode) {
- io_opts->d.nr = 0;
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
- BTREE_ITER_all_snapshots, k, ({
- if (k.k->p.offset != extent_pos.inode)
- break;
-
- if (!bkey_is_inode(k.k))
- continue;
-
- struct bch_inode_unpacked inode;
- _ret3 = bch2_inode_unpack(k, &inode);
- if (_ret3)
- break;
-
- struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
- bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
-
- darray_push(&io_opts->d, e);
- }));
- io_opts->cur_inum = extent_pos.inode;
- }
-
- ret = ret ?: trans_was_restarted(trans, restart_count);
- if (ret)
- return ERR_PTR(ret);
-
- if (extent_k.k->p.snapshot)
- darray_for_each(io_opts->d, i)
- if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
- opts_ret = &i->io_opts;
- break;
- }
-out:
- ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
- if (ret)
- return ERR_PTR(ret);
- return opts_ret;
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct btree_iter *extent_iter,
- struct bkey_s_c extent_k)
-{
- struct bch_fs *c = trans->c;
-
- *io_opts = bch2_opts_to_inode_opts(c->opts);
-
- /* reflink btree? */
- if (!extent_k.k->p.inode)
- goto out;
-
- struct btree_iter inode_iter;
- struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
- SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
- BTREE_ITER_cached);
- int ret = bkey_err(inode_k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
-
- if (!ret && bkey_is_inode(inode_k.k)) {
- struct bch_inode_unpacked inode;
- bch2_inode_unpack(inode_k, &inode);
- bch2_inode_opts_get(io_opts, c, &inode);
- }
- bch2_trans_iter_exit(trans, &inode_iter);
-out:
- return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
-}
-
-int bch2_move_ratelimit(struct moving_context *ctxt)
-{
- struct bch_fs *c = ctxt->trans->c;
- bool is_kthread = current->flags & PF_KTHREAD;
- u64 delay;
-
- if (ctxt->wait_on_copygc && c->copygc_running) {
- bch2_moving_ctxt_flush_all(ctxt);
- wait_event_killable(c->copygc_running_wq,
- !c->copygc_running ||
- (is_kthread && kthread_should_stop()));
- }
-
- do {
- delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
-
- if (is_kthread && kthread_should_stop())
- return 1;
-
- if (delay)
- move_ctxt_wait_event_timeout(ctxt,
- freezing(current) ||
- (is_kthread && kthread_should_stop()),
- delay);
-
- if (unlikely(freezing(current))) {
- bch2_moving_ctxt_flush_all(ctxt);
- try_to_freeze();
- }
- } while (delay);
-
- /*
- * XXX: these limits really ought to be per device, SSDs and hard drives
- * will want different limits
- */
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
- atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
- atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
- atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
-
- return 0;
-}
-
-static int bch2_move_data_btree(struct moving_context *ctxt,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- enum btree_id btree_id)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- struct per_snapshot_io_opts snapshot_io_opts;
- struct bch_io_opts *io_opts;
- struct bkey_buf sk;
- struct btree_iter iter, reflink_iter = {};
- struct bkey_s_c k;
- struct data_update_opts data_opts;
- /*
- * If we're moving a single file, also process reflinked data it points
- * to (this includes propagating changed io_opts from the inode to the
- * extent):
- */
- bool walk_indirect = start.inode == end.inode;
- int ret = 0, ret2;
-
- per_snapshot_io_opts_init(&snapshot_io_opts, c);
- bch2_bkey_buf_init(&sk);
-
- if (ctxt->stats) {
- ctxt->stats->data_type = BCH_DATA_user;
- ctxt->stats->pos = BBPOS(btree_id, start);
- }
-
- bch2_trans_begin(trans);
- bch2_trans_iter_init(trans, &iter, btree_id, start,
- BTREE_ITER_prefetch|
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots);
-
- if (ctxt->rate)
- bch2_ratelimit_reset(ctxt->rate);
-
- while (!bch2_move_ratelimit(ctxt)) {
- struct btree_iter *extent_iter = &iter;
-
- bch2_trans_begin(trans);
-
- k = bch2_btree_iter_peek(&iter);
- if (!k.k)
- break;
-
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- if (bkey_ge(bkey_start_pos(k.k), end))
- break;
-
- if (ctxt->stats)
- ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
-
- if (walk_indirect &&
- k.k->type == KEY_TYPE_reflink_p &&
- REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- s64 offset_into_extent = 0;
-
- bch2_trans_iter_exit(trans, &reflink_iter);
- k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- if (bkey_deleted(k.k))
- goto next_nondata;
-
- /*
- * XXX: reflink pointers may point to multiple indirect
- * extents, so don't advance past the entire reflink
- * pointer - need to fixup iter->k
- */
- extent_iter = &reflink_iter;
- offset_into_extent = 0;
- }
-
- if (!bkey_extent_is_direct_data(k.k))
- goto next_nondata;
-
- io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
- iter.pos, extent_iter, k);
- ret = PTR_ERR_OR_ZERO(io_opts);
- if (ret)
- continue;
-
- memset(&data_opts, 0, sizeof(data_opts));
- if (!pred(c, arg, k, io_opts, &data_opts))
- goto next;
-
- /*
- * The iterator gets unlocked by __bch2_read_extent - need to
- * save a copy of @k elsewhere:
- */
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
- if (ret2) {
- if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
- continue;
-
- if (bch2_err_matches(ret2, ENOMEM)) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
- continue;
- }
-
- /* XXX signal failure */
- goto next;
- }
-next:
- if (ctxt->stats)
- atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
-next_nondata:
- bch2_btree_iter_advance(&iter);
- }
-
- bch2_trans_iter_exit(trans, &reflink_iter);
- bch2_trans_iter_exit(trans, &iter);
- bch2_bkey_buf_exit(&sk, c);
- per_snapshot_io_opts_exit(&snapshot_io_opts);
-
- return ret;
-}
-
-int __bch2_move_data(struct moving_context *ctxt,
- struct bbpos start,
- struct bbpos end,
- move_pred_fn pred, void *arg)
-{
- struct bch_fs *c = ctxt->trans->c;
- enum btree_id id;
- int ret = 0;
-
- for (id = start.btree;
- id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
- id++) {
- ctxt->stats->pos = BBPOS(id, POS_MIN);
-
- if (!btree_type_has_ptrs(id) ||
- !bch2_btree_id_root(c, id)->b)
- continue;
-
- ret = bch2_move_data_btree(ctxt,
- id == start.btree ? start.pos : POS_MIN,
- id == end.btree ? end.pos : POS_MAX,
- pred, arg, id);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-int bch2_move_data(struct bch_fs *c,
- struct bbpos start,
- struct bbpos end,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
-{
- struct moving_context ctxt;
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
- bch2_moving_ctxt_exit(&ctxt);
-
- return ret;
-}
-
-static int __bch2_move_data_phys(struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- unsigned dev,
- u64 bucket_start,
- u64 bucket_end,
- unsigned data_types,
- move_pred_fn pred, void *arg)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- bool is_kthread = current->flags & PF_KTHREAD;
- struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct btree_iter iter = {}, bp_iter = {};
- struct bkey_buf sk;
- struct bkey_s_c k;
- struct bkey_buf last_flushed;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_tryget(c, dev);
- if (!ca)
- return 0;
-
- bucket_end = min(bucket_end, ca->mi.nbuckets);
-
- struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
- struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
- bch2_dev_put(ca);
- ca = NULL;
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
- bch2_bkey_buf_init(&sk);
-
- /*
- * We're not run in a context that handles transaction restarts:
- */
- bch2_trans_begin(trans);
-
- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
-
- bch_err_msg(c, ret, "looking up alloc key");
- if (ret)
- goto err;
-
- ret = bch2_btree_write_buffer_tryflush(trans);
- bch_err_msg(c, ret, "flushing btree write buffer");
- if (ret)
- goto err;
-
- while (!(ret = bch2_move_ratelimit(ctxt))) {
- if (is_kthread && kthread_should_stop())
- break;
-
- bch2_trans_begin(trans);
-
- k = bch2_btree_iter_peek(&bp_iter);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
-
- if (!k.k || bkey_gt(k.k->p, bp_end))
- break;
-
- if (k.k->type != KEY_TYPE_backpointer)
- goto next;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
- if (ctxt->stats)
- ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
- if (!(data_types & BIT(bp.v->data_type)))
- goto next;
-
- if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
- goto next;
-
- k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
- if (!k.k)
- goto next;
-
- if (!bp.v->level) {
- ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
- if (ret) {
- bch2_trans_iter_exit(trans, &iter);
- continue;
- }
- }
-
- struct data_update_opts data_opts = {};
- if (!pred(c, arg, k, &io_opts, &data_opts)) {
- bch2_trans_iter_exit(trans, &iter);
- goto next;
- }
-
- if (data_opts.scrub &&
- !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
- bch2_trans_iter_exit(trans, &iter);
- ret = -BCH_ERR_device_offline;
- break;
- }
-
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- /* move_extent will drop locks */
- unsigned sectors = bp.v->bucket_len;
-
- if (!bp.v->level)
- ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
- else if (!data_opts.scrub)
- ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
- else
- ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
-
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret == -ENOMEM) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
- continue;
- }
- if (ret)
- goto err;
-
- if (ctxt->stats)
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
-next:
- bch2_btree_iter_advance(&bp_iter);
- }
-err:
- bch2_trans_iter_exit(trans, &bp_iter);
- bch2_bkey_buf_exit(&sk, c);
- bch2_bkey_buf_exit(&last_flushed, c);
- return ret;
-}
-
-static int bch2_move_data_phys(struct bch_fs *c,
- unsigned dev,
- u64 start,
- u64 end,
- unsigned data_types,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
-{
- struct moving_context ctxt;
-
- bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ctxt.stats->phys = true;
- ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
-
- int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg);
- bch2_moving_ctxt_exit(&ctxt);
-
- return ret;
-}
-
-struct evacuate_bucket_arg {
- struct bpos bucket;
- int gen;
- struct data_update_opts data_opts;
-};
-
-static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct evacuate_bucket_arg *arg = _arg;
-
- *data_opts = arg->data_opts;
-
- unsigned i = 0;
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
- if (ptr->dev == arg->bucket.inode &&
- (arg->gen < 0 || arg->gen == ptr->gen) &&
- !ptr->cached)
- data_opts->rewrite_ptrs |= BIT(i);
- i++;
- }
-
- return data_opts->rewrite_ptrs != 0;
-}
-
-int bch2_evacuate_bucket(struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- struct bpos bucket, int gen,
- struct data_update_opts data_opts)
-{
- struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
-
- return __bch2_move_data_phys(ctxt, bucket_in_flight,
- bucket.inode,
- bucket.offset,
- bucket.offset + 1,
- ~0,
- evacuate_bucket_pred, &arg);
-}
-
-typedef bool (*move_btree_pred)(struct bch_fs *, void *,
- struct btree *, struct bch_io_opts *,
- struct data_update_opts *);
-
-static int bch2_move_btree(struct bch_fs *c,
- struct bbpos start,
- struct bbpos end,
- move_btree_pred pred, void *arg,
- struct bch_move_stats *stats)
-{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct moving_context ctxt;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct btree *b;
- enum btree_id btree;
- struct data_update_opts data_opts;
- int ret = 0;
-
- bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
- writepoint_ptr(&c->btree_write_point),
- true);
- trans = ctxt.trans;
-
- stats->data_type = BCH_DATA_btree;
-
- for (btree = start.btree;
- btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
- btree ++) {
- stats->pos = BBPOS(btree, POS_MIN);
-
- if (!bch2_btree_id_root(c, btree)->b)
- continue;
-
- bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
- BTREE_ITER_prefetch);
-retry:
- ret = 0;
- while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(&iter)) &&
- !(ret = PTR_ERR_OR_ZERO(b))) {
- if (kthread && kthread_should_stop())
- break;
-
- if ((cmp_int(btree, end.btree) ?:
- bpos_cmp(b->key.k.p, end.pos)) > 0)
- break;
-
- stats->pos = BBPOS(iter.btree_id, iter.pos);
-
- if (!pred(c, arg, b, &io_opts, &data_opts))
- goto next;
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-next:
- bch2_btree_iter_next_node(&iter);
- }
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_iter_exit(trans, &iter);
-
- if (kthread && kthread_should_stop())
- break;
- }
-
- bch_err_fn(c, ret);
- bch2_moving_ctxt_exit(&ctxt);
- bch2_btree_interior_updates_flush(c);
-
- return ret;
-}
-
-static bool rereplicate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- unsigned nr_good = bch2_bkey_durability(c, k);
- unsigned replicas = bkey_is_btree_ptr(k.k)
- ? c->opts.metadata_replicas
- : io_opts->data_replicas;
-
- rcu_read_lock();
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i = 0;
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ptr->cached &&
- (!ca || !ca->mi.durability))
- data_opts->kill_ptrs |= BIT(i);
- i++;
- }
- rcu_read_unlock();
-
- if (!data_opts->kill_ptrs &&
- (!nr_good || nr_good >= replicas))
- return false;
-
- data_opts->target = 0;
- data_opts->extra_replicas = replicas - nr_good;
- data_opts->btree_insert_flags = 0;
- return true;
-}
-
-static bool migrate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_ioctl_data *op = arg;
- unsigned i = 0;
-
- data_opts->rewrite_ptrs = 0;
- data_opts->target = 0;
- data_opts->extra_replicas = 0;
- data_opts->btree_insert_flags = 0;
-
- bkey_for_each_ptr(ptrs, ptr) {
- if (ptr->dev == op->migrate.dev)
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
-
- return data_opts->rewrite_ptrs != 0;
-}
-
-static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
-}
-
-/*
- * Ancient versions of bcachefs produced packed formats which could represent
- * keys that the in memory format cannot represent; this checks for those
- * formats so we can get rid of them.
- */
-static bool bformat_needs_redo(struct bkey_format *f)
-{
- for (unsigned i = 0; i < f->nr_fields; i++)
- if (bch2_bkey_format_field_overflows(f, i))
- return true;
-
- return false;
-}
-
-static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- if (b->version_ondisk != c->sb.version ||
- btree_node_need_rewrite(b) ||
- bformat_needs_redo(&b->format)) {
- data_opts->target = 0;
- data_opts->extra_replicas = 0;
- data_opts->btree_insert_flags = 0;
- return true;
- }
-
- return false;
-}
-
-int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
-{
- int ret;
-
- ret = bch2_move_btree(c,
- BBPOS_MIN,
- BBPOS_MAX,
- rewrite_old_nodes_pred, c, stats);
- if (!ret) {
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
- c->disk_sb.sb->version_min = c->disk_sb.sb->version;
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- unsigned durability = bch2_bkey_durability(c, k);
- unsigned replicas = bkey_is_btree_ptr(k.k)
- ? c->opts.metadata_replicas
- : io_opts->data_replicas;
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned i = 0;
-
- rcu_read_lock();
- bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
- unsigned d = bch2_extent_ptr_durability(c, &p);
-
- if (d && durability - d >= replicas) {
- data_opts->kill_ptrs |= BIT(i);
- durability -= d;
- }
-
- i++;
- }
- rcu_read_unlock();
-
- return data_opts->kill_ptrs != 0;
-}
-
-static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
-}
-
-static bool scrub_pred(struct bch_fs *c, void *_arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bch_ioctl_data *arg = _arg;
-
- if (k.k->type != KEY_TYPE_btree_ptr_v2) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == arg->migrate.dev) {
- if (!p.crc.csum_type)
- return false;
- break;
- }
- }
-
- data_opts->scrub = true;
- data_opts->read_dev = arg->migrate.dev;
- return true;
-}
-
-int bch2_data_job(struct bch_fs *c,
- struct bch_move_stats *stats,
- struct bch_ioctl_data op)
-{
- struct bbpos start = BBPOS(op.start_btree, op.start_pos);
- struct bbpos end = BBPOS(op.end_btree, op.end_pos);
- int ret = 0;
-
- if (op.op >= BCH_DATA_OP_NR)
- return -EINVAL;
-
- bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
-
- switch (op.op) {
- case BCH_DATA_OP_scrub:
- /*
- * prevent tests from spuriously failing, make sure we see all
- * btree nodes that need to be repaired
- */
- bch2_btree_interior_updates_flush(c);
-
- ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
- op.scrub.data_types,
- NULL,
- stats,
- writepoint_hashed((unsigned long) current),
- false,
- scrub_pred, &op) ?: ret;
- break;
-
- case BCH_DATA_OP_rereplicate:
- stats->data_type = BCH_DATA_journal;
- ret = bch2_journal_flush_device_pins(&c->journal, -1);
- ret = bch2_move_btree(c, start, end,
- rereplicate_btree_pred, c, stats) ?: ret;
- ret = bch2_move_data(c, start, end,
- NULL,
- stats,
- writepoint_hashed((unsigned long) current),
- true,
- rereplicate_pred, c) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
- break;
- case BCH_DATA_OP_migrate:
- if (op.migrate.dev >= c->sb.nr_devices)
- return -EINVAL;
-
- stats->data_type = BCH_DATA_journal;
- ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
- ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
- ~0,
- NULL,
- stats,
- writepoint_hashed((unsigned long) current),
- true,
- migrate_pred, &op) ?: ret;
- bch2_btree_interior_updates_flush(c);
- ret = bch2_replicas_gc2(c) ?: ret;
- break;
- case BCH_DATA_OP_rewrite_old_nodes:
- ret = bch2_scan_old_btree_nodes(c, stats);
- break;
- case BCH_DATA_OP_drop_extra_replicas:
- ret = bch2_move_btree(c, start, end,
- drop_extra_replicas_btree_pred, c, stats) ?: ret;
- ret = bch2_move_data(c, start, end, NULL, stats,
- writepoint_hashed((unsigned long) current),
- true,
- drop_extra_replicas_pred, c) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
- break;
- default:
- ret = -EINVAL;
- }
-
- bch2_move_stats_exit(stats, c);
- return ret;
-}
-
-void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
-{
- prt_printf(out, "%s: data type==", stats->name);
- bch2_prt_data_type(out, stats->data_type);
- prt_str(out, " pos=");
- bch2_bbpos_to_text(out, stats->pos);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
- prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
- prt_printf(out, "bytes seen:\t");
- prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
- prt_newline(out);
-
- prt_printf(out, "bytes moved:\t");
- prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
- prt_newline(out);
-
- prt_printf(out, "bytes raced:\t");
- prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
-}
-
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- bch2_move_stats_to_text(out, ctxt->stats);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
- atomic_read(&ctxt->read_ios),
- c->opts.move_ios_in_flight,
- atomic_read(&ctxt->read_sectors),
- c->opts.move_bytes_in_flight >> 9);
-
- prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
- atomic_read(&ctxt->write_ios),
- c->opts.move_ios_in_flight,
- atomic_read(&ctxt->write_sectors),
- c->opts.move_bytes_in_flight >> 9);
-
- printbuf_indent_add(out, 2);
-
- mutex_lock(&ctxt->lock);
- struct moving_io *io;
- list_for_each_entry(io, &ctxt->ios, io_list)
- bch2_data_update_inflight_to_text(out, &io->write);
- mutex_unlock(&ctxt->lock);
-
- printbuf_indent_sub(out, 4);
-}
-
-void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct moving_context *ctxt;
-
- mutex_lock(&c->moving_context_lock);
- list_for_each_entry(ctxt, &c->moving_context_list, list)
- bch2_moving_ctxt_to_text(out, c, ctxt);
- mutex_unlock(&c->moving_context_lock);
-}
-
-void bch2_fs_move_init(struct bch_fs *c)
-{
- INIT_LIST_HEAD(&c->moving_context_list);
- mutex_init(&c->moving_context_lock);
-}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
deleted file mode 100644
index 51e0505a8156..000000000000
--- a/fs/bcachefs/move.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVE_H
-#define _BCACHEFS_MOVE_H
-
-#include "bbpos.h"
-#include "bcachefs_ioctl.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "data_update.h"
-#include "move_types.h"
-
-struct bch_read_bio;
-
-struct moving_context {
- struct btree_trans *trans;
- struct list_head list;
- void *fn;
-
- struct bch_ratelimit *rate;
- struct bch_move_stats *stats;
- struct write_point_specifier wp;
- bool wait_on_copygc;
- bool write_error;
-
- /* For waiting on outstanding reads and writes: */
- struct closure cl;
-
- struct mutex lock;
- struct list_head reads;
- struct list_head ios;
-
- /* in flight sectors: */
- atomic_t read_sectors;
- atomic_t write_sectors;
- atomic_t read_ios;
- atomic_t write_ios;
-
- wait_queue_head_t wait;
-};
-
-#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \
-({ \
- int _ret = 0; \
- while (true) { \
- bool cond_finished = false; \
- bch2_moving_ctxt_do_pending_writes(_ctxt); \
- \
- if (_cond) \
- break; \
- bch2_trans_unlock_long((_ctxt)->trans); \
- _ret = __wait_event_timeout((_ctxt)->wait, \
- bch2_moving_ctxt_next_pending_write(_ctxt) || \
- (cond_finished = (_cond)), _timeout); \
- if (_ret || ( cond_finished)) \
- break; \
- } \
- _ret; \
-})
-
-#define move_ctxt_wait_event(_ctxt, _cond) \
-do { \
- bool cond_finished = false; \
- bch2_moving_ctxt_do_pending_writes(_ctxt); \
- \
- if (_cond) \
- break; \
- bch2_trans_unlock_long((_ctxt)->trans); \
- __wait_event((_ctxt)->wait, \
- bch2_moving_ctxt_next_pending_write(_ctxt) || \
- (cond_finished = (_cond))); \
- if (cond_finished) \
- break; \
-} while (1)
-
-typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
- struct bch_io_opts *, struct data_update_opts *);
-
-extern const char * const bch2_data_ops_strs[];
-
-void bch2_moving_ctxt_exit(struct moving_context *);
-void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
- struct bch_ratelimit *, struct bch_move_stats *,
- struct write_point_specifier, bool);
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
-void bch2_moving_ctxt_flush_all(struct moving_context *);
-void bch2_move_ctxt_wait_for_io(struct moving_context *);
-int bch2_move_ratelimit(struct moving_context *);
-
-/* Inodes in different snapshots may have different IO options: */
-struct snapshot_io_opts_entry {
- u32 snapshot;
- struct bch_io_opts io_opts;
-};
-
-struct per_snapshot_io_opts {
- u64 cur_inum;
- struct bch_io_opts fs_io_opts;
- DARRAY(struct snapshot_io_opts_entry) d;
-};
-
-static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
-{
- memset(io_opts, 0, sizeof(*io_opts));
- io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
-}
-
-static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
-{
- darray_exit(&io_opts->d);
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
- struct btree_iter *, struct bkey_s_c);
-
-int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
-
-int bch2_move_extent(struct moving_context *,
- struct move_bucket_in_flight *,
- struct btree_iter *,
- struct bkey_s_c,
- struct bch_io_opts,
- struct data_update_opts);
-
-int __bch2_move_data(struct moving_context *,
- struct bbpos,
- struct bbpos,
- move_pred_fn, void *);
-int bch2_move_data(struct bch_fs *,
- struct bbpos start,
- struct bbpos end,
- struct bch_ratelimit *,
- struct bch_move_stats *,
- struct write_point_specifier,
- bool,
- move_pred_fn, void *);
-
-int bch2_evacuate_bucket(struct moving_context *,
- struct move_bucket_in_flight *,
- struct bpos, int,
- struct data_update_opts);
-int bch2_data_job(struct bch_fs *,
- struct bch_move_stats *,
- struct bch_ioctl_data);
-
-void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
-void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, const char *);
-
-void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_move_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
deleted file mode 100644
index 807f779f6f76..000000000000
--- a/fs/bcachefs/move_types.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVE_TYPES_H
-#define _BCACHEFS_MOVE_TYPES_H
-
-#include "bbpos_types.h"
-#include "bcachefs_ioctl.h"
-
-struct bch_move_stats {
- char name[32];
- bool phys;
- enum bch_ioctl_data_event_ret ret;
-
- union {
- struct {
- enum bch_data_type data_type;
- struct bbpos pos;
- };
- struct {
- unsigned dev;
- u64 offset;
- };
- };
-
- atomic64_t keys_moved;
- atomic64_t keys_raced;
- atomic64_t sectors_seen;
- atomic64_t sectors_moved;
- atomic64_t sectors_raced;
- atomic64_t sectors_error_corrected;
- atomic64_t sectors_error_uncorrected;
-};
-
-struct move_bucket_key {
- struct bpos bucket;
- unsigned gen;
-};
-
-struct move_bucket {
- struct move_bucket_key k;
- unsigned sectors;
-};
-
-struct move_bucket_in_flight {
- struct move_bucket_in_flight *next;
- struct rhash_head hash;
- struct move_bucket bucket;
- atomic_t count;
-};
-
-#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
deleted file mode 100644
index 5126c870ce5b..000000000000
--- a/fs/bcachefs/movinggc.c
+++ /dev/null
@@ -1,462 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Moving/copying garbage collector
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "clock.h"
-#include "errcode.h"
-#include "error.h"
-#include "lru.h"
-#include "move.h"
-#include "movinggc.h"
-#include "trace.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/sched/task.h>
-#include <linux/wait.h>
-
-struct buckets_in_flight {
- struct rhashtable table;
- struct move_bucket_in_flight *first;
- struct move_bucket_in_flight *last;
- size_t nr;
- size_t sectors;
-};
-
-static const struct rhashtable_params bch_move_bucket_params = {
- .head_offset = offsetof(struct move_bucket_in_flight, hash),
- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
- .key_len = sizeof(struct move_bucket_key),
- .automatic_shrinking = true,
-};
-
-static struct move_bucket_in_flight *
-move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
-{
- struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
- int ret;
-
- if (!new)
- return ERR_PTR(-ENOMEM);
-
- new->bucket = b;
-
- ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
- bch_move_bucket_params);
- if (ret) {
- kfree(new);
- return ERR_PTR(ret);
- }
-
- if (!list->first)
- list->first = new;
- else
- list->last->next = new;
-
- list->last = new;
- list->nr++;
- list->sectors += b.sectors;
- return new;
-}
-
-static int bch2_bucket_is_movable(struct btree_trans *trans,
- struct move_bucket *b, u64 time)
-{
- struct bch_fs *c = trans->c;
-
- if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
- return 0;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
- b->k.bucket, BTREE_ITER_cached);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode);
- if (!ca)
- goto out;
-
- if (ca->mi.state != BCH_MEMBER_STATE_rw ||
- !bch2_dev_is_online(ca))
- goto out_put;
-
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- b->k.gen = a->gen;
- b->sectors = bch2_bucket_sectors_dirty(*a);
- u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
-
- ret = lru_idx && lru_idx <= time;
-out_put:
- bch2_dev_put(ca);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static void move_buckets_wait(struct moving_context *ctxt,
- struct buckets_in_flight *list,
- bool flush)
-{
- struct move_bucket_in_flight *i;
- int ret;
-
- while ((i = list->first)) {
- if (flush)
- move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
-
- if (atomic_read(&i->count))
- break;
-
- list->first = i->next;
- if (!list->first)
- list->last = NULL;
-
- list->nr--;
- list->sectors -= i->bucket.sectors;
-
- ret = rhashtable_remove_fast(&list->table, &i->hash,
- bch_move_bucket_params);
- BUG_ON(ret);
- kfree(i);
- }
-
- bch2_trans_unlock_long(ctxt->trans);
-}
-
-static bool bucket_in_flight(struct buckets_in_flight *list,
- struct move_bucket_key k)
-{
- return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
-}
-
-typedef DARRAY(struct move_bucket) move_buckets;
-
-static int bch2_copygc_get_buckets(struct moving_context *ctxt,
- struct buckets_in_flight *buckets_in_flight,
- move_buckets *buckets)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
- size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
- int ret;
-
- move_buckets_wait(ctxt, buckets_in_flight, false);
-
- ret = bch2_btree_write_buffer_tryflush(trans);
- if (bch2_err_matches(ret, EROFS))
- return ret;
-
- if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
- return ret;
-
- bch2_trans_begin(trans);
-
- ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
- lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
- 0, k, ({
- struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
- int ret2 = 0;
-
- saw++;
-
- ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
- if (ret2 < 0)
- goto err;
-
- if (!ret2)
- not_movable++;
- else if (bucket_in_flight(buckets_in_flight, b.k))
- in_flight++;
- else {
- ret2 = darray_push(buckets, b);
- if (ret2)
- goto err;
- sectors += b.sectors;
- }
-
- ret2 = buckets->nr >= nr_to_get;
-err:
- ret2;
- }));
-
- pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
- buckets_in_flight->nr, buckets_in_flight->sectors,
- saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
-
- return ret < 0 ? ret : 0;
-}
-
-noinline
-static int bch2_copygc(struct moving_context *ctxt,
- struct buckets_in_flight *buckets_in_flight,
- bool *did_work)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- struct data_update_opts data_opts = {
- .btree_insert_flags = BCH_WATERMARK_copygc,
- };
- move_buckets buckets = { 0 };
- struct move_bucket_in_flight *f;
- u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
- u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
- int ret = 0;
-
- ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
- if (ret)
- goto err;
-
- darray_for_each(buckets, i) {
- if (kthread_should_stop() || freezing(current))
- break;
-
- f = move_bucket_in_flight_add(buckets_in_flight, *i);
- ret = PTR_ERR_OR_ZERO(f);
- if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
- ret = 0;
- continue;
- }
- if (ret == -ENOMEM) { /* flush IO, continue later */
- ret = 0;
- break;
- }
-
- ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
- f->bucket.k.gen, data_opts);
- if (ret)
- goto err;
-
- *did_work = true;
- }
-err:
-
- /* no entries in LRU btree found, or got to end: */
- if (bch2_err_matches(ret, ENOENT))
- ret = 0;
-
- if (ret < 0 && !bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "from bch2_move_data()");
-
- sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
- sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
- trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved);
-
- darray_exit(&buckets);
- return ret;
-}
-
-/*
- * Copygc runs when the amount of fragmented data is above some arbitrary
- * threshold:
- *
- * The threshold at the limit - when the device is full - is the amount of space
- * we reserved in bch2_recalc_capacity; we can't have more than that amount of
- * disk space stranded due to fragmentation and store everything we have
- * promised to store.
- *
- * But we don't want to be running copygc unnecessarily when the device still
- * has plenty of free space - rather, we want copygc to smoothly run every so
- * often and continually reduce the amount of fragmented space as the device
- * fills up. So, we increase the threshold by half the current free space.
- */
-unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
-{
- s64 wait = S64_MAX, fragmented_allowed, fragmented;
-
- for_each_rw_member(c, ca) {
- struct bch_dev_usage usage = bch2_dev_usage_read(ca);
-
- fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
- ca->mi.bucket_size) >> 1);
- fragmented = 0;
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++)
- if (data_type_movable(i))
- fragmented += usage.d[i].fragmented;
-
- wait = min(wait, max(0LL, fragmented_allowed - fragmented));
- }
-
- return wait;
-}
-
-void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
-{
- printbuf_tabstop_push(out, 32);
- prt_printf(out, "running:\t%u\n", c->copygc_running);
- prt_printf(out, "copygc_wait:\t%llu\n", c->copygc_wait);
- prt_printf(out, "copygc_wait_at:\t%llu\n", c->copygc_wait_at);
-
- prt_printf(out, "Currently waiting for:\t");
- prt_human_readable_u64(out, max(0LL, c->copygc_wait -
- atomic64_read(&c->io_clock[WRITE].now)) << 9);
- prt_newline(out);
-
- prt_printf(out, "Currently waiting since:\t");
- prt_human_readable_u64(out, max(0LL,
- atomic64_read(&c->io_clock[WRITE].now) -
- c->copygc_wait_at) << 9);
- prt_newline(out);
-
- prt_printf(out, "Currently calculated wait:\t");
- prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
- prt_newline(out);
-
- rcu_read_lock();
- struct task_struct *t = rcu_dereference(c->copygc_thread);
- if (t)
- get_task_struct(t);
- rcu_read_unlock();
-
- if (t) {
- bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
- put_task_struct(t);
- }
-}
-
-static int bch2_copygc_thread(void *arg)
-{
- struct bch_fs *c = arg;
- struct moving_context ctxt;
- struct bch_move_stats move_stats;
- struct io_clock *clock = &c->io_clock[WRITE];
- struct buckets_in_flight *buckets;
- u64 last, wait;
- int ret = 0;
-
- buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
- if (!buckets)
- return -ENOMEM;
- ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
- bch_err_msg(c, ret, "allocating copygc buckets in flight");
- if (ret) {
- kfree(buckets);
- return ret;
- }
-
- set_freezable();
-
- bch2_move_stats_init(&move_stats, "copygc");
- bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
- writepoint_ptr(&c->copygc_write_point),
- false);
-
- while (!ret && !kthread_should_stop()) {
- bool did_work = false;
-
- bch2_trans_unlock_long(ctxt.trans);
- cond_resched();
-
- if (!c->opts.copygc_enabled) {
- move_buckets_wait(&ctxt, buckets, true);
- kthread_wait_freezable(c->opts.copygc_enabled ||
- kthread_should_stop());
- }
-
- if (unlikely(freezing(current))) {
- move_buckets_wait(&ctxt, buckets, true);
- __refrigerator(false);
- continue;
- }
-
- last = atomic64_read(&clock->now);
- wait = bch2_copygc_wait_amount(c);
-
- if (wait > clock->max_slop) {
- c->copygc_wait_at = last;
- c->copygc_wait = last + wait;
- move_buckets_wait(&ctxt, buckets, true);
- trace_and_count(c, copygc_wait, c, wait, last + wait);
- bch2_kthread_io_clock_wait(clock, last + wait,
- MAX_SCHEDULE_TIMEOUT);
- continue;
- }
-
- c->copygc_wait = 0;
-
- c->copygc_running = true;
- ret = bch2_copygc(&ctxt, buckets, &did_work);
- c->copygc_running = false;
-
- wake_up(&c->copygc_running_wq);
-
- if (!wait && !did_work) {
- u64 min_member_capacity = bch2_min_rw_member_capacity(c);
-
- if (min_member_capacity == U64_MAX)
- min_member_capacity = 128 * 2048;
-
- move_buckets_wait(&ctxt, buckets, true);
- bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
- MAX_SCHEDULE_TIMEOUT);
- }
- }
-
- move_buckets_wait(&ctxt, buckets, true);
-
- rhashtable_destroy(&buckets->table);
- kfree(buckets);
- bch2_moving_ctxt_exit(&ctxt);
- bch2_move_stats_exit(&move_stats, c);
-
- return 0;
-}
-
-void bch2_copygc_stop(struct bch_fs *c)
-{
- if (c->copygc_thread) {
- kthread_stop(c->copygc_thread);
- put_task_struct(c->copygc_thread);
- }
- c->copygc_thread = NULL;
-}
-
-int bch2_copygc_start(struct bch_fs *c)
-{
- struct task_struct *t;
- int ret;
-
- if (c->copygc_thread)
- return 0;
-
- if (c->opts.nochanges)
- return 0;
-
- if (bch2_fs_init_fault("copygc_start"))
- return -ENOMEM;
-
- t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
- ret = PTR_ERR_OR_ZERO(t);
- bch_err_msg(c, ret, "creating copygc thread");
- if (ret)
- return ret;
-
- get_task_struct(t);
-
- c->copygc_thread = t;
- wake_up_process(c->copygc_thread);
-
- return 0;
-}
-
-void bch2_fs_copygc_init(struct bch_fs *c)
-{
- init_waitqueue_head(&c->copygc_running_wq);
- c->copygc_running = false;
-}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
deleted file mode 100644
index ea181fef5bc9..000000000000
--- a/fs/bcachefs/movinggc.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVINGGC_H
-#define _BCACHEFS_MOVINGGC_H
-
-unsigned long bch2_copygc_wait_amount(struct bch_fs *);
-void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_copygc_stop(struct bch_fs *);
-int bch2_copygc_start(struct bch_fs *);
-void bch2_fs_copygc_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
deleted file mode 100644
index 93246ad31541..000000000000
--- a/fs/bcachefs/namei.c
+++ /dev/null
@@ -1,834 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "inode.h"
-#include "namei.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
-{
- return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
-}
-
-int bch2_create_trans(struct btree_trans *trans,
- subvol_inum dir,
- struct bch_inode_unpacked *dir_u,
- struct bch_inode_unpacked *new_inode,
- const struct qstr *name,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
- struct posix_acl *default_acl,
- struct posix_acl *acl,
- subvol_inum snapshot_src,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
- subvol_inum new_inum = dir;
- u64 now = bch2_current_time(c);
- u64 cpu = raw_smp_processor_id();
- u64 dir_target;
- u32 snapshot;
- unsigned dir_type = mode_to_type(mode);
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
- if (ret)
- goto err;
-
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
- BTREE_ITER_intent|BTREE_ITER_with_updates);
- if (ret)
- goto err;
-
- /* Inherit casefold state from parent. */
- if (S_ISDIR(mode))
- new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded;
-
- if (!(flags & BCH_CREATE_SNAPSHOT)) {
- /* Normal create path - allocate a new inode: */
- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
-
- if (flags & BCH_CREATE_TMPFILE)
- new_inode->bi_flags |= BCH_INODE_unlinked;
-
- ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
- if (ret)
- goto err;
-
- snapshot_src = (subvol_inum) { 0 };
- } else {
- /*
- * Creating a snapshot - we're not allocating a new inode, but
- * we do have to lookup the root inode of the subvolume we're
- * snapshotting and update it (in the new snapshot):
- */
-
- if (!snapshot_src.inum) {
- /* Inode wasn't specified, just snapshot: */
- struct bch_subvolume s;
- ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s);
- if (ret)
- goto err;
-
- snapshot_src.inum = le64_to_cpu(s.inode);
- }
-
- ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- if (new_inode->bi_subvol != snapshot_src.subvol) {
- /* Not a subvolume root: */
- ret = -EINVAL;
- goto err;
- }
-
- /*
- * If we're not root, we have to own the subvolume being
- * snapshotted:
- */
- if (uid && new_inode->bi_uid != uid) {
- ret = -EPERM;
- goto err;
- }
-
- flags |= BCH_CREATE_SUBVOL;
- }
-
- new_inum.inum = new_inode->bi_inum;
- dir_target = new_inode->bi_inum;
-
- if (flags & BCH_CREATE_SUBVOL) {
- u32 new_subvol, dir_snapshot;
-
- ret = bch2_subvolume_create(trans, new_inode->bi_inum,
- dir.subvol,
- snapshot_src.subvol,
- &new_subvol, &snapshot,
- (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
- if (ret)
- goto err;
-
- new_inode->bi_parent_subvol = dir.subvol;
- new_inode->bi_subvol = new_subvol;
- new_inum.subvol = new_subvol;
- dir_target = new_subvol;
- dir_type = DT_SUBVOL;
-
- ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
- ret = bch2_btree_iter_traverse(&dir_iter);
- if (ret)
- goto err;
- }
-
- if (!(flags & BCH_CREATE_SNAPSHOT)) {
- if (default_acl) {
- ret = bch2_set_acl_trans(trans, new_inum, new_inode,
- default_acl, ACL_TYPE_DEFAULT);
- if (ret)
- goto err;
- }
-
- if (acl) {
- ret = bch2_set_acl_trans(trans, new_inum, new_inode,
- acl, ACL_TYPE_ACCESS);
- if (ret)
- goto err;
- }
- }
-
- if (!(flags & BCH_CREATE_TMPFILE)) {
- struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
- u64 dir_offset;
-
- if (is_subdir_for_nlink(new_inode))
- dir_u->bi_nlink++;
- dir_u->bi_mtime = dir_u->bi_ctime = now;
-
- ret = bch2_dirent_create(trans, dir, &dir_hash,
- dir_type,
- name,
- dir_target,
- &dir_offset,
- &dir_u->bi_size,
- STR_HASH_must_create|BTREE_ITER_with_updates) ?:
- bch2_inode_write(trans, &dir_iter, dir_u);
- if (ret)
- goto err;
-
- new_inode->bi_dir = dir_u->bi_inum;
- new_inode->bi_dir_offset = dir_offset;
- }
-
- if (S_ISDIR(mode) &&
- !new_inode->bi_subvol)
- new_inode->bi_depth = dir_u->bi_depth + 1;
-
- inode_iter.flags &= ~BTREE_ITER_all_snapshots;
- bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
-
- ret = bch2_btree_iter_traverse(&inode_iter) ?:
- bch2_inode_write(trans, &inode_iter, new_inode);
-err:
- bch2_trans_iter_exit(trans, &inode_iter);
- bch2_trans_iter_exit(trans, &dir_iter);
- return ret;
-}
-
-int bch2_link_trans(struct btree_trans *trans,
- subvol_inum dir, struct bch_inode_unpacked *dir_u,
- subvol_inum inum, struct bch_inode_unpacked *inode_u,
- const struct qstr *name)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
- struct bch_hash_info dir_hash;
- u64 now = bch2_current_time(c);
- u64 dir_offset = 0;
- int ret;
-
- if (dir.subvol != inum.subvol)
- return -EXDEV;
-
- ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
- if (ret)
- return ret;
-
- inode_u->bi_ctime = now;
- ret = bch2_inode_nlink_inc(inode_u);
- if (ret)
- goto err;
-
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
- if (ret)
- goto err;
-
- if (bch2_reinherit_attrs(inode_u, dir_u)) {
- ret = -EXDEV;
- goto err;
- }
-
- dir_u->bi_mtime = dir_u->bi_ctime = now;
-
- dir_hash = bch2_hash_info_init(c, dir_u);
-
- ret = bch2_dirent_create(trans, dir, &dir_hash,
- mode_to_type(inode_u->bi_mode),
- name, inum.inum,
- &dir_offset,
- &dir_u->bi_size,
- STR_HASH_must_create);
- if (ret)
- goto err;
-
- inode_u->bi_dir = dir.inum;
- inode_u->bi_dir_offset = dir_offset;
-
- ret = bch2_inode_write(trans, &dir_iter, dir_u) ?:
- bch2_inode_write(trans, &inode_iter, inode_u);
-err:
- bch2_trans_iter_exit(trans, &dir_iter);
- bch2_trans_iter_exit(trans, &inode_iter);
- return ret;
-}
-
-int bch2_unlink_trans(struct btree_trans *trans,
- subvol_inum dir,
- struct bch_inode_unpacked *dir_u,
- struct bch_inode_unpacked *inode_u,
- const struct qstr *name,
- bool deleting_subvol)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter dirent_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
- struct bch_hash_info dir_hash;
- subvol_inum inum;
- u64 now = bch2_current_time(c);
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
- if (ret)
- goto err;
-
- dir_hash = bch2_hash_info_init(c, dir_u);
-
- ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
- name, &inum, BTREE_ITER_intent);
- if (ret)
- goto err;
-
- ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
- ret = bch2_empty_dir_trans(trans, inum);
- if (ret)
- goto err;
- }
-
- if (deleting_subvol && !inode_u->bi_subvol) {
- ret = -BCH_ERR_ENOENT_not_subvol;
- goto err;
- }
-
- if (inode_u->bi_subvol) {
- /* Recursive subvolume destroy not allowed (yet?) */
- ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
- if (ret)
- goto err;
- }
-
- if (deleting_subvol || inode_u->bi_subvol) {
- ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
- if (ret)
- goto err;
-
- k = bch2_btree_iter_peek_slot(&dirent_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- /*
- * If we're deleting a subvolume, we need to really delete the
- * dirent, not just emit a whiteout in the current snapshot:
- */
- bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&dirent_iter);
- if (ret)
- goto err;
- } else {
- bch2_inode_nlink_dec(trans, inode_u);
- }
-
- if (inode_u->bi_dir == dirent_iter.pos.inode &&
- inode_u->bi_dir_offset == dirent_iter.pos.offset) {
- inode_u->bi_dir = 0;
- inode_u->bi_dir_offset = 0;
- }
-
- dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
- dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
-
- ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash, &dirent_iter,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_inode_write(trans, &dir_iter, dir_u) ?:
- bch2_inode_write(trans, &inode_iter, inode_u);
-err:
- bch2_trans_iter_exit(trans, &inode_iter);
- bch2_trans_iter_exit(trans, &dirent_iter);
- bch2_trans_iter_exit(trans, &dir_iter);
- return ret;
-}
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
- struct bch_inode_unpacked *src_u)
-{
- u64 src, dst;
- unsigned id;
- bool ret = false;
-
- for (id = 0; id < Inode_opt_nr; id++) {
- /* Skip attributes that were explicitly set on this inode */
- if (dst_u->bi_fields_set & (1 << id))
- continue;
-
- src = bch2_inode_opt_get(src_u, id);
- dst = bch2_inode_opt_get(dst_u, id);
-
- if (src == dst)
- continue;
-
- bch2_inode_opt_set(dst_u, id, src);
- ret = true;
- }
-
- return ret;
-}
-
-static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
-{
- struct btree_iter iter;
- struct bkey_i_subvolume *s =
- bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, subvol),
- BTREE_ITER_cached, subvolume);
- int ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- return ret;
-
- s->v.fs_path_parent = cpu_to_le32(new_parent);
- bch2_trans_iter_exit(trans, &iter);
- return 0;
-}
-
-int bch2_rename_trans(struct btree_trans *trans,
- subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
- subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
- struct bch_inode_unpacked *src_inode_u,
- struct bch_inode_unpacked *dst_inode_u,
- const struct qstr *src_name,
- const struct qstr *dst_name,
- enum bch_rename_mode mode)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter src_dir_iter = { NULL };
- struct btree_iter dst_dir_iter = { NULL };
- struct btree_iter src_inode_iter = { NULL };
- struct btree_iter dst_inode_iter = { NULL };
- struct bch_hash_info src_hash, dst_hash;
- subvol_inum src_inum, dst_inum;
- u64 src_offset, dst_offset;
- u64 now = bch2_current_time(c);
- int ret;
-
- ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- src_hash = bch2_hash_info_init(c, src_dir_u);
-
- if (dst_dir.inum != src_dir.inum ||
- dst_dir.subvol != src_dir.subvol) {
- ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- dst_hash = bch2_hash_info_init(c, dst_dir_u);
- } else {
- dst_dir_u = src_dir_u;
- dst_hash = src_hash;
- }
-
- ret = bch2_dirent_rename(trans,
- src_dir, &src_hash, &src_dir_u->bi_size,
- dst_dir, &dst_hash, &dst_dir_u->bi_size,
- src_name, &src_inum, &src_offset,
- dst_name, &dst_inum, &dst_offset,
- mode);
- if (ret)
- goto err;
-
- ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- if (dst_inum.inum) {
- ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
- BTREE_ITER_intent);
- if (ret)
- goto err;
- }
-
- if (src_inode_u->bi_subvol &&
- dst_dir.subvol != src_inode_u->bi_parent_subvol) {
- ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
- if (ret)
- goto err;
- }
-
- if (mode == BCH_RENAME_EXCHANGE &&
- dst_inode_u->bi_subvol &&
- src_dir.subvol != dst_inode_u->bi_parent_subvol) {
- ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
- if (ret)
- goto err;
- }
-
- /* Can't move across subvolumes, unless it's a subvolume root: */
- if (src_dir.subvol != dst_dir.subvol &&
- (!src_inode_u->bi_subvol ||
- (dst_inum.inum && !dst_inode_u->bi_subvol))) {
- ret = -EXDEV;
- goto err;
- }
-
- if (src_inode_u->bi_parent_subvol)
- src_inode_u->bi_parent_subvol = dst_dir.subvol;
-
- if ((mode == BCH_RENAME_EXCHANGE) &&
- dst_inode_u->bi_parent_subvol)
- dst_inode_u->bi_parent_subvol = src_dir.subvol;
-
- src_inode_u->bi_dir = dst_dir_u->bi_inum;
- src_inode_u->bi_dir_offset = dst_offset;
-
- if (mode == BCH_RENAME_EXCHANGE) {
- dst_inode_u->bi_dir = src_dir_u->bi_inum;
- dst_inode_u->bi_dir_offset = src_offset;
- }
-
- if (mode == BCH_RENAME_OVERWRITE &&
- dst_inode_u->bi_dir == dst_dir_u->bi_inum &&
- dst_inode_u->bi_dir_offset == src_offset) {
- dst_inode_u->bi_dir = 0;
- dst_inode_u->bi_dir_offset = 0;
- }
-
- if (mode == BCH_RENAME_OVERWRITE) {
- if (S_ISDIR(src_inode_u->bi_mode) !=
- S_ISDIR(dst_inode_u->bi_mode)) {
- ret = -ENOTDIR;
- goto err;
- }
-
- if (S_ISDIR(dst_inode_u->bi_mode)) {
- ret = bch2_empty_dir_trans(trans, dst_inum);
- if (ret)
- goto err;
- }
- }
-
- if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
- S_ISDIR(src_inode_u->bi_mode)) {
- ret = -EXDEV;
- goto err;
- }
-
- if (mode == BCH_RENAME_EXCHANGE &&
- bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
- S_ISDIR(dst_inode_u->bi_mode)) {
- ret = -EXDEV;
- goto err;
- }
-
- if (is_subdir_for_nlink(src_inode_u)) {
- src_dir_u->bi_nlink--;
- dst_dir_u->bi_nlink++;
- }
-
- if (S_ISDIR(src_inode_u->bi_mode) &&
- !src_inode_u->bi_subvol)
- src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
-
- if (mode == BCH_RENAME_EXCHANGE &&
- S_ISDIR(dst_inode_u->bi_mode) &&
- !dst_inode_u->bi_subvol)
- dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
-
- if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
- dst_dir_u->bi_nlink--;
- src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
- }
-
- if (mode == BCH_RENAME_OVERWRITE)
- bch2_inode_nlink_dec(trans, dst_inode_u);
-
- src_dir_u->bi_mtime = now;
- src_dir_u->bi_ctime = now;
-
- if (src_dir.inum != dst_dir.inum) {
- dst_dir_u->bi_mtime = now;
- dst_dir_u->bi_ctime = now;
- }
-
- src_inode_u->bi_ctime = now;
-
- if (dst_inum.inum)
- dst_inode_u->bi_ctime = now;
-
- ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
- (src_dir.inum != dst_dir.inum
- ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
- : 0) ?:
- bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
- (dst_inum.inum
- ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
- : 0);
-err:
- bch2_trans_iter_exit(trans, &dst_inode_iter);
- bch2_trans_iter_exit(trans, &src_inode_iter);
- bch2_trans_iter_exit(trans, &dst_dir_iter);
- bch2_trans_iter_exit(trans, &src_dir_iter);
- return ret;
-}
-
-/* inum_to_path */
-
-static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
-{
- bch2_printbuf_make_room(out, n);
-
- unsigned can_print = min(n, printbuf_remaining(out));
-
- b += n;
-
- for (unsigned i = 0; i < can_print; i++)
- out->buf[out->pos++] = *((char *) --b);
-
- printbuf_nul_terminate(out);
-}
-
-static inline void prt_str_reversed(struct printbuf *out, const char *s)
-{
- prt_bytes_reversed(out, s, strlen(s));
-}
-
-static inline void reverse_bytes(void *b, size_t n)
-{
- char *e = b + n, *s = b;
-
- while (s < e) {
- --e;
- swap(*s, *e);
- s++;
- }
-}
-
-/* XXX: we don't yet attempt to print paths when we don't know the subvol */
-int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path)
-{
- unsigned orig_pos = path->pos;
- int ret = 0;
-
- while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL &&
- inum.inum == BCACHEFS_ROOT_INO)) {
- struct bch_inode_unpacked inode;
- ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
- if (ret)
- goto disconnected;
-
- if (!inode.bi_dir && !inode.bi_dir_offset) {
- ret = -BCH_ERR_ENOENT_inode_no_backpointer;
- goto disconnected;
- }
-
- inum.subvol = inode.bi_parent_subvol ?: inum.subvol;
- inum.inum = inode.bi_dir;
-
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto disconnected;
-
- struct btree_iter d_iter;
- struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
- BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
- 0, dirent);
- ret = bkey_err(d.s_c);
- if (ret)
- goto disconnected;
-
- struct qstr dirent_name = bch2_dirent_get_name(d);
- prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
-
- prt_char(path, '/');
-
- bch2_trans_iter_exit(trans, &d_iter);
- }
-
- if (orig_pos == path->pos)
- prt_char(path, '/');
-out:
- ret = path->allocation_failure ? -ENOMEM : 0;
- if (ret)
- goto err;
-
- reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
- return 0;
-err:
- return ret;
-disconnected:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto err;
-
- prt_str_reversed(path, "(disconnected)");
- goto out;
-}
-
-/* fsck */
-
-static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- bool in_fsck)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = { NULL };
- int ret = 0;
-
- if (inode_points_to_dirent(target, d))
- return 0;
-
- if (!target->bi_dir &&
- !target->bi_dir_offset) {
- fsck_err_on(S_ISDIR(target->bi_mode),
- trans, inode_dir_missing_backpointer,
- "directory with missing backpointer\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
- trans, inode_unlinked_but_has_dirent,
- "inode unlinked but has dirent\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- target->bi_flags &= ~BCH_INODE_unlinked;
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- return __bch2_fsck_write_inode(trans, target);
- }
-
- if (bch2_inode_should_have_single_bp(target) &&
- !fsck_err(trans, inode_wrong_backpointer,
- "dirent points to inode that does not point back:\n %s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n "),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf)))
- goto err;
-
- struct bkey_s_c_dirent bp_dirent =
- bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
- SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
- 0, dirent);
- ret = bkey_err(bp_dirent);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- bool backpointer_exists = !ret;
- ret = 0;
-
- if (!backpointer_exists) {
- if (fsck_err(trans, inode_wrong_backpointer,
- "inode %llu:%u has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- target->bi_inum, target->bi_snapshot,
- target->bi_dir,
- target->bi_dir_offset,
- d.k->p.inode,
- d.k->p.offset)) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, target);
- }
- } else {
- bch2_bkey_val_to_text(&buf, c, d.s_c);
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
-
- if (S_ISDIR(target->bi_mode) || target->bi_subvol) {
- /*
- * XXX: verify connectivity of the other dirent
- * up to the root before removing this one
- *
- * Additionally, bch2_lookup would need to cope with the
- * dirent it found being removed - or should we remove
- * the other one, even though the inode points to it?
- */
- if (in_fsck) {
- if (fsck_err(trans, inode_dir_multiple_links,
- "%s %llu:%u with multiple links\n%s",
- S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
- target->bi_inum, target->bi_snapshot, buf.buf))
- ret = bch2_fsck_remove_dirent(trans, d.k->p);
- } else {
- bch2_fs_inconsistent(c,
- "%s %llu:%u with multiple links\n%s",
- S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
- target->bi_inum, target->bi_snapshot, buf.buf);
- }
-
- goto out;
- } else {
- /*
- * hardlinked file with nlink 0:
- * We're just adjusting nlink here so check_nlinks() will pick
- * it up, it ignores inodes with nlink 0
- */
- if (fsck_err_on(!target->bi_nlink,
- trans, inode_multiple_links_but_nlink_0,
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
- target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_unlinked;
- ret = __bch2_fsck_write_inode(trans, target);
- if (ret)
- goto err;
- }
- }
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-int __bch2_check_dirent_target(struct btree_trans *trans,
- struct btree_iter *dirent_iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- bool in_fsck)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck);
- if (ret)
- goto err;
-
- if (fsck_err_on(d.v->d_type != inode_d_type(target),
- trans, dirent_d_type_wrong,
- "incorrect d_type: got %s, should be %s:\n%s",
- bch2_d_type_str(d.v->d_type),
- bch2_d_type_str(inode_d_type(target)),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = inode_d_type(target);
- if (n->v.d_type == DT_SUBVOL) {
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
- n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
- } else {
- n->v.d_inum = cpu_to_le64(target->bi_inum);
- }
-
- ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0);
- if (ret)
- goto err;
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h
deleted file mode 100644
index 2e6f6364767f..000000000000
--- a/fs/bcachefs/namei.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NAMEI_H
-#define _BCACHEFS_NAMEI_H
-
-#include "dirent.h"
-
-struct posix_acl;
-
-#define BCH_CREATE_TMPFILE (1U << 0)
-#define BCH_CREATE_SUBVOL (1U << 1)
-#define BCH_CREATE_SNAPSHOT (1U << 2)
-#define BCH_CREATE_SNAPSHOT_RO (1U << 3)
-
-int bch2_create_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
- const struct qstr *,
- uid_t, gid_t, umode_t, dev_t,
- struct posix_acl *,
- struct posix_acl *,
- subvol_inum, unsigned);
-
-int bch2_link_trans(struct btree_trans *,
- subvol_inum, struct bch_inode_unpacked *,
- subvol_inum, struct bch_inode_unpacked *,
- const struct qstr *);
-
-int bch2_unlink_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
- const struct qstr *, bool);
-
-int bch2_rename_trans(struct btree_trans *,
- subvol_inum, struct bch_inode_unpacked *,
- subvol_inum, struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
- const struct qstr *,
- const struct qstr *,
- enum bch_rename_mode);
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
- struct bch_inode_unpacked *);
-
-int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
-
-int __bch2_check_dirent_target(struct btree_trans *,
- struct btree_iter *,
- struct bkey_s_c_dirent,
- struct bch_inode_unpacked *, bool);
-
-static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
-static inline int bch2_check_dirent_target(struct btree_trans *trans,
- struct btree_iter *dirent_iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- bool in_fsck)
-{
- if (likely(inode_points_to_dirent(target, d) &&
- d.v->d_type == inode_d_type(target)))
- return 0;
-
- return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
-}
-
-#endif /* _BCACHEFS_NAMEI_H */
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
deleted file mode 100644
index 3c21981a4a1c..000000000000
--- a/fs/bcachefs/nocow_locking.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "nocow_locking.h"
-#include "util.h"
-
-#include <linux/closure.h>
-
-bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
- if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
- return true;
- return false;
-}
-
-#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0)
-
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
- int lock_val = flags ? 1 : -1;
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
- if (l->b[i] == dev_bucket) {
- int v = atomic_sub_return(lock_val, &l->l[i]);
-
- BUG_ON(v && sign(v) != lock_val);
- if (!v)
- closure_wake_up(&l->wait);
- return;
- }
-
- BUG();
-}
-
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
- u64 dev_bucket, int flags)
-{
- int v, lock_val = flags ? 1 : -1;
- unsigned i;
-
- spin_lock(&l->lock);
-
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
- if (l->b[i] == dev_bucket)
- goto got_entry;
-
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
- if (!atomic_read(&l->l[i])) {
- l->b[i] = dev_bucket;
- goto take_lock;
- }
-fail:
- spin_unlock(&l->lock);
- return false;
-got_entry:
- v = atomic_read(&l->l[i]);
- if (lock_val > 0 ? v < 0 : v > 0)
- goto fail;
-take_lock:
- v = atomic_read(&l->l[i]);
- /* Overflow? */
- if (v && sign(v + lock_val) != sign(v))
- goto fail;
-
- atomic_add(lock_val, &l->l[i]);
- spin_unlock(&l->lock);
- return true;
-}
-
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- struct nocow_lock_bucket *l,
- u64 dev_bucket, int flags)
-{
- if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
- struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
- u64 start_time = local_clock();
-
- __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
- bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
- }
-}
-
-void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
-
-{
- unsigned i, nr_zero = 0;
- struct nocow_lock_bucket *l;
-
- for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
- unsigned v = 0;
-
- for (i = 0; i < ARRAY_SIZE(l->l); i++)
- v |= atomic_read(&l->l[i]);
-
- if (!v) {
- nr_zero++;
- continue;
- }
-
- if (nr_zero)
- prt_printf(out, "(%u empty entries)\n", nr_zero);
- nr_zero = 0;
-
- for (i = 0; i < ARRAY_SIZE(l->l); i++) {
- int v = atomic_read(&l->l[i]);
- if (v) {
- bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
- prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
- }
- }
- prt_newline(out);
- }
-
- if (nr_zero)
- prt_printf(out, "(%u empty entries)\n", nr_zero);
-}
-
-void bch2_fs_nocow_locking_exit(struct bch_fs *c)
-{
- struct bucket_nocow_lock_table *t = &c->nocow_locks;
-
- for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
- for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
- BUG_ON(atomic_read(&l->l[j]));
-}
-
-int bch2_fs_nocow_locking_init(struct bch_fs *c)
-{
- struct bucket_nocow_lock_table *t = &c->nocow_locks;
-
- for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
- spin_lock_init(&l->lock);
-
- return 0;
-}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
deleted file mode 100644
index f9d6a426a960..000000000000
--- a/fs/bcachefs/nocow_locking.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NOCOW_LOCKING_H
-#define _BCACHEFS_NOCOW_LOCKING_H
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "nocow_locking_types.h"
-
-#include <linux/hash.h>
-
-static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- u64 dev_bucket)
-{
- unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
-
- return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
-}
-
-#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0)
-
-bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
- struct nocow_lock_bucket *, u64, int);
-
-static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- struct bpos bucket, int flags)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
- __bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
-}
-
-static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
- struct bpos bucket, int flags)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
- return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
-}
-
-void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
-
-void bch2_fs_nocow_locking_exit(struct bch_fs *);
-int bch2_fs_nocow_locking_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
deleted file mode 100644
index bd12bf677924..000000000000
--- a/fs/bcachefs/nocow_locking_types.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
-#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
-
-#define BUCKET_NOCOW_LOCKS_BITS 10
-#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS)
-
-struct nocow_lock_bucket {
- struct closure_waitlist wait;
- spinlock_t lock;
- u64 b[4];
- atomic_t l[4];
-} __aligned(SMP_CACHE_BYTES);
-
-struct bucket_nocow_lock_table {
- struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS];
-};
-
-#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
-
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
deleted file mode 100644
index 81fd6b7977d3..000000000000
--- a/fs/bcachefs/opts.c
+++ /dev/null
@@ -1,737 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/kernel.h>
-#include <linux/fs_parser.h>
-
-#include "bcachefs.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "opts.h"
-#include "recovery_passes.h"
-#include "super-io.h"
-#include "util.h"
-
-#define x(t, n, ...) [n] = #t,
-
-const char * const bch2_error_actions[] = {
- BCH_ERROR_ACTIONS()
- NULL
-};
-
-const char * const bch2_fsck_fix_opts[] = {
- BCH_FIX_ERRORS_OPTS()
- NULL
-};
-
-const char * const bch2_version_upgrade_opts[] = {
- BCH_VERSION_UPGRADE_OPTS()
- NULL
-};
-
-const char * const bch2_sb_features[] = {
- BCH_SB_FEATURES()
- NULL
-};
-
-const char * const bch2_sb_compat[] = {
- BCH_SB_COMPAT()
- NULL
-};
-
-const char * const __bch2_btree_ids[] = {
- BCH_BTREE_IDS()
- NULL
-};
-
-static const char * const __bch2_csum_types[] = {
- BCH_CSUM_TYPES()
- NULL
-};
-
-const char * const __bch2_csum_opts[] = {
- BCH_CSUM_OPTS()
- NULL
-};
-
-const char * const __bch2_compression_types[] = {
- BCH_COMPRESSION_TYPES()
- NULL
-};
-
-const char * const bch2_compression_opts[] = {
- BCH_COMPRESSION_OPTS()
- NULL
-};
-
-const char * const __bch2_str_hash_types[] = {
- BCH_STR_HASH_TYPES()
- NULL
-};
-
-const char * const bch2_str_hash_opts[] = {
- BCH_STR_HASH_OPTS()
- NULL
-};
-
-const char * const __bch2_data_types[] = {
- BCH_DATA_TYPES()
- NULL
-};
-
-const char * const bch2_member_states[] = {
- BCH_MEMBER_STATES()
- NULL
-};
-
-static const char * const __bch2_jset_entry_types[] = {
- BCH_JSET_ENTRY_TYPES()
- NULL
-};
-
-static const char * const __bch2_fs_usage_types[] = {
- BCH_FS_USAGE_TYPES()
- NULL
-};
-
-#undef x
-
-static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
- unsigned nr, const char *type, unsigned idx)
-{
- if (idx < nr)
- prt_str(out, opts[idx]);
- else
- prt_printf(out, "(unknown %s %u)", type, idx);
-}
-
-#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \
-void bch2_prt_##name(struct printbuf *out, type t) \
-{ \
- prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
-}
-
-PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type);
-PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
-PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
-PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt);
-PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
-PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
-PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
-
-static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
- struct printbuf *err)
-{
- if (!val) {
- *res = FSCK_FIX_yes;
- } else {
- int ret = match_string(bch2_fsck_fix_opts, -1, val);
-
- if (ret < 0 && err)
- prt_str(err, "fix_errors: invalid selection");
- if (ret < 0)
- return ret;
- *res = ret;
- }
-
- return 0;
-}
-
-static void bch2_opt_fix_errors_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
-{
- prt_str(out, bch2_fsck_fix_opts[v]);
-}
-
-#define bch2_opt_fix_errors (struct bch_opt_fn) { \
- .parse = bch2_opt_fix_errors_parse, \
- .to_text = bch2_opt_fix_errors_to_text, \
-}
-
-const char * const bch2_d_types[BCH_DT_MAX] = {
- [DT_UNKNOWN] = "unknown",
- [DT_FIFO] = "fifo",
- [DT_CHR] = "chr",
- [DT_DIR] = "dir",
- [DT_BLK] = "blk",
- [DT_REG] = "reg",
- [DT_LNK] = "lnk",
- [DT_SOCK] = "sock",
- [DT_WHT] = "whiteout",
- [DT_SUBVOL] = "subvol",
-};
-
-void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
-{
-#define x(_name, ...) \
- if (opt_defined(src, _name)) \
- opt_set(*dst, _name, src._name);
-
- BCH_OPTS()
-#undef x
-}
-
-bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
-{
- switch (id) {
-#define x(_name, ...) \
- case Opt_##_name: \
- return opt_defined(*opts, _name);
- BCH_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
-{
- switch (id) {
-#define x(_name, ...) \
- case Opt_##_name: \
- return opts->_name;
- BCH_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
-{
- switch (id) {
-#define x(_name, ...) \
- case Opt_##_name: \
- opt_set(*opts, _name, v); \
- break;
- BCH_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-/* dummy option, for options that aren't stored in the superblock */
-typedef u64 (*sb_opt_get_fn)(const struct bch_sb *);
-typedef void (*sb_opt_set_fn)(struct bch_sb *, u64);
-typedef u64 (*member_opt_get_fn)(const struct bch_member *);
-typedef void (*member_opt_set_fn)(struct bch_member *, u64);
-
-__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL;
-__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL;
-__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL;
-__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL;
-
-#define type_compatible_or_null(_p, _type) \
- __builtin_choose_expr( \
- __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL)
-
-const struct bch_option bch2_opt_table[] = {
-#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2
-#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
- .min = _min, .max = _max
-#define OPT_STR(_choices) .type = BCH_OPT_STR, \
- .min = 0, .max = ARRAY_SIZE(_choices) - 1, \
- .choices = _choices
-#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
- .min = 0, .max = U64_MAX, \
- .choices = _choices
-#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \
- .choices = _choices
-#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
-
-#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
- [Opt_##_name] = { \
- .attr.name = #_name, \
- .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
- .flags = _flags, \
- .hint = _hint, \
- .help = _help, \
- .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \
- .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \
- .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \
- .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\
- _type \
- },
-
- BCH_OPTS()
-#undef x
-};
-
-int bch2_opt_lookup(const char *name)
-{
- const struct bch_option *i;
-
- for (i = bch2_opt_table;
- i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
- i++)
- if (!strcmp(name, i->attr.name))
- return i - bch2_opt_table;
-
- return -1;
-}
-
-struct synonym {
- const char *s1, *s2;
-};
-
-static const struct synonym bch_opt_synonyms[] = {
- { "quota", "usrquota" },
-};
-
-static int bch2_mount_opt_lookup(const char *name)
-{
- const struct synonym *i;
-
- for (i = bch_opt_synonyms;
- i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
- i++)
- if (!strcmp(name, i->s1))
- name = i->s2;
-
- return bch2_opt_lookup(name);
-}
-
-int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
-{
- if (v < opt->min) {
- if (err)
- prt_printf(err, "%s: too small (min %llu)",
- opt->attr.name, opt->min);
- return -BCH_ERR_ERANGE_option_too_small;
- }
-
- if (opt->max && v >= opt->max) {
- if (err)
- prt_printf(err, "%s: too big (max %llu)",
- opt->attr.name, opt->max);
- return -BCH_ERR_ERANGE_option_too_big;
- }
-
- if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
- if (err)
- prt_printf(err, "%s: not a multiple of 512",
- opt->attr.name);
- return -BCH_ERR_opt_parse_error;
- }
-
- if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
- if (err)
- prt_printf(err, "%s: must be a power of two",
- opt->attr.name);
- return -BCH_ERR_opt_parse_error;
- }
-
- if (opt->fn.validate)
- return opt->fn.validate(v, err);
-
- return 0;
-}
-
-int bch2_opt_parse(struct bch_fs *c,
- const struct bch_option *opt,
- const char *val, u64 *res,
- struct printbuf *err)
-{
- ssize_t ret;
-
- switch (opt->type) {
- case BCH_OPT_BOOL:
- if (val) {
- ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
- if (ret != -BCH_ERR_option_not_bool) {
- *res = ret;
- } else {
- if (err)
- prt_printf(err, "%s: must be bool", opt->attr.name);
- return ret;
- }
- } else {
- *res = 1;
- }
-
- break;
- case BCH_OPT_UINT:
- if (!val) {
- prt_printf(err, "%s: required value",
- opt->attr.name);
- return -EINVAL;
- }
-
- ret = opt->flags & OPT_HUMAN_READABLE
- ? bch2_strtou64_h(val, res)
- : kstrtou64(val, 10, res);
- if (ret < 0) {
- if (err)
- prt_printf(err, "%s: must be a number",
- opt->attr.name);
- return ret;
- }
- break;
- case BCH_OPT_STR:
- if (!val) {
- prt_printf(err, "%s: required value",
- opt->attr.name);
- return -EINVAL;
- }
-
- ret = match_string(opt->choices, -1, val);
- if (ret < 0) {
- if (err)
- prt_printf(err, "%s: invalid selection",
- opt->attr.name);
- return ret;
- }
-
- *res = ret;
- break;
- case BCH_OPT_BITFIELD: {
- s64 v = bch2_read_flag_list(val, opt->choices);
- if (v < 0)
- return v;
- *res = v;
- break;
- }
- case BCH_OPT_FN:
- ret = opt->fn.parse(c, val, res, err);
-
- if (ret == -BCH_ERR_option_needs_open_fs)
- return ret;
-
- if (ret < 0) {
- if (err)
- prt_printf(err, "%s: parse error",
- opt->attr.name);
- return ret;
- }
- }
-
- return bch2_opt_validate(opt, *res, err);
-}
-
-void bch2_opt_to_text(struct printbuf *out,
- struct bch_fs *c, struct bch_sb *sb,
- const struct bch_option *opt, u64 v,
- unsigned flags)
-{
- if (flags & OPT_SHOW_MOUNT_STYLE) {
- if (opt->type == BCH_OPT_BOOL) {
- prt_printf(out, "%s%s",
- v ? "" : "no",
- opt->attr.name);
- return;
- }
-
- prt_printf(out, "%s=", opt->attr.name);
- }
-
- switch (opt->type) {
- case BCH_OPT_BOOL:
- case BCH_OPT_UINT:
- if (opt->flags & OPT_HUMAN_READABLE)
- prt_human_readable_u64(out, v);
- else
- prt_printf(out, "%lli", v);
- break;
- case BCH_OPT_STR:
- if (v < opt->min || v >= opt->max)
- prt_printf(out, "(invalid option %lli)", v);
- else if (flags & OPT_SHOW_FULL_LIST)
- prt_string_option(out, opt->choices, v);
- else
- prt_str(out, opt->choices[v]);
- break;
- case BCH_OPT_BITFIELD:
- prt_bitflags(out, opt->choices, v);
- break;
- case BCH_OPT_FN:
- opt->fn.to_text(out, c, sb, v);
- break;
- default:
- BUG();
- }
-}
-
-void bch2_opts_to_text(struct printbuf *out,
- struct bch_opts opts,
- struct bch_fs *c, struct bch_sb *sb,
- unsigned show_mask, unsigned hide_mask,
- unsigned flags)
-{
- bool first = true;
-
- for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
- const struct bch_option *opt = &bch2_opt_table[i];
-
- if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
- continue;
-
- u64 v = bch2_opt_get_by_id(&opts, i);
- if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
- continue;
-
- if (!first)
- prt_char(out, ',');
- first = false;
-
- bch2_opt_to_text(out, c, sb, opt, v, flags);
- }
-}
-
-int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v)
-{
- lockdep_assert_held(&c->state_lock);
-
- int ret = 0;
-
- switch (id) {
- case Opt_state:
- if (ca)
- return __bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
- break;
-
- case Opt_compression:
- case Opt_background_compression:
- ret = bch2_check_set_has_compressed_data(c, v);
- break;
- case Opt_erasure_code:
- if (v)
- bch2_check_set_feature(c, BCH_FEATURE_ec);
- break;
- }
-
- return ret;
-}
-
-int bch2_opts_check_may_set(struct bch_fs *c)
-{
- for (unsigned i = 0; i < bch2_opts_nr; i++) {
- int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
- struct printbuf *parse_later,
- const char *name, const char *val)
-{
- struct printbuf err = PRINTBUF;
- u64 v;
- int ret, id;
-
- id = bch2_mount_opt_lookup(name);
-
- /* Check for the form "noopt", negation of a boolean opt: */
- if (id < 0 &&
- !val &&
- !strncmp("no", name, 2)) {
- id = bch2_mount_opt_lookup(name + 2);
- val = "0";
- }
-
- /* Unknown options are ignored: */
- if (id < 0)
- return 0;
-
- if (!(bch2_opt_table[id].flags & OPT_MOUNT))
- goto bad_opt;
-
- if (id == Opt_acl &&
- !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
- goto bad_opt;
-
- if ((id == Opt_usrquota ||
- id == Opt_grpquota) &&
- !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
- goto bad_opt;
-
- ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
- if (ret == -BCH_ERR_option_needs_open_fs && parse_later) {
- prt_printf(parse_later, "%s=%s,", name, val);
- if (parse_later->allocation_failure) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = 0;
- goto out;
- }
-
- if (ret < 0)
- goto bad_val;
-
- if (opts)
- bch2_opt_set_by_id(opts, id, v);
-
- ret = 0;
- goto out;
-
-bad_opt:
- pr_err("Bad mount option %s", name);
- ret = -BCH_ERR_option_name;
- goto out;
-
-bad_val:
- pr_err("Invalid mount option %s", err.buf);
- ret = -BCH_ERR_option_value;
-
-out:
- printbuf_exit(&err);
- return ret;
-}
-
-int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
- struct printbuf *parse_later, char *options)
-{
- char *copied_opts, *copied_opts_start;
- char *opt, *name, *val;
- int ret;
-
- if (!options)
- return 0;
-
- /*
- * sys_fsconfig() is now occasionally providing us with option lists
- * starting with a comma - weird.
- */
- if (*options == ',')
- options++;
-
- copied_opts = kstrdup(options, GFP_KERNEL);
- if (!copied_opts)
- return -ENOMEM;
- copied_opts_start = copied_opts;
-
- while ((opt = strsep(&copied_opts, ",")) != NULL) {
- if (!*opt)
- continue;
-
- name = strsep(&opt, "=");
- val = opt;
-
- ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
- if (ret < 0)
- goto out;
- }
-
- ret = 0;
- goto out;
-
-out:
- kfree(copied_opts_start);
- return ret;
-}
-
-u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx)
-{
- const struct bch_option *opt = bch2_opt_table + id;
- u64 v;
-
- if (dev_idx < 0) {
- v = opt->get_sb(sb);
- } else {
- if (WARN(!bch2_member_exists(sb, dev_idx),
- "tried to set device option %s on nonexistent device %i",
- opt->attr.name, dev_idx))
- return 0;
-
- struct bch_member m = bch2_sb_member_get(sb, dev_idx);
- v = opt->get_member(&m);
- }
-
- if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
- --v;
-
- if (opt->flags & OPT_SB_FIELD_ILOG2)
- v = 1ULL << v;
-
- if (opt->flags & OPT_SB_FIELD_SECTORS)
- v <<= 9;
-
- return v;
-}
-
-/*
- * Initial options from superblock - here we don't want any options undefined,
- * any options the superblock doesn't specify are set to 0:
- */
-int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
-{
- for (unsigned id = 0; id < bch2_opts_nr; id++) {
- const struct bch_option *opt = bch2_opt_table + id;
-
- if (opt->get_sb)
- bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1));
- }
-
- return 0;
-}
-
-void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
- const struct bch_option *opt, u64 v)
-{
- if (opt->flags & OPT_SB_FIELD_SECTORS)
- v >>= 9;
-
- if (opt->flags & OPT_SB_FIELD_ILOG2)
- v = ilog2(v);
-
- if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
- v++;
-
- if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0)
- opt->set_sb(sb, v);
-
- if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) {
- if (WARN(!bch2_member_exists(sb, dev_idx),
- "tried to set device option %s on nonexistent device %i",
- opt->attr.name, dev_idx))
- return;
-
- opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v);
- }
-}
-
-void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
- const struct bch_option *opt, u64 v)
-{
- mutex_lock(&c->sb_lock);
- __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-}
-
-/* io opts: */
-
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
-{
- struct bch_io_opts opts = {
-#define x(_name, _bits) ._name = src._name,
- BCH_INODE_OPTS()
-#undef x
- };
-
- bch2_io_opts_fixups(&opts);
- return opts;
-}
-
-bool bch2_opt_is_inode_opt(enum bch_opt_id id)
-{
- static const enum bch_opt_id inode_opt_list[] = {
-#define x(_name, _bits) Opt_##_name,
- BCH_INODE_OPTS()
-#undef x
- };
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
- if (inode_opt_list[i] == id)
- return true;
-
- return false;
-}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
deleted file mode 100644
index bb621804d45a..000000000000
--- a/fs/bcachefs/opts.h
+++ /dev/null
@@ -1,667 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_OPTS_H
-#define _BCACHEFS_OPTS_H
-
-#include <linux/bug.h>
-#include <linux/log2.h>
-#include <linux/string.h>
-#include <linux/sysfs.h>
-#include "bcachefs_format.h"
-
-struct bch_fs;
-
-extern const char * const bch2_error_actions[];
-extern const char * const bch2_fsck_fix_opts[];
-extern const char * const bch2_version_upgrade_opts[];
-extern const char * const bch2_sb_features[];
-extern const char * const bch2_sb_compat[];
-extern const char * const __bch2_btree_ids[];
-extern const char * const __bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
-extern const char * const bch2_compression_opts[];
-extern const char * const __bch2_str_hash_types[];
-extern const char * const bch2_str_hash_opts[];
-extern const char * const __bch2_data_types[];
-extern const char * const bch2_member_states[];
-extern const char * const bch2_d_types[];
-
-void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type);
-void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
-void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
-void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt);
-void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
-void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
-void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
-
-static inline const char *bch2_d_type_str(unsigned d_type)
-{
- return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
-}
-
-/*
- * Mount options; we also store defaults in the superblock.
- *
- * Also exposed via sysfs: if an option is writeable, and it's also stored in
- * the superblock, changing it via sysfs (currently? might change this) also
- * updates the superblock.
- *
- * We store options as signed integers, where -1 means undefined. This means we
- * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
- * apply the options from that struct that are defined.
- */
-
-/* When can be set: */
-enum opt_flags {
- OPT_FS = BIT(0), /* Filesystem option */
- OPT_DEVICE = BIT(1), /* Device option */
- OPT_INODE = BIT(2), /* Inode option */
- OPT_FORMAT = BIT(3), /* May be specified at format time */
- OPT_MOUNT = BIT(4), /* May be specified at mount time */
- OPT_RUNTIME = BIT(5), /* May be specified at runtime */
- OPT_HUMAN_READABLE = BIT(6),
- OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */
- OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */
- OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */
- OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */
- OPT_HIDDEN = BIT(11),
-};
-
-enum opt_type {
- BCH_OPT_BOOL,
- BCH_OPT_UINT,
- BCH_OPT_STR,
- BCH_OPT_BITFIELD,
- BCH_OPT_FN,
-};
-
-struct bch_opt_fn {
- int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
- void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
- int (*validate)(u64, struct printbuf *);
-};
-
-/**
- * x(name, shortopt, type, in mem type, mode, sb_opt)
- *
- * @name - name of mount option, sysfs attribute, and struct bch_opts
- * member
- *
- * @mode - when opt may be set
- *
- * @sb_option - name of corresponding superblock option
- *
- * @type - one of OPT_BOOL, OPT_UINT, OPT_STR
- */
-
-/*
- * XXX: add fields for
- * - default value
- * - helptext
- */
-
-#ifdef __KERNEL__
-#define RATELIMIT_ERRORS_DEFAULT true
-#else
-#define RATELIMIT_ERRORS_DEFAULT false
-#endif
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCACHEFS_VERBOSE_DEFAULT true
-#else
-#define BCACHEFS_VERBOSE_DEFAULT false
-#endif
-
-#define BCH_FIX_ERRORS_OPTS() \
- x(exit, 0) \
- x(yes, 1) \
- x(no, 2) \
- x(ask, 3)
-
-enum fsck_err_opts {
-#define x(t, n) FSCK_FIX_##t,
- BCH_FIX_ERRORS_OPTS()
-#undef x
-};
-
-#define BCH_OPTS() \
- x(block_size, u16, \
- OPT_FS|OPT_FORMAT| \
- OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
- OPT_UINT(512, 1U << 16), \
- BCH_SB_BLOCK_SIZE, 4 << 10, \
- "size", NULL) \
- x(btree_node_size, u32, \
- OPT_FS|OPT_FORMAT| \
- OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
- OPT_UINT(512, 1U << 20), \
- BCH_SB_BTREE_NODE_SIZE, 256 << 10, \
- "size", "Btree node size, default 256k") \
- x(errors, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_error_actions), \
- BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
- NULL, "Action to take on filesystem error") \
- x(write_error_timeout, u16, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, 300), \
- BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
- NULL, "Number of consecutive write errors allowed before kicking out a device")\
- x(metadata_replicas, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, BCH_REPLICAS_MAX), \
- BCH_SB_META_REPLICAS_WANT, 1, \
- "#", "Number of metadata replicas") \
- x(data_replicas, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, BCH_REPLICAS_MAX), \
- BCH_SB_DATA_REPLICAS_WANT, 1, \
- "#", "Number of data replicas") \
- x(metadata_replicas_required, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_UINT(1, BCH_REPLICAS_MAX), \
- BCH_SB_META_REPLICAS_REQ, 1, \
- "#", NULL) \
- x(data_replicas_required, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_UINT(1, BCH_REPLICAS_MAX), \
- BCH_SB_DATA_REPLICAS_REQ, 1, \
- "#", NULL) \
- x(encoded_extent_max, u32, \
- OPT_FS|OPT_FORMAT| \
- OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
- OPT_UINT(4096, 2U << 20), \
- BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \
- "size", "Maximum size of checksummed/compressed extents")\
- x(metadata_checksum, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(__bch2_csum_opts), \
- BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
- NULL, NULL) \
- x(data_checksum, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(__bch2_csum_opts), \
- BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
- NULL, NULL) \
- x(checksum_err_retry_nr, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(0, 32), \
- BCH_SB_CSUM_ERR_RETRY_NR, 3, \
- NULL, NULL) \
- x(compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_compression), \
- BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \
- NULL, NULL) \
- x(background_compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_compression), \
- BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \
- NULL, NULL) \
- x(str_hash, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_str_hash_opts), \
- BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
- NULL, "Hash function for directory entries and xattrs")\
- x(metadata_target, u16, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_target), \
- BCH_SB_METADATA_TARGET, 0, \
- "(target)", "Device or label for metadata writes") \
- x(foreground_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_target), \
- BCH_SB_FOREGROUND_TARGET, 0, \
- "(target)", "Device or label for foreground writes") \
- x(background_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_target), \
- BCH_SB_BACKGROUND_TARGET, 0, \
- "(target)", "Device or label to move data to in the background")\
- x(promote_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_target), \
- BCH_SB_PROMOTE_TARGET, 0, \
- "(target)", "Device or label to promote data to on read") \
- x(erasure_code, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_ERASURE_CODE, false, \
- NULL, "Enable erasure coding (DO NOT USE YET)") \
- x(inodes_32bit, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_INODE_32BIT, true, \
- NULL, "Constrain inode numbers to 32 bits") \
- x(shard_inode_numbers_bits, u8, \
- OPT_FS|OPT_FORMAT, \
- OPT_UINT(0, 8), \
- BCH_SB_SHARD_INUMS_NBITS, 0, \
- NULL, "Shard new inode numbers by CPU id") \
- x(inodes_use_key_cache, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_INODES_USE_KEY_CACHE, true, \
- NULL, "Use the btree key cache for the inodes btree") \
- x(btree_node_mem_ptr_optimization, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Stash pointer to in memory btree node in btree ptr")\
- x(gc_reserve_percent, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(5, 21), \
- BCH_SB_GC_RESERVE, 8, \
- "%", "Percentage of disk space to reserve for copygc")\
- x(gc_reserve_bytes, u64, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \
- OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
- OPT_UINT(0, U64_MAX), \
- BCH_SB_GC_RESERVE_BYTES, 0, \
- "%", "Amount of disk space to reserve for copygc\n" \
- "Takes precedence over gc_reserve_percent if set")\
- x(root_reserve_percent, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_UINT(0, 100), \
- BCH_SB_ROOT_RESERVE, 0, \
- "%", "Percentage of disk space to reserve for superuser")\
- x(wide_macs, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_128_BIT_MACS, false, \
- NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\
- x(inline_data, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable inline data extents") \
- x(promote_whole_extents, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \
- NULL, "Promote whole extents, instead of just part being read")\
- x(acl, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_POSIX_ACL, true, \
- NULL, "Enable POSIX acls") \
- x(usrquota, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_USRQUOTA, false, \
- NULL, "Enable user quotas") \
- x(grpquota, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_GRPQUOTA, false, \
- NULL, "Enable group quotas") \
- x(prjquota, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_PRJQUOTA, false, \
- NULL, "Enable project quotas") \
- x(degraded, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Allow mounting in degraded mode") \
- x(very_degraded, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Allow mounting in when data will be missing") \
- x(no_splitbrain_check, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Don't kick drives out when splitbrain detected")\
- x(verbose, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \
- NULL, "Extra debugging information during mount/recovery")\
- x(journal_flush_delay, u32, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, U32_MAX), \
- BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \
- NULL, "Delay in milliseconds before automatic journal commits")\
- x(journal_flush_disabled, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_JOURNAL_FLUSH_DISABLED,false, \
- NULL, "Disable journal flush on sync/fsync\n" \
- "If enabled, writes can be lost, but only since the\n"\
- "last journal write (default 1 second)") \
- x(journal_reclaim_delay, u32, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(0, U32_MAX), \
- BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \
- NULL, "Delay in milliseconds before automatic journal reclaim")\
- x(move_bytes_in_flight, u32, \
- OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1024, U32_MAX), \
- BCH2_NO_SB_OPT, 1U << 20, \
- NULL, "Maximum Amount of IO to keep in flight by the move path")\
- x(move_ios_in_flight, u32, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, 1024), \
- BCH2_NO_SB_OPT, 32, \
- NULL, "Maximum number of IOs to keep in flight by the move path")\
- x(fsck, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Run fsck on mount") \
- x(fsck_memory_usage_percent, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_UINT(20, 70), \
- BCH2_NO_SB_OPT, 50, \
- NULL, "Maximum percentage of system ram fsck is allowed to pin")\
- x(fix_errors, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_FN(bch2_opt_fix_errors), \
- BCH2_NO_SB_OPT, FSCK_FIX_exit, \
- NULL, "Fix errors during fsck without asking") \
- x(ratelimit_errors, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
- NULL, "Ratelimit error messages during fsck") \
- x(nochanges, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Super read only mode - no writes at all will be issued,\n"\
- "even if we have to replay the journal") \
- x(norecovery, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Exit recovery immediately prior to journal replay")\
- x(recovery_passes, u64, \
- OPT_FS|OPT_MOUNT, \
- OPT_BITFIELD(bch2_recovery_passes), \
- BCH2_NO_SB_OPT, 0, \
- NULL, "Recovery passes to run explicitly") \
- x(recovery_passes_exclude, u64, \
- OPT_FS|OPT_MOUNT, \
- OPT_BITFIELD(bch2_recovery_passes), \
- BCH2_NO_SB_OPT, 0, \
- NULL, "Recovery passes to exclude") \
- x(recovery_pass_last, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_STR_NOLIMIT(bch2_recovery_passes), \
- BCH2_NO_SB_OPT, 0, \
- NULL, "Exit recovery after specified pass") \
- x(retain_recovery_info, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\
- x(read_entire_journal, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Read all journal entries, not just dirty ones")\
- x(read_journal_only, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Only read the journal, skip the rest of recovery")\
- x(journal_transaction_names, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \
- NULL, "Log transaction function names in journal") \
- x(allocator_stuck_timeout, u16, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(0, U16_MAX), \
- BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \
- NULL, "Default timeout in seconds for stuck allocator messages")\
- x(noexcl, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Don't open device in exclusive mode") \
- x(direct_io, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Use O_DIRECT (userspace only)") \
- x(sb, u64, \
- OPT_MOUNT, \
- OPT_UINT(0, S64_MAX), \
- BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
- "offset", "Sector offset of superblock") \
- x(read_only, u8, \
- OPT_FS|OPT_MOUNT|OPT_HIDDEN, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, NULL) \
- x(nostart, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Don\'t start filesystem, only open devices") \
- x(reconstruct_alloc, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Reconstruct alloc btree") \
- x(version_upgrade, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_STR(bch2_version_upgrade_opts), \
- BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \
- NULL, "Set superblock to latest version,\n" \
- "allowing any new features to be used") \
- x(stdio, u64, \
- 0, \
- OPT_UINT(0, S64_MAX), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Pointer to a struct stdio_redirect") \
- x(project, u8, \
- OPT_INODE, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, NULL) \
- x(nocow, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
- OPT_BOOL(), \
- BCH_SB_NOCOW, false, \
- NULL, "Nocow mode: Writes will be done in place when possible.\n"\
- "Snapshots and reflink will still caused writes to be COW\n"\
- "Implicitly disables data checksumming, compression and encryption")\
- x(nocow_enabled, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable nocow mode: enables runtime locking in\n"\
- "data move path needed if nocow will ever be in use\n")\
- x(copygc_enabled, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable copygc: disable for debugging, or to\n"\
- "quiet the system when doing performance testing\n")\
- x(rebalance_enabled, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable rebalance: disable for debugging, or to\n"\
- "quiet the system when doing performance testing\n")\
- x(no_data_io, u8, \
- OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Skip submit_bio() for data reads and writes, " \
- "for performance testing purposes") \
- x(state, u64, \
- OPT_DEVICE|OPT_RUNTIME, \
- OPT_STR(bch2_member_states), \
- BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \
- "state", "rw,ro,failed,spare") \
- x(bucket_size, u32, \
- OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
- OPT_UINT(0, S64_MAX), \
- BCH_MEMBER_BUCKET_SIZE, 0, \
- "size", "Specifies the bucket size; must be greater than the btree node size")\
- x(durability, u8, \
- OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \
- OPT_UINT(0, BCH_REPLICAS_MAX), \
- BCH_MEMBER_DURABILITY, 1, \
- "n", "Data written to this device will be considered\n"\
- "to have already been replicated n times") \
- x(data_allowed, u8, \
- OPT_DEVICE, \
- OPT_BITFIELD(__bch2_data_types), \
- BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
- "types", "Allowed data types for this device: journal, btree, and/or user")\
- x(discard, u8, \
- OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_MEMBER_DISCARD, true, \
- NULL, "Enable discard/TRIM support") \
- x(btree_node_prefetch, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
- " prefetched sequentially")
-
-struct bch_opts {
-#define x(_name, _bits, ...) unsigned _name##_defined:1;
- BCH_OPTS()
-#undef x
-
-#define x(_name, _bits, ...) _bits _name;
- BCH_OPTS()
-#undef x
-};
-
-struct bch2_opts_parse {
- struct bch_opts opts;
-
- /* to save opts that can't be parsed before the FS is opened: */
- struct printbuf parse_later;
-};
-
-static const __maybe_unused struct bch_opts bch2_opts_default = {
-#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \
- ._name##_defined = true, \
- ._name = _default, \
-
- BCH_OPTS()
-#undef x
-};
-
-#define opt_defined(_opts, _name) ((_opts)._name##_defined)
-
-#define opt_get(_opts, _name) \
- (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
-
-#define opt_set(_opts, _name, _v) \
-do { \
- (_opts)._name##_defined = true; \
- (_opts)._name = _v; \
-} while (0)
-
-static inline struct bch_opts bch2_opts_empty(void)
-{
- return (struct bch_opts) { 0 };
-}
-
-void bch2_opts_apply(struct bch_opts *, struct bch_opts);
-
-enum bch_opt_id {
-#define x(_name, ...) Opt_##_name,
- BCH_OPTS()
-#undef x
- bch2_opts_nr
-};
-
-struct bch_fs;
-struct printbuf;
-
-struct bch_option {
- struct attribute attr;
- enum opt_type type;
- enum opt_flags flags;
- u64 min, max;
-
- const char * const *choices;
-
- struct bch_opt_fn fn;
-
- const char *hint;
- const char *help;
-
- u64 (*get_sb)(const struct bch_sb *);
- void (*set_sb)(struct bch_sb *, u64);
-
- u64 (*get_member)(const struct bch_member *);
- void (*set_member)(struct bch_member *, u64);
-
-};
-
-extern const struct bch_option bch2_opt_table[];
-
-bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
-u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
-void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
-
-u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int);
-int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
-
-struct bch_dev;
-void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
-
-int bch2_opt_lookup(const char *);
-int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
-int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
- const char *, u64 *, struct printbuf *);
-
-#define OPT_SHOW_FULL_LIST (1 << 0)
-#define OPT_SHOW_MOUNT_STYLE (1 << 1)
-
-void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
- const struct bch_option *, u64, unsigned);
-void bch2_opts_to_text(struct printbuf *,
- struct bch_opts,
- struct bch_fs *, struct bch_sb *,
- unsigned, unsigned, unsigned);
-
-int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64);
-int bch2_opts_check_may_set(struct bch_fs *);
-int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
- struct printbuf *, const char *, const char *);
-int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
- char *);
-
-/* inode opts: */
-
-struct bch_io_opts {
-#define x(_name, _bits) u##_bits _name;
- BCH_INODE_OPTS()
-#undef x
-#define x(_name, _bits) u64 _name##_from_inode:1;
- BCH_INODE_OPTS()
-#undef x
-};
-
-static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
-{
- if (!opts->background_target)
- opts->background_target = opts->foreground_target;
- if (!opts->background_compression)
- opts->background_compression = opts->compression;
- if (opts->nocow) {
- opts->compression = opts->background_compression = 0;
- opts->data_checksum = 0;
- opts->erasure_code = 0;
- }
-}
-
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
-bool bch2_opt_is_inode_opt(enum bch_opt_id);
-
-#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
deleted file mode 100644
index 4cf5a2af1e6f..000000000000
--- a/fs/bcachefs/printbuf.c
+++ /dev/null
@@ -1,509 +0,0 @@
-// SPDX-License-Identifier: LGPL-2.1+
-/* Copyright (C) 2022 Kent Overstreet */
-
-#include <linux/bitmap.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/string_helpers.h>
-
-#include "printbuf.h"
-
-static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos)
-{
- return pos - buf->last_newline;
-}
-
-static inline unsigned printbuf_linelen(struct printbuf *buf)
-{
- return __printbuf_linelen(buf, buf->pos);
-}
-
-/*
- * Returns spaces from start of line, if set, or 0 if unset:
- */
-static inline unsigned cur_tabstop(struct printbuf *buf)
-{
- return buf->cur_tabstop < buf->nr_tabstops
- ? buf->_tabstops[buf->cur_tabstop]
- : 0;
-}
-
-int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
-{
- /* Reserved space for terminating nul: */
- extra += 1;
-
- if (out->pos + extra <= out->size)
- return 0;
-
- if (!out->heap_allocated) {
- out->overflow = true;
- return 0;
- }
-
- unsigned new_size = roundup_pow_of_two(out->size + extra);
-
- /* Sanity check... */
- if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) {
- out->allocation_failure = true;
- out->overflow = true;
- return -ENOMEM;
- }
-
- /*
- * Note: output buffer must be freeable with kfree(), it's not required
- * that the user use printbuf_exit().
- */
- char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
-
- if (!buf) {
- out->allocation_failure = true;
- out->overflow = true;
- return -ENOMEM;
- }
-
- out->buf = buf;
- out->size = new_size;
- return 0;
-}
-
-static void printbuf_advance_pos(struct printbuf *out, unsigned len)
-{
- out->pos += min(len, printbuf_remaining(out));
-}
-
-static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr)
-{
- unsigned move = out->pos - pos;
-
- bch2_printbuf_make_room(out, nr);
-
- if (pos + nr < out->size)
- memmove(out->buf + pos + nr,
- out->buf + pos,
- min(move, out->size - 1 - pos - nr));
-
- if (pos < out->size)
- memset(out->buf + pos, ' ', min(nr, out->size - pos));
-
- printbuf_advance_pos(out, nr);
- printbuf_nul_terminate_reserved(out);
-}
-
-static void __printbuf_do_indent(struct printbuf *out, unsigned pos)
-{
- while (true) {
- int pad;
- unsigned len = out->pos - pos;
- char *p = out->buf + pos;
- char *n = memscan(p, '\n', len);
- if (cur_tabstop(out)) {
- n = min(n, (char *) memscan(p, '\r', len));
- n = min(n, (char *) memscan(p, '\t', len));
- }
-
- pos = n - out->buf;
- if (pos == out->pos)
- break;
-
- switch (*n) {
- case '\n':
- pos++;
- out->last_newline = pos;
-
- printbuf_insert_spaces(out, pos, out->indent);
-
- pos = min(pos + out->indent, out->pos);
- out->last_field = pos;
- out->cur_tabstop = 0;
- break;
- case '\r':
- memmove(n, n + 1, out->pos - pos);
- --out->pos;
- pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos);
- if (pad > 0) {
- printbuf_insert_spaces(out, out->last_field, pad);
- pos += pad;
- }
-
- out->last_field = pos;
- out->cur_tabstop++;
- break;
- case '\t':
- pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1;
- if (pad > 0) {
- *n = ' ';
- printbuf_insert_spaces(out, pos, pad - 1);
- pos += pad;
- } else {
- memmove(n, n + 1, out->pos - pos);
- --out->pos;
- }
-
- out->last_field = pos;
- out->cur_tabstop++;
- break;
- }
- }
-}
-
-static inline void printbuf_do_indent(struct printbuf *out, unsigned pos)
-{
- if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling)
- __printbuf_do_indent(out, pos);
-}
-
-void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
-{
- int len;
-
- do {
- va_list args2;
-
- va_copy(args2, args);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2);
- va_end(args2);
- } while (len > printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len));
-
- unsigned indent_pos = out->pos;
- printbuf_advance_pos(out, len);
- printbuf_do_indent(out, indent_pos);
-}
-
-void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
-{
- va_list args;
- int len;
-
- do {
- va_start(args, fmt);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args);
- va_end(args);
- } while (len > printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len));
-
- unsigned indent_pos = out->pos;
- printbuf_advance_pos(out, len);
- printbuf_do_indent(out, indent_pos);
-}
-
-/**
- * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
- * null terminated
- * @buf: printbuf to terminate
- * Returns: Printbuf contents, as a nul terminated C string
- */
-const char *bch2_printbuf_str(const struct printbuf *buf)
-{
- /*
- * If we've written to a printbuf then it's guaranteed to be a null
- * terminated string - but if we haven't, then we might not have
- * allocated a buffer at all:
- */
- return buf->pos
- ? buf->buf
- : "";
-}
-
-/**
- * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
- * against accidental use.
- * @buf: printbuf to exit
- */
-void bch2_printbuf_exit(struct printbuf *buf)
-{
- if (buf->heap_allocated) {
- kfree(buf->buf);
- buf->buf = ERR_PTR(-EINTR); /* poison value */
- }
-}
-
-void bch2_printbuf_tabstops_reset(struct printbuf *buf)
-{
- buf->nr_tabstops = 0;
-}
-
-void bch2_printbuf_tabstop_pop(struct printbuf *buf)
-{
- if (buf->nr_tabstops)
- --buf->nr_tabstops;
-}
-
-/*
- * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
- *
- * @buf: printbuf to control
- * @spaces: number of spaces from previous tabpstop
- *
- * In the future this function may allocate memory if setting more than
- * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
- * of line.
- */
-int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
-{
- unsigned prev_tabstop = buf->nr_tabstops
- ? buf->_tabstops[buf->nr_tabstops - 1]
- : 0;
-
- if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
- return -EINVAL;
-
- buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
- buf->has_indent_or_tabstops = true;
- return 0;
-}
-
-/**
- * bch2_printbuf_indent_add() - add to the current indent level
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to add to the current indent level
- *
- * Subsequent lines, and the current line if the output position is at the start
- * of the current line, will be indented by @spaces more spaces.
- */
-void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
-{
- if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
- spaces = 0;
-
- buf->indent += spaces;
- prt_chars(buf, ' ', spaces);
-
- buf->has_indent_or_tabstops = true;
-}
-
-/**
- * bch2_printbuf_indent_sub() - subtract from the current indent level
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to subtract from the current indent level
- *
- * Subsequent lines, and the current line if the output position is at the start
- * of the current line, will be indented by @spaces less spaces.
- */
-void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
-{
- if (WARN_ON_ONCE(spaces > buf->indent))
- spaces = buf->indent;
-
- if (buf->last_newline + buf->indent == buf->pos) {
- buf->pos -= spaces;
- printbuf_nul_terminate(buf);
- }
- buf->indent -= spaces;
-
- if (!buf->indent && !buf->nr_tabstops)
- buf->has_indent_or_tabstops = false;
-}
-
-void bch2_prt_newline(struct printbuf *buf)
-{
- bch2_printbuf_make_room(buf, 1 + buf->indent);
-
- __prt_char_reserved(buf, '\n');
-
- buf->last_newline = buf->pos;
-
- __prt_chars_reserved(buf, ' ', buf->indent);
-
- printbuf_nul_terminate_reserved(buf);
-
- buf->last_field = buf->pos;
- buf->cur_tabstop = 0;
-}
-
-void bch2_printbuf_strip_trailing_newline(struct printbuf *out)
-{
- for (int p = out->pos - 1; p >= 0; --p) {
- if (out->buf[p] == '\n') {
- out->pos = p;
- break;
- }
- if (out->buf[p] != ' ')
- break;
- }
-
- printbuf_nul_terminate_reserved(out);
-}
-
-static void __prt_tab(struct printbuf *out)
-{
- int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
-
- prt_chars(out, ' ', spaces);
-
- out->last_field = out->pos;
- out->cur_tabstop++;
-}
-
-/**
- * bch2_prt_tab() - Advance printbuf to the next tabstop
- * @out: printbuf to control
- *
- * Advance output to the next tabstop by printing spaces.
- */
-void bch2_prt_tab(struct printbuf *out)
-{
- if (WARN_ON(!cur_tabstop(out)))
- return;
-
- __prt_tab(out);
-}
-
-static void __prt_tab_rjust(struct printbuf *buf)
-{
- int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
- if (pad > 0)
- printbuf_insert_spaces(buf, buf->last_field, pad);
-
- buf->last_field = buf->pos;
- buf->cur_tabstop++;
-}
-
-/**
- * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
- * previous output
- *
- * @buf: printbuf to control
- *
- * Advance output to the next tabstop by inserting spaces immediately after the
- * previous tabstop, right justifying previously outputted text.
- */
-void bch2_prt_tab_rjust(struct printbuf *buf)
-{
- if (WARN_ON(!cur_tabstop(buf)))
- return;
-
- __prt_tab_rjust(buf);
-}
-
-/**
- * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
- *
- * @out: output printbuf
- * @str: string to print
- * @count: number of bytes to print
- *
- * The following contol characters are handled as so:
- * \n: prt_newline newline that obeys current indent level
- * \t: prt_tab advance to next tabstop
- * \r: prt_tab_rjust advance to next tabstop, with right justification
- */
-void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
-{
- unsigned indent_pos = out->pos;
- prt_bytes(out, str, count);
- printbuf_do_indent(out, indent_pos);
-}
-
-/**
- * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
- * @out: output printbuf
- * @v: integer to print
- *
- * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
- */
-void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
-{
- bch2_printbuf_make_room(out, 10);
- unsigned len = string_get_size(v, 1, !out->si_units,
- out->buf + out->pos,
- printbuf_remaining_size(out));
- printbuf_advance_pos(out, len);
-}
-
-/**
- * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
- * @out: output printbuf
- * @v: integer to print
- *
- * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
- */
-void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
-{
- if (v < 0)
- prt_char(out, '-');
- bch2_prt_human_readable_u64(out, abs(v));
-}
-
-/**
- * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
- * @out: output printbuf
- * @v: integer to print
- *
- * Units are either raw (default), or human reabable units (controlled via
- * @buf->human_readable_units)
- */
-void bch2_prt_units_u64(struct printbuf *out, u64 v)
-{
- if (out->human_readable_units)
- bch2_prt_human_readable_u64(out, v);
- else
- bch2_prt_printf(out, "%llu", v);
-}
-
-/**
- * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
- * @out: output printbuf
- * @v: integer to print
- *
- * Units are either raw (default), or human reabable units (controlled via
- * @buf->human_readable_units)
- */
-void bch2_prt_units_s64(struct printbuf *out, s64 v)
-{
- if (v < 0)
- prt_char(out, '-');
- bch2_prt_units_u64(out, abs(v));
-}
-
-void bch2_prt_string_option(struct printbuf *out,
- const char * const list[],
- size_t selected)
-{
- for (size_t i = 0; list[i]; i++)
- bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
-}
-
-void bch2_prt_bitflags(struct printbuf *out,
- const char * const list[], u64 flags)
-{
- unsigned bit, nr = 0;
- bool first = true;
-
- while (list[nr])
- nr++;
-
- while (flags && (bit = __ffs64(flags)) < nr) {
- if (!first)
- bch2_prt_printf(out, ",");
- first = false;
- bch2_prt_printf(out, "%s", list[bit]);
- flags ^= BIT_ULL(bit);
- }
-}
-
-void bch2_prt_bitflags_vector(struct printbuf *out,
- const char * const list[],
- unsigned long *v, unsigned nr)
-{
- bool first = true;
- unsigned i;
-
- for (i = 0; i < nr; i++)
- if (!list[i]) {
- nr = i - 1;
- break;
- }
-
- for_each_set_bit(i, v, nr) {
- if (!first)
- bch2_prt_printf(out, ",");
- first = false;
- bch2_prt_printf(out, "%s", list[i]);
- }
-}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
deleted file mode 100644
index d0dd398baa2b..000000000000
--- a/fs/bcachefs/printbuf.h
+++ /dev/null
@@ -1,289 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1+ */
-/* Copyright (C) 2022 Kent Overstreet */
-
-#ifndef _BCACHEFS_PRINTBUF_H
-#define _BCACHEFS_PRINTBUF_H
-
-/*
- * Printbufs: Simple strings for printing to, with optional heap allocation
- *
- * This code has provisions for use in userspace, to aid in making other code
- * portable between kernelspace and userspace.
- *
- * Basic example:
- * struct printbuf buf = PRINTBUF;
- *
- * prt_printf(&buf, "foo=");
- * foo_to_text(&buf, foo);
- * printk("%s", buf.buf);
- * printbuf_exit(&buf);
- *
- * Or
- * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
- *
- * We can now write pretty printers instead of writing code that dumps
- * everything to the kernel log buffer, and then those pretty-printers can be
- * used by other code that outputs to kernel log, sysfs, debugfs, etc.
- *
- * Memory allocation: Outputing to a printbuf may allocate memory. This
- * allocation is done with GFP_KERNEL, by default: use the newer
- * memalloc_*_(save|restore) functions as needed.
- *
- * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
- * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
- *
- * It's allowed to grab the output buffer and free it later with kfree() instead
- * of using printbuf_exit(), if the user just needs a heap allocated string at
- * the end.
- *
- * Memory allocation failures: We don't return errors directly, because on
- * memory allocation failure we usually don't want to bail out and unwind - we
- * want to print what we've got, on a best-effort basis. But code that does want
- * to return -ENOMEM may check printbuf.allocation_failure.
- *
- * Indenting, tabstops:
- *
- * To aid is writing multi-line pretty printers spread across multiple
- * functions, printbufs track the current indent level.
- *
- * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
- * level, respectively.
- *
- * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
- * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
- * prt_tab_rjust() will also advance the current line of text up to the next
- * tabstop, but it does so by shifting text since the previous tabstop up to the
- * next tabstop - right justifying it.
- *
- * Make sure you use prt_newline() instead of \n in the format string for indent
- * level and tabstops to work corretly.
- *
- * Output units: printbuf->units exists to tell pretty-printers how to output
- * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
- * human readable bytes. prt_units() obeys it.
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-enum printbuf_si {
- PRINTBUF_UNITS_2, /* use binary powers of 2^10 */
- PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */
-};
-
-#define PRINTBUF_INLINE_TABSTOPS 6
-
-struct printbuf {
- char *buf;
- unsigned size;
- unsigned pos;
- unsigned last_newline;
- unsigned last_field;
- unsigned indent;
- /*
- * If nonzero, allocations will be done with GFP_ATOMIC:
- */
- u8 atomic;
- bool allocation_failure:1;
- bool heap_allocated:1;
- bool overflow:1;
- enum printbuf_si si_units:1;
- bool human_readable_units:1;
- bool has_indent_or_tabstops:1;
- bool suppress_indent_tabstop_handling:1;
- u8 nr_tabstops;
-
- /*
- * Do not modify directly: use printbuf_tabstop_add(),
- * printbuf_tabstop_get()
- */
- u8 cur_tabstop;
- u8 _tabstops[PRINTBUF_INLINE_TABSTOPS];
-};
-
-int bch2_printbuf_make_room(struct printbuf *, unsigned);
-__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
-__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
-const char *bch2_printbuf_str(const struct printbuf *);
-void bch2_printbuf_exit(struct printbuf *);
-
-void bch2_printbuf_tabstops_reset(struct printbuf *);
-void bch2_printbuf_tabstop_pop(struct printbuf *);
-int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
-
-void bch2_printbuf_indent_add(struct printbuf *, unsigned);
-void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
-
-void bch2_prt_newline(struct printbuf *);
-void bch2_printbuf_strip_trailing_newline(struct printbuf *);
-void bch2_prt_tab(struct printbuf *);
-void bch2_prt_tab_rjust(struct printbuf *);
-
-void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
-void bch2_prt_human_readable_u64(struct printbuf *, u64);
-void bch2_prt_human_readable_s64(struct printbuf *, s64);
-void bch2_prt_units_u64(struct printbuf *, u64);
-void bch2_prt_units_s64(struct printbuf *, s64);
-void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
-void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
-void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
- unsigned long *, unsigned);
-
-/* Initializer for a heap allocated printbuf: */
-#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
-
-/* Initializer a printbuf that points to an external buffer: */
-#define PRINTBUF_EXTERN(_buf, _size) \
-((struct printbuf) { \
- .buf = _buf, \
- .size = _size, \
-})
-
-/*
- * Returns size remaining of output buffer:
- */
-static inline unsigned printbuf_remaining_size(struct printbuf *out)
-{
- if (WARN_ON(out->size && out->pos >= out->size))
- out->pos = out->size - 1;
- return out->size - out->pos;
-}
-
-/*
- * Returns number of characters we can print to the output buffer - i.e.
- * excluding the terminating nul:
- */
-static inline unsigned printbuf_remaining(struct printbuf *out)
-{
- return out->size ? printbuf_remaining_size(out) - 1 : 0;
-}
-
-static inline unsigned printbuf_written(struct printbuf *out)
-{
- return out->size ? min(out->pos, out->size - 1) : 0;
-}
-
-static inline void printbuf_nul_terminate_reserved(struct printbuf *out)
-{
- if (WARN_ON(out->size && out->pos >= out->size))
- out->pos = out->size - 1;
- if (out->size)
- out->buf[out->pos] = 0;
-}
-
-static inline void printbuf_nul_terminate(struct printbuf *out)
-{
- bch2_printbuf_make_room(out, 1);
- printbuf_nul_terminate_reserved(out);
-}
-
-/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
-static inline void __prt_char_reserved(struct printbuf *out, char c)
-{
- if (printbuf_remaining(out))
- out->buf[out->pos++] = c;
-}
-
-/* Doesn't nul terminate: */
-static inline void __prt_char(struct printbuf *out, char c)
-{
- bch2_printbuf_make_room(out, 1);
- __prt_char_reserved(out, c);
-}
-
-static inline void prt_char(struct printbuf *out, char c)
-{
- bch2_printbuf_make_room(out, 2);
- __prt_char_reserved(out, c);
- printbuf_nul_terminate_reserved(out);
-}
-
-static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
-{
- unsigned can_print = min(n, printbuf_remaining(out));
-
- for (unsigned i = 0; i < can_print; i++)
- out->buf[out->pos++] = c;
-}
-
-static inline void prt_chars(struct printbuf *out, char c, unsigned n)
-{
- bch2_printbuf_make_room(out, n);
- __prt_chars_reserved(out, c, n);
- printbuf_nul_terminate_reserved(out);
-}
-
-static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
-{
- bch2_printbuf_make_room(out, n);
-
- unsigned can_print = min(n, printbuf_remaining(out));
-
- for (unsigned i = 0; i < can_print; i++)
- out->buf[out->pos++] = ((char *) b)[i];
-
- printbuf_nul_terminate(out);
-}
-
-static inline void prt_str(struct printbuf *out, const char *str)
-{
- prt_bytes(out, str, strlen(str));
-}
-
-static inline void prt_str_indented(struct printbuf *out, const char *str)
-{
- bch2_prt_bytes_indented(out, str, strlen(str));
-}
-
-static inline void prt_hex_byte(struct printbuf *out, u8 byte)
-{
- bch2_printbuf_make_room(out, 3);
- __prt_char_reserved(out, hex_asc_hi(byte));
- __prt_char_reserved(out, hex_asc_lo(byte));
- printbuf_nul_terminate_reserved(out);
-}
-
-static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
-{
- bch2_printbuf_make_room(out, 3);
- __prt_char_reserved(out, hex_asc_upper_hi(byte));
- __prt_char_reserved(out, hex_asc_upper_lo(byte));
- printbuf_nul_terminate_reserved(out);
-}
-
-static inline void printbuf_reset_keep_tabstops(struct printbuf *buf)
-{
- buf->pos = 0;
- buf->allocation_failure = 0;
- buf->last_newline = 0;
- buf->last_field = 0;
- buf->indent = 0;
- buf->cur_tabstop = 0;
-}
-
-/**
- * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
- */
-static inline void printbuf_reset(struct printbuf *buf)
-{
- printbuf_reset_keep_tabstops(buf);
- buf->nr_tabstops = 0;
-}
-
-/**
- * printbuf_atomic_inc - mark as entering an atomic section
- */
-static inline void printbuf_atomic_inc(struct printbuf *buf)
-{
- buf->atomic++;
-}
-
-/**
- * printbuf_atomic_inc - mark as leaving an atomic section
- */
-static inline void printbuf_atomic_dec(struct printbuf *buf)
-{
- buf->atomic--;
-}
-
-#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c
deleted file mode 100644
index bafd1c91a802..000000000000
--- a/fs/bcachefs/progress.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "disk_accounting.h"
-#include "progress.h"
-
-void bch2_progress_init(struct progress_indicator_state *s,
- struct bch_fs *c,
- u64 btree_id_mask)
-{
- memset(s, 0, sizeof(*s));
-
- s->next_print = jiffies + HZ * 10;
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++) {
- if (!(btree_id_mask & BIT_ULL(i)))
- continue;
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_btree,
- .btree.id = i,
- };
-
- u64 v;
- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
- s->nodes_total += div64_ul(v, btree_sectors(c));
- }
-}
-
-static inline bool progress_update_p(struct progress_indicator_state *s)
-{
- bool ret = time_after_eq(jiffies, s->next_print);
-
- if (ret)
- s->next_print = jiffies + HZ * 10;
- return ret;
-}
-
-void bch2_progress_update_iter(struct btree_trans *trans,
- struct progress_indicator_state *s,
- struct btree_iter *iter,
- const char *msg)
-{
- struct bch_fs *c = trans->c;
- struct btree *b = path_l(btree_iter_path(trans, iter))->b;
-
- s->nodes_seen += b != s->last_node;
- s->last_node = b;
-
- if (progress_update_p(s)) {
- struct printbuf buf = PRINTBUF;
- unsigned percent = s->nodes_total
- ? div64_u64(s->nodes_seen * 100, s->nodes_total)
- : 0;
-
- prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
- msg, percent, s->nodes_seen, s->nodes_total);
- bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
-
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-}
diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h
deleted file mode 100644
index 23fb1811f943..000000000000
--- a/fs/bcachefs/progress.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_PROGRESS_H
-#define _BCACHEFS_PROGRESS_H
-
-/*
- * Lame progress indicators
- *
- * We don't like to use these because they print to the dmesg console, which is
- * spammy - we much prefer to be wired up to a userspace programm (e.g. via
- * thread_with_file) and have it print the progress indicator.
- *
- * But some code is old and doesn't support that, or runs in a context where
- * that's not yet practical (mount).
- */
-
-struct progress_indicator_state {
- unsigned long next_print;
- u64 nodes_seen;
- u64 nodes_total;
- struct btree *last_node;
-};
-
-void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
-void bch2_progress_update_iter(struct btree_trans *,
- struct progress_indicator_state *,
- struct btree_iter *,
- const char *);
-
-#endif /* _BCACHEFS_PROGRESS_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
deleted file mode 100644
index 8b857fc33244..000000000000
--- a/fs/bcachefs/quota.c
+++ /dev/null
@@ -1,892 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "quota.h"
-#include "snapshot.h"
-#include "super-io.h"
-
-static const char * const bch2_quota_types[] = {
- "user",
- "group",
- "project",
-};
-
-static const char * const bch2_quota_counters[] = {
- "space",
- "inodes",
-};
-
-static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_quota *q = field_to_type(f, quota);
-
- if (vstruct_bytes(&q->field) < sizeof(*q)) {
- prt_printf(err, "wrong size (got %zu should be %zu)",
- vstruct_bytes(&q->field), sizeof(*q));
- return -BCH_ERR_invalid_sb_quota;
- }
-
- return 0;
-}
-
-static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_quota *q = field_to_type(f, quota);
- unsigned qtyp, counter;
-
- for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
- prt_printf(out, "%s: flags %llx",
- bch2_quota_types[qtyp],
- le64_to_cpu(q->q[qtyp].flags));
-
- for (counter = 0; counter < Q_COUNTERS; counter++)
- prt_printf(out, " %s timelimit %u warnlimit %u",
- bch2_quota_counters[counter],
- le32_to_cpu(q->q[qtyp].c[counter].timelimit),
- le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
-
- prt_newline(out);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_quota = {
- .validate = bch2_sb_quota_validate,
- .to_text = bch2_sb_quota_to_text,
-};
-
-int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(k.k->p.inode >= QTYP_NR,
- c, quota_type_invalid,
- "invalid quota type (%llu >= %u)",
- k.k->p.inode, QTYP_NR);
-fsck_err:
- return ret;
-}
-
-void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
- unsigned i;
-
- for (i = 0; i < Q_COUNTERS; i++)
- prt_printf(out, "%s hardlimit %llu softlimit %llu",
- bch2_quota_counters[i],
- le64_to_cpu(dq.v->c[i].hardlimit),
- le64_to_cpu(dq.v->c[i].softlimit));
-}
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-#include <linux/cred.h>
-#include <linux/fs.h>
-#include <linux/quota.h>
-
-static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
-{
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 20);
-
- prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask);
- prt_printf(out, "i_flags\t%u\n", i->i_flags);
- prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit);
- prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit);
- prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit);
- prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit);
- prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit);
- prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit);
-}
-
-static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
-{
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 20);
-
- prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask);
- prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit);
- prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit);
- prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit);
- prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit);
- prt_printf(out, "d_space\t%llu\n", q->d_space);
- prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count);
- prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer);
- prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer);
- prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns);
- prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns);
-}
-
-static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
-{
- qtypes >>= i;
- return qtypes ? i + __ffs(qtypes) : QTYP_NR;
-}
-
-#define for_each_set_qtype(_c, _i, _q, _qtypes) \
- for (_i = 0; \
- (_i = __next_qtype(_i, _qtypes), \
- _q = &(_c)->quotas[_i], \
- _i < QTYP_NR); \
- _i++)
-
-static bool ignore_hardlimit(struct bch_memquota_type *q)
-{
- if (capable(CAP_SYS_RESOURCE))
- return true;
-#if 0
- struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
-
- return capable(CAP_SYS_RESOURCE) &&
- (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
- !(info->dqi_flags & DQF_ROOT_SQUASH));
-#endif
- return false;
-}
-
-enum quota_msg {
- SOFTWARN, /* Softlimit reached */
- SOFTLONGWARN, /* Grace time expired */
- HARDWARN, /* Hardlimit reached */
-
- HARDBELOW, /* Usage got below inode hardlimit */
- SOFTBELOW, /* Usage got below inode softlimit */
-};
-
-static int quota_nl[][Q_COUNTERS] = {
- [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN,
- [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN,
- [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN,
- [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW,
- [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW,
-
- [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN,
- [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN,
- [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN,
- [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW,
- [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW,
-};
-
-struct quota_msgs {
- u8 nr;
- struct {
- u8 qtype;
- u8 msg;
- } m[QTYP_NR * Q_COUNTERS];
-};
-
-static void prepare_msg(unsigned qtype,
- enum quota_counters counter,
- struct quota_msgs *msgs,
- enum quota_msg msg_type)
-{
- BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
-
- msgs->m[msgs->nr].qtype = qtype;
- msgs->m[msgs->nr].msg = quota_nl[msg_type][counter];
- msgs->nr++;
-}
-
-static void prepare_warning(struct memquota_counter *qc,
- unsigned qtype,
- enum quota_counters counter,
- struct quota_msgs *msgs,
- enum quota_msg msg_type)
-{
- if (qc->warning_issued & (1 << msg_type))
- return;
-
- prepare_msg(qtype, counter, msgs, msg_type);
-}
-
-static void flush_warnings(struct bch_qid qid,
- struct super_block *sb,
- struct quota_msgs *msgs)
-{
- unsigned i;
-
- for (i = 0; i < msgs->nr; i++)
- quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
- sb->s_dev, msgs->m[i].msg);
-}
-
-static int bch2_quota_check_limit(struct bch_fs *c,
- unsigned qtype,
- struct bch_memquota *mq,
- struct quota_msgs *msgs,
- enum quota_counters counter,
- s64 v,
- enum quota_acct_mode mode)
-{
- struct bch_memquota_type *q = &c->quotas[qtype];
- struct memquota_counter *qc = &mq->c[counter];
- u64 n = qc->v + v;
-
- BUG_ON((s64) n < 0);
-
- if (mode == KEY_TYPE_QUOTA_NOCHECK)
- return 0;
-
- if (v <= 0) {
- if (n < qc->hardlimit &&
- (qc->warning_issued & (1 << HARDWARN))) {
- qc->warning_issued &= ~(1 << HARDWARN);
- prepare_msg(qtype, counter, msgs, HARDBELOW);
- }
-
- if (n < qc->softlimit &&
- (qc->warning_issued & (1 << SOFTWARN))) {
- qc->warning_issued &= ~(1 << SOFTWARN);
- prepare_msg(qtype, counter, msgs, SOFTBELOW);
- }
-
- qc->warning_issued = 0;
- return 0;
- }
-
- if (qc->hardlimit &&
- qc->hardlimit < n &&
- !ignore_hardlimit(q)) {
- prepare_warning(qc, qtype, counter, msgs, HARDWARN);
- return -EDQUOT;
- }
-
- if (qc->softlimit &&
- qc->softlimit < n) {
- if (qc->timer == 0) {
- qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
- prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
- } else if (ktime_get_real_seconds() >= qc->timer &&
- !ignore_hardlimit(q)) {
- prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
- return -EDQUOT;
- }
- }
-
- return 0;
-}
-
-int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
- enum quota_counters counter, s64 v,
- enum quota_acct_mode mode)
-{
- unsigned qtypes = enabled_qtypes(c);
- struct bch_memquota_type *q;
- struct bch_memquota *mq[QTYP_NR];
- struct quota_msgs msgs;
- unsigned i;
- int ret = 0;
-
- memset(&msgs, 0, sizeof(msgs));
-
- for_each_set_qtype(c, i, q, qtypes) {
- mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
- if (!mq[i])
- return -ENOMEM;
- }
-
- for_each_set_qtype(c, i, q, qtypes)
- mutex_lock_nested(&q->lock, i);
-
- for_each_set_qtype(c, i, q, qtypes) {
- ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
- if (ret)
- goto err;
- }
-
- for_each_set_qtype(c, i, q, qtypes)
- mq[i]->c[counter].v += v;
-err:
- for_each_set_qtype(c, i, q, qtypes)
- mutex_unlock(&q->lock);
-
- flush_warnings(qid, c->vfs_sb, &msgs);
-
- return ret;
-}
-
-static void __bch2_quota_transfer(struct bch_memquota *src_q,
- struct bch_memquota *dst_q,
- enum quota_counters counter, s64 v)
-{
- BUG_ON(v > src_q->c[counter].v);
- BUG_ON(v + dst_q->c[counter].v < v);
-
- src_q->c[counter].v -= v;
- dst_q->c[counter].v += v;
-}
-
-int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
- struct bch_qid dst,
- struct bch_qid src, u64 space,
- enum quota_acct_mode mode)
-{
- struct bch_memquota_type *q;
- struct bch_memquota *src_q[3], *dst_q[3];
- struct quota_msgs msgs;
- unsigned i;
- int ret = 0;
-
- qtypes &= enabled_qtypes(c);
-
- memset(&msgs, 0, sizeof(msgs));
-
- for_each_set_qtype(c, i, q, qtypes) {
- src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
- dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
- if (!src_q[i] || !dst_q[i])
- return -ENOMEM;
- }
-
- for_each_set_qtype(c, i, q, qtypes)
- mutex_lock_nested(&q->lock, i);
-
- for_each_set_qtype(c, i, q, qtypes) {
- ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
- dst_q[i]->c[Q_SPC].v + space,
- mode);
- if (ret)
- goto err;
-
- ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
- dst_q[i]->c[Q_INO].v + 1,
- mode);
- if (ret)
- goto err;
- }
-
- for_each_set_qtype(c, i, q, qtypes) {
- __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
- __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
- }
-
-err:
- for_each_set_qtype(c, i, q, qtypes)
- mutex_unlock(&q->lock);
-
- flush_warnings(dst, c->vfs_sb, &msgs);
-
- return ret;
-}
-
-static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
- struct qc_dqblk *qdq)
-{
- struct bkey_s_c_quota dq;
- struct bch_memquota_type *q;
- struct bch_memquota *mq;
- unsigned i;
-
- BUG_ON(k.k->p.inode >= QTYP_NR);
-
- if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
- return 0;
-
- switch (k.k->type) {
- case KEY_TYPE_quota:
- dq = bkey_s_c_to_quota(k);
- q = &c->quotas[k.k->p.inode];
-
- mutex_lock(&q->lock);
- mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
- if (!mq) {
- mutex_unlock(&q->lock);
- return -ENOMEM;
- }
-
- for (i = 0; i < Q_COUNTERS; i++) {
- mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
- mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
- }
-
- if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
- mq->c[Q_SPC].timer = qdq->d_spc_timer;
- if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
- mq->c[Q_SPC].warns = qdq->d_spc_warns;
- if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
- mq->c[Q_INO].timer = qdq->d_ino_timer;
- if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
- mq->c[Q_INO].warns = qdq->d_ino_warns;
-
- mutex_unlock(&q->lock);
- }
-
- return 0;
-}
-
-void bch2_fs_quota_exit(struct bch_fs *c)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
- genradix_free(&c->quotas[i].table);
-}
-
-void bch2_fs_quota_init(struct bch_fs *c)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
- mutex_init(&c->quotas[i].lock);
-}
-
-static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
-{
- struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
-
- if (sb_quota)
- return sb_quota;
-
- sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
- if (sb_quota) {
- unsigned qtype, qc;
-
- for (qtype = 0; qtype < QTYP_NR; qtype++)
- for (qc = 0; qc < Q_COUNTERS; qc++)
- sb_quota->q[qtype].c[qc].timelimit =
- cpu_to_le32(7 * 24 * 60 * 60);
- }
-
- return sb_quota;
-}
-
-static void bch2_sb_quota_read(struct bch_fs *c)
-{
- struct bch_sb_field_quota *sb_quota;
- unsigned i, j;
-
- sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
- if (!sb_quota)
- return;
-
- for (i = 0; i < QTYP_NR; i++) {
- struct bch_memquota_type *q = &c->quotas[i];
-
- for (j = 0; j < Q_COUNTERS; j++) {
- q->limits[j].timelimit =
- le32_to_cpu(sb_quota->q[i].c[j].timelimit);
- q->limits[j].warnlimit =
- le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
- }
- }
-}
-
-static int bch2_fs_quota_read_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bch_inode_unpacked u;
- struct bch_snapshot_tree s_t;
- u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot);
-
- int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "%s: snapshot tree %u not found", __func__, tree);
- if (ret)
- return ret;
-
- if (!s_t.master_subvol)
- goto advance;
-
- ret = bch2_inode_find_by_inum_nowarn_trans(trans,
- (subvol_inum) {
- le32_to_cpu(s_t.master_subvol),
- k.k->p.offset,
- }, &u);
- /*
- * Inode might be deleted in this snapshot - the easiest way to handle
- * that is to just skip it here:
- */
- if (bch2_err_matches(ret, ENOENT))
- goto advance;
-
- if (ret)
- return ret;
-
- bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
- KEY_TYPE_QUOTA_NOCHECK);
- bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
- KEY_TYPE_QUOTA_NOCHECK);
-advance:
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
- return 0;
-}
-
-int bch2_fs_quota_read(struct bch_fs *c)
-{
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
- if (!sb_quota) {
- mutex_unlock(&c->sb_lock);
- return -BCH_ERR_ENOSPC_sb_quota;
- }
-
- bch2_sb_quota_read(c);
- mutex_unlock(&c->sb_lock);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
- BTREE_ITER_prefetch, k,
- __bch2_quota_set(c, k, NULL)) ?:
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- bch2_fs_quota_read_inode(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-/* Enable/disable/delete quotas for an entire filesystem: */
-
-static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bch_sb_field_quota *sb_quota;
- int ret = 0;
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- /* Accounting must be enabled at mount time: */
- if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
- return -EINVAL;
-
- /* Can't enable enforcement without accounting: */
- if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
- return -EINVAL;
-
- if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
- return -EINVAL;
-
- if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
- return -EINVAL;
-
- mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
- if (!sb_quota) {
- ret = -BCH_ERR_ENOSPC_sb_quota;
- goto unlock;
- }
-
- if (uflags & FS_QUOTA_UDQ_ENFD)
- SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
-
- if (uflags & FS_QUOTA_GDQ_ENFD)
- SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
-
- if (uflags & FS_QUOTA_PDQ_ENFD)
- SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
-
- bch2_write_super(c);
-unlock:
- mutex_unlock(&c->sb_lock);
-
- return bch2_err_class(ret);
-}
-
-static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- mutex_lock(&c->sb_lock);
- if (uflags & FS_QUOTA_UDQ_ENFD)
- SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
-
- if (uflags & FS_QUOTA_GDQ_ENFD)
- SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
-
- if (uflags & FS_QUOTA_PDQ_ENFD)
- SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
-{
- struct bch_fs *c = sb->s_fs_info;
- int ret;
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- if (uflags & FS_USER_QUOTA) {
- if (c->opts.usrquota)
- return -EINVAL;
-
- ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
- POS(QTYP_USR, 0),
- POS(QTYP_USR, U64_MAX),
- 0, NULL);
- if (ret)
- return ret;
- }
-
- if (uflags & FS_GROUP_QUOTA) {
- if (c->opts.grpquota)
- return -EINVAL;
-
- ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
- POS(QTYP_GRP, 0),
- POS(QTYP_GRP, U64_MAX),
- 0, NULL);
- if (ret)
- return ret;
- }
-
- if (uflags & FS_PROJ_QUOTA) {
- if (c->opts.prjquota)
- return -EINVAL;
-
- ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
- POS(QTYP_PRJ, 0),
- POS(QTYP_PRJ, U64_MAX),
- 0, NULL);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * Return quota status information, such as enforcements, quota file inode
- * numbers etc.
- */
-static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
-{
- struct bch_fs *c = sb->s_fs_info;
- unsigned qtypes = enabled_qtypes(c);
- unsigned i;
-
- memset(state, 0, sizeof(*state));
-
- for (i = 0; i < QTYP_NR; i++) {
- state->s_state[i].flags |= QCI_SYSFILE;
-
- if (!(qtypes & (1 << i)))
- continue;
-
- state->s_state[i].flags |= QCI_ACCT_ENABLED;
-
- state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
- state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
-
- state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
- state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
- }
-
- return 0;
-}
-
-/*
- * Adjust quota timers & warnings
- */
-static int bch2_quota_set_info(struct super_block *sb, int type,
- struct qc_info *info)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bch_sb_field_quota *sb_quota;
- int ret = 0;
-
- if (0) {
- struct printbuf buf = PRINTBUF;
-
- qc_info_to_text(&buf, info);
- pr_info("setting:\n%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- if (type >= QTYP_NR)
- return -EINVAL;
-
- if (!((1 << type) & enabled_qtypes(c)))
- return -ESRCH;
-
- if (info->i_fieldmask &
- ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
- return -EINVAL;
-
- mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
- if (!sb_quota) {
- ret = -BCH_ERR_ENOSPC_sb_quota;
- goto unlock;
- }
-
- if (info->i_fieldmask & QC_SPC_TIMER)
- sb_quota->q[type].c[Q_SPC].timelimit =
- cpu_to_le32(info->i_spc_timelimit);
-
- if (info->i_fieldmask & QC_SPC_WARNS)
- sb_quota->q[type].c[Q_SPC].warnlimit =
- cpu_to_le32(info->i_spc_warnlimit);
-
- if (info->i_fieldmask & QC_INO_TIMER)
- sb_quota->q[type].c[Q_INO].timelimit =
- cpu_to_le32(info->i_ino_timelimit);
-
- if (info->i_fieldmask & QC_INO_WARNS)
- sb_quota->q[type].c[Q_INO].warnlimit =
- cpu_to_le32(info->i_ino_warnlimit);
-
- bch2_sb_quota_read(c);
-
- bch2_write_super(c);
-unlock:
- mutex_unlock(&c->sb_lock);
-
- return bch2_err_class(ret);
-}
-
-/* Get/set individual quotas: */
-
-static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
-{
- dst->d_space = src->c[Q_SPC].v << 9;
- dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9;
- dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9;
- dst->d_spc_timer = src->c[Q_SPC].timer;
- dst->d_spc_warns = src->c[Q_SPC].warns;
-
- dst->d_ino_count = src->c[Q_INO].v;
- dst->d_ino_hardlimit = src->c[Q_INO].hardlimit;
- dst->d_ino_softlimit = src->c[Q_INO].softlimit;
- dst->d_ino_timer = src->c[Q_INO].timer;
- dst->d_ino_warns = src->c[Q_INO].warns;
-}
-
-static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
- struct qc_dqblk *qdq)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bch_memquota_type *q = &c->quotas[kqid.type];
- qid_t qid = from_kqid(&init_user_ns, kqid);
- struct bch_memquota *mq;
-
- memset(qdq, 0, sizeof(*qdq));
-
- mutex_lock(&q->lock);
- mq = genradix_ptr(&q->table, qid);
- if (mq)
- __bch2_quota_get(qdq, mq);
- mutex_unlock(&q->lock);
-
- return 0;
-}
-
-static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
- struct qc_dqblk *qdq)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bch_memquota_type *q = &c->quotas[kqid->type];
- qid_t qid = from_kqid(&init_user_ns, *kqid);
- struct genradix_iter iter;
- struct bch_memquota *mq;
- int ret = 0;
-
- mutex_lock(&q->lock);
-
- genradix_for_each_from(&q->table, iter, mq, qid)
- if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
- __bch2_quota_get(qdq, mq);
- *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
- goto found;
- }
-
- ret = -ENOENT;
-found:
- mutex_unlock(&q->lock);
- return bch2_err_class(ret);
-}
-
-static int bch2_set_quota_trans(struct btree_trans *trans,
- struct bkey_i_quota *new_quota,
- struct qc_dqblk *qdq)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
- BTREE_ITER_slots|BTREE_ITER_intent);
- ret = bkey_err(k);
- if (unlikely(ret))
- return ret;
-
- if (k.k->type == KEY_TYPE_quota)
- new_quota->v = *bkey_s_c_to_quota(k).v;
-
- if (qdq->d_fieldmask & QC_SPC_SOFT)
- new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
- if (qdq->d_fieldmask & QC_SPC_HARD)
- new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
-
- if (qdq->d_fieldmask & QC_INO_SOFT)
- new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
- if (qdq->d_fieldmask & QC_INO_HARD)
- new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
-
- ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_set_quota(struct super_block *sb, struct kqid qid,
- struct qc_dqblk *qdq)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bkey_i_quota new_quota;
- int ret;
-
- if (0) {
- struct printbuf buf = PRINTBUF;
-
- qc_dqblk_to_text(&buf, qdq);
- pr_info("setting:\n%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- bkey_quota_init(&new_quota.k_i);
- new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
-
- ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
- __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
-
- return bch2_err_class(ret);
-}
-
-const struct quotactl_ops bch2_quotactl_operations = {
- .quota_enable = bch2_quota_enable,
- .quota_disable = bch2_quota_disable,
- .rm_xquota = bch2_quota_remove,
-
- .get_state = bch2_quota_get_state,
- .set_info = bch2_quota_set_info,
-
- .get_dqblk = bch2_get_quota,
- .get_nextdqblk = bch2_get_next_quota,
- .set_dqblk = bch2_set_quota,
-};
-
-#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
deleted file mode 100644
index 1551800ff44c..000000000000
--- a/fs/bcachefs/quota.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_H
-#define _BCACHEFS_QUOTA_H
-
-#include "inode.h"
-#include "quota_types.h"
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-
-int bch2_quota_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_quota ((struct bkey_ops) { \
- .key_validate = bch2_quota_validate, \
- .val_to_text = bch2_quota_to_text, \
- .min_val_size = 32, \
-})
-
-static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
-{
- return (struct bch_qid) {
- .q[QTYP_USR] = u->bi_uid,
- .q[QTYP_GRP] = u->bi_gid,
- .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
- };
-}
-
-static inline unsigned enabled_qtypes(struct bch_fs *c)
-{
- return ((c->opts.usrquota << QTYP_USR)|
- (c->opts.grpquota << QTYP_GRP)|
- (c->opts.prjquota << QTYP_PRJ));
-}
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
- s64, enum quota_acct_mode);
-
-int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
- struct bch_qid, u64, enum quota_acct_mode);
-
-void bch2_fs_quota_exit(struct bch_fs *);
-void bch2_fs_quota_init(struct bch_fs *);
-int bch2_fs_quota_read(struct bch_fs *);
-
-extern const struct quotactl_ops bch2_quotactl_operations;
-
-#else
-
-static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
- enum quota_counters counter, s64 v,
- enum quota_acct_mode mode)
-{
- return 0;
-}
-
-static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
- struct bch_qid dst,
- struct bch_qid src, u64 space,
- enum quota_acct_mode mode)
-{
- return 0;
-}
-
-static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
-static inline void bch2_fs_quota_init(struct bch_fs *c) {}
-static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
-
-#endif
-
-#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h
deleted file mode 100644
index dc34347ef6c7..000000000000
--- a/fs/bcachefs/quota_format.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_FORMAT_H
-#define _BCACHEFS_QUOTA_FORMAT_H
-
-/* KEY_TYPE_quota: */
-
-enum quota_types {
- QTYP_USR = 0,
- QTYP_GRP = 1,
- QTYP_PRJ = 2,
- QTYP_NR = 3,
-};
-
-enum quota_counters {
- Q_SPC = 0,
- Q_INO = 1,
- Q_COUNTERS = 2,
-};
-
-struct bch_quota_counter {
- __le64 hardlimit;
- __le64 softlimit;
-};
-
-struct bch_quota {
- struct bch_val v;
- struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
- __le32 timelimit;
- __le32 warnlimit;
-};
-
-struct bch_sb_quota_type {
- __le64 flags;
- struct bch_sb_quota_counter c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
- struct bch_sb_field field;
- struct bch_sb_quota_type q[QTYP_NR];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
deleted file mode 100644
index 6a136083d389..000000000000
--- a/fs/bcachefs/quota_types.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_TYPES_H
-#define _BCACHEFS_QUOTA_TYPES_H
-
-#include <linux/generic-radix-tree.h>
-
-struct bch_qid {
- u32 q[QTYP_NR];
-};
-
-enum quota_acct_mode {
- KEY_TYPE_QUOTA_PREALLOC,
- KEY_TYPE_QUOTA_WARN,
- KEY_TYPE_QUOTA_NOCHECK,
-};
-
-struct memquota_counter {
- u64 v;
- u64 hardlimit;
- u64 softlimit;
- s64 timer;
- int warns;
- int warning_issued;
-};
-
-struct bch_memquota {
- struct memquota_counter c[Q_COUNTERS];
-};
-
-typedef GENRADIX(struct bch_memquota) bch_memquota_table;
-
-struct quota_limit {
- u32 timelimit;
- u32 warnlimit;
-};
-
-struct bch_memquota_type {
- struct quota_limit limits[Q_COUNTERS];
- bch_memquota_table table;
- struct mutex lock;
-};
-
-#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
deleted file mode 100644
index bef2aa1b8bcd..000000000000
--- a/fs/bcachefs/rcu_pending.c
+++ /dev/null
@@ -1,666 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define pr_fmt(fmt) "%s() " fmt "\n", __func__
-
-#include <linux/generic-radix-tree.h>
-#include <linux/mm.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/srcu.h>
-#include <linux/vmalloc.h>
-
-#include "rcu_pending.h"
-#include "darray.h"
-#include "util.h"
-
-#define static_array_for_each(_a, _i) \
- for (typeof(&(_a)[0]) _i = _a; \
- _i < (_a) + ARRAY_SIZE(_a); \
- _i++)
-
-enum rcu_pending_special {
- RCU_PENDING_KVFREE = 1,
- RCU_PENDING_CALL_RCU = 2,
-};
-
-#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
-#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
-
-#ifdef __KERNEL__
-typedef unsigned long rcu_gp_poll_state_t;
-
-static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
-{
- return l == r;
-}
-#else
-typedef struct urcu_gp_poll_state rcu_gp_poll_state_t;
-
-static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
-{
- return l.grace_period_id == r.grace_period_id;
-}
-#endif
-
-static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp)
-{
- return ssp
- ? get_state_synchronize_srcu(ssp)
- : get_state_synchronize_rcu();
-}
-
-static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp)
-{
- return ssp
- ? start_poll_synchronize_srcu(ssp)
- : start_poll_synchronize_rcu();
-}
-
-static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie)
-{
- return ssp
- ? poll_state_synchronize_srcu(ssp, cookie)
- : poll_state_synchronize_rcu(cookie);
-}
-
-static inline void __rcu_barrier(struct srcu_struct *ssp)
-{
- return ssp
- ? srcu_barrier(ssp)
- : rcu_barrier();
-}
-
-static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
- rcu_callback_t func)
-{
- if (ssp)
- call_srcu(ssp, rhp, func);
- else
- call_rcu(rhp, func);
-}
-
-struct rcu_pending_seq {
- /*
- * We're using a radix tree like a vector - we're just pushing elements
- * onto the end; we're using a radix tree instead of an actual vector to
- * avoid reallocation overhead
- */
- GENRADIX(struct rcu_head *) objs;
- size_t nr;
- struct rcu_head **cursor;
- rcu_gp_poll_state_t seq;
-};
-
-struct rcu_pending_list {
- struct rcu_head *head;
- struct rcu_head *tail;
- rcu_gp_poll_state_t seq;
-};
-
-struct rcu_pending_pcpu {
- struct rcu_pending *parent;
- spinlock_t lock;
- int cpu;
-
- /*
- * We can't bound the number of unprocessed gp sequence numbers, and we
- * can't efficiently merge radix trees for expired grace periods, so we
- * need darray/vector:
- */
- DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
-
- /* Third entry is for expired objects: */
- struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
-
- struct rcu_head cb;
- bool cb_armed;
- struct work_struct work;
-};
-
-static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
-{
- if (p->objs.nr)
- return true;
-
- static_array_for_each(p->lists, i)
- if (i->head)
- return true;
-
- return false;
-}
-
-static void rcu_pending_list_merge(struct rcu_pending_list *l1,
- struct rcu_pending_list *l2)
-{
-#ifdef __KERNEL__
- if (!l1->head)
- l1->head = l2->head;
- else
- l1->tail->next = l2->head;
-#else
- if (!l1->head)
- l1->head = l2->head;
- else
- l1->tail->next.next = (void *) l2->head;
-#endif
-
- l1->tail = l2->tail;
- l2->head = l2->tail = NULL;
-}
-
-static void rcu_pending_list_add(struct rcu_pending_list *l,
- struct rcu_head *n)
-{
-#ifdef __KERNEL__
- if (!l->head)
- l->head = n;
- else
- l->tail->next = n;
- l->tail = n;
- n->next = NULL;
-#else
- if (!l->head)
- l->head = n;
- else
- l->tail->next.next = (void *) n;
- l->tail = n;
- n->next.next = NULL;
-#endif
-}
-
-static void merge_expired_lists(struct rcu_pending_pcpu *p)
-{
- struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
-
- for (struct rcu_pending_list *i = p->lists; i < expired; i++)
- if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
- rcu_pending_list_merge(expired, i);
-}
-
-#ifndef __KERNEL__
-static inline void kfree_bulk(size_t nr, void ** p)
-{
- while (nr--)
- kfree(*p);
-}
-
-#define local_irq_save(flags) \
-do { \
- flags = 0; \
-} while (0)
-#endif
-
-static noinline void __process_finished_items(struct rcu_pending *pending,
- struct rcu_pending_pcpu *p,
- unsigned long flags)
-{
- struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
- struct rcu_pending_seq objs = {};
- struct rcu_head *list = NULL;
-
- if (p->objs.nr &&
- __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
- objs = p->objs.data[0];
- darray_remove_item(&p->objs, p->objs.data);
- }
-
- merge_expired_lists(p);
-
- list = expired->head;
- expired->head = expired->tail = NULL;
-
- spin_unlock_irqrestore(&p->lock, flags);
-
- switch ((ulong) pending->process) {
- case RCU_PENDING_KVFREE:
- for (size_t i = 0; i < objs.nr; ) {
- size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
-
- kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
- i += nr_this_node;
- }
- genradix_free(&objs.objs);
-
- while (list) {
- struct rcu_head *obj = list;
-#ifdef __KERNEL__
- list = obj->next;
-#else
- list = (void *) obj->next.next;
-#endif
-
- /*
- * low bit of pointer indicates whether rcu_head needs
- * to be freed - kvfree_rcu_mightsleep()
- */
- BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
-
- void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
- bool free_head = ((unsigned long) obj->func) & 1UL;
-
- kvfree(ptr);
- if (free_head)
- kfree(obj);
- }
-
- break;
-
- case RCU_PENDING_CALL_RCU:
- for (size_t i = 0; i < objs.nr; i++) {
- struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
- obj->func(obj);
- }
- genradix_free(&objs.objs);
-
- while (list) {
- struct rcu_head *obj = list;
-#ifdef __KERNEL__
- list = obj->next;
-#else
- list = (void *) obj->next.next;
-#endif
- obj->func(obj);
- }
- break;
-
- default:
- for (size_t i = 0; i < objs.nr; i++)
- pending->process(pending, *genradix_ptr(&objs.objs, i));
- genradix_free(&objs.objs);
-
- while (list) {
- struct rcu_head *obj = list;
-#ifdef __KERNEL__
- list = obj->next;
-#else
- list = (void *) obj->next.next;
-#endif
- pending->process(pending, obj);
- }
- break;
- }
-}
-
-static bool process_finished_items(struct rcu_pending *pending,
- struct rcu_pending_pcpu *p,
- unsigned long flags)
-{
- /*
- * XXX: we should grab the gp seq once and avoid multiple function
- * calls, this is called from __rcu_pending_enqueue() fastpath in
- * may_sleep==true mode
- */
- if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
- (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
- (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
- p->lists[2].head) {
- __process_finished_items(pending, p, flags);
- return true;
- }
-
- return false;
-}
-
-static void rcu_pending_work(struct work_struct *work)
-{
- struct rcu_pending_pcpu *p =
- container_of(work, struct rcu_pending_pcpu, work);
- struct rcu_pending *pending = p->parent;
- unsigned long flags;
-
- do {
- spin_lock_irqsave(&p->lock, flags);
- } while (process_finished_items(pending, p, flags));
-
- spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void rcu_pending_rcu_cb(struct rcu_head *rcu)
-{
- struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
-
- schedule_work_on(p->cpu, &p->work);
-
- unsigned long flags;
- spin_lock_irqsave(&p->lock, flags);
- if (__rcu_pending_has_pending(p)) {
- spin_unlock_irqrestore(&p->lock, flags);
- __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
- } else {
- p->cb_armed = false;
- spin_unlock_irqrestore(&p->lock, flags);
- }
-}
-
-static __always_inline struct rcu_pending_seq *
-get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq)
-{
- darray_for_each_reverse(p->objs, objs)
- if (rcu_gp_poll_cookie_eq(objs->seq, seq))
- return objs;
-
- if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
- return NULL;
-
- return &darray_last(p->objs);
-}
-
-static noinline bool
-rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq,
- struct rcu_head *head, void *ptr,
- unsigned long *flags)
-{
- if (ptr) {
- if (!head) {
- /*
- * kvfree_rcu_mightsleep(): we weren't passed an
- * rcu_head, but we need one: use the low bit of the
- * ponter to free to flag that the head needs to be
- * freed as well:
- */
- ptr = (void *)(((unsigned long) ptr)|1UL);
- head = kmalloc(sizeof(*head), __GFP_NOWARN);
- if (!head) {
- spin_unlock_irqrestore(&p->lock, *flags);
- head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
- /*
- * dropped lock, did GFP_KERNEL allocation,
- * check for gp expiration
- */
- if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
- kvfree(--ptr);
- kfree(head);
- spin_lock_irqsave(&p->lock, *flags);
- return false;
- }
- }
- }
-
- head->func = ptr;
- }
-again:
- for (struct rcu_pending_list *i = p->lists;
- i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
- if (rcu_gp_poll_cookie_eq(i->seq, seq)) {
- rcu_pending_list_add(i, head);
- return false;
- }
- }
-
- for (struct rcu_pending_list *i = p->lists;
- i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
- if (!i->head) {
- i->seq = seq;
- rcu_pending_list_add(i, head);
- return true;
- }
- }
-
- merge_expired_lists(p);
- goto again;
-}
-
-/*
- * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
- * pending->pracess) once grace period elapses.
- *
- * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
- * back to a linked list.
- *
- * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
- * process callback
- *
- * - If @ptr and @head are both not NULL, we're kvfree_rcu()
- *
- * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
- *
- * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
- * expired items.
- */
-static __always_inline void
-__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
- void *ptr, bool may_sleep)
-{
-
- struct rcu_pending_pcpu *p;
- struct rcu_pending_seq *objs;
- struct genradix_node *new_node = NULL;
- unsigned long flags;
- bool start_gp = false;
-
- BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
-
- local_irq_save(flags);
- p = this_cpu_ptr(pending->p);
- spin_lock(&p->lock);
- rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
-restart:
- if (may_sleep &&
- unlikely(process_finished_items(pending, p, flags)))
- goto check_expired;
-
- /*
- * In kvfree_rcu() mode, the radix tree is only for slab pointers so
- * that we can do kfree_bulk() - vmalloc pointers always use the linked
- * list:
- */
- if (ptr && unlikely(is_vmalloc_addr(ptr)))
- goto list_add;
-
- objs = get_object_radix(p, seq);
- if (unlikely(!objs))
- goto list_add;
-
- if (unlikely(!objs->cursor)) {
- /*
- * New radix tree nodes must be added under @p->lock because the
- * tree root is in a darray that can be resized (typically,
- * genradix supports concurrent unlocked allocation of new
- * nodes) - hence preallocation and the retry loop:
- */
- objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
- objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
- if (unlikely(!objs->cursor)) {
- if (may_sleep) {
- spin_unlock_irqrestore(&p->lock, flags);
-
- gfp_t gfp = GFP_KERNEL;
- if (!head)
- gfp |= __GFP_NOFAIL;
-
- new_node = genradix_alloc_node(gfp);
- if (!new_node)
- may_sleep = false;
- goto check_expired;
- }
-list_add:
- start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
- goto start_gp;
- }
- }
-
- *objs->cursor++ = ptr ?: head;
- /* zero cursor if we hit the end of a radix tree node: */
- if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
- objs->cursor = NULL;
- start_gp = !objs->nr;
- objs->nr++;
-start_gp:
- if (unlikely(start_gp)) {
- /*
- * We only have one callback (ideally, we would have one for
- * every outstanding graceperiod) - so if our callback is
- * already in flight, we may still have to start a grace period
- * (since we used get_state() above, not start_poll())
- */
- if (!p->cb_armed) {
- p->cb_armed = true;
- __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
- } else {
- __start_poll_synchronize_rcu(pending->srcu);
- }
- }
- spin_unlock_irqrestore(&p->lock, flags);
-free_node:
- if (new_node)
- genradix_free_node(new_node);
- return;
-check_expired:
- if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
- switch ((ulong) pending->process) {
- case RCU_PENDING_KVFREE:
- kvfree(ptr);
- break;
- case RCU_PENDING_CALL_RCU:
- head->func(head);
- break;
- default:
- pending->process(pending, head);
- break;
- }
- goto free_node;
- }
-
- local_irq_save(flags);
- p = this_cpu_ptr(pending->p);
- spin_lock(&p->lock);
- goto restart;
-}
-
-void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
-{
- __rcu_pending_enqueue(pending, obj, NULL, true);
-}
-
-static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
-{
- struct rcu_head *ret = NULL;
-
- spin_lock_irq(&p->lock);
- darray_for_each(p->objs, objs)
- if (objs->nr) {
- ret = *genradix_ptr(&objs->objs, --objs->nr);
- objs->cursor = NULL;
- if (!objs->nr)
- genradix_free(&objs->objs);
- goto out;
- }
-
- static_array_for_each(p->lists, i)
- if (i->head) {
- ret = i->head;
-#ifdef __KERNEL__
- i->head = ret->next;
-#else
- i->head = (void *) ret->next.next;
-#endif
- if (!i->head)
- i->tail = NULL;
- goto out;
- }
-out:
- spin_unlock_irq(&p->lock);
-
- return ret;
-}
-
-struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
-{
- return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
-}
-
-struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
-{
- struct rcu_head *ret = rcu_pending_dequeue(pending);
-
- if (ret)
- return ret;
-
- int cpu;
- for_each_possible_cpu(cpu) {
- ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
- if (ret)
- break;
- }
- return ret;
-}
-
-static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
-{
- int cpu;
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
- spin_lock_irq(&p->lock);
- if (__rcu_pending_has_pending(p) || p->cb_armed) {
- spin_unlock_irq(&p->lock);
- return true;
- }
- spin_unlock_irq(&p->lock);
- }
-
- return false;
-}
-
-void rcu_pending_exit(struct rcu_pending *pending)
-{
- int cpu;
-
- if (!pending->p)
- return;
-
- while (rcu_pending_has_pending_or_armed(pending)) {
- __rcu_barrier(pending->srcu);
-
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
- flush_work(&p->work);
- }
- }
-
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
- flush_work(&p->work);
- }
-
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-
- static_array_for_each(p->lists, i)
- WARN_ON(i->head);
- WARN_ON(p->objs.nr);
- darray_exit(&p->objs);
- }
- free_percpu(pending->p);
-}
-
-/**
- * rcu_pending_init: - initialize a rcu_pending
- *
- * @pending: Object to init
- * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
- * RCU flavor
- * @process: Callback function invoked on objects once their RCU barriers
- * have completed; if NULL, kvfree() is used.
- */
-int rcu_pending_init(struct rcu_pending *pending,
- struct srcu_struct *srcu,
- rcu_pending_process_fn process)
-{
- pending->p = alloc_percpu(struct rcu_pending_pcpu);
- if (!pending->p)
- return -ENOMEM;
-
- int cpu;
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
- p->parent = pending;
- p->cpu = cpu;
- spin_lock_init(&p->lock);
- darray_init(&p->objs);
- INIT_WORK(&p->work, rcu_pending_work);
- }
-
- pending->srcu = srcu;
- pending->process = process;
-
- return 0;
-}
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
deleted file mode 100644
index 71a2f4ddaade..000000000000
--- a/fs/bcachefs/rcu_pending.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_RCU_PENDING_H
-#define _LINUX_RCU_PENDING_H
-
-#include <linux/rcupdate.h>
-
-struct rcu_pending;
-typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
-
-struct rcu_pending_pcpu;
-
-struct rcu_pending {
- struct rcu_pending_pcpu __percpu *p;
- struct srcu_struct *srcu;
- rcu_pending_process_fn process;
-};
-
-void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
-struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
-struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
-
-void rcu_pending_exit(struct rcu_pending *pending);
-int rcu_pending_init(struct rcu_pending *pending,
- struct srcu_struct *srcu,
- rcu_pending_process_fn process);
-
-#endif /* _LINUX_RCU_PENDING_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
deleted file mode 100644
index 29a569384146..000000000000
--- a/fs/bcachefs/rebalance.c
+++ /dev/null
@@ -1,700 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "clock.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "io_write.h"
-#include "move.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/sched/cputime.h>
-
-/* bch_extent_rebalance: */
-
-static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
-{
- const union bch_extent_entry *entry;
-
- bkey_extent_entry_for_each(ptrs, entry)
- if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
- return &entry->rebalance;
-
- return NULL;
-}
-
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
-{
- return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
-}
-
-static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_s_c k,
- struct bkey_ptrs_c ptrs)
-{
- if (!opts->background_compression)
- return 0;
-
- unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned ptr_bit = 1;
- unsigned rewrite_ptrs = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
- p.ptr.unwritten)
- return 0;
-
- if (!p.ptr.cached && p.crc.compression_type != compression_type)
- rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
- }
-
- return rewrite_ptrs;
-}
-
-static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_ptrs_c ptrs)
-{
- if (!opts->background_target ||
- !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
- return 0;
-
- unsigned ptr_bit = 1;
- unsigned rewrite_ptrs = 0;
-
- bkey_for_each_ptr(ptrs, ptr) {
- if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
- rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
- }
-
- return rewrite_ptrs;
-}
-
-static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
- bch2_bkey_ptrs_need_move(c, opts, ptrs);
-}
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
- if (!opts)
- return 0;
-
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- u64 sectors = 0;
-
- if (opts->background_compression) {
- unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
- p.ptr.unwritten) {
- sectors = 0;
- goto incompressible;
- }
-
- if (!p.ptr.cached && p.crc.compression_type != compression_type)
- sectors += p.crc.compressed_size;
- }
- }
-incompressible:
- if (opts->background_target)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
- sectors += p.crc.compressed_size;
-
- return sectors;
-}
-
-static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
- struct bkey_s_c k)
-{
- if (!bkey_extent_is_direct_data(k.k))
- return 0;
-
- const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
-
- if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
- struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
- return old == NULL || memcmp(old, &new, sizeof(new));
- } else {
- return old != NULL;
- }
-}
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
- struct bkey_i *_k)
-{
- if (!bkey_extent_is_direct_data(&_k->k))
- return 0;
-
- struct bkey_s k = bkey_i_to_s(_k);
- struct bch_extent_rebalance *old =
- (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
-
- if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
- if (!old) {
- old = bkey_val_end(k);
- k.k->u64s += sizeof(*old) / sizeof(u64);
- }
-
- *old = io_opts_to_rebalance_opts(c, opts);
- } else {
- if (old)
- extent_entry_drop(k, (union bch_extent_entry *) old);
- }
-
- return 0;
-}
-
-int bch2_get_update_rebalance_opts(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- BUG_ON(iter->flags & BTREE_ITER_is_extents);
- BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
-
- const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
- ? bch2_bkey_rebalance_opts(k) : NULL;
- if (r) {
-#define x(_name) \
- if (r->_name##_from_inode) { \
- io_opts->_name = r->_name; \
- io_opts->_name##_from_inode = true; \
- }
- BCH_REBALANCE_OPTS()
-#undef x
- }
-
- if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
- return 0;
-
- struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- bkey_reassemble(n, k);
-
- /* On successfull transaction commit, @k was invalidated: */
-
- return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
- bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_nested;
-}
-
-#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
-
-static const char * const bch2_rebalance_state_strs[] = {
-#define x(t) #t,
- BCH_REBALANCE_STATES()
- NULL
-#undef x
-};
-
-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i_cookie *cookie;
- u64 v;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
- SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- v = k.k->type == KEY_TYPE_cookie
- ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
- : 0;
-
- cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
- ret = PTR_ERR_OR_ZERO(cookie);
- if (ret)
- goto err;
-
- bkey_cookie_init(&cookie->k_i);
- cookie->k.p = iter.pos;
- cookie->v.cookie = cpu_to_le64(v + 1);
-
- ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
-{
- int ret = bch2_trans_commit_do(c, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_set_rebalance_needs_scan_trans(trans, inum));
- rebalance_wakeup(c);
- return ret;
-}
-
-int bch2_set_fs_needs_rebalance(struct bch_fs *c)
-{
- return bch2_set_rebalance_needs_scan(c, 0);
-}
-
-static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 v;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
- SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- v = k.k->type == KEY_TYPE_cookie
- ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
- : 0;
-
- if (v == cookie)
- ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
- struct btree_iter *work_iter)
-{
- return !kthread_should_stop()
- ? bch2_btree_iter_peek(work_iter)
- : bkey_s_c_null;
-}
-
-static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- if (!bch2_bkey_rebalance_opts(k))
- return 0;
-
- struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- extent_entry_drop(bkey_i_to_s(n),
- (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
- return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
- struct bpos work_pos,
- struct btree_iter *extent_iter,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bch_fs *c = trans->c;
-
- bch2_trans_iter_exit(trans, extent_iter);
- bch2_trans_iter_init(trans, extent_iter,
- work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
- work_pos,
- BTREE_ITER_all_snapshots);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter);
- if (bkey_err(k))
- return k;
-
- int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
- if (ret)
- return bkey_s_c_err(ret);
-
- memset(data_opts, 0, sizeof(*data_opts));
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
- data_opts->target = io_opts->background_target;
- data_opts->write_flags |= BCH_WRITE_only_specified_devs;
-
- if (!data_opts->rewrite_ptrs) {
- /*
- * device we would want to write to offline? devices in target
- * changed?
- *
- * We'll now need a full scan before this extent is picked up
- * again:
- */
- int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
- if (ret)
- return bkey_s_c_err(ret);
- return bkey_s_c_null;
- }
-
- if (trace_rebalance_extent_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
- if (p) {
- prt_str(&buf, "compression=");
- bch2_compression_opt_to_text(&buf, io_opts->background_compression);
- prt_str(&buf, " ");
- bch2_prt_u64_base2(&buf, p);
- prt_newline(&buf);
- }
-
- p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
- if (p) {
- prt_str(&buf, "move=");
- bch2_target_to_text(&buf, c, io_opts->background_target);
- prt_str(&buf, " ");
- bch2_prt_u64_base2(&buf, p);
- prt_newline(&buf);
- }
-
- trace_rebalance_extent(c, buf.buf);
- printbuf_exit(&buf);
- }
-
- return k;
-}
-
-noinline_for_stack
-static int do_rebalance_extent(struct moving_context *ctxt,
- struct bpos work_pos,
- struct btree_iter *extent_iter)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- struct bch_fs_rebalance *r = &trans->c->rebalance;
- struct data_update_opts data_opts;
- struct bch_io_opts io_opts;
- struct bkey_s_c k;
- struct bkey_buf sk;
- int ret;
-
- ctxt->stats = &r->work_stats;
- r->state = BCH_REBALANCE_working;
-
- bch2_bkey_buf_init(&sk);
-
- ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
- extent_iter, &io_opts, &data_opts));
- if (ret || !k.k)
- goto out;
-
- atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
-
- /*
- * The iterator gets unlocked by __bch2_read_extent - need to
- * save a copy of @k elsewhere:
- */
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
- if (ret) {
- if (bch2_err_matches(ret, ENOMEM)) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
- ret = -BCH_ERR_transaction_restart_nested;
- }
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto out;
-
- /* skip it and continue, XXX signal failure */
- ret = 0;
- }
-out:
- bch2_bkey_buf_exit(&sk, c);
- return ret;
-}
-
-static bool rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
- data_opts->target = io_opts->background_target;
- data_opts->write_flags |= BCH_WRITE_only_specified_devs;
- return data_opts->rewrite_ptrs != 0;
-}
-
-static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs_rebalance *r = &trans->c->rebalance;
- int ret;
-
- bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
- ctxt->stats = &r->scan_stats;
-
- if (!inum) {
- r->scan_start = BBPOS_MIN;
- r->scan_end = BBPOS_MAX;
- } else {
- r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
- r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
- }
-
- r->state = BCH_REBALANCE_scanning;
-
- ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_rebalance_needs_scan(trans, inum, cookie));
-
- bch2_move_stats_exit(&r->scan_stats, trans->c);
- return ret;
-}
-
-static void rebalance_wait(struct bch_fs *c)
-{
- struct bch_fs_rebalance *r = &c->rebalance;
- struct io_clock *clock = &c->io_clock[WRITE];
- u64 now = atomic64_read(&clock->now);
- u64 min_member_capacity = bch2_min_rw_member_capacity(c);
-
- if (min_member_capacity == U64_MAX)
- min_member_capacity = 128 * 2048;
-
- r->wait_iotime_end = now + (min_member_capacity >> 6);
-
- if (r->state != BCH_REBALANCE_waiting) {
- r->wait_iotime_start = now;
- r->wait_wallclock_start = ktime_get_real_ns();
- r->state = BCH_REBALANCE_waiting;
- }
-
- bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
-}
-
-static int do_rebalance(struct moving_context *ctxt)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- struct bch_fs_rebalance *r = &c->rebalance;
- struct btree_iter rebalance_work_iter, extent_iter = { NULL };
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_begin(trans);
-
- bch2_move_stats_init(&r->work_stats, "rebalance_work");
- bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
-
- bch2_trans_iter_init(trans, &rebalance_work_iter,
- BTREE_ID_rebalance_work, POS_MIN,
- BTREE_ITER_all_snapshots);
-
- while (!bch2_move_ratelimit(ctxt)) {
- if (!c->opts.rebalance_enabled) {
- bch2_moving_ctxt_flush_all(ctxt);
- kthread_wait_freezable(c->opts.rebalance_enabled ||
- kthread_should_stop());
- }
-
- if (kthread_should_stop())
- break;
-
- bch2_trans_begin(trans);
-
- ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret || !k.k)
- break;
-
- ret = k.k->type == KEY_TYPE_cookie
- ? do_rebalance_scan(ctxt, k.k->p.inode,
- le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
- : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- bch2_btree_iter_advance(&rebalance_work_iter);
- }
-
- bch2_trans_iter_exit(trans, &extent_iter);
- bch2_trans_iter_exit(trans, &rebalance_work_iter);
- bch2_move_stats_exit(&r->scan_stats, c);
-
- if (!ret &&
- !kthread_should_stop() &&
- !atomic64_read(&r->work_stats.sectors_seen) &&
- !atomic64_read(&r->scan_stats.sectors_seen)) {
- bch2_moving_ctxt_flush_all(ctxt);
- bch2_trans_unlock_long(trans);
- rebalance_wait(c);
- }
-
- if (!bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_rebalance_thread(void *arg)
-{
- struct bch_fs *c = arg;
- struct bch_fs_rebalance *r = &c->rebalance;
- struct moving_context ctxt;
-
- set_freezable();
-
- bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
- writepoint_ptr(&c->rebalance_write_point),
- true);
-
- while (!kthread_should_stop() && !do_rebalance(&ctxt))
- ;
-
- bch2_moving_ctxt_exit(&ctxt);
-
- return 0;
-}
-
-void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
- printbuf_tabstop_push(out, 32);
-
- struct bch_fs_rebalance *r = &c->rebalance;
-
- /* print pending work */
- struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, };
- u64 v;
- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
-
- prt_printf(out, "pending work:\t");
- prt_human_readable_u64(out, v);
- prt_printf(out, "\n\n");
-
- prt_str(out, bch2_rebalance_state_strs[r->state]);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- switch (r->state) {
- case BCH_REBALANCE_waiting: {
- u64 now = atomic64_read(&c->io_clock[WRITE].now);
-
- prt_printf(out, "io wait duration:\t");
- bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
- prt_newline(out);
-
- prt_printf(out, "io wait remaining:\t");
- bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
- prt_newline(out);
-
- prt_printf(out, "duration waited:\t");
- bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
- prt_newline(out);
- break;
- }
- case BCH_REBALANCE_working:
- bch2_move_stats_to_text(out, &r->work_stats);
- break;
- case BCH_REBALANCE_scanning:
- bch2_move_stats_to_text(out, &r->scan_stats);
- break;
- }
- prt_newline(out);
-
- rcu_read_lock();
- struct task_struct *t = rcu_dereference(c->rebalance.thread);
- if (t)
- get_task_struct(t);
- rcu_read_unlock();
-
- if (t) {
- bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
- put_task_struct(t);
- }
-
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_rebalance_stop(struct bch_fs *c)
-{
- struct task_struct *p;
-
- c->rebalance.pd.rate.rate = UINT_MAX;
- bch2_ratelimit_reset(&c->rebalance.pd.rate);
-
- p = rcu_dereference_protected(c->rebalance.thread, 1);
- c->rebalance.thread = NULL;
-
- if (p) {
- /* for sychronizing with rebalance_wakeup() */
- synchronize_rcu();
-
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-int bch2_rebalance_start(struct bch_fs *c)
-{
- struct task_struct *p;
- int ret;
-
- if (c->rebalance.thread)
- return 0;
-
- if (c->opts.nochanges)
- return 0;
-
- p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
- ret = PTR_ERR_OR_ZERO(p);
- bch_err_msg(c, ret, "creating rebalance thread");
- if (ret)
- return ret;
-
- get_task_struct(p);
- rcu_assign_pointer(c->rebalance.thread, p);
- wake_up_process(p);
- return 0;
-}
-
-void bch2_fs_rebalance_init(struct bch_fs *c)
-{
- bch2_pd_controller_init(&c->rebalance.pd);
-}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
deleted file mode 100644
index 62a3859d3823..000000000000
--- a/fs/bcachefs/rebalance.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_H
-#define _BCACHEFS_REBALANCE_H
-
-#include "compress.h"
-#include "disk_groups.h"
-#include "opts.h"
-#include "rebalance_types.h"
-
-static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
- struct bch_io_opts *opts)
-{
- struct bch_extent_rebalance r = {
- .type = BIT(BCH_EXTENT_ENTRY_rebalance),
-#define x(_name) \
- ._name = opts->_name, \
- ._name##_from_inode = opts->_name##_from_inode,
- BCH_REBALANCE_OPTS()
-#undef x
- };
-
- if (r.background_target &&
- !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
- r.background_target = 0;
-
- return r;
-};
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
-int bch2_get_update_rebalance_opts(struct btree_trans *,
- struct bch_io_opts *,
- struct btree_iter *,
- struct bkey_s_c);
-
-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
-int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
-int bch2_set_fs_needs_rebalance(struct bch_fs *);
-
-static inline void rebalance_wakeup(struct bch_fs *c)
-{
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(c->rebalance.thread);
- if (p)
- wake_up_process(p);
- rcu_read_unlock();
-}
-
-void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_rebalance_stop(struct bch_fs *);
-int bch2_rebalance_start(struct bch_fs *);
-void bch2_fs_rebalance_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
deleted file mode 100644
index ff9a1342a22b..000000000000
--- a/fs/bcachefs/rebalance_format.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_FORMAT_H
-#define _BCACHEFS_REBALANCE_FORMAT_H
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:3,
-
- promote_target_from_inode:1,
- erasure_code_from_inode:1,
- data_checksum_from_inode:1,
- background_compression_from_inode:1,
- data_replicas_from_inode:1,
- background_target_from_inode:1,
-
- promote_target:16,
- erasure_code:1,
- data_checksum:4,
- data_replicas:4,
- background_compression:8, /* enum bch_compression_opt */
- background_target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 background_target:16,
- background_compression:8,
- data_replicas:4,
- data_checksum:4,
- erasure_code:1,
- promote_target:16,
-
- background_target_from_inode:1,
- data_replicas_from_inode:1,
- background_compression_from_inode:1,
- data_checksum_from_inode:1,
- erasure_code_from_inode:1,
- promote_target_from_inode:1,
-
- unused:3,
- type:6;
-#endif
-};
-
-/* subset of BCH_INODE_OPTS */
-#define BCH_REBALANCE_OPTS() \
- x(data_checksum) \
- x(background_compression) \
- x(data_replicas) \
- x(promote_target) \
- x(background_target) \
- x(erasure_code)
-
-#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
-
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
deleted file mode 100644
index fe5098c17dfc..000000000000
--- a/fs/bcachefs/rebalance_types.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_TYPES_H
-#define _BCACHEFS_REBALANCE_TYPES_H
-
-#include "bbpos_types.h"
-#include "move_types.h"
-
-#define BCH_REBALANCE_STATES() \
- x(waiting) \
- x(working) \
- x(scanning)
-
-enum bch_rebalance_states {
-#define x(t) BCH_REBALANCE_##t,
- BCH_REBALANCE_STATES()
-#undef x
-};
-
-struct bch_fs_rebalance {
- struct task_struct __rcu *thread;
- struct bch_pd_controller pd;
-
- enum bch_rebalance_states state;
- u64 wait_iotime_start;
- u64 wait_iotime_end;
- u64 wait_wallclock_start;
-
- struct bch_move_stats work_stats;
-
- struct bbpos scan_start;
- struct bbpos scan_end;
- struct bch_move_stats scan_stats;
-};
-
-#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
deleted file mode 100644
index 266c5770c824..000000000000
--- a/fs/bcachefs/recovery.c
+++ /dev/null
@@ -1,1217 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "bkey_buf.h"
-#include "btree_journal_iter.h"
-#include "btree_node_scan.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "buckets.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "logged_ops.h"
-#include "move.h"
-#include "namei.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "sb-downgrade.h"
-#include "snapshot.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-#include <linux/stat.h>
-
-
-int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
-{
- u64 b = BIT_ULL(btree);
- int ret = 0;
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- if (!(c->sb.btrees_lost_data & b)) {
- struct printbuf buf = PRINTBUF;
- bch2_btree_id_to_text(&buf, btree);
- bch_err(c, "flagging btree %s lost data", buf.buf);
- printbuf_exit(&buf);
- ext->btrees_lost_data |= cpu_to_le64(b);
- }
-
- /* Once we have runtime self healing for topology errors we won't need this: */
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
-
- /* Btree node accounting will be off: */
- __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- /*
- * These are much more minor, and don't need to be corrected right away,
- * but in debug mode we want the next fsck run to be clean:
- */
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret;
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
-#endif
-
- switch (btree) {
- case BTREE_ID_alloc:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
-
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
- goto out;
- case BTREE_ID_backpointers:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
- goto out;
- case BTREE_ID_need_discard:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
- goto out;
- case BTREE_ID_freespace:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
- goto out;
- case BTREE_ID_bucket_gens:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
- goto out;
- case BTREE_ID_lru:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
- goto out;
- case BTREE_ID_accounting:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
- goto out;
- default:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
- goto out;
- }
-out:
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-static void kill_btree(struct bch_fs *c, enum btree_id btree)
-{
- bch2_btree_id_root(c, btree)->alive = false;
- bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-}
-
-/* for -o reconstruct_alloc: */
-static void bch2_reconstruct_alloc(struct bch_fs *c)
-{
- bch2_journal_log_msg(c, "dropping alloc info");
- bch_info(c, "dropping and reconstructing all alloc info");
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
-
- __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
-
- __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent);
-
- __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
-
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent);
-
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-
- c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- for (unsigned i = 0; i < btree_id_nr_alive(c); i++)
- if (btree_id_is_alloc(i))
- kill_btree(c, i);
-}
-
-/*
- * Btree node pointers have a field to stack a pointer to the in memory btree
- * node; we need to zero out this field when reading in btree nodes, or when
- * reading in keys from the journal:
- */
-static void zero_out_btree_mem_ptr(struct journal_keys *keys)
-{
- darray_for_each(*keys, i)
- if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
-}
-
-/* journal replay: */
-
-static void replay_now_at(struct journal *j, u64 seq)
-{
- BUG_ON(seq < j->replay_journal_seq);
-
- seq = min(seq, j->replay_journal_seq_end);
-
- while (j->replay_journal_seq < seq)
- bch2_journal_pin_put(j, j->replay_journal_seq++);
-}
-
-static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
- struct journal_key *k)
-{
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
- BTREE_MAX_DEPTH, k->level,
- BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(&iter);
- if (ret)
- goto out;
-
- struct bkey u;
- struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
-
- /* Has this delta already been applied to the btree? */
- if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
- ret = 0;
- goto out;
- }
-
- struct bkey_i *new = k->k;
- if (old.k->type == KEY_TYPE_accounting) {
- new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto out;
-
- bch2_accounting_accumulate(bkey_i_to_accounting(new),
- bkey_s_c_to_accounting(old));
- }
-
- trans->journal_res.seq = k->journal_seq;
-
- ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_journal_replay_key(struct btree_trans *trans,
- struct journal_key *k)
-{
- struct btree_iter iter;
- unsigned iter_flags =
- BTREE_ITER_intent|
- BTREE_ITER_not_extents;
- unsigned update_flags = BTREE_TRIGGER_norun;
- int ret;
-
- if (k->overwritten)
- return 0;
-
- trans->journal_res.seq = k->journal_seq;
-
- /*
- * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
- * keep the key cache coherent with the underlying btree. Nothing
- * besides the allocator is doing updates yet so we don't need key cache
- * coherency for non-alloc btrees, and key cache fills for snapshots
- * btrees use BTREE_ITER_filter_snapshots, which isn't available until
- * the snapshots recovery pass runs.
- */
- if (!k->level && k->btree_id == BTREE_ID_alloc)
- iter_flags |= BTREE_ITER_cached;
- else
- update_flags |= BTREE_UPDATE_key_cache_reclaim;
-
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
- BTREE_MAX_DEPTH, k->level,
- iter_flags);
- ret = bch2_btree_iter_traverse(&iter);
- if (ret)
- goto out;
-
- struct btree_path *path = btree_iter_path(trans, &iter);
- if (unlikely(!btree_path_node(path, k->level))) {
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
- BTREE_MAX_DEPTH, 0, iter_flags);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_increase_depth(trans, iter.path, 0) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-
- /* Must be checked with btree locked: */
- if (k->overwritten)
- goto out;
-
- if (k->k->k.type == KEY_TYPE_accounting) {
- ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
- goto out;
- }
-
- ret = bch2_trans_update(trans, &iter, k->k, update_flags);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int journal_sort_seq_cmp(const void *_l, const void *_r)
-{
- const struct journal_key *l = *((const struct journal_key **)_l);
- const struct journal_key *r = *((const struct journal_key **)_r);
-
- /*
- * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
- *
- * journal_seq == 0 means that the key comes from early repair, and
- * should be inserted last so as to avoid overflowing the journal
- */
- return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
-}
-
-int bch2_journal_replay(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
- DARRAY(struct journal_key *) keys_sorted = { 0 };
- struct journal *j = &c->journal;
- u64 start_seq = c->journal_replay_seq_start;
- u64 end_seq = c->journal_replay_seq_start;
- struct btree_trans *trans = NULL;
- bool immediate_flush = false;
- int ret = 0;
-
- if (keys->nr) {
- ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
- keys->nr, start_seq, end_seq);
- if (ret)
- goto err;
- }
-
- BUG_ON(!atomic_read(&keys->ref));
-
- move_gap(keys, keys->nr);
- trans = bch2_trans_get(c);
-
- /*
- * Replay accounting keys first: we can't allow the write buffer to
- * flush accounting keys until we're done
- */
- darray_for_each(*keys, k) {
- if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
- continue;
-
- cond_resched();
-
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_skip_accounting_apply|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_WATERMARK_reclaim,
- bch2_journal_replay_accounting_key(trans, k));
- if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
- goto err;
-
- k->overwritten = true;
- }
-
- set_bit(BCH_FS_accounting_replay_done, &c->flags);
-
- /*
- * First, attempt to replay keys in sorted order. This is more
- * efficient - better locality of btree access - but some might fail if
- * that would cause a journal deadlock.
- */
- darray_for_each(*keys, k) {
- cond_resched();
-
- /*
- * k->allocated means the key wasn't read in from the journal,
- * rather it was from early repair code
- */
- if (k->allocated)
- immediate_flush = true;
-
- /* Skip fastpath if we're low on space in the journal */
- ret = c->journal.watermark ? -1 :
- commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_skip_accounting_apply|
- (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
- bch2_journal_replay_key(trans, k));
- BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
- if (ret) {
- ret = darray_push(&keys_sorted, k);
- if (ret)
- goto err;
- }
- }
-
- bch2_trans_unlock_long(trans);
- /*
- * Now, replay any remaining keys in the order in which they appear in
- * the journal, unpinning those journal entries as we go:
- */
- sort(keys_sorted.data, keys_sorted.nr,
- sizeof(keys_sorted.data[0]),
- journal_sort_seq_cmp, NULL);
-
- darray_for_each(keys_sorted, kp) {
- cond_resched();
-
- struct journal_key *k = *kp;
-
- if (k->journal_seq)
- replay_now_at(j, k->journal_seq);
- else
- replay_now_at(j, j->replay_journal_seq_end);
-
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_skip_accounting_apply|
- (!k->allocated
- ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
- : 0),
- bch2_journal_replay_key(trans, k));
- if (ret) {
- struct printbuf buf = PRINTBUF;
- bch2_btree_id_level_to_text(&buf, k->btree_id, k->level);
- bch_err_msg(c, ret, "while replaying key at %s:", buf.buf);
- printbuf_exit(&buf);
- goto err;
- }
-
- BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
- }
-
- /*
- * We need to put our btree_trans before calling flush_all_pins(), since
- * that will use a btree_trans internally
- */
- bch2_trans_put(trans);
- trans = NULL;
-
- if (!c->opts.retain_recovery_info &&
- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
- bch2_journal_keys_put_initial(c);
-
- replay_now_at(j, j->replay_journal_seq_end);
- j->replay_journal_seq = 0;
-
- bch2_journal_set_replay_done(j);
-
- /* if we did any repair, flush it immediately */
- if (immediate_flush) {
- bch2_journal_flush_all_pins(&c->journal);
- ret = bch2_journal_meta(&c->journal);
- }
-
- if (keys->nr)
- bch2_journal_log_msg(c, "journal replay finished");
-err:
- if (trans)
- bch2_trans_put(trans);
- darray_exit(&keys_sorted);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/* journal replay early: */
-
-static int journal_replay_entry_early(struct bch_fs *c,
- struct jset_entry *entry)
-{
- int ret = 0;
-
- switch (entry->type) {
- case BCH_JSET_ENTRY_btree_root: {
-
- if (unlikely(!entry->u64s))
- return 0;
-
- if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
- c, invalid_btree_id,
- "invalid btree id %u (max %u)",
- entry->btree_id, BTREE_ID_NR_MAX))
- return 0;
-
- while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
- ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
- if (ret)
- return ret;
- }
-
- struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
-
- r->level = entry->level;
- bkey_copy(&r->key, (struct bkey_i *) entry->start);
- r->error = 0;
- r->alive = true;
- break;
- }
- case BCH_JSET_ENTRY_usage: {
- struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
-
- switch (entry->btree_id) {
- case BCH_FS_USAGE_key_version:
- atomic64_set(&c->key_version, le64_to_cpu(u->v));
- break;
- }
- break;
- }
- case BCH_JSET_ENTRY_blacklist: {
- struct jset_entry_blacklist *bl_entry =
- container_of(entry, struct jset_entry_blacklist, entry);
-
- ret = bch2_journal_seq_blacklist_add(c,
- le64_to_cpu(bl_entry->seq),
- le64_to_cpu(bl_entry->seq) + 1);
- break;
- }
- case BCH_JSET_ENTRY_blacklist_v2: {
- struct jset_entry_blacklist_v2 *bl_entry =
- container_of(entry, struct jset_entry_blacklist_v2, entry);
-
- ret = bch2_journal_seq_blacklist_add(c,
- le64_to_cpu(bl_entry->start),
- le64_to_cpu(bl_entry->end) + 1);
- break;
- }
- case BCH_JSET_ENTRY_clock: {
- struct jset_entry_clock *clock =
- container_of(entry, struct jset_entry_clock, entry);
-
- atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
- }
- }
-fsck_err:
- return ret;
-}
-
-static int journal_replay_early(struct bch_fs *c,
- struct bch_sb_field_clean *clean)
-{
- if (clean) {
- for (struct jset_entry *entry = clean->start;
- entry != vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- int ret = journal_replay_entry_early(c, entry);
- if (ret)
- return ret;
- }
- } else {
- struct genradix_iter iter;
- struct journal_replay *i, **_i;
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- vstruct_for_each(&i->j, entry) {
- int ret = journal_replay_entry_early(c, entry);
- if (ret)
- return ret;
- }
- }
- }
-
- return 0;
-}
-
-/* sb clean section: */
-
-static int read_btree_roots(struct bch_fs *c)
-{
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (!r->alive)
- continue;
-
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, i, r->level);
-
- if (mustfix_fsck_err_on((ret = r->error),
- c, btree_root_bkey_invalid,
- "invalid btree root %s",
- buf.buf) ||
- mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
- c, btree_root_read_error,
- "error reading btree root %s: %s",
- buf.buf, bch2_err_str(ret))) {
- if (btree_id_is_alloc(i))
- r->error = 0;
-
- ret = bch2_btree_lost_data(c, i);
- BUG_ON(ret);
- }
- }
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (!r->b && !r->error) {
- r->alive = false;
- r->level = 0;
- bch2_btree_root_alloc_fake(c, i, 0);
- }
- }
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static bool check_version_upgrade(struct bch_fs *c)
-{
- unsigned latest_version = bcachefs_metadata_version_current;
- unsigned latest_compatible = min(latest_version,
- bch2_latest_compatible_version(c->sb.version));
- unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
- unsigned new_version = 0;
- bool ret = false;
-
- if (old_version < bcachefs_metadata_required_upgrade_below) {
- if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
- latest_compatible < bcachefs_metadata_required_upgrade_below)
- new_version = latest_version;
- else
- new_version = latest_compatible;
- } else {
- switch (c->opts.version_upgrade) {
- case BCH_VERSION_UPGRADE_compatible:
- new_version = latest_compatible;
- break;
- case BCH_VERSION_UPGRADE_incompatible:
- new_version = latest_version;
- break;
- case BCH_VERSION_UPGRADE_none:
- new_version = min(old_version, latest_version);
- break;
- }
- }
-
- if (new_version > old_version) {
- struct printbuf buf = PRINTBUF;
-
- if (old_version < bcachefs_metadata_required_upgrade_below)
- prt_str(&buf, "Version upgrade required:\n");
-
- if (old_version != c->sb.version) {
- prt_str(&buf, "Version upgrade from ");
- bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
- prt_str(&buf, " to ");
- bch2_version_to_text(&buf, c->sb.version);
- prt_str(&buf, " incomplete\n");
- }
-
- prt_printf(&buf, "Doing %s version upgrade from ",
- BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
- ? "incompatible" : "compatible");
- bch2_version_to_text(&buf, old_version);
- prt_str(&buf, " to ");
- bch2_version_to_text(&buf, new_version);
- prt_newline(&buf);
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_upgrade(c, old_version, new_version);
- passes = ext->recovery_passes_required[0] & ~passes;
-
- if (passes) {
- prt_str(&buf, " running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
- }
-
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
- ret = true;
- }
-
- if (new_version > c->sb.version_incompat &&
- c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Now allowing incompatible features up to ");
- bch2_version_to_text(&buf, new_version);
- prt_str(&buf, ", previously allowed up to ");
- bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
- prt_newline(&buf);
-
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
- ret = true;
- }
-
- if (ret)
- bch2_sb_upgrade(c, new_version,
- c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
-
- return ret;
-}
-
-int bch2_fs_recovery(struct bch_fs *c)
-{
- struct bch_sb_field_clean *clean = NULL;
- struct jset *last_journal_entry = NULL;
- u64 last_seq = 0, blacklist_seq, journal_seq;
- int ret = 0;
-
- if (c->sb.clean) {
- clean = bch2_read_superblock_clean(c);
- ret = PTR_ERR_OR_ZERO(clean);
- if (ret)
- goto err;
-
- bch_info(c, "recovering from clean shutdown, journal seq %llu",
- le64_to_cpu(clean->journal_seq));
- } else {
- bch_info(c, "recovering from unclean shutdown");
- }
-
- if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
- bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
- ret = -EINVAL;
- goto err;
- }
-
- if (!c->sb.clean &&
- !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
- bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
- ret = -EINVAL;
- goto err;
- }
-
- if (c->opts.norecovery) {
- c->opts.recovery_pass_last = c->opts.recovery_pass_last
- ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
- : BCH_RECOVERY_PASS_snapshots_read;
- c->opts.nochanges = true;
- c->opts.read_only = true;
- }
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- bool write_sb = false;
-
- if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
- ext->recovery_passes_required[0] |=
- cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
- write_sb = true;
- }
-
- u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- if (sb_passes) {
- struct printbuf buf = PRINTBUF;
- prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
- prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (bch2_check_version_downgrade(c)) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Version downgrade required:");
-
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_downgrade(c,
- BCH_VERSION_MINOR(bcachefs_metadata_version_current),
- BCH_VERSION_MINOR(c->sb.version));
- passes = ext->recovery_passes_required[0] & ~passes;
- if (passes) {
- prt_str(&buf, "\n running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
- }
-
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- write_sb = true;
- }
-
- if (check_version_upgrade(c))
- write_sb = true;
-
- c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
- if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
- SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
- write_sb = true;
- }
-
- if (write_sb)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (c->opts.fsck)
- set_bit(BCH_FS_fsck_running, &c->flags);
- if (c->sb.clean)
- set_bit(BCH_FS_clean_recovery, &c->flags);
- set_bit(BCH_FS_recovery_running, &c->flags);
-
- ret = bch2_blacklist_table_initialize(c);
- if (ret) {
- bch_err(c, "error initializing blacklist table");
- goto err;
- }
-
- bch2_journal_pos_from_member_info_resume(c);
-
- if (!c->sb.clean || c->opts.retain_recovery_info) {
- struct genradix_iter iter;
- struct journal_replay **i;
-
- bch_verbose(c, "starting journal read");
- ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
- if (ret)
- goto err;
-
- /*
- * note: cmd_list_journal needs the blacklist table fully up to date so
- * it can asterisk ignored journal entries:
- */
- if (c->opts.read_journal_only)
- goto out;
-
- genradix_for_each_reverse(&c->journal_entries, iter, i)
- if (!journal_replay_ignore(*i)) {
- last_journal_entry = &(*i)->j;
- break;
- }
-
- if (mustfix_fsck_err_on(c->sb.clean &&
- last_journal_entry &&
- !journal_entry_empty(last_journal_entry), c,
- clean_but_journal_not_empty,
- "filesystem marked clean but journal not empty")) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->sb.clean = false;
- }
-
- if (!last_journal_entry) {
- fsck_err_on(!c->sb.clean, c,
- dirty_but_no_journal_entries,
- "no journal entries found");
- if (clean)
- goto use_clean;
-
- genradix_for_each_reverse(&c->journal_entries, iter, i)
- if (*i) {
- last_journal_entry = &(*i)->j;
- (*i)->ignore_blacklisted = false;
- (*i)->ignore_not_dirty= false;
- /*
- * This was probably a NO_FLUSH entry,
- * so last_seq was garbage - but we know
- * we're only using a single journal
- * entry, set it here:
- */
- (*i)->j.last_seq = (*i)->j.seq;
- break;
- }
- }
-
- ret = bch2_journal_keys_sort(c);
- if (ret)
- goto err;
-
- if (c->sb.clean && last_journal_entry) {
- ret = bch2_verify_superblock_clean(c, &clean,
- last_journal_entry);
- if (ret)
- goto err;
- }
- } else {
-use_clean:
- if (!clean) {
- bch_err(c, "no superblock clean section found");
- ret = -BCH_ERR_fsck_repair_impossible;
- goto err;
-
- }
- blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
- }
-
- c->journal_replay_seq_start = last_seq;
- c->journal_replay_seq_end = blacklist_seq - 1;
-
- zero_out_btree_mem_ptr(&c->journal_keys);
-
- ret = journal_replay_early(c, clean);
- if (ret)
- goto err;
-
- if (c->opts.reconstruct_alloc)
- bch2_reconstruct_alloc(c);
-
- /*
- * After an unclean shutdown, skip then next few journal sequence
- * numbers as they may have been referenced by btree writes that
- * happened before their corresponding journal writes - those btree
- * writes need to be ignored, by skipping and blacklisting the next few
- * journal sequence numbers:
- */
- if (!c->sb.clean)
- journal_seq += JOURNAL_BUF_NR * 4;
-
- if (blacklist_seq != journal_seq) {
- ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
- blacklist_seq, journal_seq) ?:
- bch2_journal_seq_blacklist_add(c,
- blacklist_seq, journal_seq);
- if (ret) {
- bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
- goto err;
- }
- }
-
- ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
- journal_seq, last_seq, blacklist_seq - 1) ?:
- bch2_fs_journal_start(&c->journal, journal_seq);
- if (ret)
- goto err;
-
- /*
- * Skip past versions that might have possibly been used (as nonces),
- * but hadn't had their pointers written:
- */
- if (c->sb.encryption_type && !c->sb.clean)
- atomic64_add(1 << 16, &c->key_version);
-
- ret = read_btree_roots(c);
- if (ret)
- goto err;
-
- set_bit(BCH_FS_btree_running, &c->flags);
-
- ret = bch2_sb_set_upgrade_extra(c);
-
- ret = bch2_run_recovery_passes(c);
- if (ret)
- goto err;
-
- /*
- * Normally set by the appropriate recovery pass: when cleared, this
- * indicates we're in early recovery and btree updates should be done by
- * being applied to the journal replay keys. _Must_ be cleared before
- * multithreaded use:
- */
- set_bit(BCH_FS_may_go_rw, &c->flags);
- clear_bit(BCH_FS_fsck_running, &c->flags);
- clear_bit(BCH_FS_recovery_running, &c->flags);
-
- /* in case we don't run journal replay, i.e. norecovery mode */
- set_bit(BCH_FS_accounting_replay_done, &c->flags);
-
- bch2_async_btree_node_rewrites_flush(c);
-
- /* fsync if we fixed errors */
- if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
- bch2_journal_flush_all_pins(&c->journal);
- bch2_journal_meta(&c->journal);
- }
-
- /* If we fixed errors, verify that fs is actually clean now: */
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- test_bit(BCH_FS_errors_fixed, &c->flags) &&
- !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
- !test_bit(BCH_FS_error, &c->flags)) {
- bch2_flush_fsck_errs(c);
-
- bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
- clear_bit(BCH_FS_errors_fixed, &c->flags);
-
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
-
- ret = bch2_run_recovery_passes(c);
- if (ret)
- goto err;
-
- if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
- test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
- bch_err(c, "Second fsck run was not clean");
- set_bit(BCH_FS_errors_not_fixed, &c->flags);
- }
-
- set_bit(BCH_FS_errors_fixed, &c->flags);
- }
-
- if (enabled_qtypes(c)) {
- bch_verbose(c, "reading quotas");
- ret = bch2_fs_quota_read(c);
- if (ret)
- goto err;
- bch_verbose(c, "quotas done");
- }
-
- mutex_lock(&c->sb_lock);
- ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- write_sb = false;
-
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
- SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
- write_sb = true;
- }
-
- if (!test_bit(BCH_FS_error, &c->flags) &&
- !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
- write_sb = true;
- }
-
- if (!test_bit(BCH_FS_error, &c->flags) &&
- !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
- memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
- write_sb = true;
- }
-
- if (c->opts.fsck &&
- !test_bit(BCH_FS_error, &c->flags) &&
- c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
- ext->btrees_lost_data) {
- ext->btrees_lost_data = 0;
- write_sb = true;
- }
-
- if (c->opts.fsck &&
- !test_bit(BCH_FS_error, &c->flags) &&
- !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
- SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
- SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
- write_sb = true;
- }
-
- if (bch2_blacklist_entries_gc(c))
- write_sb = true;
-
- if (write_sb)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
- c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
- struct bch_move_stats stats;
-
- bch2_move_stats_init(&stats, "recovery");
-
- struct printbuf buf = PRINTBUF;
- bch2_version_to_text(&buf, c->sb.version_min);
- bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
- printbuf_exit(&buf);
-
- ret = bch2_fs_read_write_early(c) ?:
- bch2_scan_old_btree_nodes(c, &stats);
- if (ret)
- goto err;
- bch_info(c, "scanning for old btree nodes done");
- }
-
- ret = 0;
-out:
- bch2_flush_fsck_errs(c);
-
- if (!c->opts.retain_recovery_info) {
- bch2_journal_keys_put_initial(c);
- bch2_find_btree_nodes_exit(&c->found_btree_nodes);
- }
- if (!IS_ERR(clean))
- kfree(clean);
-
- if (!ret &&
- test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
- !c->opts.nochanges) {
- bch2_fs_read_write_early(c);
- bch2_delete_dead_snapshots_async(c);
- }
-
- bch_err_fn(c, ret);
- return ret;
-err:
-fsck_err:
- bch2_fs_emergency_read_only(c);
- goto out;
-}
-
-int bch2_fs_initialize(struct bch_fs *c)
-{
- struct bch_inode_unpacked root_inode, lostfound_inode;
- struct bkey_inode_buf packed_inode;
- struct qstr lostfound = QSTR("lost+found");
- struct bch_member *m;
- int ret;
-
- bch_notice(c, "initializing new filesystem");
- set_bit(BCH_FS_new_fs, &c->flags);
-
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
-
- bch2_check_version_downgrade(c);
-
- if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
- bch2_sb_upgrade(c, bcachefs_metadata_version_current, false);
- SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- bch2_write_super(c);
- }
-
- for_each_member_device(c, ca) {
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false);
- ca->mi = bch2_mi_to_cpu(m);
- }
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- set_bit(BCH_FS_btree_running, &c->flags);
- set_bit(BCH_FS_may_go_rw, &c->flags);
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++)
- bch2_btree_root_alloc_fake(c, i, 0);
-
- ret = bch2_fs_journal_alloc(c);
- if (ret)
- goto err;
-
- /*
- * journal_res_get() will crash if called before this has
- * set up the journal.pin FIFO and journal.cur pointer:
- */
- bch2_fs_journal_start(&c->journal, 1);
- set_bit(BCH_FS_accounting_replay_done, &c->flags);
- bch2_journal_set_replay_done(&c->journal);
-
- ret = bch2_fs_read_write_early(c);
- if (ret)
- goto err;
-
- for_each_member_device(c, ca) {
- ret = bch2_dev_usage_init(ca, false);
- if (ret) {
- bch2_dev_put(ca);
- goto err;
- }
- }
-
- /*
- * Write out the superblock and journal buckets, now that we can do
- * btree updates
- */
- bch_verbose(c, "marking superblocks");
- ret = bch2_trans_mark_dev_sbs(c);
- bch_err_msg(c, ret, "marking superblocks");
- if (ret)
- goto err;
-
- ret = bch2_fs_freespace_init(c);
- if (ret)
- goto err;
-
- ret = bch2_initialize_subvolumes(c);
- if (ret)
- goto err;
-
- bch_verbose(c, "reading snapshots table");
- ret = bch2_snapshots_read(c);
- if (ret)
- goto err;
- bch_verbose(c, "reading snapshots done");
-
- bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
- root_inode.bi_inum = BCACHEFS_ROOT_INO;
- root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
- bch2_inode_pack(&packed_inode, &root_inode);
- packed_inode.inode.k.p.snapshot = U32_MAX;
-
- ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "creating root directory");
- if (ret)
- goto err;
-
- bch2_inode_init_early(c, &lostfound_inode);
-
- ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_create_trans(trans,
- BCACHEFS_ROOT_SUBVOL_INUM,
- &root_inode, &lostfound_inode,
- &lostfound,
- 0, 0, S_IFDIR|0700, 0,
- NULL, NULL, (subvol_inum) { 0 }, 0));
- bch_err_msg(c, ret, "creating lost+found");
- if (ret)
- goto err;
-
- c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
-
- if (enabled_qtypes(c)) {
- ret = bch2_fs_quota_read(c);
- if (ret)
- goto err;
- }
-
- ret = bch2_journal_flush(&c->journal);
- bch_err_msg(c, ret, "writing first journal entry");
- if (ret)
- goto err;
-
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
- return 0;
-err:
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
deleted file mode 100644
index b0d55754b21b..000000000000
--- a/fs/bcachefs/recovery.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_H
-#define _BCACHEFS_RECOVERY_H
-
-int bch2_btree_lost_data(struct bch_fs *, enum btree_id);
-
-int bch2_journal_replay(struct bch_fs *);
-
-int bch2_fs_recovery(struct bch_fs *);
-int bch2_fs_initialize(struct bch_fs *);
-
-#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
deleted file mode 100644
index 0b3c951c32da..000000000000
--- a/fs/bcachefs/recovery_passes.c
+++ /dev/null
@@ -1,316 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "btree_gc.h"
-#include "btree_node_scan.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal.h"
-#include "lru.h"
-#include "logged_ops.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...) #_fn,
- BCH_RECOVERY_PASSES()
-#undef x
- NULL
-};
-
-/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
-static int bch2_recovery_pass_empty(struct bch_fs *c)
-{
- return 0;
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- /*
- * After we go RW, the journal keys buffer can't be modified (except for
- * setting journal_key->overwritten: it will be accessed by multiple
- * threads
- */
- move_gap(keys, keys->nr);
-
- set_bit(BCH_FS_may_go_rw, &c->flags);
-
- if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
- return bch2_fs_read_write_early(c);
- return 0;
-}
-
-struct recovery_pass_fn {
- int (*fn)(struct bch_fs *);
- unsigned when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static const u8 passes_to_stable_map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
-{
- return passes_to_stable_map[pass];
-}
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(passes_to_stable_map[i]);
- return ret;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
- return -BCH_ERR_not_in_recovery;
-
- if (c->recovery_passes_complete & BIT_ULL(pass))
- return 0;
-
- bool print = !(c->opts.recovery_passes & BIT_ULL(pass));
-
- if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
- c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
- if (print)
- bch_info(c, "need recovery pass %s (%u), but already rw",
- bch2_recovery_passes[pass], pass);
- return -BCH_ERR_cannot_rewind_recovery;
- }
-
- if (print)
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
-
- c->opts.recovery_passes |= BIT_ULL(pass);
-
- if (c->curr_recovery_pass > pass) {
- c->next_recovery_pass = pass;
- c->recovery_passes_complete &= (1ULL << pass) >> 1;
- return -BCH_ERR_restart_recovery;
- } else {
- return 0;
- }
-}
-
-int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- unsigned long flags;
- spin_lock_irqsave(&c->recovery_pass_lock, flags);
- int ret = __bch2_run_explicit_recovery_pass(c, pass);
- spin_unlock_irqrestore(&c->recovery_pass_lock, flags);
- return ret;
-}
-
-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- lockdep_assert_held(&c->sb_lock);
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
-
- return bch2_run_explicit_recovery_pass(c, pass);
-}
-
-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- if (!test_bit_le64(s, ext->recovery_passes_required)) {
- __set_bit_le64(s, ext->recovery_passes_required);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
-
- return bch2_run_explicit_recovery_pass(c, pass);
-}
-
-static void bch2_clear_recovery_pass_required(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- if (test_bit_le64(s, ext->recovery_passes_required)) {
- __clear_bit_le64(s, ext->recovery_passes_required);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
-}
-
-u64 bch2_fsck_recovery_passes(void)
-{
- u64 ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
- if (recovery_pass_fns[i].when & PASS_FSCK)
- ret |= BIT_ULL(i);
- return ret;
-}
-
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
- if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
- return false;
- if (c->opts.recovery_passes & BIT_ULL(pass))
- return true;
- if ((p->when & PASS_FSCK) && c->opts.fsck)
- return true;
- if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
- return true;
- if (p->when & PASS_ALWAYS)
- return true;
- return false;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
- int ret;
-
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
- return ret;
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_CONT " done\n");
-
- return 0;
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- down_read(&c->state_lock);
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
- struct recovery_pass_fn *p = recovery_pass_fns + i;
-
- if (!(p->when & PASS_ONLINE))
- continue;
-
- ret = bch2_run_recovery_pass(c, i);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
- i = c->curr_recovery_pass;
- continue;
- }
- if (ret)
- break;
- }
-
- up_read(&c->state_lock);
-
- return ret;
-}
-
-int bch2_run_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- /*
- * We can't allow set_may_go_rw to be excluded; that would cause us to
- * use the journal replay keys for updates where it's not expected.
- */
- c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
-
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
- c->next_recovery_pass = c->curr_recovery_pass + 1;
-
- spin_lock_irq(&c->recovery_pass_lock);
- unsigned pass = c->curr_recovery_pass;
-
- if (c->opts.recovery_pass_last &&
- c->curr_recovery_pass > c->opts.recovery_pass_last) {
- spin_unlock_irq(&c->recovery_pass_lock);
- break;
- }
-
- if (!should_run_recovery_pass(c, pass)) {
- c->curr_recovery_pass++;
- c->recovery_pass_done = max(c->recovery_pass_done, pass);
- spin_unlock_irq(&c->recovery_pass_lock);
- continue;
- }
- spin_unlock_irq(&c->recovery_pass_lock);
-
- ret = bch2_run_recovery_pass(c, pass) ?:
- bch2_journal_flush(&c->journal);
-
- if (!ret && !test_bit(BCH_FS_error, &c->flags))
- bch2_clear_recovery_pass_required(c, pass);
-
- spin_lock_irq(&c->recovery_pass_lock);
- if (c->next_recovery_pass < c->curr_recovery_pass) {
- /*
- * bch2_run_explicit_recovery_pass() was called: we
- * can't always catch -BCH_ERR_restart_recovery because
- * it may have been called from another thread (btree
- * node read completion)
- */
- ret = 0;
- c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
- } else {
- c->recovery_passes_complete |= BIT_ULL(pass);
- c->recovery_pass_done = max(c->recovery_pass_done, pass);
- }
- c->curr_recovery_pass = c->next_recovery_pass;
- spin_unlock_irq(&c->recovery_pass_lock);
- }
-
- return ret;
-}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
deleted file mode 100644
index 7d7339c8fa29..000000000000
--- a/fs/bcachefs/recovery_passes.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _BCACHEFS_RECOVERY_PASSES_H
-#define _BCACHEFS_RECOVERY_PASSES_H
-
-extern const char * const bch2_recovery_passes[];
-
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-u64 bch2_fsck_recovery_passes(void);
-
-int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass);
-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
-
-int bch2_run_online_recovery_passes(struct bch_fs *);
-int bch2_run_recovery_passes(struct bch_fs *);
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
deleted file mode 100644
index e89b9c783285..000000000000
--- a/fs/bcachefs/recovery_passes_types.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
-#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
-
-#define PASS_SILENT BIT(0)
-#define PASS_FSCK BIT(1)
-#define PASS_UNCLEAN BIT(2)
-#define PASS_ALWAYS BIT(3)
-#define PASS_ONLINE BIT(4)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define PASS_FSCK_DEBUG BIT(1)
-#else
-#define PASS_FSCK_DEBUG 0
-#endif
-
-/*
- * Passes may be reordered, but the second field is a persistent identifier and
- * must never change:
- */
-#define BCH_RECOVERY_PASSES() \
- x(recovery_pass_empty, 41, PASS_SILENT) \
- x(scan_for_btree_nodes, 37, 0) \
- x(check_topology, 4, 0) \
- x(accounting_read, 39, PASS_ALWAYS) \
- x(alloc_read, 0, PASS_ALWAYS) \
- x(stripes_read, 1, 0) \
- x(initialize_subvolumes, 2, 0) \
- x(snapshots_read, 3, PASS_ALWAYS) \
- x(check_allocations, 5, PASS_FSCK) \
- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
- x(journal_replay, 9, PASS_ALWAYS) \
- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
- x(bucket_gens_init, 17, 0) \
- x(reconstruct_snapshots, 38, 0) \
- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
- x(fs_upgrade_for_subvolumes, 22, 0) \
- x(check_inodes, 24, PASS_FSCK) \
- x(check_extents, 25, PASS_FSCK) \
- x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
- x(check_dirents, 27, PASS_FSCK) \
- x(check_xattrs, 28, PASS_FSCK) \
- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
- x(check_unreachable_inodes, 40, PASS_FSCK) \
- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
- x(check_nlinks, 31, PASS_FSCK) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
- x(delete_dead_inodes, 32, PASS_ALWAYS) \
- x(fix_reflink_p, 33, 0) \
- x(set_fs_needs_rebalance, 34, 0)
-
-/* We normally enumerate recovery passes in the order we run them: */
-enum bch_recovery_pass {
-#define x(n, id, when) BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- BCH_RECOVERY_PASS_NR
-};
-
-/* But we also need stable identifiers that can be used in the superblock */
-enum bch_recovery_pass_stable {
-#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
deleted file mode 100644
index 68172c6eba21..000000000000
--- a/fs/bcachefs/reflink.c
+++ /dev/null
@@ -1,860 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "rebalance.h"
-#include "reflink.h"
-#include "subvolume.h"
-#include "super-io.h"
-
-#include <linux/sched/signal.h>
-
-static inline bool bkey_extent_is_reflink_data(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_reflink_v:
- case KEY_TYPE_indirect_inline_data:
- return true;
- default:
- return false;
- }
-}
-
-static inline unsigned bkey_type_to_indirect(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_extent:
- return KEY_TYPE_reflink_v;
- case KEY_TYPE_inline_data:
- return KEY_TYPE_indirect_inline_data;
- default:
- return 0;
- }
-}
-
-/* reflink pointers */
-
-int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- int ret = 0;
-
- bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad),
- c, reflink_p_front_pad_bad,
- "idx < front_pad (%llu < %u)",
- REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad));
-fsck_err:
- return ret;
-}
-
-void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-
- prt_printf(out, "idx %llu front_pad %u back_pad %u",
- REFLINK_P_IDX(p.v),
- le32_to_cpu(p.v->front_pad),
- le32_to_cpu(p.v->back_pad));
-}
-
-bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
- struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
- struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
-
- /*
- * Disabled for now, the triggers code needs to be reworked for merging
- * of reflink pointers to work:
- */
- return false;
-
- if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
- return false;
-
- if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v))
- return false;
-
- bch2_key_resize(l.k, l.k->size + r.k->size);
- return true;
-}
-
-/* indirect extents */
-
-int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
- c, reflink_v_pos_bad,
- "indirect extent above maximum position 0:%llu",
- REFLINK_P_IDX_MAX);
-
- ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
- prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
-
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-#if 0
-Currently disabled, needs to be debugged:
-
-bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
- struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l);
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
-
- return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
-}
-#endif
-
-/* indirect inline data */
-
-int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-void bch2_indirect_inline_data_to_text(struct printbuf *out,
- struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
- unsigned datalen = bkey_inline_data_bytes(k.k);
-
- prt_printf(out, "refcount %llu datalen %u: %*phN",
- le64_to_cpu(d.v->refcount), datalen,
- min(datalen, 32U), d.v->data);
-}
-
-/* lookup */
-
-static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p,
- bool should_commit)
-{
- struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
- int ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- SET_REFLINK_P_ERROR(&new->v, false);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
- if (ret)
- return ret;
-
- if (!should_commit)
- return 0;
-
- return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
-}
-
-static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 missing_start, u64 missing_end,
- bool should_commit)
-{
- if (REFLINK_P_ERROR(p.v))
- return 0;
-
- struct bch_fs *c = trans->c;
- u64 live_start = REFLINK_P_IDX(p.v);
- u64 live_end = REFLINK_P_IDX(p.v) + p.k->size;
- u64 refd_start = live_start - le32_to_cpu(p.v->front_pad);
- u64 refd_end = live_end + le32_to_cpu(p.v->back_pad);
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- BUG_ON(missing_start < refd_start);
- BUG_ON(missing_end > refd_end);
-
- struct bpos missing_pos = bkey_start_pos(p.k);
- missing_pos.offset += missing_start - live_start;
-
- prt_printf(&buf, "pointer to missing indirect extent in ");
- ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos);
- if (ret)
- goto err;
-
- prt_printf(&buf, "-%llu\n ", (missing_pos.offset + (missing_end - missing_start)) << 9);
- bch2_bkey_val_to_text(&buf, c, p.s_c);
-
- prt_printf(&buf, "\n missing reflink btree range %llu-%llu",
- missing_start, missing_end);
-
- if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) {
- struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- /*
- * Is the missing range not actually needed?
- *
- * p.v->idx refers to the data that we actually want, but if the
- * indirect extent we point to was bigger, front_pad and back_pad
- * indicate the range we took a reference on.
- */
-
- if (missing_end <= live_start) {
- new->v.front_pad = cpu_to_le32(live_start - missing_end);
- } else if (missing_start >= live_end) {
- new->v.back_pad = cpu_to_le32(missing_start - live_end);
- } else {
- struct bpos new_start = bkey_start_pos(&new->k);
- struct bpos new_end = new->k.p;
-
- if (missing_start > live_start)
- new_start.offset += missing_start - live_start;
- if (missing_end < live_end)
- new_end.offset -= live_end - missing_end;
-
- bch2_cut_front(new_start, &new->k_i);
- bch2_cut_back(new_end, &new->k_i);
-
- SET_REFLINK_P_ERROR(&new->v, true);
- }
-
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
- if (ret)
- goto err;
-
- if (should_commit)
- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * This is used from the read path, which doesn't expect to have to do a
- * transaction commit, and from triggers, which should not be doing a commit:
- */
-struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- s64 *offset_into_extent,
- struct bkey_s_c_reflink_p p,
- bool should_commit,
- unsigned iter_flags)
-{
- BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad)));
- BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad));
-
- u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent;
-
- struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink,
- POS(0, reflink_offset), iter_flags);
- if (bkey_err(k))
- return k;
-
- if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
- unsigned size = min((u64) k.k->size,
- REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
- reflink_offset);
- bch2_key_resize(&iter->k, size);
-
- int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
- k.k->p.offset, should_commit);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return bkey_s_c_err(ret);
- }
- } else if (unlikely(REFLINK_P_ERROR(p.v))) {
- int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return bkey_s_c_err(ret);
- }
- }
-
- *offset_into_extent = reflink_offset - bkey_start_offset(k.k);
- return k;
-}
-
-/* reflink pointer trigger */
-
-static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p, u64 *idx,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v);
- struct btree_iter iter;
- struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false,
- BTREE_ITER_intent|
- BTREE_ITER_with_updates);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_refcount_c(k)) {
- if (!(flags & BTREE_TRIGGER_overwrite))
- ret = -BCH_ERR_missing_indirect_extent;
- goto next;
- }
-
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- __le64 *refcount = bkey_refcount(bkey_i_to_s(new));
- if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, k);
- log_fsck_err(trans, reflink_refcount_underflow,
- "indirect extent refcount underflow while marking\n %s",
- buf.buf);
- goto next;
- }
-
- if (flags & BTREE_TRIGGER_insert) {
- struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
- u64 pad;
-
- pad = max_t(s64, le32_to_cpu(v->front_pad),
- REFLINK_P_IDX(v) - bkey_start_offset(&new->k));
- BUG_ON(pad > U32_MAX);
- v->front_pad = cpu_to_le32(pad);
-
- pad = max_t(s64, le32_to_cpu(v->back_pad),
- new->k.p.offset - p.k->size - REFLINK_P_IDX(v));
- BUG_ON(pad > U32_MAX);
- v->back_pad = cpu_to_le32(pad);
- }
-
- le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1);
-
- bch2_btree_iter_set_pos_to_extent_start(&iter);
- ret = bch2_trans_update(trans, &iter, new, 0);
- if (ret)
- goto err;
-next:
- *idx = k.k->p.offset;
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p, u64 *idx,
- enum btree_iter_update_trigger_flags flags,
- size_t r_idx)
-{
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
- u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
- s64 ret = 0;
- struct printbuf buf = PRINTBUF;
-
- if (r_idx >= c->reflink_gc_nr)
- goto not_found;
-
- r = genradix_ptr(&c->reflink_gc_table, r_idx);
- next_idx = min(next_idx, r->offset - r->size);
- if (*idx < next_idx)
- goto not_found;
-
- BUG_ON((s64) r->refcount + add < 0);
-
- if (flags & BTREE_TRIGGER_gc)
- r->refcount += add;
- *idx = r->offset;
- return 0;
-not_found:
- if (flags & BTREE_TRIGGER_check_repair) {
- ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
- if (ret)
- goto err;
- }
-
- *idx = next_idx;
-err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __trigger_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level, struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- int ret = 0;
-
- u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
- u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
-
- if (flags & BTREE_TRIGGER_transactional) {
- while (idx < end && !ret)
- ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
- }
-
- if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) {
- size_t l = 0, r = c->reflink_gc_nr;
-
- while (l < r) {
- size_t m = l + (r - l) / 2;
- struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m);
- if (ref->offset <= idx)
- l = m + 1;
- else
- r = m;
- }
-
- while (idx < end && !ret)
- ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++);
- }
-
- return ret;
-}
-
-int bch2_trigger_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- if ((flags & BTREE_TRIGGER_transactional) &&
- (flags & BTREE_TRIGGER_insert)) {
- struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
-
- v->front_pad = v->back_pad = 0;
- }
-
- return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
-}
-
-/* indirect extent trigger */
-
-static inline void
-check_indirect_extent_deleting(struct bkey_s new,
- enum btree_iter_update_trigger_flags *flags)
-{
- if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) {
- new.k->type = KEY_TYPE_deleted;
- new.k->size = 0;
- set_bkey_val_u64s(new.k, 0);
- *flags &= ~BTREE_TRIGGER_insert;
- }
-}
-
-int bch2_trigger_reflink_v(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- if ((flags & BTREE_TRIGGER_transactional) &&
- (flags & BTREE_TRIGGER_insert))
- check_indirect_extent_deleting(new, &flags);
-
- return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
-}
-
-int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- check_indirect_extent_deleting(new, &flags);
-
- return 0;
-}
-
-/* create */
-
-static int bch2_make_extent_indirect(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- struct bkey_i *orig,
- bool reflink_p_may_update_opts_field)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter reflink_iter = { NULL };
- struct bkey_s_c k;
- struct bkey_i *r_v;
- struct bkey_i_reflink_p *r_p;
- __le64 *refcount;
- int ret;
-
- if (orig->k.type == KEY_TYPE_inline_data)
- bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
-
- bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
- BTREE_ITER_intent);
- k = bch2_btree_iter_peek_prev(&reflink_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- /*
- * XXX: we're assuming that 56 bits will be enough for the life of the
- * filesystem: we need to implement wraparound, with a cursor in the
- * logged ops btree:
- */
- if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size)))
- return -ENOSPC;
-
- r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
- ret = PTR_ERR_OR_ZERO(r_v);
- if (ret)
- goto err;
-
- bkey_init(&r_v->k);
- r_v->k.type = bkey_type_to_indirect(&orig->k);
- r_v->k.p = reflink_iter.pos;
- bch2_key_resize(&r_v->k, orig->k.size);
- r_v->k.bversion = orig->k.bversion;
-
- set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
-
- refcount = bkey_refcount(bkey_i_to_s(r_v));
- *refcount = 0;
- memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
-
- ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
- if (ret)
- goto err;
-
- /*
- * orig is in a bkey_buf which statically allocates 5 64s for the val,
- * so we know it will be big enough:
- */
- orig->k.type = KEY_TYPE_reflink_p;
- r_p = bkey_i_to_reflink_p(orig);
- set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
-
- /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
-#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
- __underlying_memset(&r_p->v, 0, sizeof(r_p->v));
-#else
- memset(&r_p->v, 0, sizeof(r_p->v));
-#endif
-
- SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k));
-
- if (reflink_p_may_update_opts_field)
- SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true);
-
- ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
- BTREE_UPDATE_internal_snapshot_node);
-err:
- bch2_trans_iter_exit(trans, &reflink_iter);
-
- return ret;
-}
-
-static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
-{
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) {
- if (bkey_extent_is_unwritten(k))
- continue;
-
- if (bkey_extent_is_data(k.k))
- return k;
- }
-
- if (bkey_ge(iter->pos, end))
- bch2_btree_iter_set_pos(iter, end);
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-}
-
-s64 bch2_remap_range(struct bch_fs *c,
- subvol_inum dst_inum, u64 dst_offset,
- subvol_inum src_inum, u64 src_offset,
- u64 remap_sectors,
- u64 new_i_size, s64 *i_sectors_delta,
- bool may_change_src_io_path_opts)
-{
- struct btree_trans *trans;
- struct btree_iter dst_iter, src_iter;
- struct bkey_s_c src_k;
- struct bkey_buf new_dst, new_src;
- struct bpos dst_start = POS(dst_inum.inum, dst_offset);
- struct bpos src_start = POS(src_inum.inum, src_offset);
- struct bpos dst_end = dst_start, src_end = src_start;
- struct bch_io_opts opts;
- struct bpos src_want;
- u64 dst_done = 0;
- u32 dst_snapshot, src_snapshot;
- bool reflink_p_may_update_opts_field =
- !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
- int ret = 0, ret2 = 0;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
- return -BCH_ERR_erofs_no_writes;
-
- bch2_check_set_feature(c, BCH_FEATURE_reflink);
-
- dst_end.offset += remap_sectors;
- src_end.offset += remap_sectors;
-
- bch2_bkey_buf_init(&new_dst);
- bch2_bkey_buf_init(&new_src);
- trans = bch2_trans_get(c);
-
- ret = bch2_inum_opts_get(trans, src_inum, &opts);
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
- BTREE_ITER_intent);
- bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
- BTREE_ITER_intent);
-
- while ((ret == 0 ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
- bkey_lt(dst_iter.pos, dst_end)) {
- struct disk_reservation disk_res = { 0 };
-
- bch2_trans_begin(trans);
-
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
- &src_snapshot);
- if (ret)
- continue;
-
- bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
-
- ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
- &dst_snapshot);
- if (ret)
- continue;
-
- bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
-
- if (dst_inum.inum < src_inum.inum) {
- /* Avoid some lock cycle transaction restarts */
- ret = bch2_btree_iter_traverse(&dst_iter);
- if (ret)
- continue;
- }
-
- dst_done = dst_iter.pos.offset - dst_start.offset;
- src_want = POS(src_start.inode, src_start.offset + dst_done);
- bch2_btree_iter_set_pos(&src_iter, src_want);
-
- src_k = get_next_src(&src_iter, src_end);
- ret = bkey_err(src_k);
- if (ret)
- continue;
-
- if (bkey_lt(src_want, src_iter.pos)) {
- ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
- min(dst_end.offset,
- dst_iter.pos.offset +
- src_iter.pos.offset - src_want.offset),
- i_sectors_delta);
- continue;
- }
-
- if (src_k.k->type != KEY_TYPE_reflink_p) {
- bch2_btree_iter_set_pos_to_extent_start(&src_iter);
-
- bch2_bkey_buf_reassemble(&new_src, c, src_k);
- src_k = bkey_i_to_s_c(new_src.k);
-
- ret = bch2_make_extent_indirect(trans, &src_iter,
- new_src.k,
- reflink_p_may_update_opts_field);
- if (ret)
- continue;
-
- BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
- }
-
- if (src_k.k->type == KEY_TYPE_reflink_p) {
- struct bkey_s_c_reflink_p src_p =
- bkey_s_c_to_reflink_p(src_k);
- struct bkey_i_reflink_p *dst_p =
- bkey_reflink_p_init(new_dst.k);
-
- u64 offset = REFLINK_P_IDX(src_p.v) +
- (src_want.offset -
- bkey_start_offset(src_k.k));
-
- SET_REFLINK_P_IDX(&dst_p->v, offset);
-
- if (reflink_p_may_update_opts_field &&
- may_change_src_io_path_opts)
- SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
- } else {
- BUG();
- }
-
- new_dst.k->k.p = dst_iter.pos;
- bch2_key_resize(&new_dst.k->k,
- min(src_k.k->p.offset - src_want.offset,
- dst_end.offset - dst_iter.pos.offset));
-
- ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?:
- bch2_extent_update(trans, dst_inum, &dst_iter,
- new_dst.k, &disk_res,
- new_i_size, i_sectors_delta,
- true);
- bch2_disk_reservation_put(c, &disk_res);
- }
- bch2_trans_iter_exit(trans, &dst_iter);
- bch2_trans_iter_exit(trans, &src_iter);
-
- BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
- BUG_ON(bkey_gt(dst_iter.pos, dst_end));
-
- dst_done = dst_iter.pos.offset - dst_start.offset;
- new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
-
- do {
- struct bch_inode_unpacked inode_u;
- struct btree_iter inode_iter = { NULL };
-
- bch2_trans_begin(trans);
-
- ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
- dst_inum, BTREE_ITER_intent);
-
- if (!ret2 &&
- inode_u.bi_size < new_i_size) {
- inode_u.bi_size = new_i_size;
- ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- }
-
- bch2_trans_iter_exit(trans, &inode_iter);
- } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-err:
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&new_src, c);
- bch2_bkey_buf_exit(&new_dst, c);
-
- bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
-
- return dst_done ?: ret ?: ret2;
-}
-
-/* fsck */
-
-static int bch2_gc_write_reflink_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- size_t *idx)
-{
- struct bch_fs *c = trans->c;
- const __le64 *refcount = bkey_refcount_c(k);
- struct printbuf buf = PRINTBUF;
- struct reflink_gc *r;
- int ret = 0;
-
- if (!refcount)
- return 0;
-
- while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
- r->offset < k.k->p.offset)
- ++*idx;
-
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- return -EINVAL;
- }
-
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
- trans, reflink_v_refcount_wrong,
- "reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
- r->refcount)) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto out;
-
- if (!r->refcount)
- new->k.type = KEY_TYPE_deleted;
- else
- *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
- ret = bch2_trans_update(trans, iter, new, 0);
- }
-out:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_gc_reflink_done(struct bch_fs *c)
-{
- size_t idx = 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
- c->reflink_gc_nr = 0;
- return ret;
-}
-
-int bch2_gc_reflink_start(struct bch_fs *c)
-{
- c->reflink_gc_nr = 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- continue;
-
- struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
- c->reflink_gc_nr++, GFP_KERNEL);
- if (!r) {
- ret = -BCH_ERR_ENOMEM_gc_reflink_start;
- break;
- }
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- 0;
- })));
-
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
deleted file mode 100644
index 1632780bdf18..000000000000
--- a/fs/bcachefs/reflink.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REFLINK_H
-#define _BCACHEFS_REFLINK_H
-
-int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
- .key_validate = bch2_reflink_p_validate, \
- .val_to_text = bch2_reflink_p_to_text, \
- .key_merge = bch2_reflink_p_merge, \
- .trigger = bch2_trigger_reflink_p, \
- .min_val_size = 16, \
-})
-
-int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
- .key_validate = bch2_reflink_v_validate, \
- .val_to_text = bch2_reflink_v_to_text, \
- .swab = bch2_ptr_swab, \
- .trigger = bch2_trigger_reflink_v, \
- .min_val_size = 8, \
-})
-
-int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_indirect_inline_data_to_text(struct printbuf *,
- struct bch_fs *, struct bkey_s_c);
-int bch2_trigger_indirect_inline_data(struct btree_trans *,
- enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
- .key_validate = bch2_indirect_inline_data_validate, \
- .val_to_text = bch2_indirect_inline_data_to_text, \
- .trigger = bch2_trigger_indirect_inline_data, \
- .min_val_size = 8, \
-})
-
-static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_reflink_v:
- return &bkey_s_c_to_reflink_v(k).v->refcount;
- case KEY_TYPE_indirect_inline_data:
- return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
- default:
- return NULL;
- }
-}
-
-static inline __le64 *bkey_refcount(struct bkey_s k)
-{
- switch (k.k->type) {
- case KEY_TYPE_reflink_v:
- return &bkey_s_to_reflink_v(k).v->refcount;
- case KEY_TYPE_indirect_inline_data:
- return &bkey_s_to_indirect_inline_data(k).v->refcount;
- default:
- return NULL;
- }
-}
-
-struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *,
- s64 *, struct bkey_s_c_reflink_p,
- bool, unsigned);
-
-s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
- subvol_inum, u64, u64, u64, s64 *,
- bool);
-
-int bch2_gc_reflink_done(struct bch_fs *);
-int bch2_gc_reflink_start(struct bch_fs *);
-
-#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
deleted file mode 100644
index 92995e4f898e..000000000000
--- a/fs/bcachefs/reflink_format.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REFLINK_FORMAT_H
-#define _BCACHEFS_REFLINK_FORMAT_H
-
-struct bch_reflink_p {
- struct bch_val v;
- __le64 idx_flags;
- /*
- * A reflink pointer might point to an indirect extent which is then
- * later split (by copygc or rebalance). If we only pointed to part of
- * the original indirect extent, and then one of the fragments is
- * outside the range we point to, we'd leak a refcount: so when creating
- * reflink pointers, we need to store pad values to remember the full
- * range we were taking a reference on.
- */
- __le32 front_pad;
- __le32 back_pad;
-} __packed __aligned(8);
-
-LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56);
-LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57);
-LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS,
- struct bch_reflink_p, idx_flags, 57, 58);
-
-struct bch_reflink_v {
- struct bch_val v;
- __le64 refcount;
- union bch_extent_entry start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
- struct bch_val v;
- __le64 refcount;
- u8 data[];
-};
-
-#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
deleted file mode 100644
index 477ef0997949..000000000000
--- a/fs/bcachefs/replicas.c
+++ /dev/null
@@ -1,919 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "buckets.h"
-#include "disk_accounting.h"
-#include "journal.h"
-#include "replicas.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
- struct bch_replicas_cpu *);
-
-/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, const void *priv)
-{
- size_t size = (size_t) priv;
- return memcmp(l, r, size);
-}
-
-/* Replicas tracking - in memory: */
-
-static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- BUG_ON(!e->nr_devs);
- BUG_ON(e->nr_required > 1 &&
- e->nr_required >= e->nr_devs);
-
- for (unsigned i = 0; i + 1 < e->nr_devs; i++)
- BUG_ON(e->devs[i] >= e->devs[i + 1]);
-#endif
-}
-
-void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
-{
- bubble_sort(e->devs, e->nr_devs, u8_cmp);
-}
-
-static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
-{
- eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
- bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
-}
-
-static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
- struct bch_replicas_entry_v0 *e)
-{
- bch2_prt_data_type(out, e->data_type);
-
- prt_printf(out, ": %u [", e->nr_devs);
- for (unsigned i = 0; i < e->nr_devs; i++)
- prt_printf(out, i ? " %u" : "%u", e->devs[i]);
- prt_printf(out, "]");
-}
-
-void bch2_replicas_entry_to_text(struct printbuf *out,
- struct bch_replicas_entry_v1 *e)
-{
- bch2_prt_data_type(out, e->data_type);
-
- prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
- for (unsigned i = 0; i < e->nr_devs; i++)
- prt_printf(out, i ? " %u" : "%u", e->devs[i]);
- prt_printf(out, "]");
-}
-
-static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
- struct bch_sb *sb,
- struct printbuf *err)
-{
- if (!r->nr_devs) {
- prt_printf(err, "no devices in entry ");
- goto bad;
- }
-
- if (r->nr_required > 1 &&
- r->nr_required >= r->nr_devs) {
- prt_printf(err, "bad nr_required in entry ");
- goto bad;
- }
-
- for (unsigned i = 0; i < r->nr_devs; i++)
- if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
- !bch2_member_exists(sb, r->devs[i])) {
- prt_printf(err, "invalid device %u in entry ", r->devs[i]);
- goto bad;
- }
-
- return 0;
-bad:
- bch2_replicas_entry_to_text(err, r);
- return -BCH_ERR_invalid_replicas_entry;
-}
-
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
- struct bch_fs *c,
- struct printbuf *err)
-{
- if (!r->nr_devs) {
- prt_printf(err, "no devices in entry ");
- goto bad;
- }
-
- if (r->nr_required > 1 &&
- r->nr_required >= r->nr_devs) {
- prt_printf(err, "bad nr_required in entry ");
- goto bad;
- }
-
- for (unsigned i = 0; i < r->nr_devs; i++)
- if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
- !bch2_dev_exists(c, r->devs[i])) {
- prt_printf(err, "invalid device %u in entry ", r->devs[i]);
- goto bad;
- }
-
- return 0;
-bad:
- bch2_replicas_entry_to_text(err, r);
- return -BCH_ERR_invalid_replicas_entry;
-}
-
-void bch2_cpu_replicas_to_text(struct printbuf *out,
- struct bch_replicas_cpu *r)
-{
- struct bch_replicas_entry_v1 *e;
- bool first = true;
-
- for_each_cpu_replicas_entry(r, e) {
- if (!first)
- prt_printf(out, " ");
- first = false;
-
- bch2_replicas_entry_to_text(out, e);
- }
-}
-
-static void extent_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry_v1 *r)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- r->nr_required = 1;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.ptr.cached)
- continue;
-
- if (!p.has_ec)
- replicas_entry_add_dev(r, p.ptr.dev);
- else
- r->nr_required = 0;
- }
-}
-
-static void stripe_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry_v1 *r)
-{
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
- const struct bch_extent_ptr *ptr;
-
- r->nr_required = s.v->nr_blocks - s.v->nr_redundant;
-
- for (ptr = s.v->ptrs;
- ptr < s.v->ptrs + s.v->nr_blocks;
- ptr++)
- replicas_entry_add_dev(r, ptr->dev);
-}
-
-void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
- struct bkey_s_c k)
-{
- e->nr_devs = 0;
-
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- e->data_type = BCH_DATA_btree;
- extent_to_replicas(k, e);
- break;
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- e->data_type = BCH_DATA_user;
- extent_to_replicas(k, e);
- break;
- case KEY_TYPE_stripe:
- e->data_type = BCH_DATA_parity;
- stripe_to_replicas(k, e);
- break;
- }
-
- bch2_replicas_entry_sort(e);
-}
-
-void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
- enum bch_data_type data_type,
- struct bch_devs_list devs)
-{
- BUG_ON(!data_type ||
- data_type == BCH_DATA_sb ||
- data_type >= BCH_DATA_NR);
-
- e->data_type = data_type;
- e->nr_devs = 0;
- e->nr_required = 1;
-
- darray_for_each(devs, i)
- replicas_entry_add_dev(e, *i);
-
- bch2_replicas_entry_sort(e);
-}
-
-static struct bch_replicas_cpu
-cpu_replicas_add_entry(struct bch_fs *c,
- struct bch_replicas_cpu *old,
- struct bch_replicas_entry_v1 *new_entry)
-{
- struct bch_replicas_cpu new = {
- .nr = old->nr + 1,
- .entry_size = max_t(unsigned, old->entry_size,
- replicas_entry_bytes(new_entry)),
- };
-
- new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
- if (!new.entries)
- return new;
-
- for (unsigned i = 0; i < old->nr; i++)
- memcpy(cpu_replicas_entry(&new, i),
- cpu_replicas_entry(old, i),
- old->entry_size);
-
- memcpy(cpu_replicas_entry(&new, old->nr),
- new_entry,
- replicas_entry_bytes(new_entry));
-
- bch2_cpu_replicas_sort(&new);
- return new;
-}
-
-static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
- struct bch_replicas_entry_v1 *search)
-{
- int idx, entry_size = replicas_entry_bytes(search);
-
- if (unlikely(entry_size > r->entry_size))
- return -1;
-
-#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
- idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
- entry_cmp, search);
-#undef entry_cmp
-
- return idx < r->nr ? idx : -1;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *c,
- struct bch_replicas_entry_v1 *search)
-{
- bch2_replicas_entry_sort(search);
-
- return __replicas_entry_idx(&c->replicas, search);
-}
-
-static bool __replicas_has_entry(struct bch_replicas_cpu *r,
- struct bch_replicas_entry_v1 *search)
-{
- return __replicas_entry_idx(r, search) >= 0;
-}
-
-bool bch2_replicas_marked_locked(struct bch_fs *c,
- struct bch_replicas_entry_v1 *search)
-{
- verify_replicas_entry(search);
-
- return !search->nr_devs ||
- (__replicas_has_entry(&c->replicas, search) &&
- (likely((!c->replicas_gc.entries)) ||
- __replicas_has_entry(&c->replicas_gc, search)));
-}
-
-bool bch2_replicas_marked(struct bch_fs *c,
- struct bch_replicas_entry_v1 *search)
-{
- percpu_down_read(&c->mark_lock);
- bool ret = bch2_replicas_marked_locked(c, search);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-noinline
-static int bch2_mark_replicas_slowpath(struct bch_fs *c,
- struct bch_replicas_entry_v1 *new_entry)
-{
- struct bch_replicas_cpu new_r, new_gc;
- int ret = 0;
-
- verify_replicas_entry(new_entry);
-
- memset(&new_r, 0, sizeof(new_r));
- memset(&new_gc, 0, sizeof(new_gc));
-
- mutex_lock(&c->sb_lock);
-
- if (c->replicas_gc.entries &&
- !__replicas_has_entry(&c->replicas_gc, new_entry)) {
- new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
- if (!new_gc.entries) {
- ret = -BCH_ERR_ENOMEM_cpu_replicas;
- goto err;
- }
- }
-
- if (!__replicas_has_entry(&c->replicas, new_entry)) {
- new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
- if (!new_r.entries) {
- ret = -BCH_ERR_ENOMEM_cpu_replicas;
- goto err;
- }
-
- ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
- if (ret)
- goto err;
- }
-
- if (!new_r.entries &&
- !new_gc.entries)
- goto out;
-
- /* allocations done, now commit: */
-
- if (new_r.entries)
- bch2_write_super(c);
-
- /* don't update in memory replicas until changes are persistent */
- percpu_down_write(&c->mark_lock);
- if (new_r.entries)
- swap(c->replicas, new_r);
- if (new_gc.entries)
- swap(new_gc, c->replicas_gc);
- percpu_up_write(&c->mark_lock);
-out:
- mutex_unlock(&c->sb_lock);
-
- kfree(new_r.entries);
- kfree(new_gc.entries);
-
- return ret;
-err:
- bch_err_msg(c, ret, "adding replicas entry");
- goto out;
-}
-
-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
-{
- return likely(bch2_replicas_marked(c, r))
- ? 0 : bch2_mark_replicas_slowpath(c, r);
-}
-
-/*
- * Old replicas_gc mechanism: only used for journal replicas entries now, should
- * die at some point:
- */
-
-int bch2_replicas_gc_end(struct bch_fs *c, int ret)
-{
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
- percpu_down_write(&c->mark_lock);
-
- ret = ret ?:
- bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
- if (!ret)
- swap(c->replicas, c->replicas_gc);
-
- kfree(c->replicas_gc.entries);
- c->replicas_gc.entries = NULL;
-
- percpu_up_write(&c->mark_lock);
-
- if (!ret)
- bch2_write_super(c);
-
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
- struct bch_replicas_entry_v1 *e;
- unsigned i = 0;
-
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
- BUG_ON(c->replicas_gc.entries);
-
- c->replicas_gc.nr = 0;
- c->replicas_gc.entry_size = 0;
-
- for_each_cpu_replicas_entry(&c->replicas, e) {
- /* Preserve unknown data types */
- if (e->data_type >= BCH_DATA_NR ||
- !((1 << e->data_type) & typemask)) {
- c->replicas_gc.nr++;
- c->replicas_gc.entry_size =
- max_t(unsigned, c->replicas_gc.entry_size,
- replicas_entry_bytes(e));
- }
- }
-
- c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
- c->replicas_gc.entry_size,
- GFP_KERNEL);
- if (!c->replicas_gc.entries) {
- mutex_unlock(&c->sb_lock);
- bch_err(c, "error allocating c->replicas_gc");
- return -BCH_ERR_ENOMEM_replicas_gc;
- }
-
- for_each_cpu_replicas_entry(&c->replicas, e)
- if (e->data_type >= BCH_DATA_NR ||
- !((1 << e->data_type) & typemask))
- memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
- e, c->replicas_gc.entry_size);
-
- bch2_cpu_replicas_sort(&c->replicas_gc);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-/*
- * New much simpler mechanism for clearing out unneeded replicas entries - drop
- * replicas entries that have 0 sectors used.
- *
- * However, we don't track sector counts for journal usage, so this doesn't drop
- * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
- * is retained for that.
- */
-int bch2_replicas_gc2(struct bch_fs *c)
-{
- struct bch_replicas_cpu new = { 0 };
- unsigned nr;
- int ret = 0;
-
- bch2_accounting_mem_gc(c);
-retry:
- nr = READ_ONCE(c->replicas.nr);
- new.entry_size = READ_ONCE(c->replicas.entry_size);
- new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
- if (!new.entries) {
- bch_err(c, "error allocating c->replicas_gc");
- return -BCH_ERR_ENOMEM_replicas_gc;
- }
-
- mutex_lock(&c->sb_lock);
- percpu_down_write(&c->mark_lock);
-
- if (nr != c->replicas.nr ||
- new.entry_size != c->replicas.entry_size) {
- percpu_up_write(&c->mark_lock);
- mutex_unlock(&c->sb_lock);
- kfree(new.entries);
- goto retry;
- }
-
- for (unsigned i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
-
- struct disk_accounting_pos k = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
-
- unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
- "embedded variable length struct");
-
- struct bpos p = disk_accounting_pos_to_bpos(&k);
-
- struct bch_accounting_mem *acc = &c->accounting;
- bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &p) >= acc->k.nr;
-
- if (e->data_type == BCH_DATA_journal || !kill)
- memcpy(cpu_replicas_entry(&new, new.nr++),
- e, new.entry_size);
- }
-
- bch2_cpu_replicas_sort(&new);
-
- ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
-
- if (!ret)
- swap(c->replicas, new);
-
- kfree(new.entries);
-
- percpu_up_write(&c->mark_lock);
-
- if (!ret)
- bch2_write_super(c);
-
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-/* Replicas tracking - superblock: */
-
-static int
-__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
- struct bch_replicas_cpu *cpu_r)
-{
- struct bch_replicas_entry_v1 *e, *dst;
- unsigned nr = 0, entry_size = 0, idx = 0;
-
- for_each_replicas_entry(sb_r, e) {
- entry_size = max_t(unsigned, entry_size,
- replicas_entry_bytes(e));
- nr++;
- }
-
- cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
- if (!cpu_r->entries)
- return -BCH_ERR_ENOMEM_cpu_replicas;
-
- cpu_r->nr = nr;
- cpu_r->entry_size = entry_size;
-
- for_each_replicas_entry(sb_r, e) {
- dst = cpu_replicas_entry(cpu_r, idx++);
- memcpy(dst, e, replicas_entry_bytes(e));
- bch2_replicas_entry_sort(dst);
- }
-
- return 0;
-}
-
-static int
-__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
- struct bch_replicas_cpu *cpu_r)
-{
- struct bch_replicas_entry_v0 *e;
- unsigned nr = 0, entry_size = 0, idx = 0;
-
- for_each_replicas_entry(sb_r, e) {
- entry_size = max_t(unsigned, entry_size,
- replicas_entry_bytes(e));
- nr++;
- }
-
- entry_size += sizeof(struct bch_replicas_entry_v1) -
- sizeof(struct bch_replicas_entry_v0);
-
- cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
- if (!cpu_r->entries)
- return -BCH_ERR_ENOMEM_cpu_replicas;
-
- cpu_r->nr = nr;
- cpu_r->entry_size = entry_size;
-
- for_each_replicas_entry(sb_r, e) {
- struct bch_replicas_entry_v1 *dst =
- cpu_replicas_entry(cpu_r, idx++);
-
- dst->data_type = e->data_type;
- dst->nr_devs = e->nr_devs;
- dst->nr_required = 1;
- memcpy(dst->devs, e->devs, e->nr_devs);
- bch2_replicas_entry_sort(dst);
- }
-
- return 0;
-}
-
-int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
-{
- struct bch_sb_field_replicas *sb_v1;
- struct bch_sb_field_replicas_v0 *sb_v0;
- struct bch_replicas_cpu new_r = { 0, 0, NULL };
- int ret = 0;
-
- if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
- ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
- else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
- ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
- if (ret)
- return ret;
-
- bch2_cpu_replicas_sort(&new_r);
-
- percpu_down_write(&c->mark_lock);
- swap(c->replicas, new_r);
- percpu_up_write(&c->mark_lock);
-
- kfree(new_r.entries);
-
- return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
- struct bch_replicas_cpu *r)
-{
- struct bch_sb_field_replicas_v0 *sb_r;
- struct bch_replicas_entry_v0 *dst;
- struct bch_replicas_entry_v1 *src;
- size_t bytes;
-
- bytes = sizeof(struct bch_sb_field_replicas);
-
- for_each_cpu_replicas_entry(r, src)
- bytes += replicas_entry_bytes(src) - 1;
-
- sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
- DIV_ROUND_UP(bytes, sizeof(u64)));
- if (!sb_r)
- return -BCH_ERR_ENOSPC_sb_replicas;
-
- bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
- sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
-
- memset(&sb_r->entries, 0,
- vstruct_end(&sb_r->field) -
- (void *) &sb_r->entries);
-
- dst = sb_r->entries;
- for_each_cpu_replicas_entry(r, src) {
- dst->data_type = src->data_type;
- dst->nr_devs = src->nr_devs;
- memcpy(dst->devs, src->devs, src->nr_devs);
-
- dst = replicas_entry_next(dst);
-
- BUG_ON((void *) dst > vstruct_end(&sb_r->field));
- }
-
- return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
- struct bch_replicas_cpu *r)
-{
- struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_entry_v1 *dst, *src;
- bool need_v1 = false;
- size_t bytes;
-
- bytes = sizeof(struct bch_sb_field_replicas);
-
- for_each_cpu_replicas_entry(r, src) {
- bytes += replicas_entry_bytes(src);
- if (src->nr_required != 1)
- need_v1 = true;
- }
-
- if (!need_v1)
- return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
-
- sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
- DIV_ROUND_UP(bytes, sizeof(u64)));
- if (!sb_r)
- return -BCH_ERR_ENOSPC_sb_replicas;
-
- bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
- sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
-
- memset(&sb_r->entries, 0,
- vstruct_end(&sb_r->field) -
- (void *) &sb_r->entries);
-
- dst = sb_r->entries;
- for_each_cpu_replicas_entry(r, src) {
- memcpy(dst, src, replicas_entry_bytes(src));
-
- dst = replicas_entry_next(dst);
-
- BUG_ON((void *) dst > vstruct_end(&sb_r->field));
- }
-
- return 0;
-}
-
-static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
- struct bch_sb *sb,
- struct printbuf *err)
-{
- unsigned i;
-
- sort_r(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- bch2_memcmp, NULL,
- (void *)(size_t)cpu_r->entry_size);
-
- for (i = 0; i < cpu_r->nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(cpu_r, i);
-
- int ret = bch2_replicas_entry_sb_validate(e, sb, err);
- if (ret)
- return ret;
-
- if (i + 1 < cpu_r->nr) {
- struct bch_replicas_entry_v1 *n =
- cpu_replicas_entry(cpu_r, i + 1);
-
- BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
-
- if (!memcmp(e, n, cpu_r->entry_size)) {
- prt_printf(err, "duplicate replicas entry ");
- bch2_replicas_entry_to_text(err, e);
- return -BCH_ERR_invalid_sb_replicas;
- }
- }
- }
-
- return 0;
-}
-
-static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
- struct bch_replicas_cpu cpu_r;
- int ret;
-
- ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
- if (ret)
- return ret;
-
- ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
- kfree(cpu_r.entries);
- return ret;
-}
-
-static void bch2_sb_replicas_to_text(struct printbuf *out,
- struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_replicas *r = field_to_type(f, replicas);
- struct bch_replicas_entry_v1 *e;
- bool first = true;
-
- for_each_replicas_entry(r, e) {
- if (!first)
- prt_printf(out, " ");
- first = false;
-
- bch2_replicas_entry_to_text(out, e);
- }
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
- .validate = bch2_sb_replicas_validate,
- .to_text = bch2_sb_replicas_to_text,
-};
-
-static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
- struct bch_replicas_cpu cpu_r;
- int ret;
-
- ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
- if (ret)
- return ret;
-
- ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
- kfree(cpu_r.entries);
- return ret;
-}
-
-static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
- struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
- struct bch_replicas_entry_v0 *e;
- bool first = true;
-
- for_each_replicas_entry(sb_r, e) {
- if (!first)
- prt_printf(out, " ");
- first = false;
-
- bch2_replicas_entry_v0_to_text(out, e);
- }
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
- .validate = bch2_sb_replicas_v0_validate,
- .to_text = bch2_sb_replicas_v0_to_text,
-};
-
-/* Query replicas: */
-
-bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
- unsigned flags, bool print)
-{
- struct bch_replicas_entry_v1 *e;
- bool ret = true;
-
- percpu_down_read(&c->mark_lock);
- for_each_cpu_replicas_entry(&c->replicas, e) {
- unsigned nr_online = 0, nr_failed = 0, dflags = 0;
- bool metadata = e->data_type < BCH_DATA_user;
-
- if (e->data_type == BCH_DATA_cached)
- continue;
-
- rcu_read_lock();
- for (unsigned i = 0; i < e->nr_devs; i++) {
- if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
- nr_failed++;
- continue;
- }
-
- nr_online += test_bit(e->devs[i], devs.d);
-
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
- nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
- }
- rcu_read_unlock();
-
- if (nr_online + nr_failed == e->nr_devs)
- continue;
-
- if (nr_online < e->nr_required)
- dflags |= metadata
- ? BCH_FORCE_IF_METADATA_LOST
- : BCH_FORCE_IF_DATA_LOST;
-
- if (nr_online < e->nr_devs)
- dflags |= metadata
- ? BCH_FORCE_IF_METADATA_DEGRADED
- : BCH_FORCE_IF_DATA_DEGRADED;
-
- if (dflags & ~flags) {
- if (print) {
- struct printbuf buf = PRINTBUF;
-
- bch2_replicas_entry_to_text(&buf, e);
- bch_err(c, "insufficient devices online (%u) for replicas entry %s",
- nr_online, buf.buf);
- printbuf_exit(&buf);
- }
- ret = false;
- break;
- }
-
- }
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
-{
- struct bch_sb_field_replicas *replicas;
- struct bch_sb_field_replicas_v0 *replicas_v0;
- unsigned data_has = 0;
-
- replicas = bch2_sb_field_get(sb, replicas);
- replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
-
- if (replicas) {
- struct bch_replicas_entry_v1 *r;
-
- for_each_replicas_entry(replicas, r) {
- if (r->data_type >= sizeof(data_has) * 8)
- continue;
-
- for (unsigned i = 0; i < r->nr_devs; i++)
- if (r->devs[i] == dev)
- data_has |= 1 << r->data_type;
- }
-
- } else if (replicas_v0) {
- struct bch_replicas_entry_v0 *r;
-
- for_each_replicas_entry_v0(replicas_v0, r) {
- if (r->data_type >= sizeof(data_has) * 8)
- continue;
-
- for (unsigned i = 0; i < r->nr_devs; i++)
- if (r->devs[i] == dev)
- data_has |= 1 << r->data_type;
- }
- }
-
-
- return data_has;
-}
-
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
-{
- mutex_lock(&c->sb_lock);
- unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-void bch2_fs_replicas_exit(struct bch_fs *c)
-{
- kfree(c->replicas.entries);
- kfree(c->replicas_gc.entries);
-}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
deleted file mode 100644
index 5aba2c1ce133..000000000000
--- a/fs/bcachefs/replicas.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_H
-#define _BCACHEFS_REPLICAS_H
-
-#include "bkey.h"
-#include "eytzinger.h"
-#include "replicas_types.h"
-
-void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
-void bch2_replicas_entry_to_text(struct printbuf *,
- struct bch_replicas_entry_v1 *);
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
- struct bch_fs *, struct printbuf *);
-void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
-
-static inline struct bch_replicas_entry_v1 *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
- return (void *) r->entries + r->entry_size * i;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *,
- struct bch_replicas_entry_v1 *);
-
-void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
- enum bch_data_type,
- struct bch_devs_list);
-
-bool bch2_replicas_marked_locked(struct bch_fs *,
- struct bch_replicas_entry_v1 *);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
-int bch2_mark_replicas(struct bch_fs *,
- struct bch_replicas_entry_v1 *);
-
-void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
-
-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
- unsigned dev)
-{
- e->data_type = BCH_DATA_cached;
- e->nr_devs = 1;
- e->nr_required = 1;
- e->devs[0] = dev;
-}
-
-bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
- unsigned, bool);
-
-unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
-unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
-
-int bch2_replicas_gc_end(struct bch_fs *, int);
-int bch2_replicas_gc_start(struct bch_fs *, unsigned);
-int bch2_replicas_gc2(struct bch_fs *);
-
-#define for_each_cpu_replicas_entry(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
- _i = (void *) (_i) + (_r)->entry_size)
-
-/* iterate over superblock replicas - used by userspace tools: */
-
-#define replicas_entry_next(_i) \
- ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
-
-#define for_each_replicas_entry(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
- (_i) = replicas_entry_next(_i))
-
-#define for_each_replicas_entry_v0(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
- (_i) = replicas_entry_next(_i))
-
-int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
-extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
-
-void bch2_fs_replicas_exit(struct bch_fs *);
-
-#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
deleted file mode 100644
index b7eff904acdb..000000000000
--- a/fs/bcachefs/replicas_format.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_FORMAT_H
-#define _BCACHEFS_REPLICAS_FORMAT_H
-
-struct bch_replicas_entry_v0 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 devs[] __counted_by(nr_devs);
-} __packed;
-
-struct bch_sb_field_replicas_v0 {
- struct bch_sb_field field;
- struct bch_replicas_entry_v0 entries[];
-} __packed __aligned(8);
-
-struct bch_replicas_entry_v1 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 nr_required;
- __u8 devs[] __counted_by(nr_devs);
-} __packed;
-
-struct bch_sb_field_replicas {
- struct bch_sb_field field;
- struct bch_replicas_entry_v1 entries[];
-} __packed __aligned(8);
-
-#define replicas_entry_bytes(_i) \
- (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
-#define replicas_entry_add_dev(e, d) ({ \
- (e)->nr_devs++; \
- (e)->devs[(e)->nr_devs - 1] = (d); \
-})
-
-#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
deleted file mode 100644
index fed71c861fe7..000000000000
--- a/fs/bcachefs/replicas_types.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_TYPES_H
-#define _BCACHEFS_REPLICAS_TYPES_H
-
-struct bch_replicas_cpu {
- unsigned nr;
- unsigned entry_size;
- struct bch_replicas_entry_v1 *entries;
-};
-
-#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
deleted file mode 100644
index 59c8770e4a0e..000000000000
--- a/fs/bcachefs/sb-clean.c
+++ /dev/null
@@ -1,340 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal_io.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "super-io.h"
-
-/*
- * BCH_SB_FIELD_clean:
- *
- * Btree roots, and a few other things, are recovered from the journal after an
- * unclean shutdown - but after a clean shutdown, to avoid having to read the
- * journal, we can store them in the superblock.
- *
- * bch_sb_field_clean simply contains a list of journal entries, stored exactly
- * as they would be in the journal:
- */
-
-int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
- int write)
-{
- struct bkey_validate_context from = {
- .flags = write,
- .from = BKEY_VALIDATE_superblock,
- };
- struct jset_entry *entry;
- int ret;
-
- for (entry = clean->start;
- entry < (struct jset_entry *) vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- if (vstruct_end(entry) > vstruct_end(&clean->field)) {
- bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
- le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
- (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
- bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
- return -BCH_ERR_fsck_repair_unimplemented;
- }
-
- ret = bch2_journal_entry_validate(c, NULL, entry,
- le16_to_cpu(c->disk_sb.sb->version),
- BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
- from);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static struct bkey_i *btree_root_find(struct bch_fs *c,
- struct bch_sb_field_clean *clean,
- struct jset *j,
- enum btree_id id, unsigned *level)
-{
- struct bkey_i *k;
- struct jset_entry *entry, *start, *end;
-
- if (clean) {
- start = clean->start;
- end = vstruct_end(&clean->field);
- } else {
- start = j->start;
- end = vstruct_last(j);
- }
-
- for (entry = start; entry < end; entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_btree_root &&
- entry->btree_id == id)
- goto found;
-
- return NULL;
-found:
- if (!entry->u64s)
- return ERR_PTR(-EINVAL);
-
- k = entry->start;
- *level = entry->level;
- return k;
-}
-
-int bch2_verify_superblock_clean(struct bch_fs *c,
- struct bch_sb_field_clean **cleanp,
- struct jset *j)
-{
- unsigned i;
- struct bch_sb_field_clean *clean = *cleanp;
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
- int ret = 0;
-
- if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
- sb_clean_journal_seq_mismatch,
- "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
- le64_to_cpu(clean->journal_seq),
- le64_to_cpu(j->seq))) {
- kfree(clean);
- *cleanp = NULL;
- return 0;
- }
-
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct bkey_i *k1, *k2;
- unsigned l1 = 0, l2 = 0;
-
- k1 = btree_root_find(c, clean, NULL, i, &l1);
- k2 = btree_root_find(c, NULL, j, i, &l2);
-
- if (!k1 && !k2)
- continue;
-
- printbuf_reset(&buf1);
- printbuf_reset(&buf2);
-
- if (k1)
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
- else
- prt_printf(&buf1, "(none)");
-
- if (k2)
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
- else
- prt_printf(&buf2, "(none)");
-
- mustfix_fsck_err_on(!k1 || !k2 ||
- IS_ERR(k1) ||
- IS_ERR(k2) ||
- k1->k.u64s != k2->k.u64s ||
- memcmp(k1, k2, bkey_bytes(&k1->k)) ||
- l1 != l2, c,
- sb_clean_btree_root_mismatch,
- "superblock btree root %u doesn't match journal after clean shutdown\n"
- "sb: l=%u %s\n"
- "journal: l=%u %s\n", i,
- l1, buf1.buf,
- l2, buf2.buf);
- }
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
-struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
-{
- struct bch_sb_field_clean *clean, *sb_clean;
- int ret;
-
- mutex_lock(&c->sb_lock);
- sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
-
- if (fsck_err_on(!sb_clean, c,
- sb_clean_missing,
- "superblock marked clean but clean section not present")) {
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->sb.clean = false;
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(-BCH_ERR_invalid_sb_clean);
- }
-
- clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
- GFP_KERNEL);
- if (!clean) {
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
- }
-
- ret = bch2_sb_clean_validate_late(c, clean, READ);
- if (ret) {
- kfree(clean);
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(ret);
- }
-
- mutex_unlock(&c->sb_lock);
-
- return clean;
-fsck_err:
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(ret);
-}
-
-void bch2_journal_super_entries_add_common(struct bch_fs *c,
- struct jset_entry **end,
- u64 journal_seq)
-{
- {
- struct jset_entry_usage *u =
- container_of(jset_entry_init(end, sizeof(*u)),
- struct jset_entry_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = BCH_FS_USAGE_key_version;
- u->v = cpu_to_le64(atomic64_read(&c->key_version));
- }
-
- for (unsigned i = 0; i < 2; i++) {
- struct jset_entry_clock *clock =
- container_of(jset_entry_init(end, sizeof(*clock)),
- struct jset_entry_clock, entry);
-
- clock->entry.type = BCH_JSET_ENTRY_clock;
- clock->rw = i;
- clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
- }
-}
-
-static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
- if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
- prt_printf(err, "wrong size (got %zu should be %zu)",
- vstruct_bytes(&clean->field), sizeof(*clean));
- return -BCH_ERR_invalid_sb_clean;
- }
-
- for (struct jset_entry *entry = clean->start;
- entry != vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
- prt_str(err, "entry type ");
- bch2_prt_jset_entry_type(err, entry->type);
- prt_str(err, " overruns end of section");
- return -BCH_ERR_invalid_sb_clean;
- }
- }
-
- return 0;
-}
-
-static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_clean *clean = field_to_type(f, clean);
- struct jset_entry *entry;
-
- prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags));
- prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq));
-
- for (entry = clean->start;
- entry != vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- if ((void *) vstruct_next(entry) > vstruct_end(&clean->field))
- break;
-
- if (entry->type == BCH_JSET_ENTRY_btree_keys &&
- !entry->u64s)
- continue;
-
- bch2_journal_entry_to_text(out, NULL, entry);
- prt_newline(out);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_clean = {
- .validate = bch2_sb_clean_validate,
- .to_text = bch2_sb_clean_to_text,
-};
-
-int bch2_fs_mark_dirty(struct bch_fs *c)
-{
- int ret;
-
- /*
- * Unconditionally write superblock, to verify it hasn't changed before
- * we go rw:
- */
-
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
- ret = bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-void bch2_fs_mark_clean(struct bch_fs *c)
-{
- struct bch_sb_field_clean *sb_clean;
- struct jset_entry *entry;
- unsigned u64s;
- int ret;
-
- mutex_lock(&c->sb_lock);
- if (BCH_SB_CLEAN(c->disk_sb.sb))
- goto out;
-
- SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
-
- u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
-
- sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
- if (!sb_clean) {
- bch_err(c, "error resizing superblock while setting filesystem clean");
- goto out;
- }
-
- sb_clean->flags = 0;
- sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq));
-
- /* Trying to catch outstanding bug: */
- BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-
- entry = sb_clean->start;
- bch2_journal_super_entries_add_common(c, &entry, 0);
- entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
- BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-
- memset(entry, 0,
- vstruct_end(&sb_clean->field) - (void *) entry);
-
- /*
- * this should be in the write path, and we should be validating every
- * superblock section:
- */
- ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
- if (ret) {
- bch_err(c, "error writing marking filesystem clean: validate error");
- goto out;
- }
-
- bch2_journal_pos_from_member_info_set(c);
-
- bch2_write_super(c);
-out:
- mutex_unlock(&c->sb_lock);
-}
diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h
deleted file mode 100644
index 71caef281239..000000000000
--- a/fs/bcachefs/sb-clean.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_CLEAN_H
-#define _BCACHEFS_SB_CLEAN_H
-
-int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
-int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
- struct jset *);
-struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
-void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
-
-int bch2_fs_mark_dirty(struct bch_fs *);
-void bch2_fs_mark_clean(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
deleted file mode 100644
index 2b4b8445d418..000000000000
--- a/fs/bcachefs/sb-counters.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "super-io.h"
-#include "sb-counters.h"
-
-/* BCH_SB_FIELD_counters */
-
-static const u8 counters_to_stable_map[] = {
-#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n,
- BCH_PERSISTENT_COUNTERS()
-#undef x
-};
-
-const char * const bch2_counter_names[] = {
-#define x(t, n, ...) (#t),
- BCH_PERSISTENT_COUNTERS()
-#undef x
- NULL
-};
-
-static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
-{
- if (!ctrs)
- return 0;
-
- return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
-}
-
-static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- return 0;
-}
-
-static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
- unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
- unsigned stable = counters_to_stable_map[i];
- if (stable < nr)
- prt_printf(out, "%s \t%llu\n",
- bch2_counter_names[i],
- le64_to_cpu(ctrs->d[stable]));
- }
-}
-
-int bch2_sb_counters_to_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
- unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++)
- c->counters_on_mount[i] = 0;
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
- unsigned stable = counters_to_stable_map[i];
- if (stable < nr) {
- u64 v = le64_to_cpu(ctrs->d[stable]);
- percpu_u64_set(&c->counters[i], v);
- c->counters_on_mount[i] = v;
- }
- }
-
- return 0;
-}
-
-int bch2_sb_counters_from_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
- struct bch_sb_field_counters *ret;
- unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
- if (nr < BCH_COUNTER_NR) {
- ret = bch2_sb_field_resize(&c->disk_sb, counters,
- sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
- if (ret) {
- ctrs = ret;
- nr = bch2_sb_counter_nr_entries(ctrs);
- }
- }
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
- unsigned stable = counters_to_stable_map[i];
- if (stable < nr)
- ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
- }
-
- return 0;
-}
-
-void bch2_fs_counters_exit(struct bch_fs *c)
-{
- free_percpu(c->counters);
-}
-
-int bch2_fs_counters_init(struct bch_fs *c)
-{
- c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
- if (!c->counters)
- return -BCH_ERR_ENOMEM_fs_counters_init;
-
- return bch2_sb_counters_to_cpu(c);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_counters = {
- .validate = bch2_sb_counters_validate,
- .to_text = bch2_sb_counters_to_text,
-};
-
-#ifndef NO_BCACHEFS_CHARDEV
-long bch2_ioctl_query_counters(struct bch_fs *c,
- struct bch_ioctl_query_counters __user *user_arg)
-{
- struct bch_ioctl_query_counters arg;
- int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg));
- if (ret)
- return ret;
-
- if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) ||
- arg.pad)
- return -EINVAL;
-
- arg.nr = min(arg.nr, BCH_COUNTER_NR);
- ret = put_user(arg.nr, &user_arg->nr);
- if (ret)
- return ret;
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
- unsigned stable = counters_to_stable_map[i];
-
- if (stable < arg.nr) {
- u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT)
- ? percpu_u64_get(&c->counters[i])
- : c->counters_on_mount[i];
-
- ret = put_user(v, &user_arg->d[stable]);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-#endif
diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h
deleted file mode 100644
index a4329ad8dd1b..000000000000
--- a/fs/bcachefs/sb-counters.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_COUNTERS_H
-#define _BCACHEFS_SB_COUNTERS_H
-
-#include "bcachefs.h"
-#include "super-io.h"
-
-int bch2_sb_counters_to_cpu(struct bch_fs *);
-int bch2_sb_counters_from_cpu(struct bch_fs *);
-
-void bch2_fs_counters_exit(struct bch_fs *);
-int bch2_fs_counters_init(struct bch_fs *);
-
-extern const char * const bch2_counter_names[];
-extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
-
-long bch2_ioctl_query_counters(struct bch_fs *,
- struct bch_ioctl_query_counters __user *);
-
-#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
deleted file mode 100644
index fa27ec59a647..000000000000
--- a/fs/bcachefs/sb-counters_format.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
-#define _BCACHEFS_SB_COUNTERS_FORMAT_H
-
-enum counters_flags {
- TYPE_COUNTER = BIT(0), /* event counters */
- TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */
-};
-
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0, TYPE_SECTORS) \
- x(io_read_inline, 80, TYPE_SECTORS) \
- x(io_read_hole, 81, TYPE_SECTORS) \
- x(io_read_promote, 30, TYPE_COUNTER) \
- x(io_read_bounce, 31, TYPE_COUNTER) \
- x(io_read_split, 33, TYPE_COUNTER) \
- x(io_read_reuse_race, 34, TYPE_COUNTER) \
- x(io_read_retry, 32, TYPE_COUNTER) \
- x(io_write, 1, TYPE_SECTORS) \
- x(io_move, 2, TYPE_SECTORS) \
- x(io_move_read, 35, TYPE_SECTORS) \
- x(io_move_write, 36, TYPE_SECTORS) \
- x(io_move_finish, 37, TYPE_SECTORS) \
- x(io_move_fail, 38, TYPE_COUNTER) \
- x(io_move_write_fail, 82, TYPE_COUNTER) \
- x(io_move_start_fail, 39, TYPE_COUNTER) \
- x(bucket_invalidate, 3, TYPE_COUNTER) \
- x(bucket_discard, 4, TYPE_COUNTER) \
- x(bucket_discard_fast, 79, TYPE_COUNTER) \
- x(bucket_alloc, 5, TYPE_COUNTER) \
- x(bucket_alloc_fail, 6, TYPE_COUNTER) \
- x(btree_cache_scan, 7, TYPE_COUNTER) \
- x(btree_cache_reap, 8, TYPE_COUNTER) \
- x(btree_cache_cannibalize, 9, TYPE_COUNTER) \
- x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \
- x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \
- x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \
- x(btree_node_write, 13, TYPE_COUNTER) \
- x(btree_node_read, 14, TYPE_COUNTER) \
- x(btree_node_compact, 15, TYPE_COUNTER) \
- x(btree_node_merge, 16, TYPE_COUNTER) \
- x(btree_node_split, 17, TYPE_COUNTER) \
- x(btree_node_rewrite, 18, TYPE_COUNTER) \
- x(btree_node_alloc, 19, TYPE_COUNTER) \
- x(btree_node_free, 20, TYPE_COUNTER) \
- x(btree_node_set_root, 21, TYPE_COUNTER) \
- x(btree_path_relock_fail, 22, TYPE_COUNTER) \
- x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \
- x(btree_reserve_get_fail, 24, TYPE_COUNTER) \
- x(journal_entry_full, 25, TYPE_COUNTER) \
- x(journal_full, 26, TYPE_COUNTER) \
- x(journal_reclaim_finish, 27, TYPE_COUNTER) \
- x(journal_reclaim_start, 28, TYPE_COUNTER) \
- x(journal_write, 29, TYPE_COUNTER) \
- x(copygc, 40, TYPE_COUNTER) \
- x(copygc_wait, 41, TYPE_COUNTER) \
- x(gc_gens_end, 42, TYPE_COUNTER) \
- x(gc_gens_start, 43, TYPE_COUNTER) \
- x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \
- x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \
- x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \
- x(trans_restart_fault_inject, 47, TYPE_COUNTER) \
- x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \
- x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \
- x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \
- x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \
- x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \
- x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \
- x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \
- x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \
- x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \
- x(trans_restart_relock, 57, TYPE_COUNTER) \
- x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \
- x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \
- x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \
- x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \
- x(trans_restart_relock_path, 62, TYPE_COUNTER) \
- x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \
- x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \
- x(trans_restart_traverse, 65, TYPE_COUNTER) \
- x(trans_restart_upgrade, 66, TYPE_COUNTER) \
- x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \
- x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \
- x(trans_restart_injected, 69, TYPE_COUNTER) \
- x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \
- x(trans_traverse_all, 71, TYPE_COUNTER) \
- x(transaction_commit, 72, TYPE_COUNTER) \
- x(write_super, 73, TYPE_COUNTER) \
- x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \
- x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \
- x(trans_restart_split_race, 76, TYPE_COUNTER) \
- x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \
- x(write_buffer_flush_sync, 78, TYPE_COUNTER)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- BCH_COUNTER_NR
-};
-
-enum bch_persistent_counters_stable {
-#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- BCH_COUNTER_STABLE_NR
-};
-
-struct bch_sb_field_counters {
- struct bch_sb_field field;
- __le64 d[];
-};
-
-#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
deleted file mode 100644
index acb5d845841e..000000000000
--- a/fs/bcachefs/sb-downgrade.c
+++ /dev/null
@@ -1,443 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Superblock section that contains a list of recovery passes to run when
- * downgrading past a given version
- */
-
-#include "bcachefs.h"
-#include "darray.h"
-#include "recovery_passes.h"
-#include "sb-downgrade.h"
-#include "sb-errors.h"
-#include "super-io.h"
-
-#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63)
-
-/*
- * Upgrade, downgrade tables - run certain recovery passes, fix certain errors
- *
- * x(version, recovery_passes, errors...)
- */
-#define UPGRADE_TABLE() \
- x(backpointers, \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_v3, \
- RECOVERY_PASS_ALL_FSCK) \
- x(unwritten_extents, \
- RECOVERY_PASS_ALL_FSCK) \
- x(bucket_gens, \
- BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
- RECOVERY_PASS_ALL_FSCK) \
- x(lru_v2, \
- RECOVERY_PASS_ALL_FSCK) \
- x(fragmentation_lru, \
- RECOVERY_PASS_ALL_FSCK) \
- x(no_bps_in_alloc_keys, \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_trees, \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_skiplists, \
- BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \
- BCH_FSCK_ERR_snapshot_bad_depth, \
- BCH_FSCK_ERR_snapshot_bad_skiplist) \
- x(deleted_inodes, \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
- BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
- x(rebalance_work, \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \
- x(subvolume_fs_parent, \
- BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
- BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
- x(btree_subvolume_children, \
- BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
- BCH_FSCK_ERR_subvol_children_not_set) \
- x(mi_btree_bitmap, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_btree_bitmap_not_marked) \
- x(disk_accounting_v2, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_bkey_version_in_future, \
- BCH_FSCK_ERR_dev_usage_buckets_wrong, \
- BCH_FSCK_ERR_dev_usage_sectors_wrong, \
- BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
- BCH_FSCK_ERR_accounting_mismatch) \
- x(disk_accounting_v3, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_bkey_version_in_future, \
- BCH_FSCK_ERR_dev_usage_buckets_wrong, \
- BCH_FSCK_ERR_dev_usage_sectors_wrong, \
- BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
- BCH_FSCK_ERR_accounting_mismatch, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \
- BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \
- BCH_FSCK_ERR_accounting_key_junk_at_end) \
- x(disk_accounting_inum, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch) \
- x(rebalance_work_acct_fix, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch) \
- x(inode_has_child_snapshots, \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
- BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \
- x(backpointer_bucket_gen, \
- BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
- BCH_FSCK_ERR_backpointer_to_missing_ptr, \
- BCH_FSCK_ERR_ptr_to_missing_backpointer) \
- x(disk_accounting_big_endian, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_junk_at_end) \
- x(cached_backpointers, \
- BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
- BCH_FSCK_ERR_ptr_to_missing_backpointer) \
- x(stripe_backpointers, \
- BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
- BCH_FSCK_ERR_ptr_to_missing_backpointer)
-
-#define DOWNGRADE_TABLE() \
- x(bucket_stripe_sectors, \
- 0) \
- x(disk_accounting_v2, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_dev_usage_buckets_wrong, \
- BCH_FSCK_ERR_dev_usage_sectors_wrong, \
- BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
- BCH_FSCK_ERR_fs_usage_hidden_wrong, \
- BCH_FSCK_ERR_fs_usage_btree_wrong, \
- BCH_FSCK_ERR_fs_usage_data_wrong, \
- BCH_FSCK_ERR_fs_usage_cached_wrong, \
- BCH_FSCK_ERR_fs_usage_reserved_wrong, \
- BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
- BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
- BCH_FSCK_ERR_fs_usage_replicas_wrong, \
- BCH_FSCK_ERR_bkey_version_in_future) \
- x(disk_accounting_v3, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_dev_usage_buckets_wrong, \
- BCH_FSCK_ERR_dev_usage_sectors_wrong, \
- BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
- BCH_FSCK_ERR_fs_usage_hidden_wrong, \
- BCH_FSCK_ERR_fs_usage_btree_wrong, \
- BCH_FSCK_ERR_fs_usage_data_wrong, \
- BCH_FSCK_ERR_fs_usage_cached_wrong, \
- BCH_FSCK_ERR_fs_usage_reserved_wrong, \
- BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
- BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
- BCH_FSCK_ERR_fs_usage_replicas_wrong, \
- BCH_FSCK_ERR_accounting_replicas_not_marked, \
- BCH_FSCK_ERR_bkey_version_in_future) \
- x(rebalance_work_acct_fix, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_junk_at_end) \
- x(backpointer_bucket_gen, \
- BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
- BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \
- BCH_FSCK_ERR_backpointer_to_missing_ptr, \
- BCH_FSCK_ERR_ptr_to_missing_backpointer) \
- x(disk_accounting_big_endian, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_junk_at_end)
-
-struct upgrade_downgrade_entry {
- u64 recovery_passes;
- u16 version;
- u16 nr_errors;
- const u16 *errors;
-};
-
-#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
-UPGRADE_TABLE()
-#undef x
-
-static const struct upgrade_downgrade_entry upgrade_table[] = {
-#define x(ver, passes, ...) { \
- .recovery_passes = passes, \
- .version = bcachefs_metadata_version_##ver,\
- .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \
- .errors = upgrade_##ver##_errors, \
-},
-UPGRADE_TABLE()
-#undef x
-};
-
-static int have_stripes(struct bch_fs *c)
-{
- if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
- return 0;
-
- return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
-}
-
-int bch2_sb_set_upgrade_extra(struct bch_fs *c)
-{
- unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
- unsigned new_version = c->sb.version;
- bool write_sb = false;
- int ret = 0;
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- if (old_version < bcachefs_metadata_version_bucket_stripe_sectors &&
- new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
- (ret = have_stripes(c) > 0)) {
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
- write_sb = true;
- }
-
- if (write_sb)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return ret < 0 ? ret : 0;
-}
-
-void bch2_sb_set_upgrade(struct bch_fs *c,
- unsigned old_version,
- unsigned new_version)
-{
- lockdep_assert_held(&c->sb_lock);
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- for (const struct upgrade_downgrade_entry *i = upgrade_table;
- i < upgrade_table + ARRAY_SIZE(upgrade_table);
- i++)
- if (i->version > old_version && i->version <= new_version) {
- u64 passes = i->recovery_passes;
-
- if (passes & RECOVERY_PASS_ALL_FSCK)
- passes |= bch2_fsck_recovery_passes();
- passes &= ~RECOVERY_PASS_ALL_FSCK;
-
- ext->recovery_passes_required[0] |=
- cpu_to_le64(bch2_recovery_passes_to_stable(passes));
-
- for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
- __set_bit_le64(*e, ext->errors_silent);
- }
-}
-
-#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
-DOWNGRADE_TABLE()
-#undef x
-
-static const struct upgrade_downgrade_entry downgrade_table[] = {
-#define x(ver, passes, ...) { \
- .recovery_passes = passes, \
- .version = bcachefs_metadata_version_##ver,\
- .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \
- .errors = downgrade_##ver##_errors, \
-},
-DOWNGRADE_TABLE()
-#undef x
-};
-
-static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
-{
- struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
- unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
- int ret = 0;
-
- unsigned nr_errors = le16_to_cpu(dst->nr_errors);
-
- switch (le16_to_cpu(dst->version)) {
- case bcachefs_metadata_version_bucket_stripe_sectors:
- if (have_stripes(c)) {
- bytes += sizeof(dst->errors[0]) * 2;
-
- ret = darray_make_room(table, bytes);
- if (ret)
- return ret;
-
- /* open coded __set_bit_le64, as dst is packed and
- * dst->recovery_passes is misaligned */
- unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
- dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));
-
- dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
- }
- break;
- }
-
- dst->nr_errors = cpu_to_le16(nr_errors);
- return ret;
-}
-
-static inline const struct bch_sb_field_downgrade_entry *
-downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
-{
- return (void *) &e->errors[le16_to_cpu(e->nr_errors)];
-}
-
-#define for_each_downgrade_entry(_d, _i) \
- for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \
- (void *) _i < vstruct_end(&(_d)->field) && \
- (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \
- (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \
- _i = downgrade_entry_next_c(_i))
-
-static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
-
- for (const struct bch_sb_field_downgrade_entry *i = e->entries;
- (void *) i < vstruct_end(&e->field);
- i = downgrade_entry_next_c(i)) {
- /*
- * Careful: sb_field_downgrade_entry is only 2 byte aligned, but
- * section sizes are 8 byte aligned - an empty entry spanning
- * the end of the section is allowed (and ignored):
- */
- if ((void *) &i->errors[0] > vstruct_end(&e->field))
- break;
-
- if (flags & BCH_VALIDATE_write &&
- (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
- prt_printf(err, "downgrade entry overruns end of superblock section");
- return -BCH_ERR_invalid_sb_downgrade;
- }
-
- if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
- BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
- prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
- BCH_VERSION_MAJOR(le16_to_cpu(i->version)),
- BCH_VERSION_MAJOR(le16_to_cpu(sb->version)));
- return -BCH_ERR_invalid_sb_downgrade;
- }
- }
-
- return 0;
-}
-
-static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
-
- if (out->nr_tabstops <= 1)
- printbuf_tabstop_push(out, 16);
-
- for_each_downgrade_entry(e, i) {
- prt_str(out, "version:\t");
- bch2_version_to_text(out, le16_to_cpu(i->version));
- prt_newline(out);
-
- prt_str(out, "recovery passes:\t");
- prt_bitflags(out, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
- prt_newline(out);
-
- prt_str(out, "errors:\t");
- bool first = true;
- for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
- if (!first)
- prt_char(out, ',');
- first = false;
- bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
- }
- prt_newline(out);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
- .validate = bch2_sb_downgrade_validate,
- .to_text = bch2_sb_downgrade_to_text,
-};
-
-int bch2_sb_downgrade_update(struct bch_fs *c)
-{
- if (!test_bit(BCH_FS_btree_running, &c->flags))
- return 0;
-
- darray_char table = {};
- int ret = 0;
-
- for (const struct upgrade_downgrade_entry *src = downgrade_table;
- src < downgrade_table + ARRAY_SIZE(downgrade_table);
- src++) {
- if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
- continue;
-
- struct bch_sb_field_downgrade_entry *dst;
- unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
-
- ret = darray_make_room(&table, bytes);
- if (ret)
- goto out;
-
- dst = (void *) &darray_top(table);
- dst->version = cpu_to_le16(src->version);
- dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
- dst->recovery_passes[1] = 0;
- dst->nr_errors = cpu_to_le16(src->nr_errors);
- for (unsigned i = 0; i < src->nr_errors; i++)
- dst->errors[i] = cpu_to_le16(src->errors[i]);
-
- ret = downgrade_table_extra(c, &table);
- if (ret)
- goto out;
-
- if (!dst->recovery_passes[0] &&
- !dst->recovery_passes[1] &&
- !dst->nr_errors)
- continue;
-
- table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
- }
-
- struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
-
- unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64));
-
- if (d && le32_to_cpu(d->field.u64s) > sb_u64s)
- goto out;
-
- d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
- if (!d) {
- ret = -BCH_ERR_ENOSPC_sb_downgrade;
- goto out;
- }
-
- memcpy(d->entries, table.data, table.nr);
- memset_u64s_tail(d->entries, 0, table.nr);
-out:
- darray_exit(&table);
- return ret;
-}
-
-void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor)
-{
- struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
- if (!d)
- return;
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- for_each_downgrade_entry(d, i) {
- unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version));
- if (new_minor < minor && minor <= old_minor) {
- ext->recovery_passes_required[0] |= i->recovery_passes[0];
- ext->recovery_passes_required[1] |= i->recovery_passes[1];
-
- for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
- unsigned e = le16_to_cpu(i->errors[j]);
- if (e < BCH_FSCK_ERR_MAX)
- __set_bit(e, c->sb.errors_silent);
- if (e < sizeof(ext->errors_silent) * 8)
- __set_bit_le64(e, ext->errors_silent);
- }
- }
- }
-}
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
deleted file mode 100644
index 095b7cc9bb47..000000000000
--- a/fs/bcachefs/sb-downgrade.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_DOWNGRADE_H
-#define _BCACHEFS_SB_DOWNGRADE_H
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
-
-int bch2_sb_downgrade_update(struct bch_fs *);
-void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
-int bch2_sb_set_upgrade_extra(struct bch_fs *);
-void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
-
-#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h
deleted file mode 100644
index cffd932be3ec..000000000000
--- a/fs/bcachefs/sb-downgrade_format.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H
-#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H
-
-struct bch_sb_field_downgrade_entry {
- __le16 version;
- __le64 recovery_passes[2];
- __le16 nr_errors;
- __le16 errors[] __counted_by(nr_errors);
-} __packed __aligned(2);
-
-struct bch_sb_field_downgrade {
- struct bch_sb_field field;
- struct bch_sb_field_downgrade_entry entries[];
-};
-
-#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
deleted file mode 100644
index 013a96883b4e..000000000000
--- a/fs/bcachefs/sb-errors.c
+++ /dev/null
@@ -1,176 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "sb-errors.h"
-#include "super-io.h"
-
-const char * const bch2_sb_error_strs[] = {
-#define x(t, n, ...) [n] = #t,
- BCH_SB_ERRS()
-#undef x
-};
-
-void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
-{
- if (id < BCH_FSCK_ERR_MAX)
- prt_str(out, bch2_sb_error_strs[id]);
- else
- prt_printf(out, "(unknown error %u)", id);
-}
-
-static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
-{
- return bch2_sb_field_nr_entries(e);
-}
-
-static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
-{
- return (sizeof(struct bch_sb_field_errors) +
- sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
-}
-
-static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_errors *e = field_to_type(f, errors);
- unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
-
- for (i = 0; i < nr; i++) {
- if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
- prt_printf(err, "entry with count 0 (id ");
- bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
- prt_printf(err, ")");
- return -BCH_ERR_invalid_sb_errors;
- }
-
- if (i + 1 < nr &&
- BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
- BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
- prt_printf(err, "entries out of order");
- return -BCH_ERR_invalid_sb_errors;
- }
- }
-
- return 0;
-}
-
-static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_errors *e = field_to_type(f, errors);
- unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
-
- if (out->nr_tabstops <= 1)
- printbuf_tabstop_push(out, 16);
-
- for (i = 0; i < nr; i++) {
- bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
- prt_tab(out);
- prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
- prt_tab(out);
- bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
- prt_newline(out);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_errors = {
- .validate = bch2_sb_errors_validate,
- .to_text = bch2_sb_errors_to_text,
-};
-
-void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
-{
- bch_sb_errors_cpu *e = &c->fsck_error_counts;
- struct bch_sb_error_entry_cpu n = {
- .id = err,
- .nr = 1,
- .last_error_time = ktime_get_real_seconds()
- };
- unsigned i;
-
- mutex_lock(&c->fsck_error_counts_lock);
- for (i = 0; i < e->nr; i++) {
- if (err == e->data[i].id) {
- e->data[i].nr++;
- e->data[i].last_error_time = n.last_error_time;
- goto out;
- }
- if (err < e->data[i].id)
- break;
- }
-
- if (darray_make_room(e, 1))
- goto out;
-
- darray_insert_item(e, i, n);
-out:
- mutex_unlock(&c->fsck_error_counts_lock);
-}
-
-void bch2_sb_errors_from_cpu(struct bch_fs *c)
-{
- bch_sb_errors_cpu *src = &c->fsck_error_counts;
- struct bch_sb_field_errors *dst;
- unsigned i;
-
- mutex_lock(&c->fsck_error_counts_lock);
-
- dst = bch2_sb_field_resize(&c->disk_sb, errors,
- bch2_sb_field_errors_u64s(src->nr));
-
- if (!dst)
- goto err;
-
- for (i = 0; i < src->nr; i++) {
- SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
- SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
- dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
- }
-
-err:
- mutex_unlock(&c->fsck_error_counts_lock);
-}
-
-static int bch2_sb_errors_to_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
- bch_sb_errors_cpu *dst = &c->fsck_error_counts;
- unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
- int ret;
-
- if (!nr)
- return 0;
-
- mutex_lock(&c->fsck_error_counts_lock);
- ret = darray_make_room(dst, nr);
- if (ret)
- goto err;
-
- dst->nr = nr;
-
- for (i = 0; i < nr; i++) {
- dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
- dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
- dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
- }
-err:
- mutex_unlock(&c->fsck_error_counts_lock);
-
- return ret;
-}
-
-void bch2_fs_sb_errors_exit(struct bch_fs *c)
-{
- darray_exit(&c->fsck_error_counts);
-}
-
-void bch2_fs_sb_errors_init_early(struct bch_fs *c)
-{
- mutex_init(&c->fsck_error_counts_lock);
- darray_init(&c->fsck_error_counts);
-}
-
-int bch2_fs_sb_errors_init(struct bch_fs *c)
-{
- return bch2_sb_errors_to_cpu(c);
-}
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
deleted file mode 100644
index b2357b8e6107..000000000000
--- a/fs/bcachefs/sb-errors.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_H
-#define _BCACHEFS_SB_ERRORS_H
-
-#include "sb-errors_types.h"
-
-extern const char * const bch2_sb_error_strs[];
-
-void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
-
-void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
-
-void bch2_sb_errors_from_cpu(struct bch_fs *);
-
-void bch2_fs_sb_errors_exit(struct bch_fs *);
-void bch2_fs_sb_errors_init_early(struct bch_fs *);
-int bch2_fs_sb_errors_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_ERRORS_H */
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
deleted file mode 100644
index 67455beb8358..000000000000
--- a/fs/bcachefs/sb-errors_format.h
+++ /dev/null
@@ -1,339 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
-#define _BCACHEFS_SB_ERRORS_FORMAT_H
-
-enum bch_fsck_flags {
- FSCK_CAN_FIX = 1 << 0,
- FSCK_CAN_IGNORE = 1 << 1,
- FSCK_NO_RATELIMIT = 1 << 2,
- FSCK_AUTOFIX = 1 << 3,
-};
-
-#define BCH_SB_ERRS() \
- x(clean_but_journal_not_empty, 0, 0) \
- x(dirty_but_no_journal_entries, 1, 0) \
- x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \
- x(sb_clean_journal_seq_mismatch, 3, 0) \
- x(sb_clean_btree_root_mismatch, 4, 0) \
- x(sb_clean_missing, 5, 0) \
- x(jset_unsupported_version, 6, 0) \
- x(jset_unknown_csum, 7, 0) \
- x(jset_last_seq_newer_than_seq, 8, 0) \
- x(jset_past_bucket_end, 9, 0) \
- x(jset_seq_blacklisted, 10, 0) \
- x(journal_entries_missing, 11, 0) \
- x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
- x(journal_entry_past_jset_end, 13, 0) \
- x(journal_entry_replicas_data_mismatch, 14, 0) \
- x(journal_entry_bkey_u64s_0, 15, 0) \
- x(journal_entry_bkey_past_end, 16, 0) \
- x(journal_entry_bkey_bad_format, 17, 0) \
- x(journal_entry_bkey_invalid, 18, 0) \
- x(journal_entry_btree_root_bad_size, 19, 0) \
- x(journal_entry_blacklist_bad_size, 20, 0) \
- x(journal_entry_blacklist_v2_bad_size, 21, 0) \
- x(journal_entry_blacklist_v2_start_past_end, 22, 0) \
- x(journal_entry_usage_bad_size, 23, 0) \
- x(journal_entry_data_usage_bad_size, 24, 0) \
- x(journal_entry_clock_bad_size, 25, 0) \
- x(journal_entry_clock_bad_rw, 26, 0) \
- x(journal_entry_dev_usage_bad_size, 27, 0) \
- x(journal_entry_dev_usage_bad_dev, 28, 0) \
- x(journal_entry_dev_usage_bad_pad, 29, 0) \
- x(btree_node_unreadable, 30, 0) \
- x(btree_node_fault_injected, 31, 0) \
- x(btree_node_bad_magic, 32, 0) \
- x(btree_node_bad_seq, 33, 0) \
- x(btree_node_unsupported_version, 34, 0) \
- x(btree_node_bset_older_than_sb_min, 35, 0) \
- x(btree_node_bset_newer_than_sb, 36, 0) \
- x(btree_node_data_missing, 37, 0) \
- x(btree_node_bset_after_end, 38, 0) \
- x(btree_node_replicas_sectors_written_mismatch, 39, 0) \
- x(btree_node_replicas_data_mismatch, 40, 0) \
- x(bset_unknown_csum, 41, 0) \
- x(bset_bad_csum, 42, 0) \
- x(bset_past_end_of_btree_node, 43, 0) \
- x(bset_wrong_sector_offset, 44, 0) \
- x(bset_empty, 45, 0) \
- x(bset_bad_seq, 46, 0) \
- x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \
- x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \
- x(btree_node_bad_btree, 49, 0) \
- x(btree_node_bad_level, 50, 0) \
- x(btree_node_bad_min_key, 51, 0) \
- x(btree_node_bad_max_key, 52, 0) \
- x(btree_node_bad_format, 53, 0) \
- x(btree_node_bkey_past_bset_end, 54, 0) \
- x(btree_node_bkey_bad_format, 55, 0) \
- x(btree_node_bad_bkey, 56, 0) \
- x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \
- x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \
- x(btree_root_read_error, 59, FSCK_AUTOFIX) \
- x(btree_root_bad_min_key, 60, 0) \
- x(btree_root_bad_max_key, 61, 0) \
- x(btree_node_read_error, 62, FSCK_AUTOFIX) \
- x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \
- x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \
- x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \
- x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \
- x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \
- x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \
- x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \
- x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \
- x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \
- x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \
- x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \
- x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \
- x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \
- x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \
- x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \
- x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \
- x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \
- x(bkey_version_in_future, 80, 0) \
- x(bkey_u64s_too_small, 81, 0) \
- x(bkey_invalid_type_for_btree, 82, 0) \
- x(bkey_extent_size_zero, 83, 0) \
- x(bkey_extent_size_greater_than_offset, 84, 0) \
- x(bkey_size_nonzero, 85, 0) \
- x(bkey_snapshot_nonzero, 86, 0) \
- x(bkey_snapshot_zero, 87, 0) \
- x(bkey_at_pos_max, 88, 0) \
- x(bkey_before_start_of_btree_node, 89, 0) \
- x(bkey_after_end_of_btree_node, 90, 0) \
- x(bkey_val_size_nonzero, 91, 0) \
- x(bkey_val_size_too_small, 92, 0) \
- x(alloc_v1_val_size_bad, 93, 0) \
- x(alloc_v2_unpack_error, 94, 0) \
- x(alloc_v3_unpack_error, 95, 0) \
- x(alloc_v4_val_size_bad, 96, 0) \
- x(alloc_v4_backpointers_start_bad, 97, 0) \
- x(alloc_key_data_type_bad, 98, 0) \
- x(alloc_key_empty_but_have_data, 99, 0) \
- x(alloc_key_dirty_sectors_0, 100, 0) \
- x(alloc_key_data_type_inconsistency, 101, 0) \
- x(alloc_key_to_missing_dev_bucket, 102, 0) \
- x(alloc_key_cached_inconsistency, 103, 0) \
- x(alloc_key_cached_but_read_time_zero, 104, FSCK_AUTOFIX) \
- x(alloc_key_to_missing_lru_entry, 105, FSCK_AUTOFIX) \
- x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \
- x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \
- x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \
- x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \
- x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \
- x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \
- x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \
- x(bucket_sector_count_overflow, 112, 0) \
- x(bucket_metadata_type_mismatch, 113, 0) \
- x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \
- x(freespace_key_wrong, 115, FSCK_AUTOFIX) \
- x(freespace_hole_missing, 116, FSCK_AUTOFIX) \
- x(bucket_gens_val_size_bad, 117, 0) \
- x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \
- x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \
- x(bucket_gens_to_invalid_dev, 120, FSCK_AUTOFIX) \
- x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \
- x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \
- x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
- x(need_discard_freespace_key_bad, 124, 0) \
- x(discarding_bucket_not_in_need_discard_btree, 291, 0) \
- x(backpointer_bucket_offset_wrong, 125, 0) \
- x(backpointer_level_bad, 294, 0) \
- x(backpointer_dev_bad, 297, 0) \
- x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \
- x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \
- x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \
- x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \
- x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \
- x(lru_entry_bad, 131, FSCK_AUTOFIX) \
- x(btree_ptr_val_too_big, 132, 0) \
- x(btree_ptr_v2_val_too_big, 133, 0) \
- x(btree_ptr_has_non_ptr, 134, 0) \
- x(extent_ptrs_invalid_entry, 135, 0) \
- x(extent_ptrs_no_ptrs, 136, 0) \
- x(extent_ptrs_too_many_ptrs, 137, 0) \
- x(extent_ptrs_redundant_crc, 138, 0) \
- x(extent_ptrs_redundant_stripe, 139, 0) \
- x(extent_ptrs_unwritten, 140, 0) \
- x(extent_ptrs_written_and_unwritten, 141, 0) \
- x(ptr_to_invalid_device, 142, 0) \
- x(ptr_to_duplicate_device, 143, 0) \
- x(ptr_after_last_bucket, 144, 0) \
- x(ptr_before_first_bucket, 145, 0) \
- x(ptr_spans_multiple_buckets, 146, 0) \
- x(ptr_to_missing_backpointer, 147, FSCK_AUTOFIX) \
- x(ptr_to_missing_alloc_key, 148, FSCK_AUTOFIX) \
- x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \
- x(ptr_to_missing_stripe, 150, 0) \
- x(ptr_to_incorrect_stripe, 151, 0) \
- x(ptr_gen_newer_than_bucket_gen, 152, 0) \
- x(ptr_too_stale, 153, 0) \
- x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \
- x(ptr_bucket_data_type_mismatch, 155, 0) \
- x(ptr_cached_and_erasure_coded, 156, 0) \
- x(ptr_crc_uncompressed_size_too_small, 157, 0) \
- x(ptr_crc_uncompressed_size_too_big, 161, 0) \
- x(ptr_crc_uncompressed_size_mismatch, 300, 0) \
- x(ptr_crc_csum_type_unknown, 158, 0) \
- x(ptr_crc_compression_type_unknown, 159, 0) \
- x(ptr_crc_redundant, 160, 0) \
- x(ptr_crc_nonce_mismatch, 162, 0) \
- x(ptr_stripe_redundant, 163, 0) \
- x(extent_flags_not_at_start, 306, 0) \
- x(reservation_key_nr_replicas_invalid, 164, 0) \
- x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
- x(reflink_v_pos_bad, 292, 0) \
- x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \
- x(reflink_refcount_underflow, 293, 0) \
- x(stripe_pos_bad, 167, 0) \
- x(stripe_val_size_bad, 168, 0) \
- x(stripe_csum_granularity_bad, 290, 0) \
- x(stripe_sector_count_wrong, 169, 0) \
- x(snapshot_tree_pos_bad, 170, 0) \
- x(snapshot_tree_to_missing_snapshot, 171, 0) \
- x(snapshot_tree_to_missing_subvol, 172, 0) \
- x(snapshot_tree_to_wrong_subvol, 173, 0) \
- x(snapshot_tree_to_snapshot_subvol, 174, 0) \
- x(snapshot_pos_bad, 175, 0) \
- x(snapshot_parent_bad, 176, 0) \
- x(snapshot_children_not_normalized, 177, 0) \
- x(snapshot_child_duplicate, 178, 0) \
- x(snapshot_child_bad, 179, 0) \
- x(snapshot_skiplist_not_normalized, 180, 0) \
- x(snapshot_skiplist_bad, 181, 0) \
- x(snapshot_should_not_have_subvol, 182, 0) \
- x(snapshot_to_bad_snapshot_tree, 183, FSCK_AUTOFIX) \
- x(snapshot_bad_depth, 184, 0) \
- x(snapshot_bad_skiplist, 185, 0) \
- x(subvol_pos_bad, 186, 0) \
- x(subvol_not_master_and_not_snapshot, 187, 0) \
- x(subvol_to_missing_root, 188, 0) \
- x(subvol_root_wrong_bi_subvol, 189, 0) \
- x(bkey_in_missing_snapshot, 190, 0) \
- x(inode_pos_inode_nonzero, 191, 0) \
- x(inode_pos_blockdev_range, 192, 0) \
- x(inode_alloc_cursor_inode_bad, 301, 0) \
- x(inode_unpack_error, 193, 0) \
- x(inode_str_hash_invalid, 194, 0) \
- x(inode_v3_fields_start_bad, 195, 0) \
- x(inode_snapshot_mismatch, 196, 0) \
- x(inode_unlinked_but_clean, 197, 0) \
- x(inode_unlinked_but_nlink_nonzero, 198, 0) \
- x(inode_unlinked_and_not_open, 281, 0) \
- x(inode_unlinked_but_has_dirent, 285, 0) \
- x(inode_checksum_type_invalid, 199, 0) \
- x(inode_compression_type_invalid, 200, 0) \
- x(inode_subvol_root_but_not_dir, 201, 0) \
- x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \
- x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \
- x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \
- x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \
- x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
- x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \
- x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \
- x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
- x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
- x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
- x(inode_has_child_snapshots_wrong, 287, 0) \
- x(inode_unreachable, 210, FSCK_AUTOFIX) \
- x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \
- x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
- x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
- x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
- x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
- x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \
- x(extent_overlapping, 215, 0) \
- x(key_in_missing_inode, 216, 0) \
- x(key_in_wrong_inode_type, 217, 0) \
- x(extent_past_end_of_inode, 218, 0) \
- x(dirent_empty_name, 219, 0) \
- x(dirent_val_too_big, 220, 0) \
- x(dirent_name_too_long, 221, 0) \
- x(dirent_name_embedded_nul, 222, 0) \
- x(dirent_name_dot_or_dotdot, 223, 0) \
- x(dirent_name_has_slash, 224, 0) \
- x(dirent_d_type_wrong, 225, 0) \
- x(inode_bi_parent_wrong, 226, 0) \
- x(dirent_in_missing_dir_inode, 227, 0) \
- x(dirent_in_non_dir_inode, 228, 0) \
- x(dirent_to_missing_inode, 229, 0) \
- x(dirent_to_overwritten_inode, 302, 0) \
- x(dirent_to_missing_subvol, 230, 0) \
- x(dirent_to_itself, 231, 0) \
- x(quota_type_invalid, 232, 0) \
- x(xattr_val_size_too_small, 233, 0) \
- x(xattr_val_size_too_big, 234, 0) \
- x(xattr_invalid_type, 235, 0) \
- x(xattr_name_invalid_chars, 236, 0) \
- x(xattr_in_missing_inode, 237, 0) \
- x(root_subvol_missing, 238, 0) \
- x(root_dir_missing, 239, 0) \
- x(root_inode_not_dir, 240, 0) \
- x(dir_loop, 241, 0) \
- x(hash_table_key_duplicate, 242, 0) \
- x(hash_table_key_wrong_offset, 243, 0) \
- x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \
- x(reflink_p_front_pad_bad, 245, 0) \
- x(journal_entry_dup_same_device, 246, 0) \
- x(inode_bi_subvol_missing, 247, 0) \
- x(inode_bi_subvol_wrong, 248, 0) \
- x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \
- x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \
- x(inode_bi_parent_nonzero, 251, 0) \
- x(dirent_to_missing_parent_subvol, 252, 0) \
- x(dirent_not_visible_in_parent_subvol, 253, 0) \
- x(subvol_fs_path_parent_wrong, 254, 0) \
- x(subvol_root_fs_path_parent_nonzero, 255, 0) \
- x(subvol_children_not_set, 256, 0) \
- x(subvol_children_bad, 257, 0) \
- x(subvol_loop, 258, 0) \
- x(subvol_unreachable, 259, FSCK_AUTOFIX) \
- x(btree_node_bkey_bad_u64s, 260, 0) \
- x(btree_node_topology_empty_interior_node, 261, 0) \
- x(btree_ptr_v2_min_key_bad, 262, 0) \
- x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
- x(snapshot_node_missing, 264, 0) \
- x(dup_backpointer_to_bad_csum_extent, 265, 0) \
- x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
- x(sb_clean_entry_overrun, 267, 0) \
- x(btree_ptr_v2_written_0, 268, 0) \
- x(subvol_snapshot_bad, 269, 0) \
- x(subvol_inode_bad, 270, 0) \
- x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
- x(accounting_mismatch, 272, FSCK_AUTOFIX) \
- x(accounting_replicas_not_marked, 273, 0) \
- x(accounting_to_invalid_device, 289, 0) \
- x(invalid_btree_id, 274, 0) \
- x(alloc_key_io_time_bad, 275, 0) \
- x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
- x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
- x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
- x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
- x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
- x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
- x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
- x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
- x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
- x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
- x(dirent_cf_name_too_big, 304, 0) \
- x(dirent_stray_data_after_cf_name, 305, 0) \
- x(MAX, 307, 0)
-
-enum bch_sb_error_id {
-#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
- BCH_SB_ERRS()
-#undef x
-};
-
-struct bch_sb_field_errors {
- struct bch_sb_field field;
- struct bch_sb_field_error_entry {
- __le64 v;
- __le64 last_error_time;
- } entries[];
-};
-
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
-
-#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
deleted file mode 100644
index 40325239c3b0..000000000000
--- a/fs/bcachefs/sb-errors_types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
-#define _BCACHEFS_SB_ERRORS_TYPES_H
-
-#include "darray.h"
-
-struct bch_sb_error_entry_cpu {
- u64 id:16,
- nr:48;
- u64 last_error_time;
-};
-
-typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
-
-#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
deleted file mode 100644
index 116131f95815..000000000000
--- a/fs/bcachefs/sb-members.c
+++ /dev/null
@@ -1,532 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "opts.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "super-io.h"
-
-void bch2_dev_missing(struct bch_fs *c, unsigned dev)
-{
- if (dev != BCH_SB_MEMBER_INVALID)
- bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
-}
-
-void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
-{
- bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset);
-}
-
-#define x(t, n, ...) [n] = #t,
-static const char * const bch2_iops_measurements[] = {
- BCH_IOPS_MEASUREMENTS()
- NULL
-};
-
-char * const bch2_member_error_strs[] = {
- BCH_MEMBER_ERROR_TYPES()
- NULL
-};
-#undef x
-
-/* Code for bch_sb_field_members_v1: */
-
-struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
-{
- return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
-}
-
-static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
-{
- struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
- memset(&ret, 0, sizeof(ret));
- memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
- return ret;
-}
-
-static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
-{
- return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
-}
-
-static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
-{
- struct bch_member ret, *p = members_v1_get_mut(mi, i);
- memset(&ret, 0, sizeof(ret));
- memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
- return ret;
-}
-
-struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
-{
- struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
- if (mi2)
- return members_v2_get(mi2, i);
- struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
- return members_v1_get(mi1, i);
-}
-
-static int sb_members_v2_resize_entries(struct bch_fs *c)
-{
- struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-
- if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
- unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
- c->disk_sb.sb->nr_devices), 8);
-
- mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
- if (!mi)
- return -BCH_ERR_ENOSPC_sb_members_v2;
-
- for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
- void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
- memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
- memset(dst + le16_to_cpu(mi->member_bytes),
- 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
- }
- mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
- }
- return 0;
-}
-
-int bch2_sb_members_v2_init(struct bch_fs *c)
-{
- struct bch_sb_field_members_v1 *mi1;
- struct bch_sb_field_members_v2 *mi2;
-
- if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
- mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
- DIV_ROUND_UP(sizeof(*mi2) +
- sizeof(struct bch_member) * c->sb.nr_devices,
- sizeof(u64)));
- mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
- memcpy(&mi2->_members[0], &mi1->_members[0],
- BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
- memset(&mi2->pad[0], 0, sizeof(mi2->pad));
- mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
- }
-
- return sb_members_v2_resize_entries(c);
-}
-
-int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
-{
- struct bch_sb_field_members_v1 *mi1;
- struct bch_sb_field_members_v2 *mi2;
-
- mi1 = bch2_sb_field_resize(disk_sb, members_v1,
- DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
- disk_sb->sb->nr_devices, sizeof(u64)));
- if (!mi1)
- return -BCH_ERR_ENOSPC_sb_members;
-
- mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
-
- for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
- memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
-
- return 0;
-}
-
-static int validate_member(struct printbuf *err,
- struct bch_member m,
- struct bch_sb *sb,
- int i)
-{
- if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
- prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
- i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (le64_to_cpu(m.nbuckets) -
- le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
- prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
- i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (le16_to_cpu(m.bucket_size) <
- le16_to_cpu(sb->block_size)) {
- prt_printf(err, "device %u: bucket size %u smaller than block size %u",
- i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (le16_to_cpu(m.bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(sb)) {
- prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
- i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) {
- prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
- return -BCH_ERR_invalid_sb_members;
- }
-
- return 0;
-}
-
-static void member_to_text(struct printbuf *out,
- struct bch_member m,
- struct bch_sb_field_disk_groups *gi,
- struct bch_sb *sb,
- int i)
-{
- unsigned data_have = bch2_sb_dev_has_data(sb, i);
- u64 bucket_size = le16_to_cpu(m.bucket_size);
- u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
-
- if (!bch2_member_alive(&m))
- return;
-
- prt_printf(out, "Device:\t%u\n", i);
-
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "Label:\t");
- if (BCH_MEMBER_GROUP(&m)) {
- unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
-
- if (idx < disk_groups_nr(gi))
- prt_printf(out, "%s (%u)",
- gi->entries[idx].label, idx);
- else
- prt_printf(out, "(bad disk labels section)");
- } else {
- prt_printf(out, "(none)");
- }
- prt_newline(out);
-
- prt_printf(out, "UUID:\t");
- pr_uuid(out, m.uuid.b);
- prt_newline(out);
-
- prt_printf(out, "Size:\t");
- prt_units_u64(out, device_size << 9);
- prt_newline(out);
-
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
- prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i]));
-
- for (unsigned i = 0; i < BCH_IOPS_NR; i++)
- prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i]));
-
- prt_printf(out, "Bucket size:\t");
- prt_units_u64(out, bucket_size << 9);
- prt_newline(out);
-
- prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket));
- prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets));
-
- prt_printf(out, "Last mount:\t");
- if (m.last_mount)
- bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
- else
- prt_printf(out, "(never)");
- prt_newline(out);
-
- prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq));
-
- prt_printf(out, "State:\t%s\n",
- BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
- ? bch2_member_states[BCH_MEMBER_STATE(&m)]
- : "unknown");
-
- prt_printf(out, "Data allowed:\t");
- if (BCH_MEMBER_DATA_ALLOWED(&m))
- prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
- else
- prt_printf(out, "(none)");
- prt_newline(out);
-
- prt_printf(out, "Has data:\t");
- if (data_have)
- prt_bitflags(out, __bch2_data_types, data_have);
- else
- prt_printf(out, "(none)");
- prt_newline(out);
-
- prt_printf(out, "Btree allocated bitmap blocksize:\t");
- if (m.btree_bitmap_shift < 64)
- prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
- else
- prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
- prt_newline(out);
-
- prt_printf(out, "Btree allocated bitmap:\t");
- bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64);
- prt_newline(out);
-
- prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
-
- prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
- prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
-
- printbuf_indent_sub(out, 2);
-}
-
-static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
- unsigned i;
-
- if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
- prt_printf(err, "too many devices for section size");
- return -BCH_ERR_invalid_sb_members;
- }
-
- for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member m = members_v1_get(mi, i);
-
- int ret = validate_member(err, m, sb, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
- struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
- unsigned i;
-
- for (i = 0; i < sb->nr_devices; i++)
- member_to_text(out, members_v1_get(mi, i), gi, sb, i);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
- .validate = bch2_sb_members_v1_validate,
- .to_text = bch2_sb_members_v1_to_text,
-};
-
-static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
- struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
- unsigned i;
-
- for (i = 0; i < sb->nr_devices; i++)
- member_to_text(out, members_v2_get(mi, i), gi, sb, i);
-}
-
-static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
- size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
- (void *) mi;
-
- if (mi_bytes > vstruct_bytes(&mi->field)) {
- prt_printf(err, "section too small (%zu > %zu)",
- mi_bytes, vstruct_bytes(&mi->field));
- return -BCH_ERR_invalid_sb_members;
- }
-
- for (unsigned i = 0; i < sb->nr_devices; i++) {
- int ret = validate_member(err, members_v2_get(mi, i), sb, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
- .validate = bch2_sb_members_v2_validate,
- .to_text = bch2_sb_members_v2_to_text,
-};
-
-void bch2_sb_members_from_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL) {
- struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
-
- for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
- m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
- }
- rcu_read_unlock();
-}
-
-void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_member m;
-
- mutex_lock(&ca->fs->sb_lock);
- m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
- mutex_unlock(&ca->fs->sb_lock);
-
- printbuf_tabstop_push(out, 12);
-
- prt_str(out, "IO errors since filesystem creation");
- prt_newline(out);
-
- printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
- prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i]));
- printbuf_indent_sub(out, 2);
-
- prt_str(out, "IO errors since ");
- bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
- prt_str(out, " ago");
- prt_newline(out);
-
- printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
- prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i],
- atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_dev_errors_reset(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_member *m;
-
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
- m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
- m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-}
-
-/*
- * Per member "range has btree nodes" bitmap:
- *
- * This is so that if we ever have to run the btree node scan to repair we don't
- * have to scan full devices:
- */
-
-bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
-{
- bool ret = true;
- rcu_read_lock();
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) {
- ret = false;
- break;
- }
- }
- rcu_read_unlock();
- return ret;
-}
-
-static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
- u64 start, unsigned sectors)
-{
- struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
- u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
-
- u64 end = start + sectors;
-
- int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
- if (resize > 0) {
- u64 new_bitmap = 0;
-
- for (unsigned i = 0; i < 64; i++)
- if (bitmap & BIT_ULL(i))
- new_bitmap |= BIT_ULL(i >> resize);
- bitmap = new_bitmap;
- m->btree_bitmap_shift += resize;
- }
-
- BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
- BUG_ON(end > 64ULL << m->btree_bitmap_shift);
-
- for (unsigned bit = start >> m->btree_bitmap_shift;
- (u64) bit << m->btree_bitmap_shift < end;
- bit++)
- bitmap |= BIT_ULL(bit);
-
- m->btree_allocated_bitmap = cpu_to_le64(bitmap);
-}
-
-void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
-{
- lockdep_assert_held(&c->sb_lock);
-
- struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
- if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
- continue;
-
- __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
- }
-}
-
-unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
-{
- unsigned nr = 0;
-
- for (unsigned i = 0; i < sb->nr_devices; i++)
- nr += bch2_member_exists((struct bch_sb *) sb, i);
- return nr;
-}
-
-int bch2_sb_member_alloc(struct bch_fs *c)
-{
- unsigned dev_idx = c->sb.nr_devices;
- struct bch_sb_field_members_v2 *mi;
- unsigned nr_devices;
- unsigned u64s;
- int best = -1;
- u64 best_last_mount = 0;
-
- if (dev_idx < BCH_SB_MEMBERS_MAX)
- goto have_slot;
-
- for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
- /* eventually BCH_SB_MEMBERS_MAX will be raised */
- if (dev_idx == BCH_SB_MEMBER_INVALID)
- continue;
-
- struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
- if (bch2_member_alive(&m))
- continue;
-
- u64 last_mount = le64_to_cpu(m.last_mount);
- if (best < 0 || last_mount < best_last_mount) {
- best = dev_idx;
- best_last_mount = last_mount;
- }
- }
- if (best >= 0) {
- dev_idx = best;
- goto have_slot;
- }
-
- return -BCH_ERR_ENOSPC_sb_members;
-have_slot:
- nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-
- mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
- le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
-
- mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
- if (!mi)
- return -BCH_ERR_ENOSPC_sb_members;
-
- c->disk_sb.sb->nr_devices = nr_devices;
- return dev_idx;
-}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
deleted file mode 100644
index 38261638a611..000000000000
--- a/fs/bcachefs/sb-members.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_H
-#define _BCACHEFS_SB_MEMBERS_H
-
-#include "darray.h"
-#include "bkey_types.h"
-
-extern char * const bch2_member_error_strs[];
-
-static inline struct bch_member *
-__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
-{
- return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
-}
-
-int bch2_sb_members_v2_init(struct bch_fs *c);
-int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
-struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
-struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
-
-static inline bool bch2_dev_is_online(struct bch_dev *ca)
-{
- return !percpu_ref_is_zero(&ca->io_ref);
-}
-
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
-
-static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
-{
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
- bool ret = ca && bch2_dev_is_online(ca);
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
-{
- return bch2_dev_is_online(ca) &&
- ca->mi.state != BCH_MEMBER_STATE_failed;
-}
-
-static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-{
- return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-}
-
-static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
- unsigned dev)
-{
- darray_for_each(devs, i)
- if (*i == dev)
- return true;
- return false;
-}
-
-static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
- unsigned dev)
-{
- darray_for_each(*devs, i)
- if (*i == dev) {
- darray_remove_item(devs, i);
- return;
- }
-}
-
-static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
- unsigned dev)
-{
- if (!bch2_dev_list_has_dev(*devs, dev)) {
- BUG_ON(devs->nr >= ARRAY_SIZE(devs->data));
- devs->data[devs->nr++] = dev;
- }
-}
-
-static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-{
- return (struct bch_devs_list) { .nr = 1, .data[0] = dev };
-}
-
-static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx,
- const struct bch_devs_mask *mask)
-{
- struct bch_dev *ca = NULL;
-
- while ((idx = mask
- ? find_next_bit(mask->d, c->sb.nr_devices, idx)
- : idx) < c->sb.nr_devices &&
- !(ca = rcu_dereference_check(c->devs[idx],
- lockdep_is_held(&c->state_lock))))
- idx++;
-
- return ca;
-}
-
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca,
- const struct bch_devs_mask *mask)
-{
- return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask);
-}
-
-#define for_each_member_device_rcu(_c, _ca, _mask) \
- for (struct bch_dev *_ca = NULL; \
- (_ca = __bch2_next_dev((_c), _ca, (_mask)));)
-
-static inline void bch2_dev_get(struct bch_dev *ca)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L);
-#else
- percpu_ref_get(&ca->ref);
-#endif
-}
-
-static inline void __bch2_dev_put(struct bch_dev *ca)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- long r = atomic_long_dec_return(&ca->ref);
- if (r < (long) !ca->dying)
- panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put);
- ca->last_put = _THIS_IP_;
- if (!r)
- complete(&ca->ref_completion);
-#else
- percpu_ref_put(&ca->ref);
-#endif
-}
-
-static inline void bch2_dev_put(struct bch_dev *ca)
-{
- if (ca)
- __bch2_dev_put(ca);
-}
-
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
-{
- rcu_read_lock();
- bch2_dev_put(ca);
- if ((ca = __bch2_next_dev(c, ca, NULL)))
- bch2_dev_get(ca);
- rcu_read_unlock();
-
- return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define __for_each_member_device(_c, _ca) \
- for (; (_ca = bch2_get_next_dev(_c, _ca));)
-
-#define for_each_member_device(_c, _ca) \
- for (struct bch_dev *_ca = NULL; \
- (_ca = bch2_get_next_dev(_c, _ca));)
-
-static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
- struct bch_dev *ca,
- unsigned state_mask)
-{
- rcu_read_lock();
- if (ca)
- percpu_ref_put(&ca->io_ref);
-
- while ((ca = __bch2_next_dev(c, ca, NULL)) &&
- (!((1 << ca->mi.state) & state_mask) ||
- !percpu_ref_tryget(&ca->io_ref)))
- ;
- rcu_read_unlock();
-
- return ca;
-}
-
-#define __for_each_online_member(_c, _ca, state_mask) \
- for (struct bch_dev *_ca = NULL; \
- (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
-
-#define for_each_online_member(c, ca) \
- __for_each_online_member(c, ca, ~0)
-
-#define for_each_rw_member(c, ca) \
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
-
-#define for_each_readable_member(c, ca) \
- __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
-
-static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
-{
- return dev < c->sb.nr_devices && c->devs[dev];
-}
-
-static inline bool bucket_valid(const struct bch_dev *ca, u64 b)
-{
- return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first;
-}
-
-static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev)
-{
- EBUG_ON(!bch2_dev_exists(c, dev));
-
- return rcu_dereference_check(c->devs[dev], 1);
-}
-
-static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
-{
- EBUG_ON(!bch2_dev_exists(c, dev));
-
- return rcu_dereference_protected(c->devs[dev],
- lockdep_is_held(&c->sb_lock) ||
- lockdep_is_held(&c->state_lock));
-}
-
-static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
-{
- return c && dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[dev])
- : NULL;
-}
-
-void bch2_dev_missing(struct bch_fs *, unsigned);
-
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
-{
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
- if (unlikely(!ca))
- bch2_dev_missing(c, dev);
- return ca;
-}
-
-static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
-{
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
- if (ca)
- bch2_dev_get(ca);
- rcu_read_unlock();
- return ca;
-}
-
-static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
-{
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
- if (unlikely(!ca))
- bch2_dev_missing(c, dev);
- return ca;
-}
-
-static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
-{
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
- if (ca && !bucket_valid(ca, bucket.offset)) {
- bch2_dev_put(ca);
- ca = NULL;
- }
- return ca;
-}
-
-void bch2_dev_bucket_missing(struct bch_fs *, struct bpos);
-
-static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
-{
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket);
- if (!ca)
- bch2_dev_bucket_missing(c, bucket);
- return ca;
-}
-
-static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
-{
- if (ca && ca->dev_idx == dev_idx)
- return ca;
- bch2_dev_put(ca);
- return bch2_dev_tryget_noerror(c, dev_idx);
-}
-
-static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
-{
- if (ca && ca->dev_idx == dev_idx)
- return ca;
- bch2_dev_put(ca);
- return bch2_dev_tryget(c, dev_idx);
-}
-
-static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
-{
- might_sleep();
-
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
- if (ca && !percpu_ref_tryget(&ca->io_ref))
- ca = NULL;
- rcu_read_unlock();
-
- if (ca &&
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
- return ca;
-
- if (ca)
- percpu_ref_put(&ca->io_ref);
- return NULL;
-}
-
-/* XXX kill, move to struct bch_fs */
-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
-{
- struct bch_devs_mask devs;
-
- memset(&devs, 0, sizeof(devs));
- for_each_online_member(c, ca)
- __set_bit(ca->dev_idx, devs.d);
- return devs;
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
-extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
-
-static inline bool bch2_member_alive(struct bch_member *m)
-{
- return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
-}
-
-static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
-{
- if (dev < sb->nr_devices) {
- struct bch_member m = bch2_sb_member_get(sb, dev);
- return bch2_member_alive(&m);
- }
- return false;
-}
-
-unsigned bch2_sb_nr_devices(const struct bch_sb *);
-
-static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
-{
- return (struct bch_member_cpu) {
- .nbuckets = le64_to_cpu(mi->nbuckets),
- .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) -
- le16_to_cpu(mi->first_bucket),
- .first_bucket = le16_to_cpu(mi->first_bucket),
- .bucket_size = le16_to_cpu(mi->bucket_size),
- .group = BCH_MEMBER_GROUP(mi),
- .state = BCH_MEMBER_STATE(mi),
- .discard = BCH_MEMBER_DISCARD(mi),
- .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
- .durability = BCH_MEMBER_DURABILITY(mi)
- ? BCH_MEMBER_DURABILITY(mi) - 1
- : 1,
- .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
- .valid = bch2_member_alive(mi),
- .btree_bitmap_shift = mi->btree_bitmap_shift,
- .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
- };
-}
-
-void bch2_sb_members_from_cpu(struct bch_fs *);
-
-void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
-void bch2_dev_errors_reset(struct bch_dev *);
-
-static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
-{
- u64 end = start + sectors;
-
- if (end > 64ULL << ca->mi.btree_bitmap_shift)
- return false;
-
- for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
- (u64) bit << ca->mi.btree_bitmap_shift < end;
- bit++)
- if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
- return false;
- return true;
-}
-
-bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
-void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
-
-int bch2_sb_member_alloc(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
deleted file mode 100644
index 3affec823b3f..000000000000
--- a/fs/bcachefs/sb-members_format.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H
-#define _BCACHEFS_SB_MEMBERS_FORMAT_H
-
-/*
- * We refer to members with bitmasks in various places - but we need to get rid
- * of this limit:
- */
-#define BCH_SB_MEMBERS_MAX 64
-
-/*
- * Sentinal value - indicates a device that does not exist
- */
-#define BCH_SB_MEMBER_INVALID 255
-
-#define BCH_MIN_NR_NBUCKETS (1 << 6)
-
-#define BCH_IOPS_MEASUREMENTS() \
- x(seqread, 0) \
- x(seqwrite, 1) \
- x(randread, 2) \
- x(randwrite, 3)
-
-enum bch_iops_measurement {
-#define x(t, n) BCH_IOPS_##t = n,
- BCH_IOPS_MEASUREMENTS()
-#undef x
- BCH_IOPS_NR
-};
-
-#define BCH_MEMBER_ERROR_TYPES() \
- x(read, 0) \
- x(write, 1) \
- x(checksum, 2)
-
-enum bch_member_error_type {
-#define x(t, n) BCH_MEMBER_ERROR_##t = n,
- BCH_MEMBER_ERROR_TYPES()
-#undef x
- BCH_MEMBER_ERROR_NR
-};
-
-struct bch_member {
- __uuid_t uuid;
- __le64 nbuckets; /* device size */
- __le16 first_bucket; /* index of first bucket used */
- __le16 bucket_size; /* sectors */
- __u8 btree_bitmap_shift;
- __u8 pad[3];
- __le64 last_mount; /* time_t */
-
- __le64 flags;
- __le32 iops[4];
- __le64 errors[BCH_MEMBER_ERROR_NR];
- __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
- __le64 errors_reset_time;
- __le64 seq;
- __le64 btree_allocated_bitmap;
- /*
- * On recovery from a clean shutdown we don't normally read the journal,
- * but we still want to resume writing from where we left off so we
- * don't overwrite more than is necessary, for list journal debugging:
- */
- __le32 last_journal_bucket;
- __le32 last_journal_bucket_offset;
-};
-
-/*
- * btree_allocated_bitmap can represent sector addresses of a u64: it itself has
- * 64 elements, so 64 - ilog2(64)
- */
-#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58
-
-/*
- * This limit comes from the bucket_gens array - it's a single allocation, and
- * kernel allocation are limited to INT_MAX
- */
-#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
-
-#define BCH_MEMBER_V1_BYTES 56
-
-LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16)
-LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
-/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
-LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
- struct bch_member, flags, 30, 31)
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-#define BCH_MEMBER_STATES() \
- x(rw, 0) \
- x(ro, 1) \
- x(failed, 2) \
- x(spare, 3)
-
-enum bch_member_state {
-#define x(t, n) BCH_MEMBER_STATE_##t = n,
- BCH_MEMBER_STATES()
-#undef x
- BCH_MEMBER_STATE_NR
-};
-
-struct bch_sb_field_members_v1 {
- struct bch_sb_field field;
- struct bch_member _members[]; //Members are now variable size
-};
-
-struct bch_sb_field_members_v2 {
- struct bch_sb_field field;
- __le16 member_bytes; //size of single member entry
- u8 pad[6];
- struct bch_member _members[];
-};
-
-#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
deleted file mode 100644
index c0eda888fe39..000000000000
--- a/fs/bcachefs/sb-members_types.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H
-#define _BCACHEFS_SB_MEMBERS_TYPES_H
-
-struct bch_member_cpu {
- u64 nbuckets; /* device size */
- u64 nbuckets_minus_first;
- u16 first_bucket; /* index of first bucket used */
- u16 bucket_size; /* sectors */
- u16 group;
- u8 state;
- u8 discard;
- u8 data_allowed;
- u8 durability;
- u8 freespace_initialized;
- u8 valid;
- u8 btree_bitmap_shift;
- u64 btree_allocated_bitmap;
-};
-
-#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
deleted file mode 100644
index c4b3d8d3f414..000000000000
--- a/fs/bcachefs/seqmutex.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SEQMUTEX_H
-#define _BCACHEFS_SEQMUTEX_H
-
-#include <linux/mutex.h>
-
-struct seqmutex {
- struct mutex lock;
- u32 seq;
-};
-
-#define seqmutex_init(_lock) mutex_init(&(_lock)->lock)
-
-static inline bool seqmutex_trylock(struct seqmutex *lock)
-{
- return mutex_trylock(&lock->lock);
-}
-
-static inline void seqmutex_lock(struct seqmutex *lock)
-{
- mutex_lock(&lock->lock);
- lock->seq++;
-}
-
-static inline u32 seqmutex_unlock(struct seqmutex *lock)
-{
- u32 seq = lock->seq;
- mutex_unlock(&lock->lock);
- return seq;
-}
-
-static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
-{
- if (lock->seq != seq || !mutex_trylock(&lock->lock))
- return false;
-
- if (lock->seq != seq) {
- mutex_unlock(&lock->lock);
- return false;
- }
-
- return true;
-}
-
-#endif /* _BCACHEFS_SEQMUTEX_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
deleted file mode 100644
index a1cc44e66c7e..000000000000
--- a/fs/bcachefs/siphash.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: BSD-3-Clause
-/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
-
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
- * are the number of compression rounds and the number of finalization rounds.
- * A compression round is identical to a finalization round and this round
- * function is called SipRound. Given a 128-bit key k and a (possibly empty)
- * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
- *
- * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
- * by Jean-Philippe Aumasson and Daniel J. Bernstein,
- * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
- * https://131002.net/siphash/siphash.pdf
- * https://131002.net/siphash/
- */
-
-#include <asm/byteorder.h>
-#include <linux/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/string.h>
-
-#include "siphash.h"
-
-static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
-{
- while (rounds--) {
- ctx->v[0] += ctx->v[1];
- ctx->v[2] += ctx->v[3];
- ctx->v[1] = rol64(ctx->v[1], 13);
- ctx->v[3] = rol64(ctx->v[3], 16);
-
- ctx->v[1] ^= ctx->v[0];
- ctx->v[3] ^= ctx->v[2];
- ctx->v[0] = rol64(ctx->v[0], 32);
-
- ctx->v[2] += ctx->v[1];
- ctx->v[0] += ctx->v[3];
- ctx->v[1] = rol64(ctx->v[1], 17);
- ctx->v[3] = rol64(ctx->v[3], 21);
-
- ctx->v[1] ^= ctx->v[2];
- ctx->v[3] ^= ctx->v[0];
- ctx->v[2] = rol64(ctx->v[2], 32);
- }
-}
-
-static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
-{
- u64 m = get_unaligned_le64(ptr);
-
- ctx->v[3] ^= m;
- SipHash_Rounds(ctx, rounds);
- ctx->v[0] ^= m;
-}
-
-void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
-{
- u64 k0, k1;
-
- k0 = le64_to_cpu(key->k0);
- k1 = le64_to_cpu(key->k1);
-
- ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
- ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
- ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
- ctx->v[3] = 0x7465646279746573ULL ^ k1;
-
- memset(ctx->buf, 0, sizeof(ctx->buf));
- ctx->bytes = 0;
-}
-
-void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
- const void *src, size_t len)
-{
- const u8 *ptr = src;
- size_t left, used;
-
- if (len == 0)
- return;
-
- used = ctx->bytes % sizeof(ctx->buf);
- ctx->bytes += len;
-
- if (used > 0) {
- left = sizeof(ctx->buf) - used;
-
- if (len >= left) {
- memcpy(&ctx->buf[used], ptr, left);
- SipHash_CRounds(ctx, ctx->buf, rc);
- len -= left;
- ptr += left;
- } else {
- memcpy(&ctx->buf[used], ptr, len);
- return;
- }
- }
-
- while (len >= sizeof(ctx->buf)) {
- SipHash_CRounds(ctx, ptr, rc);
- len -= sizeof(ctx->buf);
- ptr += sizeof(ctx->buf);
- }
-
- if (len > 0)
- memcpy(&ctx->buf[used], ptr, len);
-}
-
-void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
-{
- u64 r;
-
- r = SipHash_End(ctx, rc, rf);
-
- *((__le64 *) dst) = cpu_to_le64(r);
-}
-
-u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
-{
- u64 r;
- size_t left, used;
-
- used = ctx->bytes % sizeof(ctx->buf);
- left = sizeof(ctx->buf) - used;
- memset(&ctx->buf[used], 0, left - 1);
- ctx->buf[7] = ctx->bytes;
-
- SipHash_CRounds(ctx, ctx->buf, rc);
- ctx->v[2] ^= 0xff;
- SipHash_Rounds(ctx, rf);
-
- r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
- memset(ctx, 0, sizeof(*ctx));
- return r;
-}
-
-u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
-{
- SIPHASH_CTX ctx;
-
- SipHash_Init(&ctx, key);
- SipHash_Update(&ctx, rc, rf, src, len);
- return SipHash_End(&ctx, rc, rf);
-}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
deleted file mode 100644
index 3dfaf34a43b2..000000000000
--- a/fs/bcachefs/siphash.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause */
-/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-/*
- * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
- * optimized for speed on short messages returning a 64bit hash/digest value.
- *
- * The number of rounds is defined during the initialization:
- * SipHash24_Init() for the fast and resonable strong version
- * SipHash48_Init() for the strong version (half as fast)
- *
- * struct SIPHASH_CTX ctx;
- * SipHash24_Init(&ctx);
- * SipHash_SetKey(&ctx, "16bytes long key");
- * SipHash_Update(&ctx, pointer_to_string, length_of_string);
- * SipHash_Final(output, &ctx);
- */
-
-#ifndef _SIPHASH_H_
-#define _SIPHASH_H_
-
-#include <linux/types.h>
-
-#define SIPHASH_BLOCK_LENGTH 8
-#define SIPHASH_KEY_LENGTH 16
-#define SIPHASH_DIGEST_LENGTH 8
-
-typedef struct _SIPHASH_CTX {
- u64 v[4];
- u8 buf[SIPHASH_BLOCK_LENGTH];
- u32 bytes;
-} SIPHASH_CTX;
-
-typedef struct {
- __le64 k0;
- __le64 k1;
-} SIPHASH_KEY;
-
-void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
-void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
-u64 SipHash_End(SIPHASH_CTX *, int, int);
-void SipHash_Final(void *, SIPHASH_CTX *, int, int);
-u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
-
-#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k))
-#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l))
-#define SipHash24_End(_d) SipHash_End((_d), 2, 4)
-#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4)
-#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l))
-
-#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k))
-#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l))
-#define SipHash48_End(_d) SipHash_End((_d), 4, 8)
-#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8)
-#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l))
-
-#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
deleted file mode 100644
index 7c403427fbdb..000000000000
--- a/fs/bcachefs/six.c
+++ /dev/null
@@ -1,881 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/export.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/task.h>
-#include <linux/slab.h>
-
-#include <trace/events/lock.h>
-
-#include "six.h"
-
-#ifdef DEBUG
-#define EBUG_ON(cond) BUG_ON(cond)
-#else
-#define EBUG_ON(cond) do {} while (0)
-#endif
-
-#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip)
-#define six_release(l, ip) lock_release(l, ip)
-
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
-
-#define SIX_LOCK_HELD_read_OFFSET 0
-#define SIX_LOCK_HELD_read ~(~0U << 26)
-#define SIX_LOCK_HELD_intent (1U << 26)
-#define SIX_LOCK_HELD_write (1U << 27)
-#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read))
-#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write))
-#define SIX_LOCK_NOSPIN (1U << 31)
-
-struct six_lock_vals {
- /* Value we add to the lock in order to take the lock: */
- u32 lock_val;
-
- /* If the lock has this value (used as a mask), taking the lock fails: */
- u32 lock_fail;
-
- /* Mask that indicates lock is held for this type: */
- u32 held_mask;
-
- /* Waitlist we wakeup when releasing the lock: */
- enum six_lock_type unlock_wakeup;
-};
-
-static const struct six_lock_vals l[] = {
- [SIX_LOCK_read] = {
- .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET,
- .lock_fail = SIX_LOCK_HELD_write,
- .held_mask = SIX_LOCK_HELD_read,
- .unlock_wakeup = SIX_LOCK_write,
- },
- [SIX_LOCK_intent] = {
- .lock_val = SIX_LOCK_HELD_intent,
- .lock_fail = SIX_LOCK_HELD_intent,
- .held_mask = SIX_LOCK_HELD_intent,
- .unlock_wakeup = SIX_LOCK_intent,
- },
- [SIX_LOCK_write] = {
- .lock_val = SIX_LOCK_HELD_write,
- .lock_fail = SIX_LOCK_HELD_read,
- .held_mask = SIX_LOCK_HELD_write,
- .unlock_wakeup = SIX_LOCK_read,
- },
-};
-
-static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
-{
- if ((atomic_read(&lock->state) & mask) != mask)
- atomic_or(mask, &lock->state);
-}
-
-static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
-{
- if (atomic_read(&lock->state) & mask)
- atomic_and(~mask, &lock->state);
-}
-
-static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
- u32 old, struct task_struct *owner)
-{
- if (type != SIX_LOCK_intent)
- return;
-
- if (!(old & SIX_LOCK_HELD_intent)) {
- EBUG_ON(lock->owner);
- lock->owner = owner;
- } else {
- EBUG_ON(lock->owner != current);
- }
-}
-
-static inline unsigned pcpu_read_count(struct six_lock *lock)
-{
- unsigned read_count = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- read_count += *per_cpu_ptr(lock->readers, cpu);
- return read_count;
-}
-
-/*
- * __do_six_trylock() - main trylock routine
- *
- * Returns 1 on success, 0 on failure
- *
- * In percpu reader mode, a failed trylock may cause a spurious trylock failure
- * for anoter thread taking the competing lock type, and we may havve to do a
- * wakeup: when a wakeup is required, we return -1 - wakeup_type.
- */
-static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
- struct task_struct *task, bool try)
-{
- int ret;
- u32 old;
-
- EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
- EBUG_ON(type == SIX_LOCK_write &&
- (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
-
- /*
- * Percpu reader mode:
- *
- * The basic idea behind this algorithm is that you can implement a lock
- * between two threads without any atomics, just memory barriers:
- *
- * For two threads you'll need two variables, one variable for "thread a
- * has the lock" and another for "thread b has the lock".
- *
- * To take the lock, a thread sets its variable indicating that it holds
- * the lock, then issues a full memory barrier, then reads from the
- * other thread's variable to check if the other thread thinks it has
- * the lock. If we raced, we backoff and retry/sleep.
- *
- * Failure to take the lock may cause a spurious trylock failure in
- * another thread, because we temporarily set the lock to indicate that
- * we held it. This would be a problem for a thread in six_lock(), when
- * they are calling trylock after adding themself to the waitlist and
- * prior to sleeping.
- *
- * Therefore, if we fail to get the lock, and there were waiters of the
- * type we conflict with, we will have to issue a wakeup.
- *
- * Since we may be called under wait_lock (and by the wakeup code
- * itself), we return that the wakeup has to be done instead of doing it
- * here.
- */
- if (type == SIX_LOCK_read && lock->readers) {
- preempt_disable();
- this_cpu_inc(*lock->readers); /* signal that we own lock */
-
- smp_mb();
-
- old = atomic_read(&lock->state);
- ret = !(old & l[type].lock_fail);
-
- this_cpu_sub(*lock->readers, !ret);
- preempt_enable();
-
- if (!ret) {
- smp_mb();
- if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
- ret = -1 - SIX_LOCK_write;
- }
- } else if (type == SIX_LOCK_write && lock->readers) {
- if (try)
- atomic_add(SIX_LOCK_HELD_write, &lock->state);
-
- /*
- * Make sure atomic_add happens before pcpu_read_count and
- * six_set_bitmask in slow path happens before pcpu_read_count.
- *
- * Paired with the smp_mb() in read lock fast path (per-cpu mode)
- * and the one before atomic_read in read unlock path.
- */
- smp_mb();
- ret = !pcpu_read_count(lock);
-
- if (try && !ret) {
- old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
- if (old & SIX_LOCK_WAITING_read)
- ret = -1 - SIX_LOCK_read;
- }
- } else {
- old = atomic_read(&lock->state);
- do {
- ret = !(old & l[type].lock_fail);
- if (!ret || (type == SIX_LOCK_write && !try)) {
- smp_mb();
- break;
- }
- } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
-
- EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
- }
-
- if (ret > 0)
- six_set_owner(lock, type, old, task);
-
- EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
- (atomic_read(&lock->state) & SIX_LOCK_HELD_write));
-
- return ret;
-}
-
-static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
-{
- struct six_lock_waiter *w, *next;
- struct task_struct *task;
- bool saw_one;
- int ret;
-again:
- ret = 0;
- saw_one = false;
- raw_spin_lock(&lock->wait_lock);
-
- list_for_each_entry_safe(w, next, &lock->wait_list, list) {
- if (w->lock_want != lock_type)
- continue;
-
- if (saw_one && lock_type != SIX_LOCK_read)
- goto unlock;
- saw_one = true;
-
- ret = __do_six_trylock(lock, lock_type, w->task, false);
- if (ret <= 0)
- goto unlock;
-
- /*
- * Similar to percpu_rwsem_wake_function(), we need to guard
- * against the wakee noticing w->lock_acquired, returning, and
- * then exiting before we do the wakeup:
- */
- task = get_task_struct(w->task);
- __list_del(w->list.prev, w->list.next);
- /*
- * The release barrier here ensures the ordering of the
- * __list_del before setting w->lock_acquired; @w is on the
- * stack of the thread doing the waiting and will be reused
- * after it sees w->lock_acquired with no other locking:
- * pairs with smp_load_acquire() in six_lock_slowpath()
- */
- smp_store_release(&w->lock_acquired, true);
- wake_up_process(task);
- put_task_struct(task);
- }
-
- six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
-unlock:
- raw_spin_unlock(&lock->wait_lock);
-
- if (ret < 0) {
- lock_type = -ret - 1;
- goto again;
- }
-}
-
-__always_inline
-static void six_lock_wakeup(struct six_lock *lock, u32 state,
- enum six_lock_type lock_type)
-{
- if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
- return;
-
- if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
- return;
-
- __six_lock_wakeup(lock, lock_type);
-}
-
-__always_inline
-static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
-{
- int ret;
-
- ret = __do_six_trylock(lock, type, current, try);
- if (ret < 0)
- __six_lock_wakeup(lock, -ret - 1);
-
- return ret > 0;
-}
-
-/**
- * six_trylock_ip - attempt to take a six lock without blocking
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
- if (!do_six_trylock(lock, type, true))
- return false;
-
- if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
- return true;
-}
-EXPORT_SYMBOL_GPL(six_trylock_ip);
-
-/**
- * six_relock_ip - attempt to re-take a lock that was held previously
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq: lock sequence number obtained from six_lock_seq() while lock was
- * held previously
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
- unsigned seq, unsigned long ip)
-{
- if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
- return false;
-
- if (six_lock_seq(lock) != seq) {
- six_unlock_ip(lock, type, ip);
- return false;
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(six_relock_ip);
-
-#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
-
-static inline bool six_owner_running(struct six_lock *lock)
-{
- /*
- * When there's no owner, we might have preempted between the owner
- * acquiring the lock and setting the owner field. If we're an RT task
- * that will live-lock because we won't let the owner complete.
- */
- rcu_read_lock();
- struct task_struct *owner = READ_ONCE(lock->owner);
- bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock,
- struct six_lock_waiter *wait,
- enum six_lock_type type)
-{
- unsigned loop = 0;
- u64 end_time;
-
- if (type == SIX_LOCK_write)
- return false;
-
- if (lock->wait_list.next != &wait->list)
- return false;
-
- if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN)
- return false;
-
- preempt_disable();
- end_time = sched_clock() + 10 * NSEC_PER_USEC;
-
- while (!need_resched() && six_owner_running(lock)) {
- /*
- * Ensures that writes to the waitlist entry happen after we see
- * wait->lock_acquired: pairs with the smp_store_release in
- * __six_lock_wakeup
- */
- if (smp_load_acquire(&wait->lock_acquired)) {
- preempt_enable();
- return true;
- }
-
- if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
- six_set_bitmask(lock, SIX_LOCK_NOSPIN);
- break;
- }
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax();
- }
-
- preempt_enable();
- return false;
-}
-
-#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-
-static inline bool six_optimistic_spin(struct six_lock *lock,
- struct six_lock_waiter *wait,
- enum six_lock_type type)
-{
- return false;
-}
-
-#endif
-
-noinline
-static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- int ret = 0;
-
- if (type == SIX_LOCK_write) {
- EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
- atomic_add(SIX_LOCK_HELD_write, &lock->state);
- smp_mb__after_atomic();
- }
-
- trace_contention_begin(lock, 0);
- lock_contended(&lock->dep_map, ip);
-
- wait->task = current;
- wait->lock_want = type;
- wait->lock_acquired = false;
-
- raw_spin_lock(&lock->wait_lock);
- six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
- /*
- * Retry taking the lock after taking waitlist lock, in case we raced
- * with an unlock:
- */
- ret = __do_six_trylock(lock, type, current, false);
- if (ret <= 0) {
- wait->start_time = local_clock();
-
- if (!list_empty(&lock->wait_list)) {
- struct six_lock_waiter *last =
- list_last_entry(&lock->wait_list,
- struct six_lock_waiter, list);
-
- if (time_before_eq64(wait->start_time, last->start_time))
- wait->start_time = last->start_time + 1;
- }
-
- list_add_tail(&wait->list, &lock->wait_list);
- }
- raw_spin_unlock(&lock->wait_lock);
-
- if (unlikely(ret > 0)) {
- ret = 0;
- goto out;
- }
-
- if (unlikely(ret < 0)) {
- __six_lock_wakeup(lock, -ret - 1);
- ret = 0;
- }
-
- if (six_optimistic_spin(lock, wait, type))
- goto out;
-
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- /*
- * Ensures that writes to the waitlist entry happen after we see
- * wait->lock_acquired: pairs with the smp_store_release in
- * __six_lock_wakeup
- */
- if (smp_load_acquire(&wait->lock_acquired))
- break;
-
- ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (unlikely(ret)) {
- bool acquired;
-
- /*
- * If should_sleep_fn() returns an error, we are
- * required to return that error even if we already
- * acquired the lock - should_sleep_fn() might have
- * modified external state (e.g. when the deadlock cycle
- * detector in bcachefs issued a transaction restart)
- */
- raw_spin_lock(&lock->wait_lock);
- acquired = wait->lock_acquired;
- if (!acquired)
- list_del(&wait->list);
- raw_spin_unlock(&lock->wait_lock);
-
- if (unlikely(acquired)) {
- do_six_unlock_type(lock, type);
- } else if (type == SIX_LOCK_write) {
- six_clear_bitmask(lock, SIX_LOCK_HELD_write);
- six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
- }
- break;
- }
-
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
-out:
- trace_contention_end(lock, 0);
-
- return ret;
-}
-
-/**
- * six_lock_ip_waiter - take a lock, with full waitlist interface
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait: pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * This is the most general six_lock() variant, with parameters to support full
- * cycle detection for deadlock avoidance.
- *
- * The code calling this function must implement tracking of held locks, and the
- * @wait object should be embedded into the struct that tracks held locks -
- * which must also be accessible in a thread-safe way.
- *
- * @should_sleep_fn should invoke the cycle detector; it should walk each
- * lock's waiters, and for each waiter recursively walk their held locks.
- *
- * When this function must block, @wait will be added to @lock's waitlist before
- * calling trylock, and before calling @should_sleep_fn, and @wait will not be
- * removed from the lock waitlist until the lock has been successfully acquired,
- * or we abort.
- *
- * @wait.start_time will be monotonically increasing for any given waitlist, and
- * thus may be used as a loop cursor.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- int ret;
-
- wait->start_time = 0;
-
- if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
-
- ret = do_six_trylock(lock, type, true) ? 0
- : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
-
- if (ret && type != SIX_LOCK_write)
- six_release(&lock->dep_map, ip);
- if (!ret)
- lock_acquired(&lock->dep_map, ip);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
-
-__always_inline
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
- u32 state;
-
- if (type == SIX_LOCK_intent)
- lock->owner = NULL;
-
- if (type == SIX_LOCK_read &&
- lock->readers) {
- smp_mb(); /* unlock barrier */
- this_cpu_dec(*lock->readers);
- smp_mb(); /* between unlocking and checking for waiters */
- state = atomic_read(&lock->state);
- } else {
- u32 v = l[type].lock_val;
-
- if (type != SIX_LOCK_read)
- v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
-
- EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
- state = atomic_sub_return_release(v, &lock->state);
- }
-
- six_lock_wakeup(lock, state, l[type].unlock_wakeup);
-}
-
-/**
- * six_unlock_ip - drop a six lock
- * @lock: lock to unlock
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock); read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0
- */
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
- EBUG_ON(type == SIX_LOCK_write &&
- !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
- EBUG_ON((type == SIX_LOCK_write ||
- type == SIX_LOCK_intent) &&
- lock->owner != current);
-
- if (type != SIX_LOCK_write)
- six_release(&lock->dep_map, ip);
-
- if (type == SIX_LOCK_intent &&
- lock->intent_lock_recurse) {
- --lock->intent_lock_recurse;
- return;
- }
-
- if (type == SIX_LOCK_write &&
- lock->write_lock_recurse) {
- --lock->write_lock_recurse;
- return;
- }
-
- if (type == SIX_LOCK_write)
- lock->seq++;
-
- do_six_unlock_type(lock, type);
-}
-EXPORT_SYMBOL_GPL(six_unlock_ip);
-
-/**
- * six_lock_downgrade - convert an intent lock to a read lock
- * @lock: lock to dowgrade
- *
- * @lock will have read count incremented and intent count decremented
- */
-void six_lock_downgrade(struct six_lock *lock)
-{
- six_lock_increment(lock, SIX_LOCK_read);
- six_unlock_intent(lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_downgrade);
-
-/**
- * six_lock_tryupgrade - attempt to convert read lock to an intent lock
- * @lock: lock to upgrade
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_lock_tryupgrade(struct six_lock *lock)
-{
- u32 old = atomic_read(&lock->state), new;
-
- do {
- new = old;
-
- if (new & SIX_LOCK_HELD_intent)
- return false;
-
- if (!lock->readers) {
- EBUG_ON(!(new & SIX_LOCK_HELD_read));
- new -= l[SIX_LOCK_read].lock_val;
- }
-
- new |= SIX_LOCK_HELD_intent;
- } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
-
- if (lock->readers)
- this_cpu_dec(*lock->readers);
-
- six_set_owner(lock, SIX_LOCK_intent, old, current);
-
- return true;
-}
-EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
-
-/**
- * six_trylock_convert - attempt to convert a held lock from one type to another
- * @lock: lock to upgrade
- * @from: SIX_LOCK_read or SIX_LOCK_intent
- * @to: SIX_LOCK_read or SIX_LOCK_intent
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_trylock_convert(struct six_lock *lock,
- enum six_lock_type from,
- enum six_lock_type to)
-{
- EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
-
- if (to == from)
- return true;
-
- if (to == SIX_LOCK_read) {
- six_lock_downgrade(lock);
- return true;
- } else {
- return six_lock_tryupgrade(lock);
- }
-}
-EXPORT_SYMBOL_GPL(six_trylock_convert);
-
-/**
- * six_lock_increment - increase held lock count on a lock that is already held
- * @lock: lock to increment
- * @type: SIX_LOCK_read or SIX_LOCK_intent
- *
- * @lock must already be held, with a lock type that is greater than or equal to
- * @type
- *
- * A corresponding six_unlock_type() call will be required for @lock to be fully
- * unlocked.
- */
-void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
-{
- six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
-
- /* XXX: assert already locked, and that we don't overflow: */
-
- switch (type) {
- case SIX_LOCK_read:
- if (lock->readers) {
- this_cpu_inc(*lock->readers);
- } else {
- EBUG_ON(!(atomic_read(&lock->state) &
- (SIX_LOCK_HELD_read|
- SIX_LOCK_HELD_intent)));
- atomic_add(l[type].lock_val, &lock->state);
- }
- break;
- case SIX_LOCK_write:
- lock->write_lock_recurse++;
- fallthrough;
- case SIX_LOCK_intent:
- EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
- lock->intent_lock_recurse++;
- break;
- }
-}
-EXPORT_SYMBOL_GPL(six_lock_increment);
-
-/**
- * six_lock_wakeup_all - wake up all waiters on @lock
- * @lock: lock to wake up waiters for
- *
- * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
- * abort the lock operation.
- *
- * This function is never needed in a bug-free program; it's only useful in
- * debug code, e.g. to determine if a cycle detector is at fault.
- */
-void six_lock_wakeup_all(struct six_lock *lock)
-{
- u32 state = atomic_read(&lock->state);
- struct six_lock_waiter *w;
-
- six_lock_wakeup(lock, state, SIX_LOCK_read);
- six_lock_wakeup(lock, state, SIX_LOCK_intent);
- six_lock_wakeup(lock, state, SIX_LOCK_write);
-
- raw_spin_lock(&lock->wait_lock);
- list_for_each_entry(w, &lock->wait_list, list)
- wake_up_process(w->task);
- raw_spin_unlock(&lock->wait_lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-
-/**
- * six_lock_counts - return held lock counts, for each lock type
- * @lock: lock to return counters for
- *
- * Return: the number of times a lock is held for read, intent and write.
- */
-struct six_lock_count six_lock_counts(struct six_lock *lock)
-{
- struct six_lock_count ret;
-
- ret.n[SIX_LOCK_read] = !lock->readers
- ? atomic_read(&lock->state) & SIX_LOCK_HELD_read
- : pcpu_read_count(lock);
- ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
- lock->intent_lock_recurse;
- ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_counts);
-
-/**
- * six_lock_readers_add - directly manipulate reader count of a lock
- * @lock: lock to add/subtract readers for
- * @nr: reader count to add/subtract
- *
- * When an upper layer is implementing lock reentrency, we may have both read
- * and intent locks on the same lock.
- *
- * When we need to take a write lock, the read locks will cause self-deadlock,
- * because six locks themselves do not track which read locks are held by the
- * current thread and which are held by a different thread - it does no
- * per-thread tracking of held locks.
- *
- * The upper layer that is tracking held locks may however, if trylock() has
- * failed, count up its own read locks, subtract them, take the write lock, and
- * then re-add them.
- *
- * As in any other situation when taking a write lock, @lock must be held for
- * intent one (or more) times, so @lock will never be left unlocked.
- */
-void six_lock_readers_add(struct six_lock *lock, int nr)
-{
- if (lock->readers) {
- this_cpu_add(*lock->readers, nr);
- } else {
- EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
- /* reader count starts at bit 0 */
- atomic_add(nr, &lock->state);
- }
-}
-EXPORT_SYMBOL_GPL(six_lock_readers_add);
-
-/**
- * six_lock_exit - release resources held by a lock prior to freeing
- * @lock: lock to exit
- *
- * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
- * required to free the percpu read counts.
- */
-void six_lock_exit(struct six_lock *lock)
-{
- WARN_ON(lock->readers && pcpu_read_count(lock));
- WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
-
- free_percpu(lock->readers);
- lock->readers = NULL;
-}
-EXPORT_SYMBOL_GPL(six_lock_exit);
-
-void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags,
- gfp_t gfp)
-{
- atomic_set(&lock->state, 0);
- raw_spin_lock_init(&lock->wait_lock);
- INIT_LIST_HEAD(&lock->wait_list);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- debug_check_no_locks_freed((void *) lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-
- /*
- * Don't assume that we have real percpu variables available in
- * userspace:
- */
-#ifdef __KERNEL__
- if (flags & SIX_LOCK_INIT_PCPU) {
- /*
- * We don't return an error here on memory allocation failure
- * since percpu is an optimization, and locks will work with the
- * same semantics in non-percpu mode: callers can check for
- * failure if they wish by checking lock->readers, but generally
- * will not want to treat it as an error.
- */
- lock->readers = alloc_percpu_gfp(unsigned, gfp);
- }
-#endif
-}
-EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
deleted file mode 100644
index 59b851cf8bac..000000000000
--- a/fs/bcachefs/six.h
+++ /dev/null
@@ -1,388 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _LINUX_SIX_H
-#define _LINUX_SIX_H
-
-/**
- * DOC: SIX locks overview
- *
- * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
- * but with an additional state: read/shared, intent, exclusive/write
- *
- * The purpose of the intent state is to allow for greater concurrency on tree
- * structures without deadlocking. In general, a read can't be upgraded to a
- * write lock without deadlocking, so an operation that updates multiple nodes
- * will have to take write locks for the full duration of the operation.
- *
- * But by adding an intent state, which is exclusive with other intent locks but
- * not with readers, we can take intent locks at the start of the operation,
- * and then take write locks only for the actual update to each individual
- * nodes, without deadlocking.
- *
- * Example usage:
- * six_lock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
- *
- * An intent lock must be held before taking a write lock:
- * six_lock_intent(&foo->lock);
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
- * six_unlock_intent(&foo->lock);
- *
- * Other operations:
- * six_trylock_read()
- * six_trylock_intent()
- * six_trylock_write()
- *
- * six_lock_downgrade() convert from intent to read
- * six_lock_tryupgrade() attempt to convert from read to intent, may fail
- *
- * There are also interfaces that take the lock type as an enum:
- *
- * six_lock_type(&foo->lock, SIX_LOCK_read);
- * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
- * six_lock_type(&foo->lock, SIX_LOCK_write);
- * six_unlock_type(&foo->lock, SIX_LOCK_write);
- * six_unlock_type(&foo->lock, SIX_LOCK_intent);
- *
- * Lock sequence numbers - unlock(), relock():
- *
- * Locks embed sequences numbers, which are incremented on write lock/unlock.
- * This allows locks to be dropped and the retaken iff the state they protect
- * hasn't changed; this makes it much easier to avoid holding locks while e.g.
- * doing IO or allocating memory.
- *
- * Example usage:
- * six_lock_read(&foo->lock);
- * u32 seq = six_lock_seq(&foo->lock);
- * six_unlock_read(&foo->lock);
- *
- * some_operation_that_may_block();
- *
- * if (six_relock_read(&foo->lock, seq)) { ... }
- *
- * If the relock operation succeeds, it is as if the lock was never unlocked.
- *
- * Reentrancy:
- *
- * Six locks are not by themselves reentrant, but have counters for both the
- * read and intent states that can be used to provide reentrancy by an upper
- * layer that tracks held locks. If a lock is known to already be held in the
- * read or intent state, six_lock_increment() can be used to bump the "lock
- * held in this state" counter, increasing the number of unlock calls that
- * will be required to fully unlock it.
- *
- * Example usage:
- * six_lock_read(&foo->lock);
- * six_lock_increment(&foo->lock, SIX_LOCK_read);
- * six_unlock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
- * foo->lock is now fully unlocked.
- *
- * Since the intent state supercedes read, it's legal to increment the read
- * counter when holding an intent lock, but not the reverse.
- *
- * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
- * is not legal.
- *
- * should_sleep_fn:
- *
- * There is a six_lock() variant that takes a function pointer that is called
- * immediately prior to schedule() when blocking, and may return an error to
- * abort.
- *
- * One possible use for this feature is when objects being locked are part of
- * a cache and may reused, and lock ordering is based on a property of the
- * object that will change when the object is reused - i.e. logical key order.
- *
- * If looking up an object in the cache may race with object reuse, and lock
- * ordering is required to prevent deadlock, object reuse may change the
- * correct lock order for that object and cause a deadlock. should_sleep_fn
- * can be used to check if the object is still the object we want and avoid
- * this deadlock.
- *
- * Wait list entry interface:
- *
- * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
- * wait list entry. By embedding six_lock_waiter into another object, and by
- * traversing lock waitlists, it is then possible for an upper layer to
- * implement full cycle detection for deadlock avoidance.
- *
- * should_sleep_fn should be used for invoking the cycle detector, walking the
- * graph of held locks to check for a deadlock. The upper layer must track
- * held locks for each thread, and each thread's held locks must be reachable
- * from its six_lock_waiter object.
- *
- * six_lock_waiter() will add the wait object to the waitlist re-trying taking
- * the lock, and before calling should_sleep_fn, and the wait object will not
- * be removed from the waitlist until either the lock has been successfully
- * acquired, or we aborted because should_sleep_fn returned an error.
- *
- * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
- * have timestamps in strictly ascending order - this is so the timestamp can
- * be used as a cursor for lock graph traverse.
- */
-
-#include <linux/lockdep.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-enum six_lock_type {
- SIX_LOCK_read,
- SIX_LOCK_intent,
- SIX_LOCK_write,
-};
-
-struct six_lock {
- atomic_t state;
- u32 seq;
- unsigned intent_lock_recurse;
- unsigned write_lock_recurse;
- struct task_struct *owner;
- unsigned __percpu *readers;
- raw_spinlock_t wait_lock;
- struct list_head wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-};
-
-struct six_lock_waiter {
- struct list_head list;
- struct task_struct *task;
- enum six_lock_type lock_want;
- bool lock_acquired;
- u64 start_time;
-};
-
-typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
-
-void six_lock_exit(struct six_lock *lock);
-
-enum six_lock_init_flags {
- SIX_LOCK_INIT_PCPU = 1U << 0,
-};
-
-void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags,
- gfp_t gfp);
-
-/**
- * six_lock_init - initialize a six lock
- * @lock: lock to initialize
- * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU
- */
-#define six_lock_init(lock, flags, gfp) \
-do { \
- static struct lock_class_key __key; \
- \
- __six_lock_init((lock), #lock, &__key, flags, gfp); \
-} while (0)
-
-/**
- * six_lock_seq - obtain current lock sequence number
- * @lock: six_lock to obtain sequence number for
- *
- * @lock should be held for read or intent, and not write
- *
- * By saving the lock sequence number, we can unlock @lock and then (typically
- * after some blocking operation) attempt to relock it: the relock will succeed
- * if the sequence number hasn't changed, meaning no write locks have been taken
- * and state corresponding to what @lock protects is still valid.
- */
-static inline u32 six_lock_seq(const struct six_lock *lock)
-{
- return lock->seq;
-}
-
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_trylock_type - attempt to take a six lock without blocking
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
- return six_trylock_ip(lock, type, _THIS_IP_);
-}
-
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip);
-
-/**
- * six_lock_waiter - take a lock, with full waitlist interface
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait: pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- *
- * This is a convenience wrapper around six_lock_ip_waiter(), see that function
- * for full documentation.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
- return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-/**
- * six_lock_ip - take a six lock lock
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- struct six_lock_waiter wait;
-
- return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-}
-
-/**
- * six_lock_type - take a six lock lock
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
- struct six_lock_waiter wait;
-
- return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
- unsigned seq, unsigned long ip);
-
-/**
- * six_relock_type - attempt to re-take a lock that was held previously
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq: lock sequence number obtained from six_lock_seq() while lock was
- * held previously
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
- unsigned seq)
-{
- return six_relock_ip(lock, type, seq, _THIS_IP_);
-}
-
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_unlock_type - drop a six lock
- * @lock: lock to unlock
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock); read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0
- */
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
- six_unlock_ip(lock, type, _THIS_IP_);
-}
-
-#define __SIX_LOCK(type) \
-static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
-{ \
- return six_trylock_ip(lock, SIX_LOCK_##type, ip); \
-} \
- \
-static inline bool six_trylock_##type(struct six_lock *lock) \
-{ \
- return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \
-} \
- \
-static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \
- struct six_lock_waiter *wait, \
- six_lock_should_sleep_fn should_sleep_fn, void *p,\
- unsigned long ip) \
-{ \
- return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-} \
- \
-static inline int six_lock_ip_##type(struct six_lock *lock, \
- six_lock_should_sleep_fn should_sleep_fn, void *p, \
- unsigned long ip) \
-{ \
- return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-} \
- \
-static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-{ \
- return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \
-} \
- \
-static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \
-{ \
- return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \
-} \
- \
-static inline int six_lock_##type(struct six_lock *lock, \
- six_lock_should_sleep_fn fn, void *p)\
-{ \
- return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \
-} \
- \
-static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \
-{ \
- six_unlock_ip(lock, SIX_LOCK_##type, ip); \
-} \
- \
-static inline void six_unlock_##type(struct six_lock *lock) \
-{ \
- six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \
-}
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-#undef __SIX_LOCK
-
-void six_lock_downgrade(struct six_lock *);
-bool six_lock_tryupgrade(struct six_lock *);
-bool six_trylock_convert(struct six_lock *, enum six_lock_type,
- enum six_lock_type);
-
-void six_lock_increment(struct six_lock *, enum six_lock_type);
-
-void six_lock_wakeup_all(struct six_lock *);
-
-struct six_lock_count {
- unsigned n[3];
-};
-
-struct six_lock_count six_lock_counts(struct six_lock *);
-void six_lock_readers_add(struct six_lock *, int);
-
-#endif /* _LINUX_SIX_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
deleted file mode 100644
index e7f197896db1..000000000000
--- a/fs/bcachefs/snapshot.c
+++ /dev/null
@@ -1,1749 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-
-#include <linux/random.h>
-
-/*
- * Snapshot trees:
- *
- * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
- * exist to provide a stable identifier for the whole lifetime of a snapshot
- * tree.
- */
-
-void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
-
- prt_printf(out, "subvol %u root snapshot %u",
- le32_to_cpu(t.v->master_subvol),
- le32_to_cpu(t.v->root_snapshot));
-}
-
-int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1)),
- c, snapshot_tree_pos_bad,
- "bad pos");
-fsck_err:
- return ret;
-}
-
-int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
- struct bch_snapshot_tree *s)
-{
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
- BTREE_ITER_with_updates, snapshot_tree, s);
-
- if (bch2_err_matches(ret, ENOENT))
- ret = -BCH_ERR_ENOENT_snapshot_tree;
- return ret;
-}
-
-struct bkey_i_snapshot_tree *
-__bch2_snapshot_tree_create(struct btree_trans *trans)
-{
- struct btree_iter iter;
- int ret = bch2_bkey_get_empty_slot(trans, &iter,
- BTREE_ID_snapshot_trees, POS(0, U32_MAX));
- struct bkey_i_snapshot_tree *s_t;
-
- if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = -BCH_ERR_ENOSPC_snapshot_tree;
- if (ret)
- return ERR_PTR(ret);
-
- s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(s_t);
- bch2_trans_iter_exit(trans, &iter);
- return ret ? ERR_PTR(ret) : s_t;
-}
-
-static int bch2_snapshot_tree_create(struct btree_trans *trans,
- u32 root_id, u32 subvol_id, u32 *tree_id)
-{
- struct bkey_i_snapshot_tree *n_tree =
- __bch2_snapshot_tree_create(trans);
-
- if (IS_ERR(n_tree))
- return PTR_ERR(n_tree);
-
- n_tree->v.master_subvol = cpu_to_le32(subvol_id);
- n_tree->v.root_snapshot = cpu_to_le32(root_id);
- *tree_id = n_tree->k.p.offset;
- return 0;
-}
-
-/* Snapshot nodes: */
-
-static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
-{
- while (id && id < ancestor) {
- const struct snapshot_t *s = __snapshot_t(t, id);
- id = s ? s->parent : 0;
- }
- return id == ancestor;
-}
-
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
-{
- rcu_read_lock();
- bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
-{
- const struct snapshot_t *s = __snapshot_t(t, id);
- if (!s)
- return 0;
-
- if (s->skip[2] <= ancestor)
- return s->skip[2];
- if (s->skip[1] <= ancestor)
- return s->skip[1];
- if (s->skip[0] <= ancestor)
- return s->skip[0];
- return s->parent;
-}
-
-static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
-{
- const struct snapshot_t *s = __snapshot_t(t, id);
- if (!s)
- return false;
-
- return test_bit(ancestor - id - 1, s->is_ancestor);
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
- bool ret;
-
- rcu_read_lock();
- struct snapshot_table *t = rcu_dereference(c->snapshots);
-
- if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
- ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
- goto out;
- }
-
- if (likely(ancestor >= IS_ANCESTOR_BITMAP))
- while (id && id < ancestor - IS_ANCESTOR_BITMAP)
- id = get_ancestor_below(t, id, ancestor);
-
- ret = id && id < ancestor
- ? test_ancestor_bitmap(t, id, ancestor)
- : id == ancestor;
-
- EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
-out:
- rcu_read_unlock();
-
- return ret;
-}
-
-static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
-{
- size_t idx = U32_MAX - id;
- struct snapshot_table *new, *old;
-
- size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
- size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
-
- if (unlikely(new_bytes > INT_MAX))
- return NULL;
-
- new = kvzalloc(new_bytes, GFP_KERNEL);
- if (!new)
- return NULL;
-
- new->nr = new_size;
-
- old = rcu_dereference_protected(c->snapshots, true);
- if (old)
- memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
-
- rcu_assign_pointer(c->snapshots, new);
- kvfree_rcu(old, rcu);
-
- return &rcu_dereference_protected(c->snapshots,
- lockdep_is_held(&c->snapshot_table_lock))->s[idx];
-}
-
-static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
-{
- size_t idx = U32_MAX - id;
- struct snapshot_table *table =
- rcu_dereference_protected(c->snapshots,
- lockdep_is_held(&c->snapshot_table_lock));
-
- lockdep_assert_held(&c->snapshot_table_lock);
-
- if (likely(table && idx < table->nr))
- return &table->s[idx];
-
- return __snapshot_t_mut(c, id);
-}
-
-void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-
- prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
- BCH_SNAPSHOT_SUBVOL(s.v),
- BCH_SNAPSHOT_DELETED(s.v),
- le32_to_cpu(s.v->parent),
- le32_to_cpu(s.v->children[0]),
- le32_to_cpu(s.v->children[1]),
- le32_to_cpu(s.v->subvol),
- le32_to_cpu(s.v->tree));
-
- if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
- prt_printf(out, " depth %u skiplist %u %u %u",
- le32_to_cpu(s.v->depth),
- le32_to_cpu(s.v->skip[0]),
- le32_to_cpu(s.v->skip[1]),
- le32_to_cpu(s.v->skip[2]));
-}
-
-int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_snapshot s;
- u32 i, id;
- int ret = 0;
-
- bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1)),
- c, snapshot_pos_bad,
- "bad pos");
-
- s = bkey_s_c_to_snapshot(k);
-
- id = le32_to_cpu(s.v->parent);
- bkey_fsck_err_on(id && id <= k.k->p.offset,
- c, snapshot_parent_bad,
- "bad parent node (%u <= %llu)",
- id, k.k->p.offset);
-
- bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
- c, snapshot_children_not_normalized,
- "children not normalized");
-
- bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
- c, snapshot_child_duplicate,
- "duplicate child nodes");
-
- for (i = 0; i < 2; i++) {
- id = le32_to_cpu(s.v->children[i]);
-
- bkey_fsck_err_on(id >= k.k->p.offset,
- c, snapshot_child_bad,
- "bad child node (%u >= %llu)",
- id, k.k->p.offset);
- }
-
- if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
- bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
- le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
- c, snapshot_skiplist_not_normalized,
- "skiplist not normalized");
-
- for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
- id = le32_to_cpu(s.v->skip[i]);
-
- bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
- c, snapshot_skiplist_bad,
- "bad skiplist node %u", id);
- }
- }
-fsck_err:
- return ret;
-}
-
-static int __bch2_mark_snapshot(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct snapshot_t *t;
- u32 id = new.k->p.offset;
- int ret = 0;
-
- mutex_lock(&c->snapshot_table_lock);
-
- t = snapshot_t_mut(c, id);
- if (!t) {
- ret = -BCH_ERR_ENOMEM_mark_snapshot;
- goto err;
- }
-
- if (new.k->type == KEY_TYPE_snapshot) {
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
-
- t->live = true;
- t->parent = le32_to_cpu(s.v->parent);
- t->children[0] = le32_to_cpu(s.v->children[0]);
- t->children[1] = le32_to_cpu(s.v->children[1]);
- t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
- t->tree = le32_to_cpu(s.v->tree);
-
- if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
- t->depth = le32_to_cpu(s.v->depth);
- t->skip[0] = le32_to_cpu(s.v->skip[0]);
- t->skip[1] = le32_to_cpu(s.v->skip[1]);
- t->skip[2] = le32_to_cpu(s.v->skip[2]);
- } else {
- t->depth = 0;
- t->skip[0] = 0;
- t->skip[1] = 0;
- t->skip[2] = 0;
- }
-
- u32 parent = id;
-
- while ((parent = bch2_snapshot_parent_early(c, parent)) &&
- parent - id - 1 < IS_ANCESTOR_BITMAP)
- __set_bit(parent - id - 1, t->is_ancestor);
-
- if (BCH_SNAPSHOT_DELETED(s.v)) {
- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
- bch2_delete_dead_snapshots_async(c);
- }
- } else {
- memset(t, 0, sizeof(*t));
- }
-err:
- mutex_unlock(&c->snapshot_table_lock);
- return ret;
-}
-
-int bch2_mark_snapshot(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
-}
-
-int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
- struct bch_snapshot *s)
-{
- return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_with_updates, snapshot, s);
-}
-
-/* fsck: */
-
-static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
-{
- return snapshot_t(c, id)->children[child];
-}
-
-static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
-{
- return bch2_snapshot_child(c, id, 0);
-}
-
-static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
-{
- return bch2_snapshot_child(c, id, 1);
-}
-
-static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
-{
- u32 n, parent;
-
- n = bch2_snapshot_left_child(c, id);
- if (n)
- return n;
-
- while ((parent = bch2_snapshot_parent(c, id))) {
- n = bch2_snapshot_right_child(c, parent);
- if (n && n != id)
- return n;
- id = parent;
- }
-
- return 0;
-}
-
-u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
-{
- u32 id = snapshot_root;
- u32 subvol = 0, s;
-
- rcu_read_lock();
- while (id) {
- s = snapshot_t(c, id)->subvol;
-
- if (s && (!subvol || s < subvol))
- subvol = s;
-
- id = bch2_snapshot_tree_next(c, id);
- }
- rcu_read_unlock();
-
- return subvol;
-}
-
-static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
- u32 snapshot_root, u32 *subvol_id)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- bool found = false;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_subvolume)
- continue;
-
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
- continue;
- if (!BCH_SUBVOLUME_SNAP(s.v)) {
- *subvol_id = s.k->p.offset;
- found = true;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (!ret && !found) {
- struct bkey_i_subvolume *u;
-
- *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
-
- u = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, *subvol_id),
- 0, subvolume);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- SET_BCH_SUBVOLUME_SNAP(&u->v, false);
- }
-
- return ret;
-}
-
-static int check_snapshot_tree(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_snapshot_tree st;
- struct bch_snapshot s;
- struct bch_subvolume subvol;
- struct printbuf buf = PRINTBUF;
- struct btree_iter snapshot_iter = {};
- u32 root_id;
- int ret;
-
- if (k.k->type != KEY_TYPE_snapshot_tree)
- return 0;
-
- st = bkey_s_c_to_snapshot_tree(k);
- root_id = le32_to_cpu(st.v->root_snapshot);
-
- struct bkey_s_c_snapshot snapshot_k =
- bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
- POS(0, root_id), 0, snapshot);
- ret = bkey_err(snapshot_k);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (!ret)
- bkey_val_copy(&s, snapshot_k);
-
- if (fsck_err_on(ret ||
- root_id != bch2_snapshot_root(c, root_id) ||
- st.k->p.offset != le32_to_cpu(s.tree),
- trans, snapshot_tree_to_missing_snapshot,
- "snapshot tree points to missing/incorrect snapshot:\n %s",
- (bch2_bkey_val_to_text(&buf, c, st.s_c),
- prt_newline(&buf),
- ret
- ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
- : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
- buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter, 0);
- goto err;
- }
-
- if (!st.v->master_subvol)
- goto out;
-
- ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret,
- trans, snapshot_tree_to_missing_subvol,
- "snapshot tree points to missing subvolume:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(!bch2_snapshot_is_ancestor(c,
- le32_to_cpu(subvol.snapshot),
- root_id),
- trans, snapshot_tree_to_wrong_subvol,
- "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
- trans, snapshot_tree_to_snapshot_subvol,
- "snapshot tree points to snapshot subvolume:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
- struct bkey_i_snapshot_tree *u;
- u32 subvol_id;
-
- ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
- bch_err_fn(c, ret);
-
- if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
- ret = 0;
- goto err;
- }
-
- if (ret)
- goto err;
-
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.master_subvol = cpu_to_le32(subvol_id);
- st = snapshot_tree_i_to_s_c(u);
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &snapshot_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * For each snapshot_tree, make sure it points to the root of a snapshot tree
- * and that snapshot entry points back to it, or delete it.
- *
- * And, make sure it points to a subvolume within that snapshot tree, or correct
- * it to point to the oldest subvolume within that snapshot tree.
- */
-int bch2_check_snapshot_trees(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_snapshot_trees, POS_MIN,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_snapshot_tree(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-/*
- * Look up snapshot tree for @tree_id and find root,
- * make sure @snap_id is a descendent:
- */
-static int snapshot_tree_ptr_good(struct btree_trans *trans,
- u32 snap_id, u32 tree_id)
-{
- struct bch_snapshot_tree s_t;
- int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-
- if (bch2_err_matches(ret, ENOENT))
- return 0;
- if (ret)
- return ret;
-
- return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
-}
-
-u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s;
-
- if (!id)
- return 0;
-
- rcu_read_lock();
- s = snapshot_t(c, id);
- if (s->parent)
- id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
- rcu_read_unlock();
-
- return id;
-}
-
-static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
-{
- unsigned i;
-
- for (i = 0; i < 3; i++)
- if (!s.parent) {
- if (s.skip[i])
- return false;
- } else {
- if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
- return false;
- }
-
- return true;
-}
-
-/*
- * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
- * its snapshot_tree pointer is correct (allocate new one if necessary), then
- * update this node's pointer to root node's pointer:
- */
-static int snapshot_tree_ptr_repair(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_snapshot *s)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter root_iter;
- struct bch_snapshot_tree s_t;
- struct bkey_s_c_snapshot root;
- struct bkey_i_snapshot *u;
- u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
- int ret;
-
- root = bch2_bkey_get_iter_typed(trans, &root_iter,
- BTREE_ID_snapshots, POS(0, root_id),
- BTREE_ITER_with_updates, snapshot);
- ret = bkey_err(root);
- if (ret)
- goto err;
-
- tree_id = le32_to_cpu(root.v->tree);
-
- ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
- u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u) ?:
- bch2_snapshot_tree_create(trans, root_id,
- bch2_snapshot_tree_oldest_subvol(c, root_id),
- &tree_id);
- if (ret)
- goto err;
-
- u->v.tree = cpu_to_le32(tree_id);
- if (k.k->p.offset == root_id)
- *s = u->v;
- }
-
- if (k.k->p.offset != root_id) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.tree = cpu_to_le32(tree_id);
- *s = u->v;
- }
-err:
- bch2_trans_iter_exit(trans, &root_iter);
- return ret;
-}
-
-static int check_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bch_snapshot s;
- struct bch_subvolume subvol;
- struct bch_snapshot v;
- struct bkey_i_snapshot *u;
- u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
- u32 real_depth;
- struct printbuf buf = PRINTBUF;
- u32 i, id;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- memset(&s, 0, sizeof(s));
- memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
-
- id = le32_to_cpu(s.parent);
- if (id) {
- ret = bch2_snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "snapshot with nonexistent parent:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (ret)
- goto err;
-
- if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
- le32_to_cpu(v.children[1]) != k.k->p.offset) {
- bch_err(c, "snapshot parent %u missing pointer to child %llu",
- id, k.k->p.offset);
- ret = -EINVAL;
- goto err;
- }
- }
-
- for (i = 0; i < 2 && s.children[i]; i++) {
- id = le32_to_cpu(s.children[i]);
-
- ret = bch2_snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "snapshot node %llu has nonexistent child %u",
- k.k->p.offset, id);
- if (ret)
- goto err;
-
- if (le32_to_cpu(v.parent) != k.k->p.offset) {
- bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
- id, le32_to_cpu(v.parent), k.k->p.offset);
- ret = -EINVAL;
- goto err;
- }
- }
-
- bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
- !BCH_SNAPSHOT_DELETED(&s);
-
- if (should_have_subvol) {
- id = le32_to_cpu(s.subvol);
- ret = bch2_subvolume_get(trans, id, false, &subvol);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (ret)
- goto err;
-
- if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
- bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
- k.k->p.offset);
- ret = -EINVAL;
- goto err;
- }
- } else {
- if (fsck_err_on(s.subvol,
- trans, snapshot_should_not_have_subvol,
- "snapshot should not point to subvol:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.subvol = 0;
- s = u->v;
- }
- }
-
- ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
- if (ret < 0)
- goto err;
-
- if (fsck_err_on(!ret,
- trans, snapshot_to_bad_snapshot_tree,
- "snapshot points to missing/incorrect tree:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
- if (ret)
- goto err;
- }
- ret = 0;
-
- real_depth = bch2_snapshot_depth(c, parent_id);
-
- if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
- trans, snapshot_bad_depth,
- "snapshot with incorrect depth field, should be %u:\n %s",
- real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.depth = cpu_to_le32(real_depth);
- s = u->v;
- }
-
- ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
- if (ret < 0)
- goto err;
-
- if (fsck_err_on(!ret,
- trans, snapshot_bad_skiplist,
- "snapshot with bad skiplist field:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
- u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
-
- bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
- s = u->v;
- }
- ret = 0;
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_snapshots(struct bch_fs *c)
-{
- /*
- * We iterate backwards as checking/fixing the depth field requires that
- * the parent's depth already be correct:
- */
- int ret = bch2_trans_run(c,
- for_each_btree_key_reverse_commit(trans, iter,
- BTREE_ID_snapshots, POS_MAX,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_snapshot(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_snapshot_exists(struct btree_trans *trans, u32 id)
-{
- struct bch_fs *c = trans->c;
-
- if (bch2_snapshot_exists(c, id))
- return 0;
-
- /* Do we need to reconstruct the snapshot_tree entry as well? */
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
- u32 tree_id = 0;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
- 0, k, ret) {
- if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
- tree_id = k.k->p.offset;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- return ret;
-
- if (!tree_id) {
- ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
- if (ret)
- return ret;
- }
-
- struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
- ret = PTR_ERR_OR_ZERO(snapshot);
- if (ret)
- return ret;
-
- bkey_snapshot_init(&snapshot->k_i);
- snapshot->k.p = POS(0, id);
- snapshot->v.tree = cpu_to_le32(tree_id);
- snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
- 0, k, ret) {
- if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
- snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
- SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
- bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
-}
-
-/* Figure out which snapshot nodes belong in the same tree: */
-struct snapshot_tree_reconstruct {
- enum btree_id btree;
- struct bpos cur_pos;
- snapshot_id_list cur_ids;
- DARRAY(snapshot_id_list) trees;
-};
-
-static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
-{
- darray_for_each(r->trees, i)
- darray_exit(i);
- darray_exit(&r->trees);
- darray_exit(&r->cur_ids);
-}
-
-static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
-{
- return r->btree == BTREE_ID_inodes
- ? r->cur_pos.offset == pos.offset
- : r->cur_pos.inode == pos.inode;
-}
-
-static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
-{
- darray_for_each(*l, i)
- if (snapshot_list_has_id(r, *i))
- return true;
- return false;
-}
-
-static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
-{
- bool first = true;
- darray_for_each(*s, i) {
- if (!first)
- prt_char(out, ' ');
- first = false;
- prt_printf(out, "%u", *i);
- }
-}
-
-static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
-{
- if (r->cur_ids.nr) {
- darray_for_each(r->trees, i)
- if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
- int ret = snapshot_list_merge(c, i, &r->cur_ids);
- if (ret)
- return ret;
- goto out;
- }
- darray_push(&r->trees, r->cur_ids);
- darray_init(&r->cur_ids);
- }
-out:
- r->cur_ids.nr = 0;
- return 0;
-}
-
-static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
-{
- if (!same_snapshot(r, pos))
- snapshot_tree_reconstruct_next(c, r);
- r->cur_pos = pos;
- return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
-}
-
-int bch2_reconstruct_snapshots(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct printbuf buf = PRINTBUF;
- struct snapshot_tree_reconstruct r = {};
- int ret = 0;
-
- for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
- if (btree_type_has_snapshots(btree)) {
- r.btree = btree;
-
- ret = for_each_btree_key(trans, iter, btree, POS_MIN,
- BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
- get_snapshot_trees(c, &r, k.k->p);
- }));
- if (ret)
- goto err;
-
- snapshot_tree_reconstruct_next(c, &r);
- }
- }
-
- darray_for_each(r.trees, t) {
- printbuf_reset(&buf);
- snapshot_id_list_to_text(&buf, t);
-
- darray_for_each(*t, id) {
- if (fsck_err_on(!bch2_snapshot_exists(c, *id),
- trans, snapshot_node_missing,
- "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
- if (t->nr > 1) {
- bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto err;
- }
-
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_snapshot_exists(trans, *id));
- if (ret)
- goto err;
- }
- }
- }
-fsck_err:
-err:
- bch2_trans_put(trans);
- snapshot_tree_reconstruct_exit(&r);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_check_key_has_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
- trans, bkey_in_missing_snapshot,
- "key in missing snapshot %s, delete?",
- (bch2_btree_id_to_text(&buf, iter->btree_id),
- prt_char(&buf, ' '),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node) ?: 1;
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * Mark a snapshot as deleted, for future cleanup:
- */
-int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
-{
- struct btree_iter iter;
- struct bkey_i_snapshot *s =
- bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshots, POS(0, id),
- 0, snapshot);
- int ret = PTR_ERR_OR_ZERO(s);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
- trans->c, "missing snapshot %u", id);
- return ret;
- }
-
- /* already deleted? */
- if (BCH_SNAPSHOT_DELETED(&s->v))
- goto err;
-
- SET_BCH_SNAPSHOT_DELETED(&s->v, true);
- SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
- s->v.subvol = 0;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
-{
- if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
- swap(s->children[0], s->children[1]);
-}
-
-static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
- struct btree_iter c_iter = (struct btree_iter) { NULL };
- struct btree_iter tree_iter = (struct btree_iter) { NULL };
- struct bkey_s_c_snapshot s;
- u32 parent_id, child_id;
- unsigned i;
- int ret = 0;
-
- s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_intent, snapshot);
- ret = bkey_err(s);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "missing snapshot %u", id);
-
- if (ret)
- goto err;
-
- BUG_ON(s.v->children[1]);
-
- parent_id = le32_to_cpu(s.v->parent);
- child_id = le32_to_cpu(s.v->children[0]);
-
- if (parent_id) {
- struct bkey_i_snapshot *parent;
-
- parent = bch2_bkey_get_mut_typed(trans, &p_iter,
- BTREE_ID_snapshots, POS(0, parent_id),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(parent);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "missing snapshot %u", parent_id);
- if (unlikely(ret))
- goto err;
-
- /* find entry in parent->children for node being deleted */
- for (i = 0; i < 2; i++)
- if (le32_to_cpu(parent->v.children[i]) == id)
- break;
-
- if (bch2_fs_inconsistent_on(i == 2, c,
- "snapshot %u missing child pointer to %u",
- parent_id, id))
- goto err;
-
- parent->v.children[i] = cpu_to_le32(child_id);
-
- normalize_snapshot_child_pointers(&parent->v);
- }
-
- if (child_id) {
- struct bkey_i_snapshot *child;
-
- child = bch2_bkey_get_mut_typed(trans, &c_iter,
- BTREE_ID_snapshots, POS(0, child_id),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(child);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "missing snapshot %u", child_id);
- if (unlikely(ret))
- goto err;
-
- child->v.parent = cpu_to_le32(parent_id);
-
- if (!child->v.parent) {
- child->v.skip[0] = 0;
- child->v.skip[1] = 0;
- child->v.skip[2] = 0;
- }
- }
-
- if (!parent_id) {
- /*
- * We're deleting the root of a snapshot tree: update the
- * snapshot_tree entry to point to the new root, or delete it if
- * this is the last snapshot ID in this tree:
- */
- struct bkey_i_snapshot_tree *s_t;
-
- BUG_ON(s.v->children[1]);
-
- s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
- BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
- 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(s_t);
- if (ret)
- goto err;
-
- if (s.v->children[0]) {
- s_t->v.root_snapshot = s.v->children[0];
- } else {
- s_t->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&s_t->k, 0);
- }
- }
-
- ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
- bch2_trans_iter_exit(trans, &tree_iter);
- bch2_trans_iter_exit(trans, &p_iter);
- bch2_trans_iter_exit(trans, &c_iter);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i_snapshot *n;
- struct bkey_s_c k;
- unsigned i, j;
- u32 depth = bch2_snapshot_depth(c, parent);
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
- POS_MIN, BTREE_ITER_intent);
- k = bch2_btree_iter_peek(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- for (i = 0; i < nr_snapids; i++) {
- k = bch2_btree_iter_prev_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k || !k.k->p.offset) {
- ret = -BCH_ERR_ENOSPC_snapshot_create;
- goto err;
- }
-
- n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- n->v.flags = 0;
- n->v.parent = cpu_to_le32(parent);
- n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
- n->v.tree = cpu_to_le32(tree);
- n->v.depth = cpu_to_le32(depth);
- n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
- n->v.btime.hi = 0;
-
- for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
- n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
-
- bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
- SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
-
- ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
- bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
- if (ret)
- goto err;
-
- new_snapids[i] = iter.pos.offset;
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * Create new snapshot IDs as children of an existing snapshot ID:
- */
-static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- struct btree_iter iter;
- struct bkey_i_snapshot *n_parent;
- int ret = 0;
-
- n_parent = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshots, POS(0, parent),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(n_parent);
- if (unlikely(ret)) {
- if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot %u not found", parent);
- return ret;
- }
-
- if (n_parent->v.children[0] || n_parent->v.children[1]) {
- bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
- ret = -EINVAL;
- goto err;
- }
-
- ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
- new_snapids, snapshot_subvols, nr_snapids);
- if (ret)
- goto err;
-
- n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
- n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
- n_parent->v.subvol = 0;
- SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * Create a snapshot node that is the root of a new tree:
- */
-static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- struct bkey_i_snapshot_tree *n_tree;
- int ret;
-
- n_tree = __bch2_snapshot_tree_create(trans);
- ret = PTR_ERR_OR_ZERO(n_tree) ?:
- create_snapids(trans, 0, n_tree->k.p.offset,
- new_snapids, snapshot_subvols, nr_snapids);
- if (ret)
- return ret;
-
- n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
- n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
- return 0;
-}
-
-int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- BUG_ON((parent == 0) != (nr_snapids == 1));
- BUG_ON((parent != 0) != (nr_snapids == 2));
-
- return parent
- ? bch2_snapshot_node_create_children(trans, parent,
- new_snapids, snapshot_subvols, nr_snapids)
- : bch2_snapshot_node_create_tree(trans,
- new_snapids, snapshot_subvols, nr_snapids);
-
-}
-
-/*
- * If we have an unlinked inode in an internal snapshot node, and the inode
- * really has been deleted in all child snapshots, how does this get cleaned up?
- *
- * first there is the problem of how keys that have been overwritten in all
- * child snapshots get deleted (unimplemented?), but inodes may perhaps be
- * special?
- *
- * also: unlinked inode in internal snapshot appears to not be getting deleted
- * correctly if inode doesn't exist in leaf snapshots
- *
- * solution:
- *
- * for a key in an interior snapshot node that needs work to be done that
- * requires it to be mutated: iterate over all descendent leaf nodes and copy
- * that key to snapshot leaf nodes, where we can mutate it
- */
-
-struct snapshot_interior_delete {
- u32 id;
- u32 live_child;
-};
-typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
-
-static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
-{
- darray_for_each(*l, i)
- if (i->id == id)
- return i->live_child;
- return 0;
-}
-
-static unsigned __live_child(struct snapshot_table *t, u32 id,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
-{
- struct snapshot_t *s = __snapshot_t(t, id);
- if (!s)
- return 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
- if (s->children[i] &&
- !snapshot_list_has_id(delete_leaves, s->children[i]) &&
- !interior_delete_has_id(delete_interior, s->children[i]))
- return s->children[i];
-
- for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
- u32 live_child = s->children[i]
- ? __live_child(t, s->children[i], delete_leaves, delete_interior)
- : 0;
- if (live_child)
- return live_child;
- }
-
- return 0;
-}
-
-static unsigned live_child(struct bch_fs *c, u32 id,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
-{
- rcu_read_lock();
- u32 ret = __live_child(rcu_dereference(c->snapshots), id,
- delete_leaves, delete_interior);
- rcu_read_unlock();
- return ret;
-}
-
-static int delete_dead_snapshots_process_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
-{
- if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
- return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
-
- u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
- if (live_child) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- int ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- new->k.p.snapshot = live_child;
-
- struct btree_iter dst_iter;
- struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
- iter->btree_id, new->k.p,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_intent);
- ret = bkey_err(dst_k);
- if (ret)
- return ret;
-
- ret = (bkey_deleted(dst_k.k)
- ? bch2_trans_update(trans, &dst_iter, new,
- BTREE_UPDATE_internal_snapshot_node)
- : 0) ?:
- bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &dst_iter);
- return ret;
- }
-
- return 0;
-}
-
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
-{
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- struct bch_fs *c = trans->c;
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
- unsigned live_children = 0;
-
- if (BCH_SNAPSHOT_SUBVOL(s.v))
- return 0;
-
- for (unsigned i = 0; i < 2; i++) {
- u32 child = le32_to_cpu(s.v->children[i]);
-
- live_children += child &&
- !snapshot_list_has_id(delete_leaves, child);
- }
-
- if (live_children == 0) {
- return snapshot_list_add(c, delete_leaves, s.k->p.offset);
- } else if (live_children == 1) {
- struct snapshot_interior_delete d = {
- .id = s.k->p.offset,
- .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
- };
-
- if (!d.live_child) {
- bch_err(c, "error finding live child of snapshot %u", d.id);
- return -EINVAL;
- }
-
- return darray_push(delete_interior, d);
- } else {
- return 0;
- }
-}
-
-static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
- interior_delete_list *skip)
-{
- rcu_read_lock();
- while (interior_delete_has_id(skip, id))
- id = __bch2_snapshot_parent(c, id);
-
- while (n--) {
- do {
- id = __bch2_snapshot_parent(c, id);
- } while (interior_delete_has_id(skip, id));
- }
- rcu_read_unlock();
-
- return id;
-}
-
-static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
- struct btree_iter *iter, struct bkey_s_c k,
- interior_delete_list *deleted)
-{
- struct bch_fs *c = trans->c;
- u32 nr_deleted_ancestors = 0;
- struct bkey_i_snapshot *s;
- int ret;
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- if (interior_delete_has_id(deleted, k.k->p.offset))
- return 0;
-
- s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- return ret;
-
- darray_for_each(*deleted, i)
- nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
-
- if (!nr_deleted_ancestors)
- return 0;
-
- le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
-
- if (!s->v.depth) {
- s->v.skip[0] = 0;
- s->v.skip[1] = 0;
- s->v.skip[2] = 0;
- } else {
- u32 depth = le32_to_cpu(s->v.depth);
- u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
-
- for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
- u32 id = le32_to_cpu(s->v.skip[j]);
-
- if (interior_delete_has_id(deleted, id)) {
- id = bch2_snapshot_nth_parent_skip(c,
- parent,
- depth > 1
- ? get_random_u32_below(depth - 1)
- : 0,
- deleted);
- s->v.skip[j] = cpu_to_le32(id);
- }
- }
-
- bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
- }
-
- return bch2_trans_update(trans, iter, &s->k_i, 0);
-}
-
-int bch2_delete_dead_snapshots(struct bch_fs *c)
-{
- if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
- return 0;
-
- struct btree_trans *trans = bch2_trans_get(c);
- snapshot_id_list delete_leaves = {};
- interior_delete_list delete_interior = {};
- int ret = 0;
-
- /*
- * For every snapshot node: If we have no live children and it's not
- * pointed to by a subvolume, delete it:
- */
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
- check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "walking snapshots");
- if (ret)
- goto err;
-
- if (!delete_leaves.nr && !delete_interior.nr)
- goto err;
-
- {
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "deleting leaves");
- darray_for_each(delete_leaves, i)
- prt_printf(&buf, " %u", *i);
-
- prt_printf(&buf, " interior");
- darray_for_each(delete_interior, i)
- prt_printf(&buf, " %u->%u", i->id, i->live_child);
-
- ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
- printbuf_exit(&buf);
- if (ret)
- goto err;
- }
-
- for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
- struct disk_reservation res = { 0 };
-
- if (!btree_type_has_snapshots(btree))
- continue;
-
- ret = for_each_btree_key_commit(trans, iter,
- btree, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL, BCH_TRANS_COMMIT_no_enospc,
- delete_dead_snapshots_process_key(trans, &iter, k,
- &delete_leaves,
- &delete_interior));
-
- bch2_disk_reservation_put(c, &res);
-
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
- if (ret)
- goto err;
- }
-
- darray_for_each(delete_leaves, i) {
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, *i));
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
- if (ret)
- goto err;
- }
-
- /*
- * Fixing children of deleted snapshots can't be done completely
- * atomically, if we crash between here and when we delete the interior
- * nodes some depth fields will be off:
- */
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
- BTREE_ITER_intent, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
- if (ret)
- goto err;
-
- darray_for_each(delete_interior, i) {
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, i->id));
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "deleting snapshot %u", i->id);
- if (ret)
- goto err;
- }
-err:
- darray_exit(&delete_interior);
- darray_exit(&delete_leaves);
- bch2_trans_put(trans);
- if (!bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
- return ret;
-}
-
-void bch2_delete_dead_snapshots_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
-
- set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
-
- bch2_delete_dead_snapshots(c);
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_delete_dead_snapshots_async(struct bch_fs *c)
-{
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
- return;
-
- BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
-
- if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
- enum btree_id id,
- struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots,
- k, ret) {
- if (!bkey_eq(pos, k.k->p))
- break;
-
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
- ret = 1;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
-{
- /* If there's one child, it's redundant and keys will be moved to the child */
- return !!snap.v->children[0] + !!snap.v->children[1] == 1;
-}
-
-static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
-{
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
- interior_snapshot_needs_delete(snap))
- set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
-
- return 0;
-}
-
-int bch2_snapshots_read(struct bch_fs *c)
-{
- /*
- * Initializing the is_ancestor bitmaps requires ancestors to already be
- * initialized - so mark in reverse:
- */
- int ret = bch2_trans_run(c,
- for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
- POS_MAX, 0, k,
- __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
- bch2_check_snapshot_needs_deletion(trans, k)));
- bch_err_fn(c, ret);
-
- /*
- * It's important that we check if we need to reconstruct snapshots
- * before going RW, so we mark that pass as required in the superblock -
- * otherwise, we could end up deleting keys with missing snapshot nodes
- * instead
- */
- BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
- test_bit(BCH_FS_may_go_rw, &c->flags));
-
- if (bch2_err_matches(ret, EIO) ||
- (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
- ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
-
- return ret;
-}
-
-void bch2_fs_snapshots_exit(struct bch_fs *c)
-{
- kvfree(rcu_dereference_protected(c->snapshots, true));
-}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
deleted file mode 100644
index 81180181d7c9..000000000000
--- a/fs/bcachefs/snapshot.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_H
-#define _BCACHEFS_SNAPSHOT_H
-
-void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-
-#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
- .key_validate = bch2_snapshot_tree_validate, \
- .val_to_text = bch2_snapshot_tree_to_text, \
- .min_val_size = 8, \
-})
-
-struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
-
-int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
-
-void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
- .key_validate = bch2_snapshot_validate, \
- .val_to_text = bch2_snapshot_to_text, \
- .trigger = bch2_mark_snapshot, \
- .min_val_size = 24, \
-})
-
-static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
-{
- u32 idx = U32_MAX - id;
-
- return likely(t && idx < t->nr)
- ? &t->s[idx]
- : NULL;
-}
-
-static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
-{
- return __snapshot_t(rcu_dereference(c->snapshots), id);
-}
-
-static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- const struct snapshot_t *s = snapshot_t(c, id);
- id = s ? s->tree : 0;
- rcu_read_unlock();
-
- return id;
-}
-
-static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->parent : 0;
-}
-
-static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- id = __bch2_snapshot_parent_early(c, id);
- rcu_read_unlock();
-
- return id;
-}
-
-static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s = snapshot_t(c, id);
- if (!s)
- return 0;
-
- u32 parent = s->parent;
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- parent &&
- s->depth != snapshot_t(c, parent)->depth + 1)
- panic("id %u depth=%u parent %u depth=%u\n",
- id, snapshot_t(c, id)->depth,
- parent, snapshot_t(c, parent)->depth);
-
- return parent;
-}
-
-static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
- return id;
-}
-
-static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
-{
- rcu_read_lock();
- while (n--)
- id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
- return id;
-}
-
-u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32);
-u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
-
-static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
-{
- u32 parent;
-
- rcu_read_lock();
- while ((parent = __bch2_snapshot_parent(c, id)))
- id = parent;
- rcu_read_unlock();
-
- return id;
-}
-
-static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->live : 0;
-}
-
-static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- bool ret = __bch2_snapshot_exists(c, id);
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- const struct snapshot_t *s = snapshot_t(c, id);
- int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
- int ret = bch2_snapshot_is_internal_node(c, id);
- if (ret < 0)
- return ret;
- return !ret;
-}
-
-static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
-{
- u32 depth;
-
- rcu_read_lock();
- depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
- rcu_read_unlock();
-
- return depth;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
-
-static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
- return id == ancestor
- ? true
- : __bch2_snapshot_is_ancestor(c, id, ancestor);
-}
-
-static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- const struct snapshot_t *t = snapshot_t(c, id);
- bool ret = t && (t->children[0]|t->children[1]) != 0;
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
-{
- darray_for_each(*s, i)
- if (*i == id)
- return true;
- return false;
-}
-
-static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
- darray_for_each(*s, i)
- if (bch2_snapshot_is_ancestor(c, id, *i))
- return true;
- return false;
-}
-
-static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
- BUG_ON(snapshot_list_has_id(s, id));
- int ret = darray_push(s, id);
- if (ret)
- bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
- return ret;
-}
-
-static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
- int ret = snapshot_list_has_id(s, id)
- ? 0
- : darray_push(s, id);
- if (ret)
- bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
- return ret;
-}
-
-static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
-{
- darray_for_each(*src, i) {
- int ret = snapshot_list_add_nodup(c, dst, *i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
- struct bch_snapshot *s);
-int bch2_snapshot_get_subvol(struct btree_trans *, u32,
- struct bch_subvolume *);
-
-/* only exported for tests: */
-int bch2_snapshot_node_create(struct btree_trans *, u32,
- u32 *, u32 *, unsigned);
-
-int bch2_check_snapshot_trees(struct bch_fs *);
-int bch2_check_snapshots(struct bch_fs *);
-int bch2_reconstruct_snapshots(struct bch_fs *);
-int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
-
-int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
-void bch2_delete_dead_snapshots_work(struct work_struct *);
-
-int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
-
-static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
- enum btree_id id,
- struct bpos pos)
-{
- if (!btree_type_has_snapshots(id) ||
- bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
- return 0;
-
- return __bch2_key_has_snapshot_overwrites(trans, id, pos);
-}
-
-int bch2_snapshots_read(struct bch_fs *);
-void bch2_fs_snapshots_exit(struct bch_fs *);
-
-#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
deleted file mode 100644
index aabcd3a74cd9..000000000000
--- a/fs/bcachefs/snapshot_format.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
-#define _BCACHEFS_SNAPSHOT_FORMAT_H
-
-struct bch_snapshot {
- struct bch_val v;
- __le32 flags;
- __le32 parent;
- __le32 children[2];
- __le32 subvol;
- /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
- __le32 tree;
- __le32 depth;
- __le32 skip[3];
- bch_le128 btime;
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
-
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
- struct bch_val v;
- __le32 master_subvol;
- __le32 root_snapshot;
-};
-
-#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
deleted file mode 100644
index 93e71119e5a4..000000000000
--- a/fs/bcachefs/str_hash.c
+++ /dev/null
@@ -1,295 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "fsck.h"
-#include "str_hash.h"
-#include "subvolume.h"
-
-static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
-{
- if (d.v->d_type == DT_SUBVOL) {
- struct bch_subvolume subvol;
- int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol),
- false, &subvol);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
- return !ret;
- } else {
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = bkey_is_inode(k.k);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
- }
-}
-
-static noinline int fsck_rename_dirent(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c_dirent old)
-{
- struct qstr old_name = bch2_dirent_get_name(old);
- struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
- int ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- bkey_dirent_init(&new->k_i);
- dirent_copy_target(new, old);
- new->k.p = old.k->p;
-
- for (unsigned i = 0; i < 1000; i++) {
- unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
- old_name.len, old_name.name, i);
- unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0);
-
- if (u64s > U8_MAX)
- return -EINVAL;
-
- new->k.u64s = u64s;
-
- ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- (subvol_inum) { 0, old.k->p.inode },
- old.k->p.snapshot, &new->k_i,
- BTREE_UPDATE_internal_snapshot_node);
- if (!bch2_err_matches(ret, EEXIST))
- break;
- }
-
- if (ret)
- return ret;
-
- return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
-}
-
-static noinline int hash_pick_winner(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c k1,
- struct bkey_s_c k2)
-{
- if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
- !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
- return 0;
-
- switch (desc.btree_id) {
- case BTREE_ID_dirents: {
- int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1));
- if (ret < 0)
- return ret;
- if (!ret)
- return 0;
-
- ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2));
- if (ret < 0)
- return ret;
- if (!ret)
- return 1;
- return 2;
- }
- default:
- return 0;
- }
-}
-
-static int repair_inode_hash_info(struct btree_trans *trans,
- struct bch_inode_unpacked *snapshot_root)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
- SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != snapshot_root->bi_inum)
- break;
- if (!bkey_is_inode(k.k))
- continue;
-
- struct bch_inode_unpacked inode;
- ret = bch2_inode_unpack(k, &inode);
- if (ret)
- break;
-
- if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed ||
- INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root),
- trans, inode_snapshot_mismatch,
- "inode hash info in different snapshots don't match")) {
- inode.bi_hash_seed = snapshot_root->bi_hash_seed;
- SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
- ret = __bch2_fsck_write_inode(trans, &inode) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- break;
- }
- }
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * All versions of the same inode in different snapshots must have the same hash
- * seed/type: verify that the hash info we're using matches the root
- */
-static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
- struct bch_hash_info *hash_info)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
- if (bkey_is_inode(k.k))
- goto found;
- }
- bch_err(c, "%s(): inum %llu not found", __func__, inum);
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto err;
-found:;
- struct bch_inode_unpacked inode;
- ret = bch2_inode_unpack(k, &inode);
- if (ret)
- goto err;
-
- struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
- if (hash_info->type != hash2.type ||
- memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) {
- ret = repair_inode_hash_info(trans, &inode);
- if (!ret) {
- bch_err(c, "inode hash info mismatch with root, but mismatch not found\n"
- "%u %llx %llx\n"
- "%u %llx %llx",
- hash_info->type,
- hash_info->siphash_key.k0,
- hash_info->siphash_key.k1,
- hash2.type,
- hash2.siphash_key.k0,
- hash2.siphash_key.k1);
- ret = -BCH_ERR_fsck_repair_unimplemented;
- }
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int __bch2_str_hash_check_key(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc *desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
- struct printbuf buf = PRINTBUF;
- struct bkey_s_c k;
- int ret = 0;
-
- u64 hash = desc->hash_bkey(hash_info, hash_k);
- if (hash_k.k->p.offset < hash)
- goto bad_hash;
-
- for_each_btree_key_norestart(trans, iter, desc->btree_id,
- SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
- BTREE_ITER_slots, k, ret) {
- if (bkey_eq(k.k->p, hash_k.k->p))
- break;
-
- if (k.k->type == desc->key_type &&
- !desc->cmp_bkey(k, hash_k))
- goto duplicate_entries;
-
- if (bkey_deleted(k.k)) {
- bch2_trans_iter_exit(trans, &iter);
- goto bad_hash;
- }
- }
-out:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
-bad_hash:
- /*
- * Before doing any repair, check hash_info itself:
- */
- ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
- if (ret)
- goto out;
-
- if (fsck_err(trans, hash_table_key_wrong_offset,
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
- bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
- k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info,
- (subvol_inum) { 0, hash_k.k->p.inode },
- hash_k.k->p.snapshot, new,
- STR_HASH_must_create|
- BTREE_ITER_with_updates|
- BTREE_UPDATE_internal_snapshot_node);
- ret = bkey_err(k);
- if (ret)
- goto out;
- if (k.k)
- goto duplicate_entries;
-
- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-fsck_err:
- goto out;
-duplicate_entries:
- ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k);
- if (ret < 0)
- goto out;
-
- if (!fsck_err(trans, hash_table_key_duplicate,
- "duplicate hash table keys%s:\n%s",
- ret != 2 ? "" : ", both point to valid inodes",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k),
- prt_newline(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf)))
- goto out;
-
- switch (ret) {
- case 0:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
- break;
- case 1:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
- break;
- case 2:
- ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
- bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
- goto out;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
-}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
deleted file mode 100644
index 575ad1e03904..000000000000
--- a/fs/bcachefs/str_hash.h
+++ /dev/null
@@ -1,418 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_STR_HASH_H
-#define _BCACHEFS_STR_HASH_H
-
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "checksum.h"
-#include "error.h"
-#include "inode.h"
-#include "siphash.h"
-#include "subvolume.h"
-#include "super.h"
-
-#include <linux/crc32c.h>
-#include <crypto/sha2.h>
-
-static inline enum bch_str_hash_type
-bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
-{
- switch (opt) {
- case BCH_STR_HASH_OPT_crc32c:
- return BCH_STR_HASH_crc32c;
- case BCH_STR_HASH_OPT_crc64:
- return BCH_STR_HASH_crc64;
- case BCH_STR_HASH_OPT_siphash:
- return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
- ? BCH_STR_HASH_siphash
- : BCH_STR_HASH_siphash_old;
- default:
- BUG();
- }
-}
-
-struct bch_hash_info {
- u8 type;
- struct unicode_map *cf_encoding;
- /*
- * For crc32 or crc64 string hashes the first key value of
- * the siphash_key (k0) is used as the key.
- */
- SIPHASH_KEY siphash_key;
-};
-
-static inline struct bch_hash_info
-bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
-{
- /* XXX ick */
- struct bch_hash_info info = {
- .type = INODE_STR_HASH(bi),
-#ifdef CONFIG_UNICODE
- .cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL,
-#endif
- .siphash_key = { .k0 = bi->bi_hash_seed }
- };
-
- if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
- u8 digest[SHA256_DIGEST_SIZE];
-
- sha256((const u8 *)&bi->bi_hash_seed,
- sizeof(bi->bi_hash_seed), digest);
- memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
- }
-
- return info;
-}
-
-struct bch_str_hash_ctx {
- union {
- u32 crc32c;
- u64 crc64;
- SIPHASH_CTX siphash;
- };
-};
-
-static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info)
-{
- switch (info->type) {
- case BCH_STR_HASH_crc32c:
- ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
- sizeof(info->siphash_key.k0));
- break;
- case BCH_STR_HASH_crc64:
- ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
- sizeof(info->siphash_key.k0));
- break;
- case BCH_STR_HASH_siphash_old:
- case BCH_STR_HASH_siphash:
- SipHash24_Init(&ctx->siphash, &info->siphash_key);
- break;
- default:
- BUG();
- }
-}
-
-static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info,
- const void *data, size_t len)
-{
- switch (info->type) {
- case BCH_STR_HASH_crc32c:
- ctx->crc32c = crc32c(ctx->crc32c, data, len);
- break;
- case BCH_STR_HASH_crc64:
- ctx->crc64 = crc64_be(ctx->crc64, data, len);
- break;
- case BCH_STR_HASH_siphash_old:
- case BCH_STR_HASH_siphash:
- SipHash24_Update(&ctx->siphash, data, len);
- break;
- default:
- BUG();
- }
-}
-
-static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info)
-{
- switch (info->type) {
- case BCH_STR_HASH_crc32c:
- return ctx->crc32c;
- case BCH_STR_HASH_crc64:
- return ctx->crc64 >> 1;
- case BCH_STR_HASH_siphash_old:
- case BCH_STR_HASH_siphash:
- return SipHash24_End(&ctx->siphash) >> 1;
- default:
- BUG();
- }
-}
-
-struct bch_hash_desc {
- enum btree_id btree_id;
- u8 key_type;
-
- u64 (*hash_key)(const struct bch_hash_info *, const void *);
- u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
- bool (*cmp_key)(struct bkey_s_c, const void *);
- bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
- bool (*is_visible)(subvol_inum inum, struct bkey_s_c);
-};
-
-static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
-{
- return k.k->type == desc.key_type &&
- (!desc.is_visible ||
- !inum.inum ||
- desc.is_visible(inum, k));
-}
-
-static __always_inline struct bkey_s_c
-bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, const void *key,
- enum btree_iter_update_trigger_flags flags,
- u32 snapshot)
-{
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
- SPOS(inum.inum, desc.hash_key(info, key), snapshot),
- POS(inum.inum, U64_MAX),
- BTREE_ITER_slots|flags, k, ret) {
- if (is_visible_key(desc, inum, k)) {
- if (!desc.cmp_key(k, key))
- return k;
- } else if (k.k->type == KEY_TYPE_hash_whiteout) {
- ;
- } else {
- /* hole, not found */
- break;
- }
- }
- bch2_trans_iter_exit(trans, iter);
-
- return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup);
-}
-
-static __always_inline struct bkey_s_c
-bch2_hash_lookup(struct btree_trans *trans,
- struct btree_iter *iter,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, const void *key,
- enum btree_iter_update_trigger_flags flags)
-{
- u32 snapshot;
- int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return bkey_s_c_err(ret);
-
- return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
-}
-
-static __always_inline int
-bch2_hash_hole(struct btree_trans *trans,
- struct btree_iter *iter,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, const void *key)
-{
- struct bkey_s_c k;
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
- for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
- SPOS(inum.inum, desc.hash_key(info, key), snapshot),
- POS(inum.inum, U64_MAX),
- BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
- if (!is_visible_key(desc, inum, k))
- return 0;
- bch2_trans_iter_exit(trans, iter);
-
- return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
-}
-
-static __always_inline
-int bch2_hash_needs_whiteout(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *start)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_copy_iter(&iter, start);
-
- bch2_btree_iter_advance(&iter);
-
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) {
- if (k.k->type != desc.key_type &&
- k.k->type != KEY_TYPE_hash_whiteout)
- break;
-
- if (k.k->type == desc.key_type &&
- desc.hash_bkey(info, k) <= start->pos.offset) {
- ret = 1;
- break;
- }
- }
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static __always_inline
-struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, u32 snapshot,
- struct bkey_i *insert,
- enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter slot = {};
- struct bkey_s_c k;
- bool found = false;
- int ret;
-
- for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
- SPOS(insert->k.p.inode,
- desc.hash_bkey(info, bkey_i_to_s_c(insert)),
- snapshot),
- POS(insert->k.p.inode, U64_MAX),
- BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
- if (is_visible_key(desc, inum, k)) {
- if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
- goto found;
-
- /* hash collision: */
- continue;
- }
-
- if (!slot.path && !(flags & STR_HASH_must_replace))
- bch2_trans_copy_iter(&slot, iter);
-
- if (k.k->type != KEY_TYPE_hash_whiteout)
- goto not_found;
- }
-
- if (!ret)
- ret = -BCH_ERR_ENOSPC_str_hash_create;
-out:
- bch2_trans_iter_exit(trans, &slot);
- bch2_trans_iter_exit(trans, iter);
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-found:
- found = true;
-not_found:
- if (found && (flags & STR_HASH_must_create)) {
- bch2_trans_iter_exit(trans, &slot);
- return k;
- } else if (!found && (flags & STR_HASH_must_replace)) {
- ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else {
- if (!found && slot.path)
- swap(*iter, slot);
-
- insert->k.p = iter->pos;
- ret = bch2_trans_update(trans, iter, insert, flags);
- }
-
- goto out;
-}
-
-static __always_inline
-int bch2_hash_set_in_snapshot(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, u32 snapshot,
- struct bkey_i *insert,
- enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum,
- snapshot, insert, flags);
- int ret = bkey_err(k);
- if (ret)
- return ret;
- if (k.k) {
- bch2_trans_iter_exit(trans, &iter);
- return -BCH_ERR_EEXIST_str_hash_set;
- }
-
- return 0;
-}
-
-static __always_inline
-int bch2_hash_set(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum,
- struct bkey_i *insert,
- enum btree_iter_update_trigger_flags flags)
-{
- insert->k.p.inode = inum.inum;
-
- u32 snapshot;
- return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
- bch2_hash_set_in_snapshot(trans, desc, info, inum,
- snapshot, insert, flags);
-}
-
-static __always_inline
-int bch2_hash_delete_at(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_i *delete;
- int ret;
-
- delete = bch2_trans_kmalloc(trans, sizeof(*delete));
- ret = PTR_ERR_OR_ZERO(delete);
- if (ret)
- return ret;
-
- ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
- if (ret < 0)
- return ret;
-
- bkey_init(&delete->k);
- delete->k.p = iter->pos;
- delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
-
- return bch2_trans_update(trans, iter, delete, flags);
-}
-
-static __always_inline
-int bch2_hash_delete(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, const void *key)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
- BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-struct snapshots_seen;
-int __bch2_str_hash_check_key(struct btree_trans *,
- struct snapshots_seen *,
- const struct bch_hash_desc *,
- struct bch_hash_info *,
- struct btree_iter *, struct bkey_s_c);
-
-static inline int bch2_str_hash_check_key(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc *desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
-{
- if (hash_k.k->type != desc->key_type)
- return 0;
-
- if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
- return 0;
-
- return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k);
-}
-
-#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
deleted file mode 100644
index b7b96283c316..000000000000
--- a/fs/bcachefs/subvolume.c
+++ /dev/null
@@ -1,724 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "snapshot.h"
-#include "subvolume.h"
-
-#include <linux/random.h>
-
-static int bch2_subvolume_delete(struct btree_trans *, u32);
-
-static struct bpos subvolume_children_pos(struct bkey_s_c k)
-{
- if (k.k->type != KEY_TYPE_subvolume)
- return POS_MIN;
-
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- if (!s.v->fs_path_parent)
- return POS_MIN;
- return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
-}
-
-static int check_subvol(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_subvolume subvol;
- struct btree_iter subvol_children_iter = {};
- struct bch_snapshot snapshot;
- struct printbuf buf = PRINTBUF;
- unsigned snapid;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_subvolume)
- return 0;
-
- subvol = bkey_s_c_to_subvolume(k);
- snapid = le32_to_cpu(subvol.v->snapshot);
- ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
-
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
- k.k->p.offset, snapid);
- if (ret)
- return ret;
-
- if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
- return ret ?: -BCH_ERR_transaction_restart_nested;
- }
-
- if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
- subvol.v->fs_path_parent,
- trans, subvol_root_fs_path_parent_nonzero,
- "root subvolume has nonzero fs_path_parent\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- struct bkey_i_subvolume *n =
- bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- n->v.fs_path_parent = 0;
- }
-
- if (subvol.v->fs_path_parent) {
- struct bpos pos = subvolume_children_pos(k);
-
- struct bkey_s_c subvol_children_k =
- bch2_bkey_get_iter(trans, &subvol_children_iter,
- BTREE_ID_subvolume_children, pos, 0);
- ret = bkey_err(subvol_children_k);
- if (ret)
- goto err;
-
- if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
- trans, subvol_children_not_set,
- "subvolume not set in subvolume_children btree at %llu:%llu\n%s",
- pos.inode, pos.offset,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
- if (ret)
- goto err;
- }
- }
-
- struct bch_inode_unpacked inode;
- ret = bch2_inode_find_by_inum_nowarn_trans(trans,
- (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
- &inode);
- if (!ret) {
- if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
- trans, subvol_root_wrong_bi_subvol,
- "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
- inode.bi_inum, inode.bi_snapshot,
- inode.bi_subvol, subvol.k->p.offset)) {
- inode.bi_subvol = subvol.k->p.offset;
- inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
- ret = __bch2_fsck_write_inode(trans, &inode);
- if (ret)
- goto err;
- }
- } else if (bch2_err_matches(ret, ENOENT)) {
- if (fsck_err(trans, subvol_to_missing_root,
- "subvolume %llu points to missing subvolume root %llu:%u",
- k.k->p.offset, le64_to_cpu(subvol.v->inode),
- le32_to_cpu(subvol.v->snapshot))) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
- ret = ret ?: -BCH_ERR_transaction_restart_nested;
- goto err;
- }
- } else {
- goto err;
- }
-
- if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
- u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
- u32 snapshot_tree;
- struct bch_snapshot_tree st;
-
- rcu_read_lock();
- snapshot_tree = snapshot_t(c, snapshot_root)->tree;
- rcu_read_unlock();
-
- ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
-
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "%s: snapshot tree %u not found", __func__, snapshot_tree);
-
- if (ret)
- goto err;
-
- if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
- trans, subvol_not_master_and_not_snapshot,
- "subvolume %llu is not set as snapshot but is not master subvolume",
- k.k->p.offset)) {
- struct bkey_i_subvolume *s =
- bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- goto err;
-
- SET_BCH_SUBVOLUME_SNAP(&s->v, true);
- }
- }
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &subvol_children_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_subvols(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_subvol(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_subvol_child(struct btree_trans *trans,
- struct btree_iter *child_iter,
- struct bkey_s_c child_k)
-{
- struct bch_subvolume s;
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
- 0, subvolume, &s);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (fsck_err_on(ret ||
- le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
- trans, subvol_children_bad,
- "incorrect entry in subvolume_children btree %llu:%llu",
- child_k.k->p.inode, child_k.k->p.offset)) {
- ret = bch2_btree_delete_at(trans, child_iter, 0);
- if (ret)
- goto err;
- }
-err:
-fsck_err:
- return ret;
-}
-
-int bch2_check_subvol_children(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_subvol_child(trans, &iter, k)));
- bch_err_fn(c, ret);
- return 0;
-}
-
-/* Subvolumes: */
-
-int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
- int ret = 0;
-
- bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
- bkey_gt(k.k->p, SUBVOL_POS_MAX),
- c, subvol_pos_bad,
- "invalid pos");
-
- bkey_fsck_err_on(!subvol.v->snapshot,
- c, subvol_snapshot_bad,
- "invalid snapshot");
-
- bkey_fsck_err_on(!subvol.v->inode,
- c, subvol_inode_bad,
- "invalid inode");
-fsck_err:
- return ret;
-}
-
-void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-
- prt_printf(out, "root %llu snapshot id %u",
- le64_to_cpu(s.v->inode),
- le32_to_cpu(s.v->snapshot));
-
- if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
- prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
- prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
- }
-}
-
-static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
-{
- return !bpos_eq(pos, POS_MIN)
- ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
- : 0;
-}
-
-int bch2_subvolume_trigger(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- if (flags & BTREE_TRIGGER_transactional) {
- struct bpos children_pos_old = subvolume_children_pos(old);
- struct bpos children_pos_new = subvolume_children_pos(new.s_c);
-
- if (!bpos_eq(children_pos_old, children_pos_new)) {
- int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
- subvolume_children_mod(trans, children_pos_new, true);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
-{
- struct btree_iter iter;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
- struct bkey_s_c k = bch2_btree_iter_peek(&iter);
- bch2_trans_iter_exit(trans, &iter);
-
- return bkey_err(k) ?: k.k && k.k->p.inode == subvol
- ? -BCH_ERR_ENOTEMPTY_subvol_not_empty
- : 0;
-}
-
-static __always_inline int
-bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
- bool inconsistent_if_not_found,
- struct bch_subvolume *s)
-{
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
- BTREE_ITER_cached|
- BTREE_ITER_with_updates, subvolume, s);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
- inconsistent_if_not_found,
- trans->c, "missing subvolume %u", subvol);
- return ret;
-}
-
-int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
- bool inconsistent_if_not_found,
- struct bch_subvolume *s)
-{
- return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s);
-}
-
-int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
-{
- struct bch_subvolume s;
- int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s);
- if (ret)
- return ret;
-
- if (BCH_SUBVOLUME_RO(&s))
- return -EROFS;
- return 0;
-}
-
-int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
-{
- return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol));
-}
-
-int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
- struct bch_subvolume *subvol)
-{
- struct bch_snapshot snap;
-
- return bch2_snapshot_lookup(trans, snapshot, &snap) ?:
- bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol);
-}
-
-int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
- u32 *snapid, bool warn)
-{
- struct btree_iter iter;
- struct bkey_s_c_subvolume subvol;
- int ret;
-
- subvol = bch2_bkey_get_iter_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_cached|BTREE_ITER_with_updates,
- subvolume);
- ret = bkey_err(subvol);
-
- bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c,
- "missing subvolume %u", subvolid);
-
- if (likely(!ret))
- *snapid = le32_to_cpu(subvol.v->snapshot);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
- u32 *snapid)
-{
- return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true);
-}
-
-static int bch2_subvolume_reparent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- u32 old_parent, u32 new_parent)
-{
- struct bkey_i_subvolume *s;
- int ret;
-
- if (k.k->type != KEY_TYPE_subvolume)
- return 0;
-
- if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
- le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
- return 0;
-
- s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- return ret;
-
- s->v.creation_parent = cpu_to_le32(new_parent);
- return 0;
-}
-
-/*
- * Separate from the snapshot tree in the snapshots btree, we record the tree
- * structure of how snapshot subvolumes were created - the parent subvolume of
- * each snapshot subvolume.
- *
- * When a subvolume is deleted, we scan for child subvolumes and reparant them,
- * to avoid dangling references:
- */
-static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
-{
- struct bch_subvolume s;
-
- return lockrestart_do(trans,
- bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?:
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_subvolume_reparent(trans, &iter, k,
- subvolid_to_delete, le32_to_cpu(s.creation_parent)));
-}
-
-/*
- * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
- * deletion/cleanup:
- */
-static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
-{
- struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {};
-
- struct bkey_s_c_subvolume subvol =
- bch2_bkey_get_iter_typed(trans, &subvol_iter,
- BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_cached|BTREE_ITER_intent,
- subvolume);
- int ret = bkey_err(subvol);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing subvolume %u", subvolid);
- if (ret)
- goto err;
-
- u32 snapid = le32_to_cpu(subvol.v->snapshot);
-
- struct bkey_s_c_snapshot snapshot =
- bch2_bkey_get_iter_typed(trans, &snapshot_iter,
- BTREE_ID_snapshots, POS(0, snapid),
- 0, snapshot);
- ret = bkey_err(snapshot);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing snapshot %u", snapid);
- if (ret)
- goto err;
-
- u32 treeid = le32_to_cpu(snapshot.v->tree);
-
- struct bkey_s_c_snapshot_tree snapshot_tree =
- bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
- BTREE_ID_snapshot_trees, POS(0, treeid),
- 0, snapshot_tree);
- ret = bkey_err(snapshot_tree);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing snapshot tree %u", treeid);
- if (ret)
- goto err;
-
- if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
- struct bkey_i_snapshot_tree *snapshot_tree_mut =
- bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter,
- &snapshot_tree.s_c,
- 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(snapshot_tree_mut);
- if (ret)
- goto err;
-
- snapshot_tree_mut->v.master_subvol = 0;
- }
-
- ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?:
- bch2_snapshot_node_set_deleted(trans, snapid);
-err:
- bch2_trans_iter_exit(trans, &snapshot_tree_iter);
- bch2_trans_iter_exit(trans, &snapshot_iter);
- bch2_trans_iter_exit(trans, &subvol_iter);
- return ret;
-}
-
-static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
-{
- return bch2_subvolumes_reparent(trans, subvolid) ?:
- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_subvolume_delete(trans, subvolid));
-}
-
-static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs,
- snapshot_wait_for_pagecache_and_delete_work);
- snapshot_id_list s;
- u32 *id;
- int ret = 0;
-
- while (!ret) {
- mutex_lock(&c->snapshots_unlinked_lock);
- s = c->snapshots_unlinked;
- darray_init(&c->snapshots_unlinked);
- mutex_unlock(&c->snapshots_unlinked_lock);
-
- if (!s.nr)
- break;
-
- bch2_evict_subvolume_inodes(c, &s);
-
- for (id = s.data; id < s.data + s.nr; id++) {
- ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
- bch_err_msg(c, ret, "deleting subvolume %u", *id);
- if (ret)
- break;
- }
-
- darray_exit(&s);
- }
-
- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
-}
-
-struct subvolume_unlink_hook {
- struct btree_trans_commit_hook h;
- u32 subvol;
-};
-
-static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
- struct btree_trans_commit_hook *_h)
-{
- struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- mutex_lock(&c->snapshots_unlinked_lock);
- if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
- ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
- mutex_unlock(&c->snapshots_unlinked_lock);
-
- if (ret)
- return ret;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
- return -EROFS;
-
- if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
- return 0;
-}
-
-int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
-{
- struct btree_iter iter;
- struct bkey_i_subvolume *n;
- struct subvolume_unlink_hook *h;
- int ret = 0;
-
- h = bch2_trans_kmalloc(trans, sizeof(*h));
- ret = PTR_ERR_OR_ZERO(h);
- if (ret)
- return ret;
-
- h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook;
- h->subvol = subvolid;
- bch2_trans_commit_hook(trans, &h->h);
-
- n = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_cached, subvolume);
- ret = PTR_ERR_OR_ZERO(n);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing subvolume %u", subvolid);
- return ret;
- }
-
- SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
- u32 parent_subvolid,
- u32 src_subvolid,
- u32 *new_subvolid,
- u32 *new_snapshotid,
- bool ro)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
- struct bkey_i_subvolume *new_subvol = NULL;
- struct bkey_i_subvolume *src_subvol = NULL;
- u32 parent = 0, new_nodes[2], snapshot_subvols[2];
- int ret = 0;
-
- ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
- BTREE_ID_subvolumes, POS(0, U32_MAX));
- if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = -BCH_ERR_ENOSPC_subvolume_create;
- if (ret)
- return ret;
-
- snapshot_subvols[0] = dst_iter.pos.offset;
- snapshot_subvols[1] = src_subvolid;
-
- if (src_subvolid) {
- /* Creating a snapshot: */
-
- src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
- BTREE_ID_subvolumes, POS(0, src_subvolid),
- BTREE_ITER_cached, subvolume);
- ret = PTR_ERR_OR_ZERO(src_subvol);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "subvolume %u not found", src_subvolid);
- goto err;
- }
-
- parent = le32_to_cpu(src_subvol->v.snapshot);
- }
-
- ret = bch2_snapshot_node_create(trans, parent, new_nodes,
- snapshot_subvols,
- src_subvolid ? 2 : 1);
- if (ret)
- goto err;
-
- if (src_subvolid) {
- src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
- ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
- if (ret)
- goto err;
- }
-
- new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(new_subvol);
- if (ret)
- goto err;
-
- new_subvol->v.flags = 0;
- new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
- new_subvol->v.inode = cpu_to_le64(inode);
- new_subvol->v.creation_parent = cpu_to_le32(src_subvolid);
- new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid);
- new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
- new_subvol->v.otime.hi = 0;
-
- SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
- SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
-
- *new_subvolid = new_subvol->k.p.offset;
- *new_snapshotid = new_nodes[0];
-err:
- bch2_trans_iter_exit(trans, &src_iter);
- bch2_trans_iter_exit(trans, &dst_iter);
- return ret;
-}
-
-int bch2_initialize_subvolumes(struct bch_fs *c)
-{
- struct bkey_i_snapshot_tree root_tree;
- struct bkey_i_snapshot root_snapshot;
- struct bkey_i_subvolume root_volume;
- int ret;
-
- bkey_snapshot_tree_init(&root_tree.k_i);
- root_tree.k.p.offset = 1;
- root_tree.v.master_subvol = cpu_to_le32(1);
- root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
-
- bkey_snapshot_init(&root_snapshot.k_i);
- root_snapshot.k.p.offset = U32_MAX;
- root_snapshot.v.flags = 0;
- root_snapshot.v.parent = 0;
- root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
- root_snapshot.v.tree = cpu_to_le32(1);
- SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
- bkey_subvolume_init(&root_volume.k_i);
- root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_volume.v.flags = 0;
- root_volume.v.snapshot = cpu_to_le32(U32_MAX);
- root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
-
- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?:
- bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?:
- bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked inode;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_is_inode(k.k)) {
- bch_err(trans->c, "root inode not found");
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
- }
-
- ret = bch2_inode_unpack(k, &inode);
- BUG_ON(ret);
-
- inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
- ret = bch2_inode_write(trans, &iter, &inode);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* set bi_subvol on root inode */
-int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_fs_upgrade_for_subvolumes(trans));
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_fs_subvolumes_init(struct bch_fs *c)
-{
- INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
- INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
- bch2_subvolume_wait_for_pagecache_and_delete);
- mutex_init(&c->snapshots_unlinked_lock);
- return 0;
-}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
deleted file mode 100644
index 910f6196700e..000000000000
--- a/fs/bcachefs/subvolume.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_H
-#define _BCACHEFS_SUBVOLUME_H
-
-#include "darray.h"
-#include "subvolume_types.h"
-
-int bch2_check_subvols(struct bch_fs *);
-int bch2_check_subvol_children(struct bch_fs *);
-
-int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
- .key_validate = bch2_subvolume_validate, \
- .val_to_text = bch2_subvolume_to_text, \
- .trigger = bch2_subvolume_trigger, \
- .min_val_size = 16, \
-})
-
-int bch2_subvol_has_children(struct btree_trans *, u32);
-int bch2_subvolume_get(struct btree_trans *, unsigned,
- bool, struct bch_subvolume *);
-int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
- u32 *, bool);
-int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
-
-int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
-int bch2_subvol_is_ro(struct bch_fs *, u32);
-
-static inline struct bkey_s_c
-bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end,
- u32 subvolid, unsigned flags)
-{
- u32 snapshot;
- int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
- if (ret)
- return bkey_s_c_err(ret);
-
- bch2_btree_iter_set_snapshot(iter, snapshot);
- return bch2_btree_iter_peek_max_type(iter, end, flags);
-}
-
-#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
- _end, _subvolid, _flags, _k, _do) \
-({ \
- struct bkey_s_c _k; \
- int _ret3 = 0; \
- \
- do { \
- _ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \
- _end, _subvolid, (_flags)); \
- if (!(_k).k) \
- break; \
- \
- bkey_err(_k) ?: (_do); \
- })); \
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
-
-#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \
- _start, _end, _subvolid, _flags, _k, _do) \
-({ \
- struct btree_iter _iter; \
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
- for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
- _end, _subvolid, _flags, _k, _do); \
-})
-
-int bch2_delete_dead_snapshots(struct bch_fs *);
-void bch2_delete_dead_snapshots_async(struct bch_fs *);
-
-int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
-
-int bch2_initialize_subvolumes(struct bch_fs *);
-int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
-
-int bch2_fs_subvolumes_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
deleted file mode 100644
index e029df7ba89f..000000000000
--- a/fs/bcachefs/subvolume_format.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
-#define _BCACHEFS_SUBVOLUME_FORMAT_H
-
-#define SUBVOL_POS_MIN POS(0, 1)
-#define SUBVOL_POS_MAX POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL 1
-
-struct bch_subvolume {
- struct bch_val v;
- __le32 flags;
- __le32 snapshot;
- __le64 inode;
- /*
- * Snapshot subvolumes form a tree, separate from the snapshot nodes
- * tree - if this subvolume is a snapshot, this is the ID of the
- * subvolume it was created from:
- *
- * This is _not_ necessarily the subvolume of the directory containing
- * this subvolume:
- */
- __le32 creation_parent;
- __le32 fs_path_parent;
- bch_le128 otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
-
-#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
deleted file mode 100644
index 1549d6daf7af..000000000000
--- a/fs/bcachefs/subvolume_types.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
-#define _BCACHEFS_SUBVOLUME_TYPES_H
-
-#include "darray.h"
-
-typedef DARRAY(u32) snapshot_id_list;
-
-#define IS_ANCESTOR_BITMAP 128
-
-struct snapshot_t {
- bool live;
- u32 parent;
- u32 skip[3];
- u32 depth;
- u32 children[2];
- u32 subvol; /* Nonzero only if a subvolume points to this node: */
- u32 tree;
- unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
-};
-
-struct snapshot_table {
- struct rcu_head rcu;
- size_t nr;
-#ifndef RUST_BINDGEN
- DECLARE_FLEX_ARRAY(struct snapshot_t, s);
-#else
- struct snapshot_t s[0];
-#endif
-};
-
-typedef struct {
- /* we can't have padding in this struct: */
- u64 subvol;
- u64 inum;
-} subvol_inum;
-
-#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
deleted file mode 100644
index 572b06bfa0b8..000000000000
--- a/fs/bcachefs/super-io.c
+++ /dev/null
@@ -1,1503 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "checksum.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_sb.h"
-#include "journal_seq_blacklist.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "quota.h"
-#include "sb-clean.h"
-#include "sb-counters.h"
-#include "sb-downgrade.h"
-#include "sb-errors.h"
-#include "sb-members.h"
-#include "super-io.h"
-#include "super.h"
-#include "trace.h"
-#include "vstructs.h"
-
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
-#include <linux/string_choices.h>
-
-struct bch2_metadata_version {
- u16 version;
- const char *name;
-};
-
-static const struct bch2_metadata_version bch2_metadata_versions[] = {
-#define x(n, v) { \
- .version = v, \
- .name = #n, \
-},
- BCH_METADATA_VERSIONS()
-#undef x
-};
-
-void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v)
-{
- const char *str = "(unknown version)";
-
- for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
- if (bch2_metadata_versions[i].version == v) {
- str = bch2_metadata_versions[i].name;
- break;
- }
-
- prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
-}
-
-enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v)
-{
- if (!BCH_VERSION_MAJOR(v))
- return v;
-
- for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
- if (bch2_metadata_versions[i].version > v &&
- BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) ==
- BCH_VERSION_MAJOR(v))
- v = bch2_metadata_versions[i].version;
-
- return v;
-}
-
-int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
-{
- int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
- version <= c->sb.version_incompat_allowed)
- ? 0
- : -BCH_ERR_may_not_use_incompat_feature;
-
- if (!ret) {
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
- max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- return ret;
-}
-
-const char * const bch2_sb_fields[] = {
-#define x(name, nr) #name,
- BCH_SB_FIELDS()
-#undef x
- NULL
-};
-
-static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
- enum bch_validate_flags, struct printbuf *);
-
-struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
- enum bch_sb_field_type type)
-{
- /* XXX: need locking around superblock to access optional fields */
-
- vstruct_for_each(sb, f)
- if (le32_to_cpu(f->type) == type)
- return f;
- return NULL;
-}
-
-static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
- struct bch_sb_field *f,
- unsigned u64s)
-{
- unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
- unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
-
- BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
-
- if (!f && !u64s) {
- /* nothing to do: */
- } else if (!f) {
- f = vstruct_last(sb->sb);
- memset(f, 0, sizeof(u64) * u64s);
- f->u64s = cpu_to_le32(u64s);
- f->type = 0;
- } else {
- void *src, *dst;
-
- src = vstruct_end(f);
-
- if (u64s) {
- f->u64s = cpu_to_le32(u64s);
- dst = vstruct_end(f);
- } else {
- dst = f;
- }
-
- memmove(dst, src, vstruct_end(sb->sb) - src);
-
- if (dst > src)
- memset(src, 0, dst - src);
- }
-
- sb->sb->u64s = cpu_to_le32(sb_u64s);
-
- return u64s ? f : NULL;
-}
-
-void bch2_sb_field_delete(struct bch_sb_handle *sb,
- enum bch_sb_field_type type)
-{
- struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-
- if (f)
- __bch2_sb_field_resize(sb, f, 0);
-}
-
-/* Superblock realloc/free: */
-
-void bch2_free_super(struct bch_sb_handle *sb)
-{
- kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->s_bdev_file))
- bdev_fput(sb->s_bdev_file);
- kfree(sb->holder);
- kfree(sb->sb_name);
-
- kfree(sb->sb);
- memset(sb, 0, sizeof(*sb));
-}
-
-int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
-{
- size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
- size_t new_buffer_size;
- struct bch_sb *new_sb;
- struct bio *bio;
-
- if (sb->bdev)
- new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
-
- new_buffer_size = roundup_pow_of_two(new_bytes);
-
- if (sb->sb && sb->buffer_size >= new_buffer_size)
- return 0;
-
- if (sb->sb && sb->have_layout) {
- u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
-
- if (new_bytes > max_bytes) {
- struct printbuf buf = PRINTBUF;
-
- prt_bdevname(&buf, sb->bdev);
- prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes);
- pr_err("%s", buf.buf);
- printbuf_exit(&buf);
- return -BCH_ERR_ENOSPC_sb;
- }
- }
-
- if (sb->buffer_size >= new_buffer_size && sb->sb)
- return 0;
-
- if (dynamic_fault("bcachefs:add:super_realloc"))
- return -BCH_ERR_ENOMEM_sb_realloc_injected;
-
- new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
- if (!new_sb)
- return -BCH_ERR_ENOMEM_sb_buf_realloc;
-
- sb->sb = new_sb;
-
- if (sb->have_bio) {
- unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
-
- bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
- if (!bio)
- return -BCH_ERR_ENOMEM_sb_bio_realloc;
-
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
-
- kfree(sb->bio);
- sb->bio = bio;
- }
-
- sb->buffer_size = new_buffer_size;
-
- return 0;
-}
-
-struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
- enum bch_sb_field_type type,
- unsigned u64s)
-{
- struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
- ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
- ssize_t d = -old_u64s + u64s;
-
- if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
- return NULL;
-
- if (sb->fs_sb) {
- struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
-
- lockdep_assert_held(&c->sb_lock);
-
- /* XXX: we're not checking that offline device have enough space */
-
- for_each_online_member(c, ca) {
- struct bch_sb_handle *dev_sb = &ca->disk_sb;
-
- if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->io_ref);
- return NULL;
- }
- }
- }
-
- f = bch2_sb_field_get_id(sb->sb, type);
- f = __bch2_sb_field_resize(sb, f, u64s);
- if (f)
- f->type = cpu_to_le32(type);
- return f;
-}
-
-struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb,
- enum bch_sb_field_type type,
- unsigned u64s)
-{
- struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-
- if (!f || le32_to_cpu(f->u64s) < u64s)
- f = bch2_sb_field_resize_id(sb, type, u64s);
- return f;
-}
-
-/* Superblock validate: */
-
-static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
-{
- u64 offset, prev_offset, max_sectors;
- unsigned i;
-
- BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
-
- if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
- !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
- prt_printf(out, "Not a bcachefs superblock layout");
- return -BCH_ERR_invalid_sb_layout;
- }
-
- if (layout->layout_type != 0) {
- prt_printf(out, "Invalid superblock layout type %u",
- layout->layout_type);
- return -BCH_ERR_invalid_sb_layout_type;
- }
-
- if (!layout->nr_superblocks) {
- prt_printf(out, "Invalid superblock layout: no superblocks");
- return -BCH_ERR_invalid_sb_layout_nr_superblocks;
- }
-
- if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
- prt_printf(out, "Invalid superblock layout: too many superblocks");
- return -BCH_ERR_invalid_sb_layout_nr_superblocks;
- }
-
- if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) {
- prt_printf(out, "Invalid superblock layout: max_size_bits too high");
- return -BCH_ERR_invalid_sb_layout_sb_max_size_bits;
- }
-
- max_sectors = 1 << layout->sb_max_size_bits;
-
- prev_offset = le64_to_cpu(layout->sb_offset[0]);
-
- for (i = 1; i < layout->nr_superblocks; i++) {
- offset = le64_to_cpu(layout->sb_offset[i]);
-
- if (offset < prev_offset + max_sectors) {
- prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
- " (sb %u ends at %llu next starts at %llu",
- i - 1, prev_offset + max_sectors, offset);
- return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
- }
- prev_offset = offset;
- }
-
- return 0;
-}
-
-static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
-{
- u16 version = le16_to_cpu(sb->version);
- u16 version_min = le16_to_cpu(sb->version_min);
-
- if (!bch2_version_compatible(version)) {
- prt_str(out, "Unsupported superblock version ");
- bch2_version_to_text(out, version);
- prt_str(out, " (min ");
- bch2_version_to_text(out, bcachefs_metadata_version_min);
- prt_str(out, ", max ");
- bch2_version_to_text(out, bcachefs_metadata_version_current);
- prt_str(out, ")");
- return -BCH_ERR_invalid_sb_version;
- }
-
- if (!bch2_version_compatible(version_min)) {
- prt_str(out, "Unsupported superblock version_min ");
- bch2_version_to_text(out, version_min);
- prt_str(out, " (min ");
- bch2_version_to_text(out, bcachefs_metadata_version_min);
- prt_str(out, ", max ");
- bch2_version_to_text(out, bcachefs_metadata_version_current);
- prt_str(out, ")");
- return -BCH_ERR_invalid_sb_version;
- }
-
- if (version_min > version) {
- prt_str(out, "Bad minimum version ");
- bch2_version_to_text(out, version_min);
- prt_str(out, ", greater than version field ");
- bch2_version_to_text(out, version);
- return -BCH_ERR_invalid_sb_version;
- }
-
- return 0;
-}
-
-int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
- enum bch_validate_flags flags, struct printbuf *out)
-{
- struct bch_sb_field_members_v1 *mi;
- enum bch_opt_id opt_id;
- int ret;
-
- ret = bch2_sb_compatible(sb, out);
- if (ret)
- return ret;
-
- u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
- unsigned incompat_bit = 0;
- if (incompat)
- incompat_bit = __ffs64(incompat);
- else if (sb->features[1])
- incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
-
- if (incompat_bit) {
- prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
- incompat_bit,
- bch2_sb_features[BCH_FEATURE_NR - 1],
- BCH_FEATURE_NR - 1);
- return -BCH_ERR_invalid_sb_features;
- }
-
- if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
- BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
- prt_str(out, "Filesystem has incompatible version ");
- bch2_version_to_text(out, le16_to_cpu(sb->version));
- prt_str(out, ", current version ");
- bch2_version_to_text(out, bcachefs_metadata_version_current);
- return -BCH_ERR_invalid_sb_features;
- }
-
- if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
- prt_printf(out, "Bad user UUID (got zeroes)");
- return -BCH_ERR_invalid_sb_uuid;
- }
-
- if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
- prt_printf(out, "Bad internal UUID (got zeroes)");
- return -BCH_ERR_invalid_sb_uuid;
- }
-
- if (!(flags & BCH_VALIDATE_write) &&
- le64_to_cpu(sb->offset) != read_offset) {
- prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
- le64_to_cpu(sb->offset), read_offset);
- return -BCH_ERR_invalid_sb_offset;
- }
-
- if (!sb->nr_devices ||
- sb->nr_devices > BCH_SB_MEMBERS_MAX) {
- prt_printf(out, "Bad number of member devices %u (max %u)",
- sb->nr_devices, BCH_SB_MEMBERS_MAX);
- return -BCH_ERR_invalid_sb_too_many_members;
- }
-
- if (sb->dev_idx >= sb->nr_devices) {
- prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
- sb->dev_idx, sb->nr_devices);
- return -BCH_ERR_invalid_sb_dev_idx;
- }
-
- if (!sb->time_precision ||
- le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
- prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
- le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
- return -BCH_ERR_invalid_sb_time_precision;
- }
-
- /* old versions didn't know to downgrade this field */
- if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version))
- SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version));
-
- if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) {
- prt_printf(out, "Invalid version_incompat ");
- bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
- prt_str(out, " > incompat_allowed ");
- bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
- if (flags & BCH_VALIDATE_write)
- return -BCH_ERR_invalid_sb_version;
- else
- SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
- }
-
- if (!flags) {
- /*
- * Been seeing a bug where these are getting inexplicably
- * zeroed, so we're now validating them, but we have to be
- * careful not to preven people's filesystems from mounting:
- */
- if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
- SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
- if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
- SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
-
- if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
- SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
-
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
- !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
- SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
-
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
- SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
-
- if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
- SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
-
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
- !BCH_SB_CSUM_ERR_RETRY_NR(sb))
- SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
- }
-
-#ifdef __KERNEL__
- if (!BCH_SB_SHARD_INUMS_NBITS(sb))
- SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
-#endif
-
- for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
- const struct bch_option *opt = bch2_opt_table + opt_id;
-
- if (opt->get_sb) {
- u64 v = bch2_opt_from_sb(sb, opt_id, -1);
-
- prt_printf(out, "Invalid option ");
- ret = bch2_opt_validate(opt, v, out);
- if (ret)
- return ret;
-
- printbuf_reset(out);
- }
- }
-
- /* validate layout */
- ret = validate_sb_layout(&sb->layout, out);
- if (ret)
- return ret;
-
- vstruct_for_each(sb, f) {
- if (!f->u64s) {
- prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
- le32_to_cpu(f->type));
- return -BCH_ERR_invalid_sb_field_size;
- }
-
- if (vstruct_next(f) > vstruct_last(sb)) {
- prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
- le32_to_cpu(f->type));
- return -BCH_ERR_invalid_sb_field_size;
- }
- }
-
- /* members must be validated first: */
- mi = bch2_sb_field_get(sb, members_v1);
- if (!mi) {
- prt_printf(out, "Invalid superblock: member info area missing");
- return -BCH_ERR_invalid_sb_members_missing;
- }
-
- ret = bch2_sb_field_validate(sb, &mi->field, flags, out);
- if (ret)
- return ret;
-
- vstruct_for_each(sb, f) {
- if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
- continue;
-
- ret = bch2_sb_field_validate(sb, f, flags, out);
- if (ret)
- return ret;
- }
-
- if ((flags & BCH_VALIDATE_write) &&
- bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
- prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
- le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
- le64_to_cpu(sb->seq));
- return -BCH_ERR_invalid_sb_members_missing;
- }
-
- return 0;
-}
-
-/* device open: */
-
-static unsigned long le_ulong_to_cpu(unsigned long v)
-{
- return sizeof(unsigned long) == 8
- ? le64_to_cpu(v)
- : le32_to_cpu(v);
-}
-
-static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr)
-{
- BUG_ON(nr & (BITS_PER_TYPE(long) - 1));
-
- for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++)
- dst[i] = le_ulong_to_cpu(src[i]);
-}
-
-static void bch2_sb_update(struct bch_fs *c)
-{
- struct bch_sb *src = c->disk_sb.sb;
-
- lockdep_assert_held(&c->sb_lock);
-
- c->sb.uuid = src->uuid;
- c->sb.user_uuid = src->user_uuid;
- c->sb.version = le16_to_cpu(src->version);
- c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src);
- c->sb.version_incompat_allowed
- = BCH_SB_VERSION_INCOMPAT_ALLOWED(src);
- c->sb.version_min = le16_to_cpu(src->version_min);
- c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
- c->sb.nr_devices = src->nr_devices;
- c->sb.clean = BCH_SB_CLEAN(src);
- c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
-
- c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
- c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
-
- /* XXX this is wrong, we need a 96 or 128 bit integer type */
- c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo),
- c->sb.nsec_per_time_unit);
- c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
-
- c->sb.features = le64_to_cpu(src->features[0]);
- c->sb.compat = le64_to_cpu(src->compat[0]);
-
- memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
- if (ext) {
- le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
- sizeof(c->sb.errors_silent) * 8);
- c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
- }
-
- for_each_member_device(c, ca) {
- struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
- ca->mi = bch2_mi_to_cpu(&m);
- }
-}
-
-static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
-{
- struct bch_sb_field *src_f, *dst_f;
- struct bch_sb *dst = dst_handle->sb;
- unsigned i;
-
- dst->version = src->version;
- dst->version_min = src->version_min;
- dst->seq = src->seq;
- dst->uuid = src->uuid;
- dst->user_uuid = src->user_uuid;
- memcpy(dst->label, src->label, sizeof(dst->label));
-
- dst->block_size = src->block_size;
- dst->nr_devices = src->nr_devices;
-
- dst->time_base_lo = src->time_base_lo;
- dst->time_base_hi = src->time_base_hi;
- dst->time_precision = src->time_precision;
- dst->write_time = src->write_time;
-
- memcpy(dst->flags, src->flags, sizeof(dst->flags));
- memcpy(dst->features, src->features, sizeof(dst->features));
- memcpy(dst->compat, src->compat, sizeof(dst->compat));
-
- for (i = 0; i < BCH_SB_FIELD_NR; i++) {
- int d;
-
- if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
- continue;
-
- src_f = bch2_sb_field_get_id(src, i);
- dst_f = bch2_sb_field_get_id(dst, i);
-
- d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
- (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
- if (d > 0) {
- int ret = bch2_sb_realloc(dst_handle,
- le32_to_cpu(dst_handle->sb->u64s) + d);
-
- if (ret)
- return ret;
-
- dst = dst_handle->sb;
- dst_f = bch2_sb_field_get_id(dst, i);
- }
-
- dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
- src_f ? le32_to_cpu(src_f->u64s) : 0);
-
- if (src_f)
- memcpy(dst_f, src_f, vstruct_bytes(src_f));
- }
-
- return 0;
-}
-
-int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
-{
- int ret;
-
- lockdep_assert_held(&c->sb_lock);
-
- ret = bch2_sb_realloc(&c->disk_sb, 0) ?:
- __copy_super(&c->disk_sb, src) ?:
- bch2_sb_replicas_to_cpu_replicas(c) ?:
- bch2_sb_disk_groups_to_cpu(c);
- if (ret)
- return ret;
-
- bch2_sb_update(c);
- return 0;
-}
-
-int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
-{
- return __copy_super(&ca->disk_sb, c->disk_sb.sb);
-}
-
-/* read superblock: */
-
-static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
-{
- size_t bytes;
- int ret;
-reread:
- bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
- sb->bio->bi_iter.bi_sector = offset;
- bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
-
- ret = submit_bio_wait(sb->bio);
- if (ret) {
- prt_printf(err, "IO error: %i", ret);
- return ret;
- }
-
- if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
- !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
- prt_str(err, "Not a bcachefs superblock (got magic ");
- pr_uuid(err, sb->sb->magic.b);
- prt_str(err, ")");
- return -BCH_ERR_invalid_sb_magic;
- }
-
- ret = bch2_sb_compatible(sb->sb, err);
- if (ret)
- return ret;
-
- bytes = vstruct_bytes(sb->sb);
-
- u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits);
- if (bytes > sb_size) {
- prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)",
- bytes, sb_size);
- return -BCH_ERR_invalid_sb_too_big;
- }
-
- if (bytes > sb->buffer_size) {
- ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
- if (ret)
- return ret;
- goto reread;
- }
-
- enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
- if (csum_type >= BCH_CSUM_NR ||
- bch2_csum_type_is_encryption(csum_type)) {
- prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
- return -BCH_ERR_invalid_sb_csum_type;
- }
-
- /* XXX: verify MACs */
- struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
- if (bch2_crc_cmp(csum, sb->sb->csum)) {
- bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
- return -BCH_ERR_invalid_sb_csum;
- }
-
- sb->seq = le64_to_cpu(sb->sb->seq);
-
- return 0;
-}
-
-static int __bch2_read_super(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
-{
- u64 offset = opt_get(*opts, sb);
- struct bch_sb_layout layout;
- struct printbuf err = PRINTBUF;
- struct printbuf err2 = PRINTBUF;
- __le64 *i;
- int ret;
-#ifndef __KERNEL__
-retry:
-#endif
- memset(sb, 0, sizeof(*sb));
- sb->mode = BLK_OPEN_READ;
- sb->have_bio = true;
- sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
- if (!sb->holder)
- return -ENOMEM;
-
- sb->sb_name = kstrdup(path, GFP_KERNEL);
- if (!sb->sb_name) {
- ret = -ENOMEM;
- prt_printf(&err, "error allocating memory for sb_name");
- goto err;
- }
-
-#ifndef __KERNEL__
- if (opt_get(*opts, direct_io) == false)
- sb->mode |= BLK_OPEN_BUFFERED;
-#endif
-
- if (!opt_get(*opts, noexcl))
- sb->mode |= BLK_OPEN_EXCL;
-
- if (!opt_get(*opts, nochanges))
- sb->mode |= BLK_OPEN_WRITE;
-
- sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->s_bdev_file) &&
- PTR_ERR(sb->s_bdev_file) == -EACCES &&
- opt_get(*opts, read_only)) {
- sb->mode &= ~BLK_OPEN_WRITE;
-
- sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->s_bdev_file))
- opt_set(*opts, nochanges, true);
- }
-
- if (IS_ERR(sb->s_bdev_file)) {
- ret = PTR_ERR(sb->s_bdev_file);
- prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
- goto err;
- }
- sb->bdev = file_bdev(sb->s_bdev_file);
-
- ret = bch2_sb_realloc(sb, 0);
- if (ret) {
- prt_printf(&err, "error allocating memory for superblock");
- goto err;
- }
-
- if (bch2_fs_init_fault("read_super")) {
- prt_printf(&err, "dynamic fault");
- ret = -EFAULT;
- goto err;
- }
-
- ret = read_one_super(sb, offset, &err);
- if (!ret)
- goto got_super;
-
- if (opt_defined(*opts, sb))
- goto err;
-
- prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
- path, err.buf);
- if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
- bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
- else
- bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
-
- printbuf_exit(&err2);
- printbuf_reset(&err);
-
- /*
- * Error reading primary superblock - read location of backup
- * superblocks:
- */
- bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
- sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
- /*
- * use sb buffer to read layout, since sb buffer is page aligned but
- * layout won't be:
- */
- bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
-
- ret = submit_bio_wait(sb->bio);
- if (ret) {
- prt_printf(&err, "IO error: %i", ret);
- goto err;
- }
-
- memcpy(&layout, sb->sb, sizeof(layout));
- ret = validate_sb_layout(&layout, &err);
- if (ret)
- goto err;
-
- for (i = layout.sb_offset;
- i < layout.sb_offset + layout.nr_superblocks; i++) {
- offset = le64_to_cpu(*i);
-
- if (offset == opt_get(*opts, sb)) {
- ret = -BCH_ERR_invalid;
- continue;
- }
-
- ret = read_one_super(sb, offset, &err);
- if (!ret)
- goto got_super;
- }
-
- goto err;
-
-got_super:
- if (le16_to_cpu(sb->sb->block_size) << 9 <
- bdev_logical_block_size(sb->bdev) &&
- opt_get(*opts, direct_io)) {
-#ifndef __KERNEL__
- opt_set(*opts, direct_io, false);
- bch2_free_super(sb);
- goto retry;
-#endif
- prt_printf(&err, "block size (%u) smaller than device block size (%u)",
- le16_to_cpu(sb->sb->block_size) << 9,
- bdev_logical_block_size(sb->bdev));
- ret = -BCH_ERR_block_size_too_small;
- goto err;
- }
-
- sb->have_layout = true;
-
- ret = bch2_sb_validate(sb->sb, offset, 0, &err);
- if (ret) {
- bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
- path, err.buf);
- goto err_no_print;
- }
-out:
- printbuf_exit(&err);
- return ret;
-err:
- bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
- path, err.buf);
-err_no_print:
- bch2_free_super(sb);
- goto out;
-}
-
-int bch2_read_super(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb)
-{
- return __bch2_read_super(path, opts, sb, false);
-}
-
-/* provide a silenced version for mount.bcachefs */
-
-int bch2_read_super_silent(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb)
-{
- return __bch2_read_super(path, opts, sb, true);
-}
-
-/* write superblock: */
-
-static void write_super_endio(struct bio *bio)
-{
- struct bch_dev *ca = bio->bi_private;
-
- bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
-
- /* XXX: return errors directly */
-
- if (bio->bi_status) {
- bch_err_dev_ratelimited(ca, "superblock %s error: %s",
- str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status));
- ca->sb_write_error = 1;
- }
-
- closure_put(&ca->fs->sb_write);
- percpu_ref_put(&ca->io_ref);
-}
-
-static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bch_sb *sb = ca->disk_sb.sb;
- struct bio *bio = ca->disk_sb.bio;
-
- memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE);
-
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
- bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
- bio->bi_end_io = write_super_endio;
- bio->bi_private = ca;
- bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE);
-
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
-
- percpu_ref_get(&ca->io_ref);
- closure_bio_submit(bio, &c->sb_write);
-}
-
-static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
-{
- struct bch_sb *sb = ca->disk_sb.sb;
- struct bio *bio = ca->disk_sb.bio;
-
- sb->offset = sb->layout.sb_offset[idx];
-
- SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
- sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
- null_nonce(), sb);
-
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
- bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
- bio->bi_end_io = write_super_endio;
- bio->bi_private = ca;
- bch2_bio_map(bio, sb,
- roundup((size_t) vstruct_bytes(sb),
- bdev_logical_block_size(ca->disk_sb.bdev)));
-
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
- bio_sectors(bio));
-
- percpu_ref_get(&ca->io_ref);
- closure_bio_submit(bio, &c->sb_write);
-}
-
-int bch2_write_super(struct bch_fs *c)
-{
- struct closure *cl = &c->sb_write;
- struct printbuf err = PRINTBUF;
- unsigned sb = 0, nr_wrote;
- struct bch_devs_mask sb_written;
- bool wrote, can_mount_without_written, can_mount_with_written;
- unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
- DARRAY(struct bch_dev *) online_devices = {};
- int ret = 0;
-
- trace_and_count(c, write_super, c, _RET_IP_);
-
- if (c->opts.very_degraded)
- degraded_flags |= BCH_FORCE_IF_LOST;
-
- lockdep_assert_held(&c->sb_lock);
-
- closure_init_stack(cl);
- memset(&sb_written, 0, sizeof(sb_written));
-
- for_each_online_member(c, ca) {
- ret = darray_push(&online_devices, ca);
- if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
- percpu_ref_put(&ca->io_ref);
- goto out;
- }
- percpu_ref_get(&ca->io_ref);
- }
-
- /* Make sure we're using the new magic numbers: */
- c->disk_sb.sb->magic = BCHFS_MAGIC;
- c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
-
- le64_add_cpu(&c->disk_sb.sb->seq, 1);
-
- struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- darray_for_each(online_devices, ca)
- __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq;
- c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
-
- if (test_bit(BCH_FS_error, &c->flags))
- SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
- if (test_bit(BCH_FS_topology_error, &c->flags))
- SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
-
- SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
-
- bch2_sb_counters_from_cpu(c);
- bch2_sb_members_from_cpu(c);
- bch2_sb_members_cpy_v2_v1(&c->disk_sb);
- bch2_sb_errors_from_cpu(c);
- bch2_sb_downgrade_update(c);
-
- darray_for_each(online_devices, ca)
- bch2_sb_from_fs(c, (*ca));
-
- darray_for_each(online_devices, ca) {
- printbuf_reset(&err);
-
- ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
- if (ret) {
- bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
- goto out;
- }
- }
-
- if (c->opts.nochanges)
- goto out;
-
- /*
- * Defer writing the superblock until filesystem initialization is
- * complete - don't write out a partly initialized superblock:
- */
- if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
- goto out;
-
- if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) {
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "attempting to write superblock that wasn't version downgraded (");
- bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version));
- prt_str(&buf, " > ");
- bch2_version_to_text(&buf, bcachefs_metadata_version_current);
- prt_str(&buf, ")");
- bch2_fs_fatal_error(c, ": %s", buf.buf);
- printbuf_exit(&buf);
- return -BCH_ERR_sb_not_downgraded;
- }
-
- darray_for_each(online_devices, ca) {
- __set_bit((*ca)->dev_idx, sb_written.d);
- (*ca)->sb_write_error = 0;
- }
-
- darray_for_each(online_devices, ca)
- read_back_super(c, *ca);
- closure_sync(cl);
-
- darray_for_each(online_devices, cap) {
- struct bch_dev *ca = *cap;
-
- if (ca->sb_write_error)
- continue;
-
- if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
- struct printbuf buf = PRINTBUF;
- prt_char(&buf, ' ');
- prt_bdevname(&buf, ca->disk_sb.bdev);
- prt_printf(&buf,
- ": Superblock write was silently dropped! (seq %llu expected %llu)",
- le64_to_cpu(ca->sb_read_scratch->seq),
- ca->disk_sb.seq);
-
- if (c->opts.errors != BCH_ON_ERROR_continue &&
- c->opts.errors != BCH_ON_ERROR_fix_safe) {
- ret = -BCH_ERR_erofs_sb_err;
- bch2_fs_fatal_error(c, "%s", buf.buf);
- } else {
- bch_err(c, "%s", buf.buf);
- }
-
- printbuf_exit(&buf);
- }
-
- if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
- struct printbuf buf = PRINTBUF;
- prt_char(&buf, ' ');
- prt_bdevname(&buf, ca->disk_sb.bdev);
- prt_printf(&buf,
- ": Superblock modified by another process (seq %llu expected %llu)",
- le64_to_cpu(ca->sb_read_scratch->seq),
- ca->disk_sb.seq);
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_erofs_sb_err;
- }
- }
-
- if (ret)
- goto out;
-
- do {
- wrote = false;
- darray_for_each(online_devices, cap) {
- struct bch_dev *ca = *cap;
- if (!ca->sb_write_error &&
- sb < ca->disk_sb.sb->layout.nr_superblocks) {
- write_one_super(c, ca, sb);
- wrote = true;
- }
- }
- closure_sync(cl);
- sb++;
- } while (wrote);
-
- darray_for_each(online_devices, cap) {
- struct bch_dev *ca = *cap;
- if (ca->sb_write_error)
- __clear_bit(ca->dev_idx, sb_written.d);
- else
- ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
- }
-
- nr_wrote = dev_mask_nr(&sb_written);
-
- can_mount_with_written =
- bch2_have_enough_devs(c, sb_written, degraded_flags, false);
-
- for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
- sb_written.d[i] = ~sb_written.d[i];
-
- can_mount_without_written =
- bch2_have_enough_devs(c, sb_written, degraded_flags, false);
-
- /*
- * If we would be able to mount _without_ the devices we successfully
- * wrote superblocks to, we weren't able to write to enough devices:
- *
- * Exception: if we can mount without the successes because we haven't
- * written anything (new filesystem), we continue if we'd be able to
- * mount with the devices we did successfully write to:
- */
- if (bch2_fs_fatal_err_on(!nr_wrote ||
- !can_mount_with_written ||
- (can_mount_without_written &&
- !can_mount_with_written), c,
- ": Unable to write superblock to sufficient devices (from %ps)",
- (void *) _RET_IP_))
- ret = -BCH_ERR_erofs_sb_err;
-out:
- /* Make new options visible after they're persistent: */
- bch2_sb_update(c);
- darray_for_each(online_devices, ca)
- percpu_ref_put(&(*ca)->io_ref);
- darray_exit(&online_devices);
- printbuf_exit(&err);
- return ret;
-}
-
-void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
-{
- mutex_lock(&c->sb_lock);
- if (!(c->sb.features & (1ULL << feat))) {
- c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
-
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
-}
-
-/* Downgrade if superblock is at a higher version than currently supported: */
-bool bch2_check_version_downgrade(struct bch_fs *c)
-{
- bool ret = bcachefs_metadata_version_current < c->sb.version;
-
- lockdep_assert_held(&c->sb_lock);
-
- /*
- * Downgrade, if superblock is at a higher version than currently
- * supported:
- *
- * c->sb will be checked before we write the superblock, so update it as
- * well:
- */
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
- SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current)
- SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current);
- if (c->sb.version > bcachefs_metadata_version_current)
- c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
- if (c->sb.version_min > bcachefs_metadata_version_current)
- c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
- c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
- return ret;
-}
-
-void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
-{
- lockdep_assert_held(&c->sb_lock);
-
- if (BCH_VERSION_MAJOR(new_version) >
- BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
- bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
-
- c->disk_sb.sb->version = cpu_to_le16(new_version);
-
- if (incompat) {
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
- SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
- max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
- }
-}
-
-static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- if (vstruct_bytes(f) < 88) {
- prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
- return -BCH_ERR_invalid_sb_ext;
- }
-
- return 0;
-}
-
-static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_ext *e = field_to_type(f, ext);
-
- prt_printf(out, "Recovery passes required:\t");
- prt_bitflags(out, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
- prt_newline(out);
-
- unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL);
- if (errors_silent) {
- le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
-
- prt_printf(out, "Errors to silently fix:\t");
- prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
- min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
- prt_newline(out);
-
- kfree(errors_silent);
- }
-
- prt_printf(out, "Btrees with missing data:\t");
- prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
- prt_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
- .validate = bch2_sb_ext_validate,
- .to_text = bch2_sb_ext_to_text,
-};
-
-static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
-#define x(f, nr) \
- [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
- BCH_SB_FIELDS()
-#undef x
-};
-
-static const struct bch_sb_field_ops bch2_sb_field_null_ops;
-
-static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
-{
- return likely(type < ARRAY_SIZE(bch2_sb_field_ops))
- ? bch2_sb_field_ops[type]
- : &bch2_sb_field_null_ops;
-}
-
-static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- unsigned type = le32_to_cpu(f->type);
- struct printbuf field_err = PRINTBUF;
- const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
- int ret;
-
- ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0;
- if (ret) {
- prt_printf(err, "Invalid superblock section %s: %s",
- bch2_sb_fields[type], field_err.buf);
- prt_newline(err);
- bch2_sb_field_to_text(err, sb, f);
- }
-
- printbuf_exit(&field_err);
- return ret;
-}
-
-void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- unsigned type = le32_to_cpu(f->type);
- const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
-
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- if (ops->to_text)
- ops->to_text(out, sb, f);
-}
-
-void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- unsigned type = le32_to_cpu(f->type);
-
- if (type < BCH_SB_FIELD_NR)
- prt_printf(out, "%s", bch2_sb_fields[type]);
- else
- prt_printf(out, "(unknown field %u)", type);
-
- prt_printf(out, " (size %zu):", vstruct_bytes(f));
- prt_newline(out);
-
- __bch2_sb_field_to_text(out, sb, f);
-}
-
-void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
-{
- unsigned i;
-
- prt_printf(out, "Type: %u", l->layout_type);
- prt_newline(out);
-
- prt_str(out, "Superblock max size: ");
- prt_units_u64(out, 512 << l->sb_max_size_bits);
- prt_newline(out);
-
- prt_printf(out, "Nr superblocks: %u", l->nr_superblocks);
- prt_newline(out);
-
- prt_str(out, "Offsets: ");
- for (i = 0; i < l->nr_superblocks; i++) {
- if (i)
- prt_str(out, ", ");
- prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
- }
- prt_newline(out);
-}
-
-void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
- bool print_layout, unsigned fields)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 44);
-
- prt_printf(out, "External UUID:\t");
- pr_uuid(out, sb->user_uuid.b);
- prt_newline(out);
-
- prt_printf(out, "Internal UUID:\t");
- pr_uuid(out, sb->uuid.b);
- prt_newline(out);
-
- prt_printf(out, "Magic number:\t");
- pr_uuid(out, sb->magic.b);
- prt_newline(out);
-
- prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
-
- prt_printf(out, "Label:\t");
- if (!strlen(sb->label))
- prt_printf(out, "(none)");
- else
- prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
- prt_newline(out);
-
- prt_printf(out, "Version:\t");
- bch2_version_to_text(out, le16_to_cpu(sb->version));
- prt_newline(out);
-
- prt_printf(out, "Incompatible features allowed:\t");
- bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
- prt_newline(out);
-
- prt_printf(out, "Incompatible features in use:\t");
- bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
- prt_newline(out);
-
- prt_printf(out, "Version upgrade complete:\t");
- bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
- prt_newline(out);
-
- prt_printf(out, "Oldest version on disk:\t");
- bch2_version_to_text(out, le16_to_cpu(sb->version_min));
- prt_newline(out);
-
- prt_printf(out, "Created:\t");
- if (sb->time_base_lo)
- bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
- else
- prt_printf(out, "(not set)");
- prt_newline(out);
-
- prt_printf(out, "Sequence number:\t");
- prt_printf(out, "%llu", le64_to_cpu(sb->seq));
- prt_newline(out);
-
- prt_printf(out, "Time of last write:\t");
- bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
- prt_newline(out);
-
- prt_printf(out, "Superblock size:\t");
- prt_units_u64(out, vstruct_bytes(sb));
- prt_str(out, "/");
- prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
- prt_newline(out);
-
- prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
- prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
-
- prt_printf(out, "Sections:\t");
- u64 fields_have = 0;
- vstruct_for_each(sb, f)
- fields_have |= 1 << le32_to_cpu(f->type);
- prt_bitflags(out, bch2_sb_fields, fields_have);
- prt_newline(out);
-
- prt_printf(out, "Features:\t");
- prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
- prt_newline(out);
-
- prt_printf(out, "Compat features:\t");
- prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
- prt_newline(out);
-
- prt_newline(out);
- prt_printf(out, "Options:");
- prt_newline(out);
- printbuf_indent_add(out, 2);
- {
- enum bch_opt_id id;
-
- for (id = 0; id < bch2_opts_nr; id++) {
- const struct bch_option *opt = bch2_opt_table + id;
-
- if (opt->get_sb) {
- u64 v = bch2_opt_from_sb(sb, id, -1);
-
- prt_printf(out, "%s:\t", opt->attr.name);
- bch2_opt_to_text(out, NULL, sb, opt, v,
- OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
- prt_newline(out);
- }
- }
- }
-
- printbuf_indent_sub(out, 2);
-
- if (print_layout) {
- prt_newline(out);
- prt_printf(out, "layout:");
- prt_newline(out);
- printbuf_indent_add(out, 2);
- bch2_sb_layout_to_text(out, &sb->layout);
- printbuf_indent_sub(out, 2);
- }
-
- vstruct_for_each(sb, f)
- if (fields & (1 << le32_to_cpu(f->type))) {
- prt_newline(out);
- bch2_sb_field_to_text(out, sb, f);
- }
-}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
deleted file mode 100644
index 78f708a6fbcd..000000000000
--- a/fs/bcachefs/super-io.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_IO_H
-#define _BCACHEFS_SUPER_IO_H
-
-#include "extents.h"
-#include "eytzinger.h"
-#include "super_types.h"
-#include "super.h"
-#include "sb-members.h"
-
-#include <asm/byteorder.h>
-
-#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096
-
-static inline bool bch2_version_compatible(u16 version)
-{
- return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
- version >= bcachefs_metadata_version_min;
-}
-
-void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
-enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
-
-int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
-
-static inline int bch2_request_incompat_feature(struct bch_fs *c,
- enum bcachefs_metadata_version version)
-{
- return likely(version <= c->sb.version_incompat)
- ? 0
- : bch2_set_version_incompat(c, version);
-}
-
-static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
-{
- return le32_to_cpu(f->u64s) * sizeof(u64);
-}
-
-#define field_to_type(_f, _name) \
- container_of_or_null(_f, struct bch_sb_field_##_name, field)
-
-struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
-#define bch2_sb_field_get(_sb, _name) \
- field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
-
-struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
- enum bch_sb_field_type, unsigned);
-#define bch2_sb_field_resize(_sb, _name, _u64s) \
- field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
-
-struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *,
- enum bch_sb_field_type, unsigned);
-#define bch2_sb_field_get_minsize(_sb, _name, _u64s) \
- field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
-
-#define bch2_sb_field_nr_entries(_f) \
- (_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) / \
- sizeof(_f->entries[0])) \
- : 0)
-
-void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
-
-extern const char * const bch2_sb_fields[];
-
-struct bch_sb_field_ops {
- int (*validate)(struct bch_sb *, struct bch_sb_field *,
- enum bch_validate_flags, struct printbuf *);
- void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
-};
-
-static inline __le64 bch2_sb_magic(struct bch_fs *c)
-{
- __le64 ret;
-
- memcpy(&ret, &c->sb.uuid, sizeof(ret));
- return ret;
-}
-
-static inline __u64 jset_magic(struct bch_fs *c)
-{
- return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
-}
-
-static inline __u64 bset_magic(struct bch_fs *c)
-{
- return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
-}
-
-int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
-int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
-
-void bch2_free_super(struct bch_sb_handle *);
-int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
-
-int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
-
-int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
-int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
-int bch2_write_super(struct bch_fs *);
-void __bch2_check_set_feature(struct bch_fs *, unsigned);
-
-static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
-{
- if (!(c->sb.features & (1ULL << feat)))
- __bch2_check_set_feature(c, feat);
-}
-
-bool bch2_check_version_downgrade(struct bch_fs *);
-void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
-
-void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
- struct bch_sb_field *);
-void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
- struct bch_sb_field *);
-void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
-void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
-
-#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
deleted file mode 100644
index 99f9a0aaa380..000000000000
--- a/fs/bcachefs/super.c
+++ /dev/null
@@ -1,2265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcachefs setup/teardown code, and some metadata io - read a superblock and
- * figure out what to do with it.
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "bkey_sort.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_write_buffer.h"
-#include "buckets_waiting_for_journal.h"
-#include "chardev.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "move.h"
-#include "migrate.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "sb-counters.h"
-#include "sb-errors.h"
-#include "sb-members.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "sysfs.h"
-#include "thread_with_file.h"
-#include "trace.h"
-
-#include <linux/backing-dev.h>
-#include <linux/blkdev.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/random.h>
-#include <linux/sysfs.h>
-#include <crypto/hash.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
-MODULE_DESCRIPTION("bcachefs filesystem");
-MODULE_SOFTDEP("pre: chacha20");
-MODULE_SOFTDEP("pre: poly1305");
-MODULE_SOFTDEP("pre: xxhash");
-
-const char * const bch2_fs_flag_strs[] = {
-#define x(n) #n,
- BCH_FS_FLAGS()
-#undef x
- NULL
-};
-
-void bch2_print_str(struct bch_fs *c, const char *str)
-{
-#ifdef __KERNEL__
- struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
-
- if (unlikely(stdio)) {
- bch2_stdio_redirect_printf(stdio, true, "%s", str);
- return;
- }
-#endif
- bch2_print_string_as_lines(KERN_ERR, str);
-}
-
-__printf(2, 0)
-static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
-{
-#ifdef __KERNEL__
- if (unlikely(stdio)) {
- if (fmt[0] == KERN_SOH[0])
- fmt += 2;
-
- bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
- return;
- }
-#endif
- vprintk(fmt, args);
-}
-
-void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
-{
- struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
-
- va_list args;
- va_start(args, fmt);
- bch2_print_maybe_redirect(stdio, fmt, args);
- va_end(args);
-}
-
-void __bch2_print(struct bch_fs *c, const char *fmt, ...)
-{
- struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
-
- va_list args;
- va_start(args, fmt);
- bch2_print_maybe_redirect(stdio, fmt, args);
- va_end(args);
-}
-
-#define KTYPE(type) \
-static const struct attribute_group type ## _group = { \
- .attrs = type ## _files \
-}; \
- \
-static const struct attribute_group *type ## _groups[] = { \
- &type ## _group, \
- NULL \
-}; \
- \
-static const struct kobj_type type ## _ktype = { \
- .release = type ## _release, \
- .sysfs_ops = &type ## _sysfs_ops, \
- .default_groups = type ## _groups \
-}
-
-static void bch2_fs_release(struct kobject *);
-static void bch2_dev_release(struct kobject *);
-static void bch2_fs_counters_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_internal_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_opts_dir_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_time_stats_release(struct kobject *k)
-{
-}
-
-KTYPE(bch2_fs);
-KTYPE(bch2_fs_counters);
-KTYPE(bch2_fs_internal);
-KTYPE(bch2_fs_opts_dir);
-KTYPE(bch2_fs_time_stats);
-KTYPE(bch2_dev);
-
-static struct kset *bcachefs_kset;
-static LIST_HEAD(bch_fs_list);
-static DEFINE_MUTEX(bch_fs_list_lock);
-
-DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
-
-static void bch2_dev_unlink(struct bch_dev *);
-static void bch2_dev_free(struct bch_dev *);
-static int bch2_dev_alloc(struct bch_fs *, unsigned);
-static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
-static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
-
-struct bch_fs *bch2_dev_to_fs(dev_t dev)
-{
- struct bch_fs *c;
-
- mutex_lock(&bch_fs_list_lock);
- rcu_read_lock();
-
- list_for_each_entry(c, &bch_fs_list, list)
- for_each_member_device_rcu(c, ca, NULL)
- if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
- closure_get(&c->cl);
- goto found;
- }
- c = NULL;
-found:
- rcu_read_unlock();
- mutex_unlock(&bch_fs_list_lock);
-
- return c;
-}
-
-static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
-{
- struct bch_fs *c;
-
- lockdep_assert_held(&bch_fs_list_lock);
-
- list_for_each_entry(c, &bch_fs_list, list)
- if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
- return c;
-
- return NULL;
-}
-
-struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
-{
- struct bch_fs *c;
-
- mutex_lock(&bch_fs_list_lock);
- c = __bch2_uuid_to_fs(uuid);
- if (c)
- closure_get(&c->cl);
- mutex_unlock(&bch_fs_list_lock);
-
- return c;
-}
-
-/* Filesystem RO/RW: */
-
-/*
- * For startup/shutdown of RW stuff, the dependencies are:
- *
- * - foreground writes depend on copygc and rebalance (to free up space)
- *
- * - copygc and rebalance depend on mark and sweep gc (they actually probably
- * don't because they either reserve ahead of time or don't block if
- * allocations fail, but allocations can require mark and sweep gc to run
- * because of generation number wraparound)
- *
- * - all of the above depends on the allocator threads
- *
- * - allocator depends on the journal (when it rewrites prios and gens)
- */
-
-static void __bch2_fs_read_only(struct bch_fs *c)
-{
- unsigned clean_passes = 0;
- u64 seq = 0;
-
- bch2_fs_ec_stop(c);
- bch2_open_buckets_stop(c, NULL, true);
- bch2_rebalance_stop(c);
- bch2_copygc_stop(c);
- bch2_fs_ec_flush(c);
-
- bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
- journal_cur_seq(&c->journal));
-
- do {
- clean_passes++;
-
- if (bch2_btree_interior_updates_flush(c) ||
- bch2_btree_write_buffer_flush_going_ro(c) ||
- bch2_journal_flush_all_pins(&c->journal) ||
- bch2_btree_flush_all_writes(c) ||
- seq != atomic64_read(&c->journal.seq)) {
- seq = atomic64_read(&c->journal.seq);
- clean_passes = 0;
- }
- } while (clean_passes < 2);
-
- bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
- journal_cur_seq(&c->journal));
-
- if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&
- !test_bit(BCH_FS_emergency_ro, &c->flags))
- set_bit(BCH_FS_clean_shutdown, &c->flags);
-
- bch2_fs_journal_stop(&c->journal);
-
- bch_info(c, "%sclean shutdown complete, journal seq %llu",
- test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
- c->journal.seq_ondisk);
-
- /*
- * After stopping journal:
- */
- for_each_member_device(c, ca)
- bch2_dev_allocator_remove(c, ca);
-}
-
-#ifndef BCH_WRITE_REF_DEBUG
-static void bch2_writes_disabled(struct percpu_ref *writes)
-{
- struct bch_fs *c = container_of(writes, struct bch_fs, writes);
-
- set_bit(BCH_FS_write_disable_complete, &c->flags);
- wake_up(&bch2_read_only_wait);
-}
-#endif
-
-void bch2_fs_read_only(struct bch_fs *c)
-{
- if (!test_bit(BCH_FS_rw, &c->flags)) {
- bch2_journal_reclaim_stop(&c->journal);
- return;
- }
-
- BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));
-
- bch_verbose(c, "going read-only");
-
- /*
- * Block new foreground-end write operations from starting - any new
- * writes will return -EROFS:
- */
- set_bit(BCH_FS_going_ro, &c->flags);
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_kill(&c->writes);
-#else
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
- bch2_write_ref_put(c, i);
-#endif
-
- /*
- * If we're not doing an emergency shutdown, we want to wait on
- * outstanding writes to complete so they don't see spurious errors due
- * to shutting down the allocator:
- *
- * If we are doing an emergency shutdown outstanding writes may
- * hang until we shutdown the allocator so we don't want to wait
- * on outstanding writes before shutting everything down - but
- * we do need to wait on them before returning and signalling
- * that going RO is complete:
- */
- wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_write_disable_complete, &c->flags) ||
- test_bit(BCH_FS_emergency_ro, &c->flags));
-
- bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);
- if (writes_disabled)
- bch_verbose(c, "finished waiting for writes to stop");
-
- __bch2_fs_read_only(c);
-
- wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_write_disable_complete, &c->flags));
-
- if (!writes_disabled)
- bch_verbose(c, "finished waiting for writes to stop");
-
- clear_bit(BCH_FS_write_disable_complete, &c->flags);
- clear_bit(BCH_FS_going_ro, &c->flags);
- clear_bit(BCH_FS_rw, &c->flags);
-
- if (!bch2_journal_error(&c->journal) &&
- !test_bit(BCH_FS_error, &c->flags) &&
- !test_bit(BCH_FS_emergency_ro, &c->flags) &&
- test_bit(BCH_FS_started, &c->flags) &&
- test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
- BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
- BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
- BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
- BUG_ON(c->btree_write_buffer.inc.keys.nr);
- BUG_ON(c->btree_write_buffer.flushing.keys.nr);
- bch2_verify_accounting_clean(c);
-
- bch_verbose(c, "marking filesystem clean");
- bch2_fs_mark_clean(c);
- } else {
- bch_verbose(c, "done going read-only, filesystem not clean");
- }
-}
-
-static void bch2_fs_read_only_work(struct work_struct *work)
-{
- struct bch_fs *c =
- container_of(work, struct bch_fs, read_only_work);
-
- down_write(&c->state_lock);
- bch2_fs_read_only(c);
- up_write(&c->state_lock);
-}
-
-static void bch2_fs_read_only_async(struct bch_fs *c)
-{
- queue_work(system_long_wq, &c->read_only_work);
-}
-
-bool bch2_fs_emergency_read_only(struct bch_fs *c)
-{
- bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
- bch2_journal_halt(&c->journal);
- bch2_fs_read_only_async(c);
-
- wake_up(&bch2_read_only_wait);
- return ret;
-}
-
-bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
-{
- bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
- bch2_journal_halt_locked(&c->journal);
- bch2_fs_read_only_async(c);
-
- wake_up(&bch2_read_only_wait);
- return ret;
-}
-
-static int bch2_fs_read_write_late(struct bch_fs *c)
-{
- int ret;
-
- /*
- * Data move operations can't run until after check_snapshots has
- * completed, and bch2_snapshot_is_ancestor() is available.
- *
- * Ideally we'd start copygc/rebalance earlier instead of waiting for
- * all of recovery/fsck to complete:
- */
- ret = bch2_copygc_start(c);
- if (ret) {
- bch_err(c, "error starting copygc thread");
- return ret;
- }
-
- ret = bch2_rebalance_start(c);
- if (ret) {
- bch_err(c, "error starting rebalance thread");
- return ret;
- }
-
- return 0;
-}
-
-static int __bch2_fs_read_write(struct bch_fs *c, bool early)
-{
- int ret;
-
- BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
-
- if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
- bch_err(c, "cannot go rw, unfixed btree errors");
- return -BCH_ERR_erofs_unfixed_errors;
- }
-
- if (test_bit(BCH_FS_rw, &c->flags))
- return 0;
-
- bch_info(c, "going read-write");
-
- ret = bch2_sb_members_v2_init(c);
- if (ret)
- goto err;
-
- ret = bch2_fs_mark_dirty(c);
- if (ret)
- goto err;
-
- clear_bit(BCH_FS_clean_shutdown, &c->flags);
-
- /*
- * First journal write must be a flush write: after a clean shutdown we
- * don't read the journal, so the first journal write may end up
- * overwriting whatever was there previously, and there must always be
- * at least one non-flush write in the journal or recovery will fail:
- */
- set_bit(JOURNAL_need_flush_write, &c->journal.flags);
- set_bit(JOURNAL_running, &c->journal.flags);
-
- for_each_rw_member(c, ca)
- bch2_dev_allocator_add(c, ca);
- bch2_recalc_capacity(c);
-
- set_bit(BCH_FS_rw, &c->flags);
- set_bit(BCH_FS_was_rw, &c->flags);
-
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_reinit(&c->writes);
-#else
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
- BUG_ON(atomic_long_read(&c->writes[i]));
- atomic_long_inc(&c->writes[i]);
- }
-#endif
-
- ret = bch2_journal_reclaim_start(&c->journal);
- if (ret)
- goto err;
-
- if (!early) {
- ret = bch2_fs_read_write_late(c);
- if (ret)
- goto err;
- }
-
- bch2_do_discards(c);
- bch2_do_invalidates(c);
- bch2_do_stripe_deletes(c);
- bch2_do_pending_node_rewrites(c);
- return 0;
-err:
- if (test_bit(BCH_FS_rw, &c->flags))
- bch2_fs_read_only(c);
- else
- __bch2_fs_read_only(c);
- return ret;
-}
-
-int bch2_fs_read_write(struct bch_fs *c)
-{
- if (c->opts.recovery_pass_last &&
- c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
- return -BCH_ERR_erofs_norecovery;
-
- if (c->opts.nochanges)
- return -BCH_ERR_erofs_nochanges;
-
- return __bch2_fs_read_write(c, false);
-}
-
-int bch2_fs_read_write_early(struct bch_fs *c)
-{
- lockdep_assert_held(&c->state_lock);
-
- return __bch2_fs_read_write(c, true);
-}
-
-/* Filesystem startup/shutdown: */
-
-static void __bch2_fs_free(struct bch_fs *c)
-{
- for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
- bch2_time_stats_exit(&c->times[i]);
-
- bch2_find_btree_nodes_exit(&c->found_btree_nodes);
- bch2_free_pending_node_rewrites(c);
- bch2_fs_accounting_exit(c);
- bch2_fs_sb_errors_exit(c);
- bch2_fs_counters_exit(c);
- bch2_fs_snapshots_exit(c);
- bch2_fs_quota_exit(c);
- bch2_fs_fs_io_direct_exit(c);
- bch2_fs_fs_io_buffered_exit(c);
- bch2_fs_fsio_exit(c);
- bch2_fs_vfs_exit(c);
- bch2_fs_ec_exit(c);
- bch2_fs_encryption_exit(c);
- bch2_fs_nocow_locking_exit(c);
- bch2_fs_io_write_exit(c);
- bch2_fs_io_read_exit(c);
- bch2_fs_buckets_waiting_for_journal_exit(c);
- bch2_fs_btree_interior_update_exit(c);
- bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
- bch2_fs_btree_cache_exit(c);
- bch2_fs_btree_iter_exit(c);
- bch2_fs_replicas_exit(c);
- bch2_fs_journal_exit(&c->journal);
- bch2_io_clock_exit(&c->io_clock[WRITE]);
- bch2_io_clock_exit(&c->io_clock[READ]);
- bch2_fs_compress_exit(c);
- bch2_fs_btree_gc_exit(c);
- bch2_journal_keys_put_initial(c);
- bch2_find_btree_nodes_exit(&c->found_btree_nodes);
- BUG_ON(atomic_read(&c->journal_keys.ref));
- bch2_fs_btree_write_buffer_exit(c);
- percpu_free_rwsem(&c->mark_lock);
- if (c->online_reserved) {
- u64 v = percpu_u64_get(c->online_reserved);
- WARN(v, "online_reserved not 0 at shutdown: %lli", v);
- free_percpu(c->online_reserved);
- }
-
- darray_exit(&c->btree_roots_extra);
- free_percpu(c->pcpu);
- free_percpu(c->usage);
- mempool_exit(&c->large_bkey_pool);
- mempool_exit(&c->btree_bounce_pool);
- bioset_exit(&c->btree_bio);
- mempool_exit(&c->fill_iter);
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_exit(&c->writes);
-#endif
- kfree(rcu_dereference_protected(c->disk_groups, 1));
- kfree(c->journal_seq_blacklist_table);
-
- if (c->write_ref_wq)
- destroy_workqueue(c->write_ref_wq);
- if (c->btree_write_submit_wq)
- destroy_workqueue(c->btree_write_submit_wq);
- if (c->btree_read_complete_wq)
- destroy_workqueue(c->btree_read_complete_wq);
- if (c->copygc_wq)
- destroy_workqueue(c->copygc_wq);
- if (c->btree_io_complete_wq)
- destroy_workqueue(c->btree_io_complete_wq);
- if (c->btree_update_wq)
- destroy_workqueue(c->btree_update_wq);
-
- bch2_free_super(&c->disk_sb);
- kvfree(c);
- module_put(THIS_MODULE);
-}
-
-static void bch2_fs_release(struct kobject *kobj)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- __bch2_fs_free(c);
-}
-
-void __bch2_fs_stop(struct bch_fs *c)
-{
- bch_verbose(c, "shutting down");
-
- set_bit(BCH_FS_stopping, &c->flags);
-
- down_write(&c->state_lock);
- bch2_fs_read_only(c);
- up_write(&c->state_lock);
-
- for_each_member_device(c, ca)
- bch2_dev_unlink(ca);
-
- if (c->kobj.state_in_sysfs)
- kobject_del(&c->kobj);
-
- bch2_fs_debug_exit(c);
- bch2_fs_chardev_exit(c);
-
- bch2_ro_ref_put(c);
- wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));
-
- kobject_put(&c->counters_kobj);
- kobject_put(&c->time_stats);
- kobject_put(&c->opts_dir);
- kobject_put(&c->internal);
-
- /* btree prefetch might have kicked off reads in the background: */
- bch2_btree_flush_all_reads(c);
-
- for_each_member_device(c, ca)
- cancel_work_sync(&ca->io_error_work);
-
- cancel_work_sync(&c->read_only_work);
-}
-
-void bch2_fs_free(struct bch_fs *c)
-{
- unsigned i;
-
- mutex_lock(&bch_fs_list_lock);
- list_del(&c->list);
- mutex_unlock(&bch_fs_list_lock);
-
- closure_sync(&c->cl);
- closure_debug_destroy(&c->cl);
-
- for (i = 0; i < c->sb.nr_devices; i++) {
- struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
-
- if (ca) {
- EBUG_ON(atomic_long_read(&ca->ref) != 1);
- bch2_free_super(&ca->disk_sb);
- bch2_dev_free(ca);
- }
- }
-
- bch_verbose(c, "shutdown complete");
-
- kobject_put(&c->kobj);
-}
-
-void bch2_fs_stop(struct bch_fs *c)
-{
- __bch2_fs_stop(c);
- bch2_fs_free(c);
-}
-
-static int bch2_fs_online(struct bch_fs *c)
-{
- int ret = 0;
-
- lockdep_assert_held(&bch_fs_list_lock);
-
- if (__bch2_uuid_to_fs(c->sb.uuid)) {
- bch_err(c, "filesystem UUID already open");
- return -EINVAL;
- }
-
- ret = bch2_fs_chardev_init(c);
- if (ret) {
- bch_err(c, "error creating character device");
- return ret;
- }
-
- bch2_fs_debug_init(c);
-
- ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
- kobject_add(&c->internal, &c->kobj, "internal") ?:
- kobject_add(&c->opts_dir, &c->kobj, "options") ?:
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
- kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
-#endif
- kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
- bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS);
- if (ret) {
- bch_err(c, "error creating sysfs objects");
- return ret;
- }
-
- down_write(&c->state_lock);
-
- for_each_member_device(c, ca) {
- ret = bch2_dev_sysfs_online(c, ca);
- if (ret) {
- bch_err(c, "error creating sysfs objects");
- bch2_dev_put(ca);
- goto err;
- }
- }
-
- BUG_ON(!list_empty(&c->list));
- list_add(&c->list, &bch_fs_list);
-err:
- up_write(&c->state_lock);
- return ret;
-}
-
-static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
-{
- struct bch_fs *c;
- struct printbuf name = PRINTBUF;
- unsigned i, iter_size;
- int ret = 0;
-
- c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
- if (!c) {
- c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
- goto out;
- }
-
- c->stdio = (void *)(unsigned long) opts.stdio;
-
- __module_get(THIS_MODULE);
-
- closure_init(&c->cl, NULL);
-
- c->kobj.kset = bcachefs_kset;
- kobject_init(&c->kobj, &bch2_fs_ktype);
- kobject_init(&c->internal, &bch2_fs_internal_ktype);
- kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
- kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
- kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
-
- c->minor = -1;
- c->disk_sb.fs_sb = true;
-
- init_rwsem(&c->state_lock);
- mutex_init(&c->sb_lock);
- mutex_init(&c->replicas_gc_lock);
- mutex_init(&c->btree_root_lock);
- INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
-
- refcount_set(&c->ro_ref, 1);
- init_waitqueue_head(&c->ro_ref_wait);
- spin_lock_init(&c->recovery_pass_lock);
- sema_init(&c->online_fsck_mutex, 1);
-
- for (i = 0; i < BCH_TIME_STAT_NR; i++)
- bch2_time_stats_init(&c->times[i]);
-
- bch2_fs_copygc_init(c);
- bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
- bch2_fs_btree_iter_init_early(c);
- bch2_fs_btree_interior_update_init_early(c);
- bch2_fs_journal_keys_init(c);
- bch2_fs_allocator_background_init(c);
- bch2_fs_allocator_foreground_init(c);
- bch2_fs_rebalance_init(c);
- bch2_fs_quota_init(c);
- bch2_fs_ec_init_early(c);
- bch2_fs_move_init(c);
- bch2_fs_sb_errors_init_early(c);
-
- INIT_LIST_HEAD(&c->list);
-
- mutex_init(&c->bio_bounce_pages_lock);
- mutex_init(&c->snapshot_table_lock);
- init_rwsem(&c->snapshot_create_lock);
-
- spin_lock_init(&c->btree_write_error_lock);
-
- INIT_LIST_HEAD(&c->journal_iters);
-
- INIT_LIST_HEAD(&c->fsck_error_msgs);
- mutex_init(&c->fsck_error_msgs_lock);
-
- seqcount_init(&c->usage_lock);
-
- sema_init(&c->io_in_flight, 128);
-
- INIT_LIST_HEAD(&c->vfs_inodes_list);
- mutex_init(&c->vfs_inodes_lock);
-
- c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
- c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
- c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
-
- bch2_fs_btree_cache_init_early(&c->btree_cache);
-
- mutex_init(&c->sectors_available_lock);
-
- ret = percpu_init_rwsem(&c->mark_lock);
- if (ret)
- goto err;
-
- mutex_lock(&c->sb_lock);
- ret = bch2_sb_to_fs(c, sb);
- mutex_unlock(&c->sb_lock);
-
- if (ret)
- goto err;
-
-#ifdef CONFIG_UNICODE
- /* Default encoding until we can potentially have more as an option. */
- c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
- if (IS_ERR(c->cf_encoding)) {
- printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
- unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
- ret = -EINVAL;
- goto err;
- }
-#else
- if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
- printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
- ret = -EINVAL;
- goto err;
- }
-#endif
-
- pr_uuid(&name, c->sb.user_uuid.b);
- ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
- if (ret)
- goto err;
-
- strscpy(c->name, name.buf, sizeof(c->name));
- printbuf_exit(&name);
-
- /* Compat: */
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
- !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
- SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
-
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
- !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
- SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
-
- c->opts = bch2_opts_default;
- ret = bch2_opts_from_sb(&c->opts, sb);
- if (ret)
- goto err;
-
- bch2_opts_apply(&c->opts, opts);
-
- c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
- if (c->opts.inodes_use_key_cache)
- c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
- c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
-
- c->block_bits = ilog2(block_sectors(c));
- c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
-
- if (bch2_fs_init_fault("fs_alloc")) {
- bch_err(c, "fs_alloc fault injected");
- ret = -EFAULT;
- goto err;
- }
-
- iter_size = sizeof(struct sort_iter) +
- (btree_blocks(c) + 1) * 2 *
- sizeof(struct sort_iter_set);
-
- if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
- !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
- !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
- !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
- !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
- WQ_FREEZABLE, 0)) ||
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_init(&c->writes, bch2_writes_disabled,
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
-#endif
- mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
- bioset_init(&c->btree_bio, 1,
- max(offsetof(struct btree_read_bio, bio),
- offsetof(struct btree_write_bio, wbio.bio)),
- BIOSET_NEED_BVECS) ||
- !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
- !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
- !(c->online_reserved = alloc_percpu(u64)) ||
- mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
- c->opts.btree_node_size) ||
- mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
- ret = -BCH_ERR_ENOMEM_fs_other_alloc;
- goto err;
- }
-
- ret = bch2_fs_counters_init(c) ?:
- bch2_fs_sb_errors_init(c) ?:
- bch2_io_clock_init(&c->io_clock[READ]) ?:
- bch2_io_clock_init(&c->io_clock[WRITE]) ?:
- bch2_fs_journal_init(&c->journal) ?:
- bch2_fs_btree_iter_init(c) ?:
- bch2_fs_btree_cache_init(c) ?:
- bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
- bch2_fs_btree_interior_update_init(c) ?:
- bch2_fs_btree_gc_init(c) ?:
- bch2_fs_buckets_waiting_for_journal_init(c) ?:
- bch2_fs_btree_write_buffer_init(c) ?:
- bch2_fs_subvolumes_init(c) ?:
- bch2_fs_io_read_init(c) ?:
- bch2_fs_io_write_init(c) ?:
- bch2_fs_nocow_locking_init(c) ?:
- bch2_fs_encryption_init(c) ?:
- bch2_fs_compress_init(c) ?:
- bch2_fs_ec_init(c) ?:
- bch2_fs_vfs_init(c) ?:
- bch2_fs_fsio_init(c) ?:
- bch2_fs_fs_io_buffered_init(c) ?:
- bch2_fs_fs_io_direct_init(c);
- if (ret)
- goto err;
-
- for (i = 0; i < c->sb.nr_devices; i++) {
- if (!bch2_member_exists(c->disk_sb.sb, i))
- continue;
- ret = bch2_dev_alloc(c, i);
- if (ret)
- goto err;
- }
-
- bch2_journal_entry_res_resize(&c->journal,
- &c->btree_root_journal_res,
- BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
- bch2_journal_entry_res_resize(&c->journal,
- &c->clock_journal_res,
- (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
-
- mutex_lock(&bch_fs_list_lock);
- ret = bch2_fs_online(c);
- mutex_unlock(&bch_fs_list_lock);
-
- if (ret)
- goto err;
-out:
- return c;
-err:
- bch2_fs_free(c);
- c = ERR_PTR(ret);
- goto out;
-}
-
-noinline_for_stack
-static void print_mount_opts(struct bch_fs *c)
-{
- enum bch_opt_id i;
- struct printbuf p = PRINTBUF;
- bool first = true;
-
- prt_str(&p, "starting version ");
- bch2_version_to_text(&p, c->sb.version);
-
- if (c->opts.read_only) {
- prt_str(&p, " opts=");
- first = false;
- prt_printf(&p, "ro");
- }
-
- for (i = 0; i < bch2_opts_nr; i++) {
- const struct bch_option *opt = &bch2_opt_table[i];
- u64 v = bch2_opt_get_by_id(&c->opts, i);
-
- if (!(opt->flags & OPT_MOUNT))
- continue;
-
- if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
- continue;
-
- prt_str(&p, first ? " opts=" : ",");
- first = false;
- bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
- }
-
- bch_info(c, "%s", p.buf);
- printbuf_exit(&p);
-}
-
-int bch2_fs_start(struct bch_fs *c)
-{
- time64_t now = ktime_get_real_seconds();
- int ret;
-
- print_mount_opts(c);
-
- down_write(&c->state_lock);
-
- BUG_ON(test_bit(BCH_FS_started, &c->flags));
-
- mutex_lock(&c->sb_lock);
-
- ret = bch2_sb_members_v2_init(c);
- if (ret) {
- mutex_unlock(&c->sb_lock);
- goto err;
- }
-
- for_each_online_member(c, ca)
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
-
- struct bch_sb_field_ext *ext =
- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
- mutex_unlock(&c->sb_lock);
-
- if (!ext) {
- bch_err(c, "insufficient space in superblock for sb_field_ext");
- ret = -BCH_ERR_ENOSPC_sb;
- goto err;
- }
-
- for_each_rw_member(c, ca)
- bch2_dev_allocator_add(c, ca);
- bch2_recalc_capacity(c);
-
- c->recovery_task = current;
- ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
- ? bch2_fs_recovery(c)
- : bch2_fs_initialize(c);
- c->recovery_task = NULL;
-
- if (ret)
- goto err;
-
- ret = bch2_opts_check_may_set(c);
- if (ret)
- goto err;
-
- if (bch2_fs_init_fault("fs_start")) {
- bch_err(c, "fs_start fault injected");
- ret = -EINVAL;
- goto err;
- }
-
- set_bit(BCH_FS_started, &c->flags);
- wake_up(&c->ro_ref_wait);
-
- if (c->opts.read_only) {
- bch2_fs_read_only(c);
- } else {
- ret = !test_bit(BCH_FS_rw, &c->flags)
- ? bch2_fs_read_write(c)
- : bch2_fs_read_write_late(c);
- if (ret)
- goto err;
- }
-
- ret = 0;
-err:
- if (ret)
- bch_err_msg(c, ret, "starting filesystem");
- else
- bch_verbose(c, "done starting filesystem");
- up_write(&c->state_lock);
- return ret;
-}
-
-static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
-{
- struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
-
- if (le16_to_cpu(sb->block_size) != block_sectors(c))
- return -BCH_ERR_mismatched_block_size;
-
- if (le16_to_cpu(m.bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
- return -BCH_ERR_bucket_size_too_small;
-
- return 0;
-}
-
-static int bch2_dev_in_fs(struct bch_sb_handle *fs,
- struct bch_sb_handle *sb,
- struct bch_opts *opts)
-{
- if (fs == sb)
- return 0;
-
- if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
- return -BCH_ERR_device_not_a_member_of_filesystem;
-
- if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
- return -BCH_ERR_device_has_been_removed;
-
- if (fs->sb->block_size != sb->sb->block_size)
- return -BCH_ERR_mismatched_block_size;
-
- if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
- le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
- return 0;
-
- if (fs->sb->seq == sb->sb->seq &&
- fs->sb->write_time != sb->sb->write_time) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Split brain detected between ");
- prt_bdevname(&buf, sb->bdev);
- prt_str(&buf, " and ");
- prt_bdevname(&buf, fs->bdev);
- prt_char(&buf, ':');
- prt_newline(&buf);
- prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
- prt_newline(&buf);
-
- prt_bdevname(&buf, fs->bdev);
- prt_char(&buf, ' ');
- bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
- prt_newline(&buf);
-
- prt_bdevname(&buf, sb->bdev);
- prt_char(&buf, ' ');
- bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
- prt_newline(&buf);
-
- if (!opts->no_splitbrain_check)
- prt_printf(&buf, "Not using older sb");
-
- pr_err("%s", buf.buf);
- printbuf_exit(&buf);
-
- if (!opts->no_splitbrain_check)
- return -BCH_ERR_device_splitbrain;
- }
-
- struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
- u64 seq_from_fs = le64_to_cpu(m.seq);
- u64 seq_from_member = le64_to_cpu(sb->sb->seq);
-
- if (seq_from_fs && seq_from_fs < seq_from_member) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Split brain detected between ");
- prt_bdevname(&buf, sb->bdev);
- prt_str(&buf, " and ");
- prt_bdevname(&buf, fs->bdev);
- prt_char(&buf, ':');
- prt_newline(&buf);
-
- prt_bdevname(&buf, fs->bdev);
- prt_str(&buf, " believes seq of ");
- prt_bdevname(&buf, sb->bdev);
- prt_printf(&buf, " to be %llu, but ", seq_from_fs);
- prt_bdevname(&buf, sb->bdev);
- prt_printf(&buf, " has %llu\n", seq_from_member);
-
- if (!opts->no_splitbrain_check) {
- prt_str(&buf, "Not using ");
- prt_bdevname(&buf, sb->bdev);
- }
-
- pr_err("%s", buf.buf);
- printbuf_exit(&buf);
-
- if (!opts->no_splitbrain_check)
- return -BCH_ERR_device_splitbrain;
- }
-
- return 0;
-}
-
-/* Device startup/shutdown: */
-
-static void bch2_dev_release(struct kobject *kobj)
-{
- struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
-
- kfree(ca);
-}
-
-static void bch2_dev_free(struct bch_dev *ca)
-{
- cancel_work_sync(&ca->io_error_work);
-
- bch2_dev_unlink(ca);
-
- if (ca->kobj.state_in_sysfs)
- kobject_del(&ca->kobj);
-
- bch2_free_super(&ca->disk_sb);
- bch2_dev_allocator_background_exit(ca);
- bch2_dev_journal_exit(ca);
-
- free_percpu(ca->io_done);
- bch2_dev_buckets_free(ca);
- kfree(ca->sb_read_scratch);
-
- bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
- bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
-
- percpu_ref_exit(&ca->io_ref);
-#ifndef CONFIG_BCACHEFS_DEBUG
- percpu_ref_exit(&ca->ref);
-#endif
- kobject_put(&ca->kobj);
-}
-
-static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
-{
-
- lockdep_assert_held(&c->state_lock);
-
- if (percpu_ref_is_zero(&ca->io_ref))
- return;
-
- __bch2_dev_read_only(c, ca);
-
- reinit_completion(&ca->io_ref_completion);
- percpu_ref_kill(&ca->io_ref);
- wait_for_completion(&ca->io_ref_completion);
-
- bch2_dev_unlink(ca);
-
- bch2_free_super(&ca->disk_sb);
- bch2_dev_journal_exit(ca);
-}
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-static void bch2_dev_ref_complete(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
-
- complete(&ca->ref_completion);
-}
-#endif
-
-static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
-
- complete(&ca->io_ref_completion);
-}
-
-static void bch2_dev_unlink(struct bch_dev *ca)
-{
- struct kobject *b;
-
- /*
- * This is racy w.r.t. the underlying block device being hot-removed,
- * which removes it from sysfs.
- *
- * It'd be lovely if we had a way to handle this race, but the sysfs
- * code doesn't appear to provide a good method and block/holder.c is
- * susceptible as well:
- */
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev &&
- (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
- sysfs_remove_link(b, "bcachefs");
- sysfs_remove_link(&ca->kobj, "block");
- }
-}
-
-static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
-{
- int ret;
-
- if (!c->kobj.state_in_sysfs)
- return 0;
-
- if (!ca->kobj.state_in_sysfs) {
- ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?:
- bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE);
- if (ret)
- return ret;
- }
-
- if (ca->disk_sb.bdev) {
- struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
-
- ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
- if (ret)
- return ret;
-
- ret = sysfs_create_link(&ca->kobj, block, "block");
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
- struct bch_member *member)
-{
- struct bch_dev *ca;
- unsigned i;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca)
- return NULL;
-
- kobject_init(&ca->kobj, &bch2_dev_ktype);
- init_completion(&ca->ref_completion);
- init_completion(&ca->io_ref_completion);
-
- INIT_WORK(&ca->io_error_work, bch2_io_error_work);
-
- bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
- bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
-
- ca->mi = bch2_mi_to_cpu(member);
-
- for (i = 0; i < ARRAY_SIZE(member->errors); i++)
- atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
-
- ca->uuid = member->uuid;
-
- ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
- ca->mi.bucket_size / btree_sectors(c));
-
-#ifndef CONFIG_BCACHEFS_DEBUG
- if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
- goto err;
-#else
- atomic_long_set(&ca->ref, 1);
-#endif
-
- bch2_dev_allocator_background_init(ca);
-
- if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
- !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
- bch2_dev_buckets_alloc(c, ca) ||
- !(ca->io_done = alloc_percpu(*ca->io_done)))
- goto err;
-
- return ca;
-err:
- bch2_dev_free(ca);
- return NULL;
-}
-
-static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
- unsigned dev_idx)
-{
- ca->dev_idx = dev_idx;
- __set_bit(ca->dev_idx, ca->self.d);
- scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
-
- ca->fs = c;
- rcu_assign_pointer(c->devs[ca->dev_idx], ca);
-
- if (bch2_dev_sysfs_online(c, ca))
- pr_warn("error creating sysfs objects");
-}
-
-static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
-{
- struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
- struct bch_dev *ca = NULL;
-
- if (bch2_fs_init_fault("dev_alloc"))
- goto err;
-
- ca = __bch2_dev_alloc(c, &member);
- if (!ca)
- goto err;
-
- ca->fs = c;
-
- bch2_dev_attach(c, ca, dev_idx);
- return 0;
-err:
- return -BCH_ERR_ENOMEM_dev_alloc;
-}
-
-static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
-{
- unsigned ret;
-
- if (bch2_dev_is_online(ca)) {
- bch_err(ca, "already have device online in slot %u",
- sb->sb->dev_idx);
- return -BCH_ERR_device_already_online;
- }
-
- if (get_capacity(sb->bdev->bd_disk) <
- ca->mi.bucket_size * ca->mi.nbuckets) {
- bch_err(ca, "cannot online: device too small");
- return -BCH_ERR_device_size_too_small;
- }
-
- BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
-
- ret = bch2_dev_journal_init(ca, sb->sb);
- if (ret)
- return ret;
-
- /* Commit: */
- ca->disk_sb = *sb;
- memset(sb, 0, sizeof(*sb));
-
- /*
- * Stash pointer to the filesystem for blk_holder_ops - note that once
- * attached to a filesystem, we will always close the block device
- * before tearing down the filesystem object.
- */
- ca->disk_sb.holder->c = ca->fs;
-
- ca->dev = ca->disk_sb.bdev->bd_dev;
-
- percpu_ref_reinit(&ca->io_ref);
-
- return 0;
-}
-
-static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
-{
- struct bch_dev *ca;
- int ret;
-
- lockdep_assert_held(&c->state_lock);
-
- if (le64_to_cpu(sb->sb->seq) >
- le64_to_cpu(c->disk_sb.sb->seq))
- bch2_sb_to_fs(c, sb->sb);
-
- BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
-
- ca = bch2_dev_locked(c, sb->sb->dev_idx);
-
- ret = __bch2_dev_attach_bdev(ca, sb);
- if (ret)
- return ret;
-
- bch2_dev_sysfs_online(c, ca);
-
- struct printbuf name = PRINTBUF;
- prt_bdevname(&name, ca->disk_sb.bdev);
-
- if (c->sb.nr_devices == 1)
- strscpy(c->name, name.buf, sizeof(c->name));
- strscpy(ca->name, name.buf, sizeof(ca->name));
-
- printbuf_exit(&name);
-
- rebalance_wakeup(c);
- return 0;
-}
-
-/* Device management: */
-
-/*
- * Note: this function is also used by the error paths - when a particular
- * device sees an error, we call it to determine whether we can just set the
- * device RO, or - if this function returns false - we'll set the whole
- * filesystem RO:
- *
- * XXX: maybe we should be more explicit about whether we're changing state
- * because we got an error or what have you?
- */
-bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
-{
- struct bch_devs_mask new_online_devs;
- int nr_rw = 0, required;
-
- lockdep_assert_held(&c->state_lock);
-
- switch (new_state) {
- case BCH_MEMBER_STATE_rw:
- return true;
- case BCH_MEMBER_STATE_ro:
- if (ca->mi.state != BCH_MEMBER_STATE_rw)
- return true;
-
- /* do we have enough devices to write to? */
- for_each_member_device(c, ca2)
- if (ca2 != ca)
- nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
-
- required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
- ? c->opts.metadata_replicas
- : metadata_replicas_required(c),
- !(flags & BCH_FORCE_IF_DATA_DEGRADED)
- ? c->opts.data_replicas
- : data_replicas_required(c));
-
- return nr_rw >= required;
- case BCH_MEMBER_STATE_failed:
- case BCH_MEMBER_STATE_spare:
- if (ca->mi.state != BCH_MEMBER_STATE_rw &&
- ca->mi.state != BCH_MEMBER_STATE_ro)
- return true;
-
- /* do we have enough devices to read from? */
- new_online_devs = bch2_online_devs(c);
- __clear_bit(ca->dev_idx, new_online_devs.d);
-
- return bch2_have_enough_devs(c, new_online_devs, flags, false);
- default:
- BUG();
- }
-}
-
-static bool bch2_fs_may_start(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i, flags = 0;
-
- if (c->opts.very_degraded)
- flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
-
- if (c->opts.degraded)
- flags |= BCH_FORCE_IF_DEGRADED;
-
- if (!c->opts.degraded &&
- !c->opts.very_degraded) {
- mutex_lock(&c->sb_lock);
-
- for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- if (!bch2_member_exists(c->disk_sb.sb, i))
- continue;
-
- ca = bch2_dev_locked(c, i);
-
- if (!bch2_dev_is_online(ca) &&
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
- ca->mi.state == BCH_MEMBER_STATE_ro)) {
- mutex_unlock(&c->sb_lock);
- return false;
- }
- }
- mutex_unlock(&c->sb_lock);
- }
-
- return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
-}
-
-static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
-{
- /*
- * The allocator thread itself allocates btree nodes, so stop it first:
- */
- bch2_dev_allocator_remove(c, ca);
- bch2_recalc_capacity(c);
- bch2_dev_journal_stop(&c->journal, ca);
-}
-
-static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
-{
- lockdep_assert_held(&c->state_lock);
-
- BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
-
- bch2_dev_allocator_add(c, ca);
- bch2_recalc_capacity(c);
- bch2_dev_do_discards(ca);
-}
-
-int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
-{
- struct bch_member *m;
- int ret = 0;
-
- if (ca->mi.state == new_state)
- return 0;
-
- if (!bch2_dev_state_allowed(c, ca, new_state, flags))
- return -BCH_ERR_device_state_not_allowed;
-
- if (new_state != BCH_MEMBER_STATE_rw)
- __bch2_dev_read_only(c, ca);
-
- bch_notice(ca, "%s", bch2_member_states[new_state]);
-
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_STATE(m, new_state);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (new_state == BCH_MEMBER_STATE_rw)
- __bch2_dev_read_write(c, ca);
-
- rebalance_wakeup(c);
-
- return ret;
-}
-
-int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
-{
- int ret;
-
- down_write(&c->state_lock);
- ret = __bch2_dev_set_state(c, ca, new_state, flags);
- up_write(&c->state_lock);
-
- return ret;
-}
-
-/* Device add/removal: */
-
-int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
- struct bch_member *m;
- unsigned dev_idx = ca->dev_idx, data;
- int ret;
-
- down_write(&c->state_lock);
-
- /*
- * We consume a reference to ca->ref, regardless of whether we succeed
- * or fail:
- */
- bch2_dev_put(ca);
-
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
- bch_err(ca, "Cannot remove without losing data");
- ret = -BCH_ERR_device_state_not_allowed;
- goto err;
- }
-
- __bch2_dev_read_only(c, ca);
-
- ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- bch_err_msg(ca, ret, "bch2_dev_data_drop()");
- if (ret)
- goto err;
-
- ret = bch2_dev_remove_alloc(c, ca);
- bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
- if (ret)
- goto err;
-
- /*
- * We need to flush the entire journal to get rid of keys that reference
- * the device being removed before removing the superblock entry
- */
- bch2_journal_flush_all_pins(&c->journal);
-
- /*
- * this is really just needed for the bch2_replicas_gc_(start|end)
- * calls, and could be cleaned up:
- */
- ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
- if (ret)
- goto err;
-
- ret = bch2_journal_flush(&c->journal);
- bch_err_msg(ca, ret, "bch2_journal_flush()");
- if (ret)
- goto err;
-
- ret = bch2_replicas_gc2(c);
- bch_err_msg(ca, ret, "bch2_replicas_gc2()");
- if (ret)
- goto err;
-
- data = bch2_dev_has_data(c, ca);
- if (data) {
- struct printbuf data_has = PRINTBUF;
-
- prt_bitflags(&data_has, __bch2_data_types, data);
- bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
- printbuf_exit(&data_has);
- ret = -EBUSY;
- goto err;
- }
-
- __bch2_dev_offline(c, ca);
-
- mutex_lock(&c->sb_lock);
- rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
- mutex_unlock(&c->sb_lock);
-
-#ifndef CONFIG_BCACHEFS_DEBUG
- percpu_ref_kill(&ca->ref);
-#else
- ca->dying = true;
- bch2_dev_put(ca);
-#endif
- wait_for_completion(&ca->ref_completion);
-
- bch2_dev_free(ca);
-
- /*
- * Free this device's slot in the bch_member array - all pointers to
- * this device must be gone:
- */
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
- memset(&m->uuid, 0, sizeof(m->uuid));
-
- bch2_write_super(c);
-
- mutex_unlock(&c->sb_lock);
- up_write(&c->state_lock);
- return 0;
-err:
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- !percpu_ref_is_zero(&ca->io_ref))
- __bch2_dev_read_write(c, ca);
- up_write(&c->state_lock);
- return ret;
-}
-
-/* Add new device to running filesystem: */
-int bch2_dev_add(struct bch_fs *c, const char *path)
-{
- struct bch_opts opts = bch2_opts_empty();
- struct bch_sb_handle sb;
- struct bch_dev *ca = NULL;
- struct printbuf errbuf = PRINTBUF;
- struct printbuf label = PRINTBUF;
- int ret;
-
- ret = bch2_read_super(path, &opts, &sb);
- bch_err_msg(c, ret, "reading super");
- if (ret)
- goto err;
-
- struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
-
- if (BCH_MEMBER_GROUP(&dev_mi)) {
- bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
- if (label.allocation_failure) {
- ret = -ENOMEM;
- goto err;
- }
- }
-
- ret = bch2_dev_may_add(sb.sb, c);
- if (ret)
- goto err;
-
- ca = __bch2_dev_alloc(c, &dev_mi);
- if (!ca) {
- ret = -ENOMEM;
- goto err;
- }
-
- ret = __bch2_dev_attach_bdev(ca, &sb);
- if (ret)
- goto err;
-
- down_write(&c->state_lock);
- mutex_lock(&c->sb_lock);
-
- ret = bch2_sb_from_fs(c, ca);
- bch_err_msg(c, ret, "setting up new superblock");
- if (ret)
- goto err_unlock;
-
- if (dynamic_fault("bcachefs:add:no_slot"))
- goto err_unlock;
-
- ret = bch2_sb_member_alloc(c);
- if (ret < 0) {
- bch_err_msg(c, ret, "setting up new superblock");
- goto err_unlock;
- }
- unsigned dev_idx = ret;
-
- /* success: */
-
- dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
- *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
-
- ca->disk_sb.sb->dev_idx = dev_idx;
- bch2_dev_attach(c, ca, dev_idx);
-
- if (BCH_MEMBER_GROUP(&dev_mi)) {
- ret = __bch2_dev_group_set(c, ca, label.buf);
- bch_err_msg(c, ret, "creating new label");
- if (ret)
- goto err_unlock;
- }
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- ret = bch2_dev_usage_init(ca, false);
- if (ret)
- goto err_late;
-
- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(ca, ret, "marking new superblock");
- if (ret)
- goto err_late;
-
- ret = bch2_fs_freespace_init(c);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
- goto err_late;
-
- if (ca->mi.state == BCH_MEMBER_STATE_rw)
- __bch2_dev_read_write(c, ca);
-
- ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(c, ret, "allocating journal");
- if (ret)
- goto err_late;
-
- up_write(&c->state_lock);
-out:
- printbuf_exit(&label);
- printbuf_exit(&errbuf);
- bch_err_fn(c, ret);
- return ret;
-
-err_unlock:
- mutex_unlock(&c->sb_lock);
- up_write(&c->state_lock);
-err:
- if (ca)
- bch2_dev_free(ca);
- bch2_free_super(&sb);
- goto out;
-err_late:
- up_write(&c->state_lock);
- ca = NULL;
- goto err;
-}
-
-/* Hot add existing device to running filesystem: */
-int bch2_dev_online(struct bch_fs *c, const char *path)
-{
- struct bch_opts opts = bch2_opts_empty();
- struct bch_sb_handle sb = { NULL };
- struct bch_dev *ca;
- unsigned dev_idx;
- int ret;
-
- down_write(&c->state_lock);
-
- ret = bch2_read_super(path, &opts, &sb);
- if (ret) {
- up_write(&c->state_lock);
- return ret;
- }
-
- dev_idx = sb.sb->dev_idx;
-
- ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
- bch_err_msg(c, ret, "bringing %s online", path);
- if (ret)
- goto err;
-
- ret = bch2_dev_attach_bdev(c, &sb);
- if (ret)
- goto err;
-
- ca = bch2_dev_locked(c, dev_idx);
-
- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
- if (ret)
- goto err;
-
- if (ca->mi.state == BCH_MEMBER_STATE_rw)
- __bch2_dev_read_write(c, ca);
-
- if (!ca->mi.freespace_initialized) {
- ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
- goto err;
- }
-
- if (!ca->journal.nr) {
- ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(ca, ret, "allocating journal");
- if (ret)
- goto err;
- }
-
- mutex_lock(&c->sb_lock);
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
- cpu_to_le64(ktime_get_real_seconds());
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- up_write(&c->state_lock);
- return 0;
-err:
- up_write(&c->state_lock);
- bch2_free_super(&sb);
- return ret;
-}
-
-int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
- down_write(&c->state_lock);
-
- if (!bch2_dev_is_online(ca)) {
- bch_err(ca, "Already offline");
- up_write(&c->state_lock);
- return 0;
- }
-
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
- bch_err(ca, "Cannot offline required disk");
- up_write(&c->state_lock);
- return -BCH_ERR_device_state_not_allowed;
- }
-
- __bch2_dev_offline(c, ca);
-
- up_write(&c->state_lock);
- return 0;
-}
-
-int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
-{
- struct bch_member *m;
- u64 old_nbuckets;
- int ret = 0;
-
- down_write(&c->state_lock);
- old_nbuckets = ca->mi.nbuckets;
-
- if (nbuckets < ca->mi.nbuckets) {
- bch_err(ca, "Cannot shrink yet");
- ret = -EINVAL;
- goto err;
- }
-
- if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
- bch_err(ca, "New device size too big (%llu greater than max %u)",
- nbuckets, BCH_MEMBER_NBUCKETS_MAX);
- ret = -BCH_ERR_device_size_too_big;
- goto err;
- }
-
- if (bch2_dev_is_online(ca) &&
- get_capacity(ca->disk_sb.bdev->bd_disk) <
- ca->mi.bucket_size * nbuckets) {
- bch_err(ca, "New size larger than device");
- ret = -BCH_ERR_device_size_too_small;
- goto err;
- }
-
- ret = bch2_dev_buckets_resize(c, ca, nbuckets);
- bch_err_msg(ca, ret, "resizing buckets");
- if (ret)
- goto err;
-
- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- if (ret)
- goto err;
-
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- m->nbuckets = cpu_to_le64(nbuckets);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (ca->mi.freespace_initialized) {
- u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
-
- ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
- bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
- .dev = ca->dev_idx,
- .data_type = BCH_DATA_free)) ?:
- bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
- if (ret)
- goto err;
- }
-
- bch2_recalc_capacity(c);
-err:
- up_write(&c->state_lock);
- return ret;
-}
-
-/* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
-{
- if (!strncmp(name, "/dev/", strlen("/dev/")))
- name += strlen("/dev/");
-
- for_each_member_device(c, ca)
- if (!strcmp(name, ca->name))
- return ca;
- return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
-}
-
-/* blk_holder_ops: */
-
-static struct bch_fs *bdev_get_fs(struct block_device *bdev)
- __releases(&bdev->bd_holder_lock)
-{
- struct bch_sb_handle_holder *holder = bdev->bd_holder;
- struct bch_fs *c = holder->c;
-
- if (c && !bch2_ro_ref_tryget(c))
- c = NULL;
-
- mutex_unlock(&bdev->bd_holder_lock);
-
- if (c)
- wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
- return c;
-}
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
-{
- for_each_member_device(c, ca)
- if (ca->disk_sb.bdev == bdev)
- return ca;
- return NULL;
-}
-
-static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
-{
- struct bch_fs *c = bdev_get_fs(bdev);
- if (!c)
- return;
-
- struct super_block *sb = c->vfs_sb;
- if (sb) {
- /*
- * Not necessary, c->ro_ref guards against the filesystem being
- * unmounted - we only take this to avoid a warning in
- * sync_filesystem:
- */
- down_read(&sb->s_umount);
- }
-
- down_write(&c->state_lock);
- struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
- if (!ca)
- goto unlock;
-
- if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) {
- __bch2_dev_offline(c, ca);
- } else {
- if (sb) {
- if (!surprise)
- sync_filesystem(sb);
- shrink_dcache_sb(sb);
- evict_inodes(sb);
- }
-
- bch2_journal_flush(&c->journal);
- bch2_fs_emergency_read_only(c);
- }
-
- bch2_dev_put(ca);
-unlock:
- if (sb)
- up_read(&sb->s_umount);
- up_write(&c->state_lock);
- bch2_ro_ref_put(c);
-}
-
-static void bch2_fs_bdev_sync(struct block_device *bdev)
-{
- struct bch_fs *c = bdev_get_fs(bdev);
- if (!c)
- return;
-
- struct super_block *sb = c->vfs_sb;
- if (sb) {
- /*
- * Not necessary, c->ro_ref guards against the filesystem being
- * unmounted - we only take this to avoid a warning in
- * sync_filesystem:
- */
- down_read(&sb->s_umount);
- sync_filesystem(sb);
- up_read(&sb->s_umount);
- }
-
- bch2_ro_ref_put(c);
-}
-
-const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
- .mark_dead = bch2_fs_bdev_mark_dead,
- .sync = bch2_fs_bdev_sync,
-};
-
-/* Filesystem open: */
-
-static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
-{
- return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
- cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
-}
-
-struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
- struct bch_opts opts)
-{
- DARRAY(struct bch_sb_handle) sbs = { 0 };
- struct bch_fs *c = NULL;
- struct bch_sb_handle *best = NULL;
- struct printbuf errbuf = PRINTBUF;
- int ret = 0;
-
- if (!try_module_get(THIS_MODULE))
- return ERR_PTR(-ENODEV);
-
- if (!nr_devices) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = darray_make_room(&sbs, nr_devices);
- if (ret)
- goto err;
-
- for (unsigned i = 0; i < nr_devices; i++) {
- struct bch_sb_handle sb = { NULL };
-
- ret = bch2_read_super(devices[i], &opts, &sb);
- if (ret)
- goto err;
-
- BUG_ON(darray_push(&sbs, sb));
- }
-
- if (opts.nochanges && !opts.read_only) {
- ret = -BCH_ERR_erofs_nochanges;
- goto err_print;
- }
-
- darray_for_each(sbs, sb)
- if (!best || sb_cmp(sb->sb, best->sb) > 0)
- best = sb;
-
- darray_for_each_reverse(sbs, sb) {
- ret = bch2_dev_in_fs(best, sb, &opts);
-
- if (ret == -BCH_ERR_device_has_been_removed ||
- ret == -BCH_ERR_device_splitbrain) {
- bch2_free_super(sb);
- darray_remove_item(&sbs, sb);
- best -= best > sb;
- ret = 0;
- continue;
- }
-
- if (ret)
- goto err_print;
- }
-
- c = bch2_fs_alloc(best->sb, opts);
- ret = PTR_ERR_OR_ZERO(c);
- if (ret)
- goto err;
-
- down_write(&c->state_lock);
- darray_for_each(sbs, sb) {
- ret = bch2_dev_attach_bdev(c, sb);
- if (ret) {
- up_write(&c->state_lock);
- goto err;
- }
- }
- up_write(&c->state_lock);
-
- if (!bch2_fs_may_start(c)) {
- ret = -BCH_ERR_insufficient_devices_to_start;
- goto err_print;
- }
-
- if (!c->opts.nostart) {
- ret = bch2_fs_start(c);
- if (ret)
- goto err;
- }
-out:
- darray_for_each(sbs, sb)
- bch2_free_super(sb);
- darray_exit(&sbs);
- printbuf_exit(&errbuf);
- module_put(THIS_MODULE);
- return c;
-err_print:
- pr_err("bch_fs_open err opening %s: %s",
- devices[0], bch2_err_str(ret));
-err:
- if (!IS_ERR_OR_NULL(c))
- bch2_fs_stop(c);
- c = ERR_PTR(ret);
- goto out;
-}
-
-/* Global interfaces/init */
-
-static void bcachefs_exit(void)
-{
- bch2_debug_exit();
- bch2_vfs_exit();
- bch2_chardev_exit();
- bch2_btree_key_cache_exit();
- if (bcachefs_kset)
- kset_unregister(bcachefs_kset);
-}
-
-static int __init bcachefs_init(void)
-{
- bch2_bkey_pack_test();
-
- if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
- bch2_btree_key_cache_init() ||
- bch2_chardev_init() ||
- bch2_vfs_init() ||
- bch2_debug_init())
- goto err;
-
- return 0;
-err:
- bcachefs_exit();
- return -ENOMEM;
-}
-
-#define BCH_DEBUG_PARAM(name, description) \
- bool bch2_##name; \
- module_param_named(name, bch2_##name, bool, 0644); \
- MODULE_PARM_DESC(name, description);
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-__maybe_unused
-static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
-module_param_named(version, bch2_metadata_version, uint, 0400);
-
-module_exit(bcachefs_exit);
-module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
deleted file mode 100644
index 23533bce5709..000000000000
--- a/fs/bcachefs/super.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_H
-#define _BCACHEFS_SUPER_H
-
-#include "extents.h"
-
-#include "bcachefs_ioctl.h"
-
-#include <linux/math64.h>
-
-extern const char * const bch2_fs_flag_strs[];
-
-struct bch_fs *bch2_dev_to_fs(dev_t);
-struct bch_fs *bch2_uuid_to_fs(__uuid_t);
-
-bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-
-int bch2_dev_fail(struct bch_dev *, int);
-int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_add(struct bch_fs *, const char *);
-int bch2_dev_online(struct bch_fs *, const char *);
-int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
-struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
-
-bool bch2_fs_emergency_read_only(struct bch_fs *);
-bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
-void bch2_fs_read_only(struct bch_fs *);
-
-int bch2_fs_read_write(struct bch_fs *);
-int bch2_fs_read_write_early(struct bch_fs *);
-
-void __bch2_fs_stop(struct bch_fs *);
-void bch2_fs_free(struct bch_fs *);
-void bch2_fs_stop(struct bch_fs *);
-
-int bch2_fs_start(struct bch_fs *);
-struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
-
-extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
-
-#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
deleted file mode 100644
index 3a899f799d1d..000000000000
--- a/fs/bcachefs/super_types.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_TYPES_H
-#define _BCACHEFS_SUPER_TYPES_H
-
-struct bch_fs;
-
-struct bch_sb_handle_holder {
- struct bch_fs *c;
-};
-
-struct bch_sb_handle {
- struct bch_sb *sb;
- struct file *s_bdev_file;
- struct block_device *bdev;
- char *sb_name;
- struct bio *bio;
- struct bch_sb_handle_holder *holder;
- size_t buffer_size;
- blk_mode_t mode;
- unsigned have_layout:1;
- unsigned have_bio:1;
- unsigned fs_sb:1;
- u64 seq;
-};
-
-struct bch_devs_mask {
- unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
-};
-
-struct bch_devs_list {
- u8 nr;
- u8 data[BCH_BKEY_PTRS_MAX];
-};
-
-#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
deleted file mode 100644
index 251ba8224c1f..000000000000
--- a/fs/bcachefs/sysfs.c
+++ /dev/null
@@ -1,888 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcache sysfs interfaces
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#ifndef NO_BCACHEFS_SYSFS
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "sysfs.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "compress.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "inode.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "move.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "opts.h"
-#include "rebalance.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "tests.h"
-
-#include <linux/blkdev.h>
-#include <linux/sort.h>
-#include <linux/sched/clock.h>
-
-#include "util.h"
-
-#define SYSFS_OPS(type) \
-const struct sysfs_ops type ## _sysfs_ops = { \
- .show = type ## _show, \
- .store = type ## _store \
-}
-
-#define SHOW(fn) \
-static ssize_t fn ## _to_text(struct printbuf *, \
- struct kobject *, struct attribute *); \
- \
-static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
- char *buf) \
-{ \
- struct printbuf out = PRINTBUF; \
- ssize_t ret = fn ## _to_text(&out, kobj, attr); \
- \
- if (out.pos && out.buf[out.pos - 1] != '\n') \
- prt_newline(&out); \
- \
- if (!ret && out.allocation_failure) \
- ret = -ENOMEM; \
- \
- if (!ret) { \
- ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \
- memcpy(buf, out.buf, ret); \
- } \
- printbuf_exit(&out); \
- return bch2_err_class(ret); \
-} \
- \
-static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
- struct attribute *attr)
-
-#define STORE(fn) \
-static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
- const char *, size_t); \
- \
-static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
- const char *buf, size_t size) \
-{ \
- return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \
-} \
- \
-static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
- const char *buf, size_t size)
-
-#define __sysfs_attribute(_name, _mode) \
- static struct attribute sysfs_##_name = \
- { .name = #_name, .mode = _mode }
-
-#define write_attribute(n) __sysfs_attribute(n, 0200)
-#define read_attribute(n) __sysfs_attribute(n, 0444)
-#define rw_attribute(n) __sysfs_attribute(n, 0644)
-
-#define sysfs_printf(file, fmt, ...) \
-do { \
- if (attr == &sysfs_ ## file) \
- prt_printf(out, fmt "\n", __VA_ARGS__); \
-} while (0)
-
-#define sysfs_print(file, var) \
-do { \
- if (attr == &sysfs_ ## file) \
- snprint(out, var); \
-} while (0)
-
-#define sysfs_hprint(file, val) \
-do { \
- if (attr == &sysfs_ ## file) \
- prt_human_readable_s64(out, val); \
-} while (0)
-
-#define sysfs_strtoul(file, var) \
-do { \
- if (attr == &sysfs_ ## file) \
- return strtoul_safe(buf, var) ?: (ssize_t) size; \
-} while (0)
-
-#define sysfs_strtoul_clamp(file, var, min, max) \
-do { \
- if (attr == &sysfs_ ## file) \
- return strtoul_safe_clamp(buf, var, min, max) \
- ?: (ssize_t) size; \
-} while (0)
-
-#define strtoul_or_return(cp) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (_r) \
- return _r; \
- _v; \
-})
-
-write_attribute(trigger_gc);
-write_attribute(trigger_discards);
-write_attribute(trigger_invalidates);
-write_attribute(trigger_journal_flush);
-write_attribute(trigger_journal_writes);
-write_attribute(trigger_btree_cache_shrink);
-write_attribute(trigger_btree_key_cache_shrink);
-write_attribute(trigger_freelist_wakeup);
-write_attribute(trigger_btree_updates);
-read_attribute(gc_gens_pos);
-
-read_attribute(uuid);
-read_attribute(minor);
-read_attribute(flags);
-read_attribute(first_bucket);
-read_attribute(nbuckets);
-read_attribute(io_done);
-read_attribute(io_errors);
-write_attribute(io_errors_reset);
-
-read_attribute(io_latency_read);
-read_attribute(io_latency_write);
-read_attribute(io_latency_stats_read);
-read_attribute(io_latency_stats_write);
-read_attribute(congested);
-
-read_attribute(btree_write_stats);
-
-read_attribute(btree_cache_size);
-read_attribute(compression_stats);
-read_attribute(journal_debug);
-read_attribute(btree_cache);
-read_attribute(btree_key_cache);
-read_attribute(btree_reserve_cache);
-read_attribute(open_buckets);
-read_attribute(open_buckets_partial);
-read_attribute(nocow_lock_table);
-
-#ifdef BCH_WRITE_REF_DEBUG
-read_attribute(write_refs);
-
-static const char * const bch2_write_refs[] = {
-#define x(n) #n,
- BCH_WRITE_REFS()
-#undef x
- NULL
-};
-
-static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
-{
- bch2_printbuf_tabstop_push(out, 24);
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++)
- prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i]));
-}
-#endif
-
-read_attribute(internal_uuid);
-read_attribute(disk_groups);
-
-read_attribute(has_data);
-read_attribute(alloc_debug);
-read_attribute(usage_base);
-
-#define x(t, n, ...) read_attribute(t);
-BCH_PERSISTENT_COUNTERS()
-#undef x
-
-rw_attribute(label);
-
-read_attribute(copy_gc_wait);
-
-sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_status);
-
-read_attribute(new_stripes);
-
-read_attribute(io_timers_read);
-read_attribute(io_timers_write);
-
-read_attribute(moving_ctxts);
-
-#ifdef CONFIG_BCACHEFS_TESTS
-write_attribute(perf_test);
-#endif /* CONFIG_BCACHEFS_TESTS */
-
-#define x(_name) \
- static struct attribute sysfs_time_stat_##_name = \
- { .name = #_name, .mode = 0644 };
- BCH_TIME_STATS()
-#undef x
-
-static size_t bch2_btree_cache_size(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- size_t ret = 0;
- struct btree *b;
-
- mutex_lock(&bc->lock);
- list_for_each_entry(b, &bc->live[0].list, list)
- ret += btree_buf_bytes(b);
- list_for_each_entry(b, &bc->live[1].list, list)
- ret += btree_buf_bytes(b);
- list_for_each_entry(b, &bc->freeable, list)
- ret += btree_buf_bytes(b);
- mutex_unlock(&bc->lock);
- return ret;
-}
-
-static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
-{
- prt_str(out, "type");
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 24);
- prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
-
- for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
- struct disk_accounting_pos a = {
- .type = BCH_DISK_ACCOUNTING_compression,
- .compression.type = i,
- };
- struct bpos p = disk_accounting_pos_to_bpos(&a);
- u64 v[3];
- bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
-
- u64 nr_extents = v[0];
- u64 sectors_uncompressed = v[1];
- u64 sectors_compressed = v[2];
-
- bch2_prt_compression_type(out, i);
- prt_tab(out);
-
- prt_human_readable_u64(out, sectors_compressed << 9);
- prt_tab_rjust(out);
-
- prt_human_readable_u64(out, sectors_uncompressed << 9);
- prt_tab_rjust(out);
-
- prt_human_readable_u64(out, nr_extents
- ? div64_u64(sectors_uncompressed << 9, nr_extents)
- : 0);
- prt_tab_rjust(out);
- prt_newline(out);
- }
-
- return 0;
-}
-
-static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
-{
- bch2_btree_id_to_text(out, c->gc_gens_btree);
- prt_printf(out, ": ");
- bch2_bpos_to_text(out, c->gc_gens_pos);
- prt_printf(out, "\n");
-}
-
-static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct bch_fs_usage_base b = {};
-
- acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64));
-
- prt_printf(out, "hidden:\t\t%llu\n", b.hidden);
- prt_printf(out, "btree:\t\t%llu\n", b.btree);
- prt_printf(out, "data:\t\t%llu\n", b.data);
- prt_printf(out, "cached:\t%llu\n", b.cached);
- prt_printf(out, "reserved:\t\t%llu\n", b.reserved);
- prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes);
-}
-
-SHOW(bch2_fs)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- sysfs_print(minor, c->minor);
- sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
-
- if (attr == &sysfs_flags)
- prt_bitflags(out, bch2_fs_flag_strs, c->flags);
-
- sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
-
- if (attr == &sysfs_btree_write_stats)
- bch2_btree_write_stats_to_text(out, c);
-
- if (attr == &sysfs_gc_gens_pos)
- bch2_gc_gens_pos_to_text(out, c);
-
- sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
-
- if (attr == &sysfs_copy_gc_wait)
- bch2_copygc_wait_to_text(out, c);
-
- if (attr == &sysfs_rebalance_status)
- bch2_rebalance_status_to_text(out, c);
-
- /* Debugging: */
-
- if (attr == &sysfs_journal_debug)
- bch2_journal_debug_to_text(out, &c->journal);
-
- if (attr == &sysfs_btree_cache)
- bch2_btree_cache_to_text(out, &c->btree_cache);
-
- if (attr == &sysfs_btree_key_cache)
- bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
-
- if (attr == &sysfs_btree_reserve_cache)
- bch2_btree_reserve_cache_to_text(out, c);
-
- if (attr == &sysfs_open_buckets)
- bch2_open_buckets_to_text(out, c, NULL);
-
- if (attr == &sysfs_open_buckets_partial)
- bch2_open_buckets_partial_to_text(out, c);
-
- if (attr == &sysfs_compression_stats)
- bch2_compression_stats_to_text(out, c);
-
- if (attr == &sysfs_new_stripes)
- bch2_new_stripes_to_text(out, c);
-
- if (attr == &sysfs_io_timers_read)
- bch2_io_timers_to_text(out, &c->io_clock[READ]);
-
- if (attr == &sysfs_io_timers_write)
- bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
-
- if (attr == &sysfs_moving_ctxts)
- bch2_fs_moving_ctxts_to_text(out, c);
-
-#ifdef BCH_WRITE_REF_DEBUG
- if (attr == &sysfs_write_refs)
- bch2_write_refs_to_text(out, c);
-#endif
-
- if (attr == &sysfs_nocow_lock_table)
- bch2_nocow_locks_to_text(out, &c->nocow_locks);
-
- if (attr == &sysfs_disk_groups)
- bch2_disk_groups_to_text(out, c);
-
- if (attr == &sysfs_alloc_debug)
- bch2_fs_alloc_debug_to_text(out, c);
-
- if (attr == &sysfs_usage_base)
- bch2_fs_usage_base_to_text(out, c);
-
- return 0;
-}
-
-STORE(bch2_fs)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
-
- /* Debugging: */
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EPERM;
-
- /* Debugging: */
-
- if (attr == &sysfs_trigger_btree_updates)
- queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
- return -EROFS;
-
- if (attr == &sysfs_trigger_btree_cache_shrink) {
- struct btree_cache *bc = &c->btree_cache;
- struct shrink_control sc;
-
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = strtoul_or_return(buf);
- bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
- }
-
- if (attr == &sysfs_trigger_btree_key_cache_shrink) {
- struct shrink_control sc;
-
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
- }
-
- if (attr == &sysfs_trigger_gc)
- bch2_gc_gens(c);
-
- if (attr == &sysfs_trigger_discards)
- bch2_do_discards(c);
-
- if (attr == &sysfs_trigger_invalidates)
- bch2_do_invalidates(c);
-
- if (attr == &sysfs_trigger_journal_flush) {
- bch2_journal_flush_all_pins(&c->journal);
- bch2_journal_meta(&c->journal);
- }
-
- if (attr == &sysfs_trigger_journal_writes)
- bch2_journal_do_writes(&c->journal);
-
- if (attr == &sysfs_trigger_freelist_wakeup)
- closure_wake_up(&c->freelist_wait);
-
-#ifdef CONFIG_BCACHEFS_TESTS
- if (attr == &sysfs_perf_test) {
- char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
- char *test = strsep(&p, " \t\n");
- char *nr_str = strsep(&p, " \t\n");
- char *threads_str = strsep(&p, " \t\n");
- unsigned threads;
- u64 nr;
- int ret = -EINVAL;
-
- if (threads_str &&
- !(ret = kstrtouint(threads_str, 10, &threads)) &&
- !(ret = bch2_strtoull_h(nr_str, &nr)))
- ret = bch2_btree_perf_test(c, test, nr, threads);
- kfree(tmp);
-
- if (ret)
- size = ret;
- }
-#endif
- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
- return size;
-}
-SYSFS_OPS(bch2_fs);
-
-struct attribute *bch2_fs_files[] = {
- &sysfs_minor,
- &sysfs_btree_cache_size,
- &sysfs_btree_write_stats,
-
- &sysfs_rebalance_status,
-
- &sysfs_compression_stats,
-
-#ifdef CONFIG_BCACHEFS_TESTS
- &sysfs_perf_test,
-#endif
- NULL
-};
-
-/* counters dir */
-
-SHOW(bch2_fs_counters)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
- u64 counter = 0;
- u64 counter_since_mount = 0;
-
- printbuf_tabstop_push(out, 32);
-
- #define x(t, n, f, ...) \
- if (attr == &sysfs_##t) { \
- counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
- counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
- if (f & TYPE_SECTORS) { \
- counter <<= 9; \
- counter_since_mount <<= 9; \
- } \
- \
- prt_printf(out, "since mount:\t"); \
- (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\
- prt_human_readable_u64(out, counter_since_mount); \
- prt_newline(out); \
- \
- prt_printf(out, "since filesystem creation:\t"); \
- (f & TYPE_COUNTER) ? prt_u64(out, counter) : \
- prt_human_readable_u64(out, counter); \
- prt_newline(out); \
- }
- BCH_PERSISTENT_COUNTERS()
- #undef x
- return 0;
-}
-
-STORE(bch2_fs_counters) {
- return 0;
-}
-
-SYSFS_OPS(bch2_fs_counters);
-
-struct attribute *bch2_fs_counters_files[] = {
-#define x(t, ...) \
- &sysfs_##t,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- NULL
-};
-/* internal dir - just a wrapper */
-
-SHOW(bch2_fs_internal)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-
- return bch2_fs_to_text(out, &c->kobj, attr);
-}
-
-STORE(bch2_fs_internal)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-
- return bch2_fs_store(&c->kobj, attr, buf, size);
-}
-SYSFS_OPS(bch2_fs_internal);
-
-struct attribute *bch2_fs_internal_files[] = {
- &sysfs_flags,
- &sysfs_journal_debug,
- &sysfs_btree_cache,
- &sysfs_btree_key_cache,
- &sysfs_btree_reserve_cache,
- &sysfs_new_stripes,
- &sysfs_open_buckets,
- &sysfs_open_buckets_partial,
-#ifdef BCH_WRITE_REF_DEBUG
- &sysfs_write_refs,
-#endif
- &sysfs_nocow_lock_table,
- &sysfs_io_timers_read,
- &sysfs_io_timers_write,
-
- &sysfs_trigger_gc,
- &sysfs_trigger_discards,
- &sysfs_trigger_invalidates,
- &sysfs_trigger_journal_flush,
- &sysfs_trigger_journal_writes,
- &sysfs_trigger_btree_cache_shrink,
- &sysfs_trigger_btree_key_cache_shrink,
- &sysfs_trigger_freelist_wakeup,
- &sysfs_trigger_btree_updates,
-
- &sysfs_gc_gens_pos,
-
- &sysfs_copy_gc_wait,
-
- sysfs_pd_controller_files(rebalance),
-
- &sysfs_moving_ctxts,
-
- &sysfs_internal_uuid,
-
- &sysfs_disk_groups,
- &sysfs_alloc_debug,
- &sysfs_usage_base,
- NULL
-};
-
-/* options */
-
-static ssize_t sysfs_opt_show(struct bch_fs *c,
- struct bch_dev *ca,
- enum bch_opt_id id,
- struct printbuf *out)
-{
- const struct bch_option *opt = bch2_opt_table + id;
- u64 v;
-
- if (opt->flags & OPT_FS) {
- v = bch2_opt_get_by_id(&c->opts, id);
- } else if ((opt->flags & OPT_DEVICE) && opt->get_member) {
- v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx);
- } else {
- return -EINVAL;
- }
-
- bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
- prt_char(out, '\n');
- return 0;
-}
-
-static ssize_t sysfs_opt_store(struct bch_fs *c,
- struct bch_dev *ca,
- enum bch_opt_id id,
- const char *buf, size_t size)
-{
- const struct bch_option *opt = bch2_opt_table + id;
- int ret = 0;
-
- /*
- * We don't need to take c->writes for correctness, but it eliminates an
- * unsightly error message in the dmesg log when we're RO:
- */
- if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
- return -EROFS;
-
- down_write(&c->state_lock);
-
- char *tmp = kstrdup(buf, GFP_KERNEL);
- if (!tmp) {
- ret = -ENOMEM;
- goto err;
- }
-
- u64 v;
- ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
- bch2_opt_check_may_set(c, ca, id, v);
- kfree(tmp);
-
- if (ret < 0)
- goto err;
-
- bch2_opt_set_sb(c, ca, opt, v);
- bch2_opt_set_by_id(&c->opts, id, v);
-
- if (v &&
- (id == Opt_background_target ||
- (id == Opt_foreground_target && !c->opts.background_target) ||
- id == Opt_background_compression ||
- (id == Opt_compression && !c->opts.background_compression)))
- bch2_set_rebalance_needs_scan(c, 0);
-
- if (v && id == Opt_rebalance_enabled)
- rebalance_wakeup(c);
-
- if (v && id == Opt_copygc_enabled &&
- c->copygc_thread)
- wake_up_process(c->copygc_thread);
-
- if (id == Opt_discard && !ca) {
- mutex_lock(&c->sb_lock);
- for_each_member_device(c, ca)
- opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- ret = size;
-err:
- up_write(&c->state_lock);
- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
- return ret;
-}
-
-SHOW(bch2_fs_opts_dir)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
- int id = bch2_opt_lookup(attr->name);
- if (id < 0)
- return 0;
-
- return sysfs_opt_show(c, NULL, id, out);
-}
-
-STORE(bch2_fs_opts_dir)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
- int id = bch2_opt_lookup(attr->name);
- if (id < 0)
- return 0;
-
- return sysfs_opt_store(c, NULL, id, buf, size);
-}
-SYSFS_OPS(bch2_fs_opts_dir);
-
-struct attribute *bch2_fs_opts_dir_files[] = { NULL };
-
-int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
-{
- for (const struct bch_option *i = bch2_opt_table;
- i < bch2_opt_table + bch2_opts_nr;
- i++) {
- if (i->flags & OPT_HIDDEN)
- continue;
- if (!(i->flags & type))
- continue;
-
- int ret = sysfs_create_file(kobj, &i->attr);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* time stats */
-
-SHOW(bch2_fs_time_stats)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define x(name) \
- if (attr == &sysfs_time_stat_##name) \
- bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
- BCH_TIME_STATS()
-#undef x
-
- return 0;
-}
-
-STORE(bch2_fs_time_stats)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define x(name) \
- if (attr == &sysfs_time_stat_##name) \
- bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
- BCH_TIME_STATS()
-#undef x
- return size;
-}
-SYSFS_OPS(bch2_fs_time_stats);
-
-struct attribute *bch2_fs_time_stats_files[] = {
-#define x(name) \
- &sysfs_time_stat_##name,
- BCH_TIME_STATS()
-#undef x
- NULL
-};
-
-static const char * const bch2_rw[] = {
- "read",
- "write",
- NULL
-};
-
-static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- int rw, i;
-
- for (rw = 0; rw < 2; rw++) {
- prt_printf(out, "%s:\n", bch2_rw[rw]);
-
- for (i = 1; i < BCH_DATA_NR; i++)
- prt_printf(out, "%-12s:%12llu\n",
- bch2_data_type_str(i),
- percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
- }
-}
-
-SHOW(bch2_dev)
-{
- struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
- struct bch_fs *c = ca->fs;
-
- sysfs_printf(uuid, "%pU\n", ca->uuid.b);
-
- sysfs_print(first_bucket, ca->mi.first_bucket);
- sysfs_print(nbuckets, ca->mi.nbuckets);
-
- if (attr == &sysfs_label) {
- if (ca->mi.group)
- bch2_disk_path_to_text(out, c, ca->mi.group - 1);
- prt_char(out, '\n');
- }
-
- if (attr == &sysfs_has_data) {
- prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
- prt_char(out, '\n');
- }
-
- if (attr == &sysfs_io_done)
- dev_io_done_to_text(out, ca);
-
- if (attr == &sysfs_io_errors)
- bch2_dev_io_errors_to_text(out, ca);
-
- sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
- sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
-
- if (attr == &sysfs_io_latency_stats_read)
- bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
-
- if (attr == &sysfs_io_latency_stats_write)
- bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
-
- sysfs_printf(congested, "%u%%",
- clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
- * 100 / CONGESTED_MAX);
-
- if (attr == &sysfs_alloc_debug)
- bch2_dev_alloc_debug_to_text(out, ca);
-
- if (attr == &sysfs_open_buckets)
- bch2_open_buckets_to_text(out, c, ca);
-
- int opt_id = bch2_opt_lookup(attr->name);
- if (opt_id >= 0)
- return sysfs_opt_show(c, ca, opt_id, out);
-
- return 0;
-}
-
-STORE(bch2_dev)
-{
- struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
- struct bch_fs *c = ca->fs;
-
- if (attr == &sysfs_label) {
- char *tmp;
- int ret;
-
- tmp = kstrdup(buf, GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
-
- ret = bch2_dev_group_set(c, ca, strim(tmp));
- kfree(tmp);
- if (ret)
- return ret;
- }
-
- if (attr == &sysfs_io_errors_reset)
- bch2_dev_errors_reset(ca);
-
- int opt_id = bch2_opt_lookup(attr->name);
- if (opt_id >= 0)
- return sysfs_opt_store(c, ca, opt_id, buf, size);
-
- return size;
-}
-SYSFS_OPS(bch2_dev);
-
-struct attribute *bch2_dev_files[] = {
- &sysfs_uuid,
- &sysfs_first_bucket,
- &sysfs_nbuckets,
-
- /* settings: */
- &sysfs_label,
-
- &sysfs_has_data,
- &sysfs_io_done,
- &sysfs_io_errors,
- &sysfs_io_errors_reset,
-
- &sysfs_io_latency_read,
- &sysfs_io_latency_write,
- &sysfs_io_latency_stats_read,
- &sysfs_io_latency_stats_write,
- &sysfs_congested,
-
- /* debug: */
- &sysfs_alloc_debug,
- &sysfs_open_buckets,
- NULL
-};
-
-#endif /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
deleted file mode 100644
index 303e0433c702..000000000000
--- a/fs/bcachefs/sysfs.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SYSFS_H_
-#define _BCACHEFS_SYSFS_H_
-
-#include <linux/sysfs.h>
-
-#ifndef NO_BCACHEFS_SYSFS
-
-struct attribute;
-struct sysfs_ops;
-
-extern struct attribute *bch2_fs_files[];
-extern struct attribute *bch2_fs_counters_files[];
-extern struct attribute *bch2_fs_internal_files[];
-extern struct attribute *bch2_fs_opts_dir_files[];
-extern struct attribute *bch2_fs_time_stats_files[];
-extern struct attribute *bch2_dev_files[];
-
-extern const struct sysfs_ops bch2_fs_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern const struct sysfs_ops bch2_dev_sysfs_ops;
-
-int bch2_opts_create_sysfs_files(struct kobject *, unsigned);
-
-#else
-
-static struct attribute *bch2_fs_files[] = {};
-static struct attribute *bch2_fs_counters_files[] = {};
-static struct attribute *bch2_fs_internal_files[] = {};
-static struct attribute *bch2_fs_opts_dir_files[] = {};
-static struct attribute *bch2_fs_time_stats_files[] = {};
-static struct attribute *bch2_dev_files[] = {};
-
-static const struct sysfs_ops bch2_fs_sysfs_ops;
-static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
-static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
-static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-static const struct sysfs_ops bch2_dev_sysfs_ops;
-
-static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
-{ return 0; }
-
-#endif /* NO_BCACHEFS_SYSFS */
-
-#endif /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
deleted file mode 100644
index 6c6469814637..000000000000
--- a/fs/bcachefs/tests.c
+++ /dev/null
@@ -1,887 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_BCACHEFS_TESTS
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "journal_reclaim.h"
-#include "snapshot.h"
-#include "tests.h"
-
-#include "linux/kthread.h"
-#include "linux/random.h"
-
-static void delete_test_keys(struct bch_fs *c)
-{
- int ret;
-
- ret = bch2_btree_delete_range(c, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX),
- POS(0, U64_MAX),
- 0, NULL);
- BUG_ON(ret);
-
- ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- POS(0, U64_MAX),
- 0, NULL);
- BUG_ON(ret);
-}
-
-/* unit tests */
-
-static int test_delete(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_i_cookie k;
- int ret;
-
- bkey_cookie_init(&k.k_i);
- k.k.p.snapshot = U32_MAX;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_intent);
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, &k.k_i, 0));
- bch_err_msg(c, ret, "update error");
- if (ret)
- goto err;
-
- pr_info("deleting once");
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_delete_at(trans, &iter, 0));
- bch_err_msg(c, ret, "delete error (first)");
- if (ret)
- goto err;
-
- pr_info("deleting twice");
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_delete_at(trans, &iter, 0));
- bch_err_msg(c, ret, "delete error (second)");
- if (ret)
- goto err;
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int test_delete_written(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_i_cookie k;
- int ret;
-
- bkey_cookie_init(&k.k_i);
- k.k.p.snapshot = U32_MAX;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_intent);
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, &k.k_i, 0));
- bch_err_msg(c, ret, "update error");
- if (ret)
- goto err;
-
- bch2_trans_unlock(trans);
- bch2_journal_flush_all_pins(&c->journal);
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_delete_at(trans, &iter, 0));
- bch_err_msg(c, ret, "delete error");
- if (ret)
- goto err;
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int test_iterate(struct bch_fs *c, u64 nr)
-{
- u64 i;
- int ret = 0;
-
- delete_test_keys(c);
-
- pr_info("inserting test keys");
-
- for (i = 0; i < nr; i++) {
- struct bkey_i_cookie ck;
-
- bkey_cookie_init(&ck.k_i);
- ck.k.p.offset = i;
- ck.k.p.snapshot = U32_MAX;
-
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "insert error");
- if (ret)
- return ret;
- }
-
- pr_info("iterating forwards");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i++);
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards");
- if (ret)
- return ret;
-
- BUG_ON(i != nr);
-
- pr_info("iterating backwards");
-
- ret = bch2_trans_run(c,
- for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
- SPOS(0, U64_MAX, U32_MAX), 0, k, ({
- BUG_ON(k.k->p.offset != --i);
- 0;
- })));
- bch_err_msg(c, ret, "error iterating backwards");
- if (ret)
- return ret;
-
- BUG_ON(i);
- return 0;
-}
-
-static int test_iterate_extents(struct bch_fs *c, u64 nr)
-{
- u64 i;
- int ret = 0;
-
- delete_test_keys(c);
-
- pr_info("inserting test extents");
-
- for (i = 0; i < nr; i += 8) {
- struct bkey_i_cookie ck;
-
- bkey_cookie_init(&ck.k_i);
- ck.k.p.offset = i + 8;
- ck.k.p.snapshot = U32_MAX;
- ck.k.size = 8;
-
- ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "insert error");
- if (ret)
- return ret;
- }
-
- pr_info("iterating forwards");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i);
- i = k.k->p.offset;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards");
- if (ret)
- return ret;
-
- BUG_ON(i != nr);
-
- pr_info("iterating backwards");
-
- ret = bch2_trans_run(c,
- for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
- SPOS(0, U64_MAX, U32_MAX), 0, k, ({
- BUG_ON(k.k->p.offset != i);
- i = bkey_start_offset(k.k);
- 0;
- })));
- bch_err_msg(c, ret, "error iterating backwards");
- if (ret)
- return ret;
-
- BUG_ON(i);
- return 0;
-}
-
-static int test_iterate_slots(struct bch_fs *c, u64 nr)
-{
- u64 i;
- int ret = 0;
-
- delete_test_keys(c);
-
- pr_info("inserting test keys");
-
- for (i = 0; i < nr; i++) {
- struct bkey_i_cookie ck;
-
- bkey_cookie_init(&ck.k_i);
- ck.k.p.offset = i * 2;
- ck.k.p.snapshot = U32_MAX;
-
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "insert error");
- if (ret)
- return ret;
- }
-
- pr_info("iterating forwards");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i);
- i += 2;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards");
- if (ret)
- return ret;
-
- BUG_ON(i != nr * 2);
-
- pr_info("iterating forwards by slots");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_slots, k, ({
- if (i >= nr * 2)
- break;
-
- BUG_ON(k.k->p.offset != i);
- BUG_ON(bkey_deleted(k.k) != (i & 1));
-
- i++;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards by slots");
- return ret;
-}
-
-static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
-{
- u64 i;
- int ret = 0;
-
- delete_test_keys(c);
-
- pr_info("inserting test keys");
-
- for (i = 0; i < nr; i += 16) {
- struct bkey_i_cookie ck;
-
- bkey_cookie_init(&ck.k_i);
- ck.k.p.offset = i + 16;
- ck.k.p.snapshot = U32_MAX;
- ck.k.size = 8;
-
- ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "insert error");
- if (ret)
- return ret;
- }
-
- pr_info("iterating forwards");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i + 8);
- BUG_ON(k.k->size != 8);
- i += 16;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards");
- if (ret)
- return ret;
-
- BUG_ON(i != nr);
-
- pr_info("iterating forwards by slots");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_slots, k, ({
- if (i == nr)
- break;
- BUG_ON(bkey_deleted(k.k) != !(i % 16));
-
- BUG_ON(bkey_start_offset(k.k) != i);
- BUG_ON(k.k->size != 8);
- i = k.k->p.offset;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards by slots");
- return ret;
-}
-
-/*
- * XXX: we really want to make sure we've got a btree with depth > 0 for these
- * tests
- */
-static int test_peek_end(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0);
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
- BUG_ON(k.k);
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
- BUG_ON(k.k);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return 0;
-}
-
-static int test_peek_end_extents(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), 0);
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
- BUG_ON(k.k);
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
- BUG_ON(k.k);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return 0;
-}
-
-/* extent unit tests */
-
-static u64 test_version;
-
-static int insert_test_extent(struct bch_fs *c,
- u64 start, u64 end)
-{
- struct bkey_i_cookie k;
- int ret;
-
- bkey_cookie_init(&k.k_i);
- k.k_i.k.p.offset = end;
- k.k_i.k.p.snapshot = U32_MAX;
- k.k_i.k.size = end - start;
- k.k_i.k.bversion.lo = test_version++;
-
- ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __test_extent_overwrite(struct bch_fs *c,
- u64 e1_start, u64 e1_end,
- u64 e2_start, u64 e2_end)
-{
- int ret;
-
- ret = insert_test_extent(c, e1_start, e1_end) ?:
- insert_test_extent(c, e2_start, e2_end);
-
- delete_test_keys(c);
- return ret;
-}
-
-static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
-{
- return __test_extent_overwrite(c, 0, 64, 0, 32) ?:
- __test_extent_overwrite(c, 8, 64, 0, 32);
-}
-
-static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
-{
- return __test_extent_overwrite(c, 0, 64, 32, 64) ?:
- __test_extent_overwrite(c, 0, 64, 32, 72);
-}
-
-static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
-{
- return __test_extent_overwrite(c, 0, 64, 32, 40);
-}
-
-static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
-{
- return __test_extent_overwrite(c, 32, 64, 0, 64) ?:
- __test_extent_overwrite(c, 32, 64, 0, 128) ?:
- __test_extent_overwrite(c, 32, 64, 32, 64) ?:
- __test_extent_overwrite(c, 32, 64, 32, 128);
-}
-
-static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
-{
- struct bkey_i_cookie k;
- int ret;
-
- bkey_cookie_init(&k.k_i);
- k.k_i.k.p.inode = inum;
- k.k_i.k.p.offset = start + len;
- k.k_i.k.p.snapshot = snapid;
- k.k_i.k.size = len;
-
- ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
- BTREE_UPDATE_internal_snapshot_node));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
-{
- return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */
- insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?:
- insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?:
- insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */
- insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?:
- insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?:
- insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX);
-}
-
-/* snapshot unit tests */
-
-/* Test skipping over keys in unrelated snapshots: */
-static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
-{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i_cookie cookie;
- int ret;
-
- bkey_cookie_init(&cookie.k_i);
- cookie.k.p.snapshot = snapid_hi;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
- if (ret)
- return ret;
-
- trans = bch2_trans_get(c);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
- SPOS(0, 0, snapid_lo), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
-
- BUG_ON(k.k->p.snapshot != U32_MAX);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int test_snapshots(struct bch_fs *c, u64 nr)
-{
- struct bkey_i_cookie cookie;
- u32 snapids[2];
- u32 snapid_subvols[2] = { 1, 1 };
- int ret;
-
- bkey_cookie_init(&cookie.k_i);
- cookie.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
- if (ret)
- return ret;
-
- ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_snapshot_node_create(trans, U32_MAX,
- snapids,
- snapid_subvols,
- 2));
- if (ret)
- return ret;
-
- if (snapids[0] > snapids[1])
- swap(snapids[0], snapids[1]);
-
- ret = test_snapshot_filter(c, snapids[0], snapids[1]);
- bch_err_msg(c, ret, "from test_snapshot_filter");
- return ret;
-}
-
-/* perf tests */
-
-static u64 test_rand(void)
-{
- u64 v;
-
- get_random_bytes(&v, sizeof(v));
- return v;
-}
-
-static int rand_insert(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bkey_i_cookie k;
- int ret = 0;
- u64 i;
-
- for (i = 0; i < nr; i++) {
- bkey_cookie_init(&k.k_i);
- k.k.p.offset = test_rand();
- k.k.p.snapshot = U32_MAX;
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
- return ret;
-}
-
-static int rand_insert_multi(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bkey_i_cookie k[8];
- int ret = 0;
- unsigned j;
- u64 i;
-
- for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
- for (j = 0; j < ARRAY_SIZE(k); j++) {
- bkey_cookie_init(&k[j].k_i);
- k[j].k.p.offset = test_rand();
- k[j].k.p.snapshot = U32_MAX;
- }
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
- return ret;
-}
-
-static int rand_lookup(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
- u64 i;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0);
-
- for (i = 0; i < nr; i++) {
- bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
- ret = bkey_err(k);
- if (ret)
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int rand_mixed_trans(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i_cookie *cookie,
- u64 i, u64 pos)
-{
- struct bkey_s_c k;
- int ret;
-
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
-
- k = bch2_btree_iter_peek(iter);
- ret = bkey_err(k);
- bch_err_msg(trans->c, ret, "lookup error");
- if (ret)
- return ret;
-
- if (!(i & 3) && k.k) {
- bkey_cookie_init(&cookie->k_i);
- cookie->k.p = iter->pos;
- ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
- }
-
- return ret;
-}
-
-static int rand_mixed(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_i_cookie cookie;
- int ret = 0;
- u64 i, rand;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0);
-
- for (i = 0; i < nr; i++) {
- rand = test_rand();
- ret = commit_do(trans, NULL, NULL, 0,
- rand_mixed_trans(trans, &iter, &cookie, i, rand));
- if (ret)
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int __do_delete(struct btree_trans *trans, struct bpos pos)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
- BTREE_ITER_intent);
- k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX));
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k)
- goto err;
-
- ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int rand_delete(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = 0;
- u64 i;
-
- for (i = 0; i < nr; i++) {
- struct bpos pos = SPOS(0, test_rand(), U32_MAX);
-
- ret = commit_do(trans, NULL, NULL, 0,
- __do_delete(trans, pos));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
- return ret;
-}
-
-static int seq_insert(struct bch_fs *c, u64 nr)
-{
- struct bkey_i_cookie insert;
-
- bkey_cookie_init(&insert.k_i);
-
- return bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_slots|BTREE_ITER_intent, k,
- NULL, NULL, 0, ({
- if (iter.pos.offset >= nr)
- break;
- insert.k.p = iter.pos;
- bch2_trans_update(trans, &iter, &insert.k_i, 0);
- })));
-}
-
-static int seq_lookup(struct bch_fs *c, u64 nr)
-{
- return bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k,
- 0));
-}
-
-static int seq_overwrite(struct bch_fs *c, u64 nr)
-{
- return bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_intent, k,
- NULL, NULL, 0, ({
- struct bkey_i_cookie u;
-
- bkey_reassemble(&u.k_i, k);
- bch2_trans_update(trans, &iter, &u.k_i, 0);
- })));
-}
-
-static int seq_delete(struct bch_fs *c, u64 nr)
-{
- return bch2_btree_delete_range(c, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- POS(0, U64_MAX),
- 0, NULL);
-}
-
-typedef int (*perf_test_fn)(struct bch_fs *, u64);
-
-struct test_job {
- struct bch_fs *c;
- u64 nr;
- unsigned nr_threads;
- perf_test_fn fn;
-
- atomic_t ready;
- wait_queue_head_t ready_wait;
-
- atomic_t done;
- struct completion done_completion;
-
- u64 start;
- u64 finish;
- int ret;
-};
-
-static int btree_perf_test_thread(void *data)
-{
- struct test_job *j = data;
- int ret;
-
- if (atomic_dec_and_test(&j->ready)) {
- wake_up(&j->ready_wait);
- j->start = sched_clock();
- } else {
- wait_event(j->ready_wait, !atomic_read(&j->ready));
- }
-
- ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
- if (ret) {
- bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
- j->ret = ret;
- }
-
- if (atomic_dec_and_test(&j->done)) {
- j->finish = sched_clock();
- complete(&j->done_completion);
- }
-
- return 0;
-}
-
-int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
- u64 nr, unsigned nr_threads)
-{
- struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
- char name_buf[20];
- struct printbuf nr_buf = PRINTBUF;
- struct printbuf per_sec_buf = PRINTBUF;
- unsigned i;
- u64 time;
-
- if (nr == 0 || nr_threads == 0) {
- pr_err("nr of iterations or threads is not allowed to be 0");
- return -EINVAL;
- }
-
- atomic_set(&j.ready, nr_threads);
- init_waitqueue_head(&j.ready_wait);
-
- atomic_set(&j.done, nr_threads);
- init_completion(&j.done_completion);
-
-#define perf_test(_test) \
- if (!strcmp(testname, #_test)) j.fn = _test
-
- perf_test(rand_insert);
- perf_test(rand_insert_multi);
- perf_test(rand_lookup);
- perf_test(rand_mixed);
- perf_test(rand_delete);
-
- perf_test(seq_insert);
- perf_test(seq_lookup);
- perf_test(seq_overwrite);
- perf_test(seq_delete);
-
- /* a unit test, not a perf test: */
- perf_test(test_delete);
- perf_test(test_delete_written);
- perf_test(test_iterate);
- perf_test(test_iterate_extents);
- perf_test(test_iterate_slots);
- perf_test(test_iterate_slots_extents);
- perf_test(test_peek_end);
- perf_test(test_peek_end_extents);
-
- perf_test(test_extent_overwrite_front);
- perf_test(test_extent_overwrite_back);
- perf_test(test_extent_overwrite_middle);
- perf_test(test_extent_overwrite_all);
- perf_test(test_extent_create_overlapping);
-
- perf_test(test_snapshots);
-
- if (!j.fn) {
- pr_err("unknown test %s", testname);
- return -EINVAL;
- }
-
- //pr_info("running test %s:", testname);
-
- if (nr_threads == 1)
- btree_perf_test_thread(&j);
- else
- for (i = 0; i < nr_threads; i++)
- kthread_run(btree_perf_test_thread, &j,
- "bcachefs perf test[%u]", i);
-
- while (wait_for_completion_interruptible(&j.done_completion))
- ;
-
- time = j.finish - j.start;
-
- scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
- prt_human_readable_u64(&nr_buf, nr);
- prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
- printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
- name_buf, nr_buf.buf, nr_threads,
- div_u64(time, NSEC_PER_SEC),
- div_u64(time * nr_threads, nr),
- per_sec_buf.buf);
- printbuf_exit(&per_sec_buf);
- printbuf_exit(&nr_buf);
- return j.ret;
-}
-
-#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
deleted file mode 100644
index c73b18aea7e0..000000000000
--- a/fs/bcachefs/tests.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_TEST_H
-#define _BCACHEFS_TEST_H
-
-struct bch_fs;
-
-#ifdef CONFIG_BCACHEFS_TESTS
-
-int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
-
-#else
-
-#endif /* CONFIG_BCACHEFS_TESTS */
-
-#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
deleted file mode 100644
index dea73bc1cb51..000000000000
--- a/fs/bcachefs/thread_with_file.c
+++ /dev/null
@@ -1,492 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "thread_with_file.h"
-
-#include <linux/anon_inodes.h>
-#include <linux/file.h>
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/poll.h>
-#include <linux/sched/sysctl.h>
-
-void bch2_thread_with_file_exit(struct thread_with_file *thr)
-{
- if (thr->task) {
- kthread_stop(thr->task);
- put_task_struct(thr->task);
- }
-}
-
-int bch2_run_thread_with_file(struct thread_with_file *thr,
- const struct file_operations *fops,
- int (*fn)(void *))
-{
- struct file *file = NULL;
- int ret, fd = -1;
- unsigned fd_flags = O_CLOEXEC;
-
- if (fops->read && fops->write)
- fd_flags |= O_RDWR;
- else if (fops->read)
- fd_flags |= O_RDONLY;
- else if (fops->write)
- fd_flags |= O_WRONLY;
-
- char name[TASK_COMM_LEN];
- get_task_comm(name, current);
-
- thr->ret = 0;
- thr->task = kthread_create(fn, thr, "%s", name);
- ret = PTR_ERR_OR_ZERO(thr->task);
- if (ret)
- return ret;
-
- ret = get_unused_fd_flags(fd_flags);
- if (ret < 0)
- goto err;
- fd = ret;
-
- file = anon_inode_getfile(name, fops, thr, fd_flags);
- ret = PTR_ERR_OR_ZERO(file);
- if (ret)
- goto err;
-
- get_task_struct(thr->task);
- wake_up_process(thr->task);
- fd_install(fd, file);
- return fd;
-err:
- if (fd >= 0)
- put_unused_fd(fd);
- if (thr->task)
- kthread_stop(thr->task);
- return ret;
-}
-
-/* stdio_redirect */
-
-static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen)
-{
- return stdio->input.buf.nr > seen || stdio->done;
-}
-
-static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
-{
- return stdio_redirect_has_more_input(stdio, 0);
-}
-
-static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
-{
- return stdio->output.buf.nr || stdio->done;
-}
-
-#define STDIO_REDIRECT_BUFSIZE 4096
-
-static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
-{
- return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
-{
- return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static void stdio_buf_init(struct stdio_buf *buf)
-{
- spin_lock_init(&buf->lock);
- init_waitqueue_head(&buf->wait);
- darray_init(&buf->buf);
-}
-
-/* thread_with_stdio */
-
-static void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
- thr->thr.done = true;
- thr->stdio.done = true;
- wake_up(&thr->stdio.input.wait);
- wake_up(&thr->stdio.output.wait);
-}
-
-static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
- size_t len, loff_t *ppos)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
- struct stdio_buf *buf = &thr->stdio.output;
- size_t copied = 0, b;
- int ret = 0;
-
- if (!(file->f_flags & O_NONBLOCK)) {
- ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
- if (ret)
- return ret;
- } else if (!stdio_redirect_has_output(&thr->stdio))
- return -EAGAIN;
-
- while (len && buf->buf.nr) {
- if (fault_in_writeable(ubuf, len) == len) {
- ret = -EFAULT;
- break;
- }
-
- spin_lock_irq(&buf->lock);
- b = min_t(size_t, len, buf->buf.nr);
-
- if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
- ubuf += b;
- len -= b;
- copied += b;
- buf->buf.nr -= b;
- memmove(buf->buf.data,
- buf->buf.data + b,
- buf->buf.nr);
- }
- spin_unlock_irq(&buf->lock);
- }
-
- return copied ?: ret;
-}
-
-static int thread_with_stdio_release(struct inode *inode, struct file *file)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- thread_with_stdio_done(thr);
- bch2_thread_with_file_exit(&thr->thr);
- darray_exit(&thr->stdio.input.buf);
- darray_exit(&thr->stdio.output.buf);
- thr->ops->exit(thr);
- return 0;
-}
-
-static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
- size_t len, loff_t *ppos)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
- struct stdio_buf *buf = &thr->stdio.input;
- size_t copied = 0;
- ssize_t ret = 0;
-
- while (len) {
- if (thr->thr.done) {
- ret = -EPIPE;
- break;
- }
-
- size_t b = len - fault_in_readable(ubuf, len);
- if (!b) {
- ret = -EFAULT;
- break;
- }
-
- spin_lock(&buf->lock);
- size_t makeroom = b;
- if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr))
- makeroom = min_t(ssize_t, makeroom,
- max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr,
- 0));
- darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT);
-
- b = min(len, darray_room(buf->buf));
-
- if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
- buf->buf.nr += b;
- ubuf += b;
- len -= b;
- copied += b;
- }
- spin_unlock(&buf->lock);
-
- if (b) {
- wake_up(&buf->wait);
- } else {
- if ((file->f_flags & O_NONBLOCK)) {
- ret = -EAGAIN;
- break;
- }
-
- ret = wait_event_interruptible(buf->wait,
- stdio_redirect_has_input_space(&thr->stdio));
- if (ret)
- break;
- }
- }
-
- return copied ?: ret;
-}
-
-static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- poll_wait(file, &thr->stdio.output.wait, wait);
- poll_wait(file, &thr->stdio.input.wait, wait);
-
- __poll_t mask = 0;
-
- if (stdio_redirect_has_output(&thr->stdio))
- mask |= EPOLLIN;
- if (stdio_redirect_has_input_space(&thr->stdio))
- mask |= EPOLLOUT;
- if (thr->thr.done)
- mask |= EPOLLHUP|EPOLLERR;
- return mask;
-}
-
-static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- poll_wait(file, &thr->stdio.output.wait, wait);
-
- __poll_t mask = 0;
-
- if (stdio_redirect_has_output(&thr->stdio))
- mask |= EPOLLIN;
- if (thr->thr.done)
- mask |= EPOLLHUP|EPOLLERR;
- return mask;
-}
-
-static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- return thr->thr.ret;
-}
-
-static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- if (thr->ops->unlocked_ioctl)
- return thr->ops->unlocked_ioctl(thr, cmd, p);
- return -ENOTTY;
-}
-
-static const struct file_operations thread_with_stdio_fops = {
- .read = thread_with_stdio_read,
- .write = thread_with_stdio_write,
- .poll = thread_with_stdio_poll,
- .flush = thread_with_stdio_flush,
- .release = thread_with_stdio_release,
- .unlocked_ioctl = thread_with_stdio_ioctl,
-};
-
-static const struct file_operations thread_with_stdout_fops = {
- .read = thread_with_stdio_read,
- .poll = thread_with_stdout_poll,
- .flush = thread_with_stdio_flush,
- .release = thread_with_stdio_release,
- .unlocked_ioctl = thread_with_stdio_ioctl,
-};
-
-static int thread_with_stdio_fn(void *arg)
-{
- struct thread_with_stdio *thr = arg;
-
- thr->thr.ret = thr->ops->fn(thr);
-
- thread_with_stdio_done(thr);
- return 0;
-}
-
-void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
- const struct thread_with_stdio_ops *ops)
-{
- stdio_buf_init(&thr->stdio.input);
- stdio_buf_init(&thr->stdio.output);
- thr->ops = ops;
-}
-
-int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
-{
- return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
-}
-
-int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
- const struct thread_with_stdio_ops *ops)
-{
- bch2_thread_with_stdio_init(thr, ops);
-
- return __bch2_run_thread_with_stdio(thr);
-}
-
-int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
- const struct thread_with_stdio_ops *ops)
-{
- stdio_buf_init(&thr->stdio.input);
- stdio_buf_init(&thr->stdio.output);
- thr->ops = ops;
-
- return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
-}
-EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
-
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
-{
- struct stdio_buf *buf = &stdio->input;
-
- /*
- * we're waiting on user input (or for the file descriptor to be
- * closed), don't want a hung task warning:
- */
- do {
- wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
- sysctl_hung_task_timeout_secs * HZ / 2);
- } while (!stdio_redirect_has_input(stdio));
-
- if (stdio->done)
- return -1;
-
- spin_lock(&buf->lock);
- int ret = min(len, buf->buf.nr);
- buf->buf.nr -= ret;
- memcpy(ubuf, buf->buf.data, ret);
- memmove(buf->buf.data,
- buf->buf.data + ret,
- buf->buf.nr);
- spin_unlock(&buf->lock);
-
- wake_up(&buf->wait);
- return ret;
-}
-
-int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio,
- darray_char *line,
- unsigned long timeout)
-{
- unsigned long until = jiffies + timeout, t;
- struct stdio_buf *buf = &stdio->input;
- size_t seen = 0;
-again:
- t = timeout != MAX_SCHEDULE_TIMEOUT
- ? max_t(long, until - jiffies, 0)
- : timeout;
-
- t = min(t, sysctl_hung_task_timeout_secs * HZ / 2);
-
- wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t);
-
- if (stdio->done)
- return -1;
-
- spin_lock(&buf->lock);
- seen = buf->buf.nr;
- char *n = memchr(buf->buf.data, '\n', seen);
-
- if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
- spin_unlock(&buf->lock);
- return -ETIME;
- }
-
- if (!n) {
- buf->waiting_for_line = true;
- spin_unlock(&buf->lock);
- goto again;
- }
-
- size_t b = n + 1 - buf->buf.data;
- if (b > line->size) {
- spin_unlock(&buf->lock);
- int ret = darray_resize(line, b);
- if (ret)
- return ret;
- seen = 0;
- goto again;
- }
-
- buf->buf.nr -= b;
- memcpy(line->data, buf->buf.data, b);
- memmove(buf->buf.data,
- buf->buf.data + b,
- buf->buf.nr);
- line->nr = b;
-
- buf->waiting_for_line = false;
- spin_unlock(&buf->lock);
-
- wake_up(&buf->wait);
- return 0;
-}
-
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
-{
- return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT);
-}
-
-__printf(3, 0)
-static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
-{
- ssize_t ret;
-
- do {
- va_list args2;
- size_t len;
-
- va_copy(args2, args);
- len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
- va_end(args2);
-
- if (len + 1 <= darray_room(*out)) {
- out->nr += len;
- return len;
- }
-
- ret = darray_make_room_gfp(out, len + 1, gfp);
- } while (ret == 0);
-
- return ret;
-}
-
-ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
- const char *fmt, va_list args)
-{
- struct stdio_buf *buf = &stdio->output;
- unsigned long flags;
- ssize_t ret;
-
-again:
- spin_lock_irqsave(&buf->lock, flags);
- ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
- spin_unlock_irqrestore(&buf->lock, flags);
-
- if (ret < 0) {
- if (nonblocking)
- return -EAGAIN;
-
- ret = wait_event_interruptible(buf->wait,
- stdio_redirect_has_output_space(stdio));
- if (ret)
- return ret;
- goto again;
- }
-
- wake_up(&buf->wait);
- return ret;
-}
-
-ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
- const char *fmt, ...)
-{
- va_list args;
- ssize_t ret;
-
- va_start(args, fmt);
- ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
- va_end(args);
-
- return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
deleted file mode 100644
index 72497b921911..000000000000
--- a/fs/bcachefs/thread_with_file.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_H
-#define _BCACHEFS_THREAD_WITH_FILE_H
-
-#include "thread_with_file_types.h"
-
-/*
- * Thread with file: Run a kthread and connect it to a file descriptor, so that
- * it can be interacted with via fd read/write methods and closing the file
- * descriptor stops the kthread.
- *
- * We have two different APIs:
- *
- * thread_with_file, the low level version.
- * You get to define the full file_operations, including your release function,
- * which means that you must call bch2_thread_with_file_exit() from your
- * .release method
- *
- * thread_with_stdio, the higher level version
- * This implements full piping of input and output, including .poll.
- *
- * Notes on behaviour:
- * - kthread shutdown behaves like writing or reading from a pipe that has been
- * closed
- * - Input and output buffers are 4096 bytes, although buffers may in some
- * situations slightly exceed that limit so as to avoid chopping off a
- * message in the middle in nonblocking mode.
- * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
- * should be fine but might change in future revisions.
- * - Output buffer may grow past 4096 bytes to deal with messages that are
- * bigger than 4096 bytes
- * - Writing may be done blocking or nonblocking; in nonblocking mode, we only
- * drop entire messages.
- *
- * To write, use stdio_redirect_printf()
- * To read, use stdio_redirect_read() or stdio_redirect_readline()
- */
-
-struct task_struct;
-
-struct thread_with_file {
- struct task_struct *task;
- int ret;
- bool done;
-};
-
-void bch2_thread_with_file_exit(struct thread_with_file *);
-int bch2_run_thread_with_file(struct thread_with_file *,
- const struct file_operations *,
- int (*fn)(void *));
-
-struct thread_with_stdio;
-
-struct thread_with_stdio_ops {
- void (*exit)(struct thread_with_stdio *);
- int (*fn)(struct thread_with_stdio *);
- long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
-};
-
-struct thread_with_stdio {
- struct thread_with_file thr;
- struct stdio_redirect stdio;
- const struct thread_with_stdio_ops *ops;
-};
-
-void bch2_thread_with_stdio_init(struct thread_with_stdio *,
- const struct thread_with_stdio_ops *);
-int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
-int bch2_run_thread_with_stdio(struct thread_with_stdio *,
- const struct thread_with_stdio_ops *);
-int bch2_run_thread_with_stdout(struct thread_with_stdio *,
- const struct thread_with_stdio_ops *);
-int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-
-int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
-
-__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
-__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
deleted file mode 100644
index f4d484d44f63..000000000000
--- a/fs/bcachefs/thread_with_file_types.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-
-#include "darray.h"
-
-struct stdio_buf {
- spinlock_t lock;
- wait_queue_head_t wait;
- darray_char buf;
- bool waiting_for_line;
-};
-
-struct stdio_redirect {
- struct stdio_buf input;
- struct stdio_buf output;
- bool done;
-};
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
deleted file mode 100644
index 3fe82757f93a..000000000000
--- a/fs/bcachefs/time_stats.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/time.h>
-#include <linux/spinlock.h>
-
-#include "eytzinger.h"
-#include "time_stats.h"
-
-static const struct time_unit time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "d", (u64) NSEC_PER_SEC * 3600 * 24},
- { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
- { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
- { "eon", U64_MAX },
-};
-
-const struct time_unit *bch2_pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
-static void quantiles_update(struct quantiles *q, u64 v)
-{
- unsigned i = 0;
-
- while (i < ARRAY_SIZE(q->entries)) {
- struct quantile_entry *e = q->entries + i;
-
- if (unlikely(!e->step)) {
- e->m = v;
- e->step = max_t(unsigned, v / 2, 1024);
- } else if (e->m > v) {
- e->m = e->m >= e->step
- ? e->m - e->step
- : 0;
- } else if (e->m < v) {
- e->m = e->m + e->step > e->m
- ? e->m + e->step
- : U32_MAX;
- }
-
- if ((e->m > v ? e->m - v : v - e->m) < e->step)
- e->step = max_t(unsigned, e->step / 2, 1);
-
- if (v >= e->m)
- break;
-
- i = eytzinger0_child(i, v > e->m);
- }
-}
-
-static inline void time_stats_update_one(struct bch2_time_stats *stats,
- u64 start, u64 end)
-{
- u64 duration, freq;
- bool initted = stats->last_event != 0;
-
- if (time_after64(end, start)) {
- struct quantiles *quantiles = time_stats_to_quantiles(stats);
-
- duration = end - start;
- mean_and_variance_update(&stats->duration_stats, duration);
- mean_and_variance_weighted_update(&stats->duration_stats_weighted,
- duration, initted, TIME_STATS_MV_WEIGHT);
- stats->max_duration = max(stats->max_duration, duration);
- stats->min_duration = min(stats->min_duration, duration);
- stats->total_duration += duration;
-
- if (quantiles)
- quantiles_update(quantiles, duration);
- }
-
- if (stats->last_event && time_after64(end, stats->last_event)) {
- freq = end - stats->last_event;
- mean_and_variance_update(&stats->freq_stats, freq);
- mean_and_variance_weighted_update(&stats->freq_stats_weighted,
- freq, initted, TIME_STATS_MV_WEIGHT);
- stats->max_freq = max(stats->max_freq, freq);
- stats->min_freq = min(stats->min_freq, freq);
- }
-
- stats->last_event = end;
-}
-
-void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct time_stat_buffer *b)
-{
- for (struct time_stat_buffer_entry *i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- time_stats_update_one(stats, i->start, i->end);
- b->nr = 0;
-}
-
-static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct time_stat_buffer *b)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&stats->lock, flags);
- __bch2_time_stats_clear_buffer(stats, b);
- spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
- unsigned long flags;
-
- if (!stats->buffer) {
- spin_lock_irqsave(&stats->lock, flags);
- time_stats_update_one(stats, start, end);
-
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
- stats->duration_stats.n > 1024)
- stats->buffer =
- alloc_percpu_gfp(struct time_stat_buffer,
- GFP_ATOMIC);
- spin_unlock_irqrestore(&stats->lock, flags);
- } else {
- struct time_stat_buffer *b;
-
- preempt_disable();
- b = this_cpu_ptr(stats->buffer);
-
- BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
- b->entries[b->nr++] = (struct time_stat_buffer_entry) {
- .start = start,
- .end = end
- };
-
- if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
- time_stats_clear_buffer(stats, b);
- preempt_enable();
- }
-}
-
-void bch2_time_stats_reset(struct bch2_time_stats *stats)
-{
- spin_lock_irq(&stats->lock);
- unsigned offset = offsetof(struct bch2_time_stats, min_duration);
- memset((void *) stats + offset, 0, sizeof(*stats) - offset);
-
- if (stats->buffer) {
- int cpu;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(stats->buffer, cpu)->nr = 0;
- }
- spin_unlock_irq(&stats->lock);
-}
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
- free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
- memset(stats, 0, sizeof(*stats));
- stats->min_duration = U64_MAX;
- stats->min_freq = U64_MAX;
- spin_lock_init(&stats->lock);
-}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
deleted file mode 100644
index dc6493f7bbab..000000000000
--- a/fs/bcachefs/time_stats.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * bch2_time_stats - collect statistics on events that have a duration, with nicely
- * formatted textual output on demand
- *
- * - percpu buffering of event collection: cheap enough to shotgun
- * everywhere without worrying about overhead
- *
- * tracks:
- * - number of events
- * - maximum event duration ever seen
- * - sum of all event durations
- * - average event duration, standard and weighted
- * - standard deviation of event durations, standard and weighted
- * and analagous statistics for the frequency of events
- *
- * We provide both mean and weighted mean (exponentially weighted), and standard
- * deviation and weighted standard deviation, to give an efficient-to-compute
- * view of current behaviour versus. average behaviour - "did this event source
- * just become wonky, or is this typical?".
- *
- * Particularly useful for tracking down latency issues.
- */
-#ifndef _BCACHEFS_TIME_STATS_H
-#define _BCACHEFS_TIME_STATS_H
-
-#include <linux/sched/clock.h>
-#include <linux/spinlock_types.h>
-#include <linux/string.h>
-
-#include "mean_and_variance.h"
-
-struct time_unit {
- const char *name;
- u64 nsecs;
-};
-
-/*
- * given a nanosecond value, pick the preferred time units for printing:
- */
-const struct time_unit *bch2_pick_time_units(u64 ns);
-
-/*
- * quantiles - do not use:
- *
- * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
- * use in new code.
- */
-
-#define NR_QUANTILES 15
-#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
-
-struct quantiles {
- struct quantile_entry {
- u64 m;
- u64 step;
- } entries[NR_QUANTILES];
-};
-
-struct time_stat_buffer {
- unsigned nr;
- struct time_stat_buffer_entry {
- u64 start;
- u64 end;
- } entries[31];
-};
-
-struct bch2_time_stats {
- spinlock_t lock;
- bool have_quantiles;
- struct time_stat_buffer __percpu *buffer;
- /* all fields are in nanoseconds */
- u64 min_duration;
- u64 max_duration;
- u64 total_duration;
- u64 max_freq;
- u64 min_freq;
- u64 last_event;
- u64 last_event_start;
-
- struct mean_and_variance duration_stats;
- struct mean_and_variance freq_stats;
-
-/* default weight for weighted mean and variance calculations */
-#define TIME_STATS_MV_WEIGHT 8
-
- struct mean_and_variance_weighted duration_stats_weighted;
- struct mean_and_variance_weighted freq_stats_weighted;
-};
-
-struct bch2_time_stats_quantiles {
- struct bch2_time_stats stats;
- struct quantiles quantiles;
-};
-
-static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
-{
- return stats->have_quantiles
- ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
- : NULL;
-}
-
-void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-/**
- * time_stats_update - collect a new event being tracked
- *
- * @stats - bch2_time_stats to update
- * @start - start time of event, recorded with local_clock()
- *
- * The end duration of the event will be the current time
- */
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
- __bch2_time_stats_update(stats, start, local_clock());
-}
-
-/**
- * track_event_change - track state change events
- *
- * @stats - bch2_time_stats to update
- * @v - new state, true or false
- *
- * Use this when tracking time stats for state changes, i.e. resource X becoming
- * blocked/unblocked.
- */
-static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
-{
- if (v != !!stats->last_event_start) {
- if (!v) {
- bch2_time_stats_update(stats, stats->last_event_start);
- stats->last_event_start = 0;
- } else {
- stats->last_event_start = local_clock() ?: 1;
- return true;
- }
- }
-
- return false;
-}
-
-void bch2_time_stats_reset(struct bch2_time_stats *);
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
-
-static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
-{
- bch2_time_stats_exit(&statq->stats);
-}
-static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
-{
- bch2_time_stats_init(&statq->stats);
- statq->stats.have_quantiles = true;
- memset(&statq->quantiles, 0, sizeof(statq->quantiles));
-}
-
-#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
deleted file mode 100644
index dfad1d06633d..000000000000
--- a/fs/bcachefs/trace.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "buckets.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update_interior.h"
-#include "keylist.h"
-#include "move_types.h"
-#include "opts.h"
-#include "six.h"
-
-#include <linux/blktrace_api.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
deleted file mode 100644
index 519d00d62ae7..000000000000
--- a/fs/bcachefs/trace.h
+++ /dev/null
@@ -1,1949 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM bcachefs
-
-#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
-
-#include <linux/tracepoint.h>
-
-#define TRACE_BPOS_entries(name) \
- __field(u64, name##_inode ) \
- __field(u64, name##_offset ) \
- __field(u32, name##_snapshot )
-
-#define TRACE_BPOS_assign(dst, src) \
- __entry->dst##_inode = (src).inode; \
- __entry->dst##_offset = (src).offset; \
- __entry->dst##_snapshot = (src).snapshot
-
-DECLARE_EVENT_CLASS(bpos,
- TP_PROTO(const struct bpos *p),
- TP_ARGS(p),
-
- TP_STRUCT__entry(
- TRACE_BPOS_entries(p)
- ),
-
- TP_fast_assign(
- TRACE_BPOS_assign(p, *p);
- ),
-
- TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
-);
-
-DECLARE_EVENT_CLASS(fs_str,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __string(str, str )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __assign_str(str);
- ),
-
- TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(trans_str,
- TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
- TP_ARGS(trans, caller_ip, str),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __string(str, str )
- ),
-
- TP_fast_assign(
- __entry->dev = trans->c->dev;
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __assign_str(str);
- ),
-
- TP_printk("%d,%d %s %pS %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(trans_str_nocaller,
- TP_PROTO(struct btree_trans *trans, const char *str),
- TP_ARGS(trans, str),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, trans_fn, 32 )
- __string(str, str )
- ),
-
- TP_fast_assign(
- __entry->dev = trans->c->dev;
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(str);
- ),
-
- TP_printk("%d,%d %s %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->trans_fn, __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(btree_node_nofs,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u8, level )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->level = b->c.level;
- __entry->btree_id = b->c.btree_id;
- TRACE_BPOS_assign(pos, b->key.k.p);
- ),
-
- TP_printk("%d,%d %u %s %llu:%llu:%u",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->level,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(btree_node,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, trans_fn, 32 )
- __field(u8, level )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->dev = trans->c->dev;
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->level = b->c.level;
- __entry->btree_id = b->c.btree_id;
- TRACE_BPOS_assign(pos, b->key.k.p);
- ),
-
- TP_printk("%d,%d %s %u %s %llu:%llu:%u",
- MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn,
- __entry->level,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(bch_fs,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- ),
-
- TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
-);
-
-DECLARE_EVENT_CLASS(btree_trans,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, trans_fn, 32 )
- ),
-
- TP_fast_assign(
- __entry->dev = trans->c->dev;
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- ),
-
- TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn)
-);
-
-DECLARE_EVENT_CLASS(bio,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(sector_t, sector )
- __field(unsigned int, nr_sector )
- __array(char, rwbs, 6 )
- ),
-
- TP_fast_assign(
- __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0;
- __entry->sector = bio->bi_iter.bi_sector;
- __entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
- ),
-
- TP_printk("%d,%d %s %llu + %u",
- MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
- (unsigned long long)__entry->sector, __entry->nr_sector)
-);
-
-/* disk_accounting.c */
-
-TRACE_EVENT(accounting_mem_insert,
- TP_PROTO(struct bch_fs *c, const char *acc),
- TP_ARGS(c, acc),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(unsigned, new_nr )
- __string(acc, acc )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->new_nr = c->accounting.k.nr;
- __assign_str(acc);
- ),
-
- TP_printk("%d,%d entries %u added %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->new_nr,
- __get_str(acc))
-);
-
-/* fs.c: */
-TRACE_EVENT(bch2_sync_fs,
- TP_PROTO(struct super_block *sb, int wait),
-
- TP_ARGS(sb, wait),
-
- TP_STRUCT__entry(
- __field( dev_t, dev )
- __field( int, wait )
-
- ),
-
- TP_fast_assign(
- __entry->dev = sb->s_dev;
- __entry->wait = wait;
- ),
-
- TP_printk("dev %d,%d wait %d",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->wait)
-);
-
-/* fs-io.c: */
-TRACE_EVENT(bch2_fsync,
- TP_PROTO(struct file *file, int datasync),
-
- TP_ARGS(file, datasync),
-
- TP_STRUCT__entry(
- __field( dev_t, dev )
- __field( ino_t, ino )
- __field( ino_t, parent )
- __field( int, datasync )
- ),
-
- TP_fast_assign(
- struct dentry *dentry = file->f_path.dentry;
-
- __entry->dev = dentry->d_sb->s_dev;
- __entry->ino = d_inode(dentry)->i_ino;
- __entry->parent = d_inode(dentry->d_parent)->i_ino;
- __entry->datasync = datasync;
- ),
-
- TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long) __entry->ino,
- (unsigned long) __entry->parent, __entry->datasync)
-);
-
-/* super-io.c: */
-TRACE_EVENT(write_super,
- TP_PROTO(struct bch_fs *c, unsigned long ip),
- TP_ARGS(c, ip),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(unsigned long, ip )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->ip = ip;
- ),
-
- TP_printk("%d,%d for %pS",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- (void *) __entry->ip)
-);
-
-/* io.c: */
-
-DEFINE_EVENT(bio, io_read_promote,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-TRACE_EVENT(io_read_nopromote,
- TP_PROTO(struct bch_fs *c, int ret),
- TP_ARGS(c, ret),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, ret, 32 )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
- ),
-
- TP_printk("%d,%d ret %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ret)
-);
-
-DEFINE_EVENT(bio, io_read_bounce,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_split,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_retry,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_reuse_race,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-/* ec.c */
-
-TRACE_EVENT(stripe_create,
- TP_PROTO(struct bch_fs *c, u64 idx, int ret),
- TP_ARGS(c, idx, ret),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, idx )
- __field(int, ret )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->idx = idx;
- __entry->ret = ret;
- ),
-
- TP_printk("%d,%d idx %llu ret %i",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->idx,
- __entry->ret)
-);
-
-/* Journal */
-
-DEFINE_EVENT(bch_fs, journal_full,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(fs_str, journal_entry_full,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, journal_entry_close,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(bio, journal_write,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-TRACE_EVENT(journal_reclaim_start,
- TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
- u64 min_nr, u64 min_key_cache,
- u64 btree_cache_dirty, u64 btree_cache_total,
- u64 btree_key_cache_dirty, u64 btree_key_cache_total),
- TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
- btree_cache_dirty, btree_cache_total,
- btree_key_cache_dirty, btree_key_cache_total),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(bool, direct )
- __field(bool, kicked )
- __field(u64, min_nr )
- __field(u64, min_key_cache )
- __field(u64, btree_cache_dirty )
- __field(u64, btree_cache_total )
- __field(u64, btree_key_cache_dirty )
- __field(u64, btree_key_cache_total )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->direct = direct;
- __entry->kicked = kicked;
- __entry->min_nr = min_nr;
- __entry->min_key_cache = min_key_cache;
- __entry->btree_cache_dirty = btree_cache_dirty;
- __entry->btree_cache_total = btree_cache_total;
- __entry->btree_key_cache_dirty = btree_key_cache_dirty;
- __entry->btree_key_cache_total = btree_key_cache_total;
- ),
-
- TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->direct,
- __entry->kicked,
- __entry->min_nr,
- __entry->min_key_cache,
- __entry->btree_cache_dirty,
- __entry->btree_cache_total,
- __entry->btree_key_cache_dirty,
- __entry->btree_key_cache_total)
-);
-
-TRACE_EVENT(journal_reclaim_finish,
- TP_PROTO(struct bch_fs *c, u64 nr_flushed),
- TP_ARGS(c, nr_flushed),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, nr_flushed )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->nr_flushed = nr_flushed;
- ),
-
- TP_printk("%d,%d flushed %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->nr_flushed)
-);
-
-/* bset.c: */
-
-DEFINE_EVENT(bpos, bkey_pack_pos_fail,
- TP_PROTO(const struct bpos *p),
- TP_ARGS(p)
-);
-
-/* Btree cache: */
-
-TRACE_EVENT(btree_cache_scan,
- TP_PROTO(long nr_to_scan, long can_free, long ret),
- TP_ARGS(nr_to_scan, can_free, ret),
-
- TP_STRUCT__entry(
- __field(long, nr_to_scan )
- __field(long, can_free )
- __field(long, ret )
- ),
-
- TP_fast_assign(
- __entry->nr_to_scan = nr_to_scan;
- __entry->can_free = can_free;
- __entry->ret = ret;
- ),
-
- TP_printk("scanned for %li nodes, can free %li, ret %li",
- __entry->nr_to_scan, __entry->can_free, __entry->ret)
-);
-
-DEFINE_EVENT(btree_node_nofs, btree_cache_reap,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans)
-);
-
-/* Btree */
-
-DEFINE_EVENT(btree_node, btree_node_read,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_node_write,
- TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
- TP_ARGS(b, bytes, sectors),
-
- TP_STRUCT__entry(
- __field(enum btree_node_type, type)
- __field(unsigned, bytes )
- __field(unsigned, sectors )
- ),
-
- TP_fast_assign(
- __entry->type = btree_node_type(b);
- __entry->bytes = bytes;
- __entry->sectors = sectors;
- ),
-
- TP_printk("bkey type %u bytes %u sectors %u",
- __entry->type , __entry->bytes, __entry->sectors)
-);
-
-DEFINE_EVENT(btree_node, btree_node_alloc,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_free,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_reserve_get_fail,
- TP_PROTO(const char *trans_fn,
- unsigned long caller_ip,
- size_t required,
- int ret),
- TP_ARGS(trans_fn, caller_ip, required, ret),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(size_t, required )
- __array(char, ret, 32 )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->required = required;
- strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
- ),
-
- TP_printk("%s %pS required %zu ret %s",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->required,
- __entry->ret)
-);
-
-DEFINE_EVENT(btree_node, btree_node_compact,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_merge,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_split,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_rewrite,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_set_root,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_path_relock_fail,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned level),
- TP_ARGS(trans, caller_ip, path, level),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u8, level )
- __field(u8, path_idx)
- TRACE_BPOS_entries(pos)
- __array(char, node, 24 )
- __field(u8, self_read_count )
- __field(u8, self_intent_count)
- __field(u8, read_count )
- __field(u8, intent_count )
- __field(u32, iter_lock_seq )
- __field(u32, node_lock_seq )
- ),
-
- TP_fast_assign(
- struct btree *b = btree_path_node(path, level);
- struct six_lock_count c;
-
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = path->btree_id;
- __entry->level = level;
- __entry->path_idx = path - trans->paths;
- TRACE_BPOS_assign(pos, path->pos);
-
- c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
- __entry->self_read_count = c.n[SIX_LOCK_read];
- __entry->self_intent_count = c.n[SIX_LOCK_intent];
-
- if (IS_ERR(b)) {
- strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
- } else {
- c = six_lock_counts(&path->l[level].b->c.lock);
- __entry->read_count = c.n[SIX_LOCK_read];
- __entry->intent_count = c.n[SIX_LOCK_intent];
- scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
- }
- __entry->iter_lock_seq = path->l[level].lock_seq;
- __entry->node_lock_seq = is_btree_node(path, level)
- ? six_lock_seq(&path->l[level].b->c.lock)
- : 0;
- ),
-
- TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->path_idx,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->level,
- __entry->node,
- __entry->self_read_count,
- __entry->self_intent_count,
- __entry->read_count,
- __entry->intent_count,
- __entry->iter_lock_seq,
- __entry->node_lock_seq)
-);
-
-TRACE_EVENT(btree_path_upgrade_fail,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned level),
- TP_ARGS(trans, caller_ip, path, level),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u8, level )
- __field(u8, path_idx)
- TRACE_BPOS_entries(pos)
- __field(u8, locked )
- __field(u8, self_read_count )
- __field(u8, self_intent_count)
- __field(u8, read_count )
- __field(u8, intent_count )
- __field(u32, iter_lock_seq )
- __field(u32, node_lock_seq )
- ),
-
- TP_fast_assign(
- struct six_lock_count c;
-
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = path->btree_id;
- __entry->level = level;
- __entry->path_idx = path - trans->paths;
- TRACE_BPOS_assign(pos, path->pos);
- __entry->locked = btree_node_locked(path, level);
-
- c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
- __entry->self_read_count = c.n[SIX_LOCK_read];
- __entry->self_intent_count = c.n[SIX_LOCK_intent];
- c = six_lock_counts(&path->l[level].b->c.lock);
- __entry->read_count = c.n[SIX_LOCK_read];
- __entry->intent_count = c.n[SIX_LOCK_intent];
- __entry->iter_lock_seq = path->l[level].lock_seq;
- __entry->node_lock_seq = is_btree_node(path, level)
- ? six_lock_seq(&path->l[level].b->c.lock)
- : 0;
- ),
-
- TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->path_idx,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->level,
- __entry->locked,
- __entry->self_read_count,
- __entry->self_intent_count,
- __entry->read_count,
- __entry->intent_count,
- __entry->iter_lock_seq,
- __entry->node_lock_seq)
-);
-
-/* Garbage collection */
-
-DEFINE_EVENT(bch_fs, gc_gens_start,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_gens_end,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-/* Allocator */
-
-DEFINE_EVENT(fs_str, bucket_alloc,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, bucket_alloc_fail,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DECLARE_EVENT_CLASS(discard_buckets_class,
- TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
- u64 need_journal_commit, u64 discarded, const char *err),
- TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, seen )
- __field(u64, open )
- __field(u64, need_journal_commit )
- __field(u64, discarded )
- __array(char, err, 16 )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->seen = seen;
- __entry->open = open;
- __entry->need_journal_commit = need_journal_commit;
- __entry->discarded = discarded;
- strscpy(__entry->err, err, sizeof(__entry->err));
- ),
-
- TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->seen,
- __entry->open,
- __entry->need_journal_commit,
- __entry->discarded,
- __entry->err)
-);
-
-DEFINE_EVENT(discard_buckets_class, discard_buckets,
- TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
- u64 need_journal_commit, u64 discarded, const char *err),
- TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
-);
-
-DEFINE_EVENT(discard_buckets_class, discard_buckets_fast,
- TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
- u64 need_journal_commit, u64 discarded, const char *err),
- TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
-);
-
-TRACE_EVENT(bucket_invalidate,
- TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
- TP_ARGS(c, dev, bucket, sectors),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u32, dev_idx )
- __field(u32, sectors )
- __field(u64, bucket )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->dev_idx = dev;
- __entry->sectors = sectors;
- __entry->bucket = bucket;
- ),
-
- TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->dev_idx, __entry->bucket,
- __entry->sectors)
-);
-
-/* Moving IO */
-
-DEFINE_EVENT(fs_str, io_move,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_read,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_write,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_finish,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_fail,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_write_fail,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_start_fail,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-TRACE_EVENT(move_data,
- TP_PROTO(struct bch_fs *c,
- struct bch_move_stats *stats),
- TP_ARGS(c, stats),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, keys_moved )
- __field(u64, keys_raced )
- __field(u64, sectors_seen )
- __field(u64, sectors_moved )
- __field(u64, sectors_raced )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->keys_moved = atomic64_read(&stats->keys_moved);
- __entry->keys_raced = atomic64_read(&stats->keys_raced);
- __entry->sectors_seen = atomic64_read(&stats->sectors_seen);
- __entry->sectors_moved = atomic64_read(&stats->sectors_moved);
- __entry->sectors_raced = atomic64_read(&stats->sectors_raced);
- ),
-
- TP_printk("%d,%d keys moved %llu raced %llu"
- "sectors seen %llu moved %llu raced %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->keys_moved,
- __entry->keys_raced,
- __entry->sectors_seen,
- __entry->sectors_moved,
- __entry->sectors_raced)
-);
-
-TRACE_EVENT(copygc,
- TP_PROTO(struct bch_fs *c,
- u64 buckets,
- u64 sectors_seen,
- u64 sectors_moved),
- TP_ARGS(c, buckets, sectors_seen, sectors_moved),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, buckets )
- __field(u64, sectors_seen )
- __field(u64, sectors_moved )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->buckets = buckets;
- __entry->sectors_seen = sectors_seen;
- __entry->sectors_moved = sectors_moved;
- ),
-
- TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->buckets,
- __entry->sectors_seen,
- __entry->sectors_moved)
-);
-
-TRACE_EVENT(copygc_wait,
- TP_PROTO(struct bch_fs *c,
- u64 wait_amount, u64 until),
- TP_ARGS(c, wait_amount, until),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, wait_amount )
- __field(u64, until )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->wait_amount = wait_amount;
- __entry->until = until;
- ),
-
- TP_printk("%d,%u waiting for %llu sectors until %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->wait_amount, __entry->until)
-);
-
-/* btree transactions: */
-
-DECLARE_EVENT_CLASS(transaction_event,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- ),
-
- TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
-);
-
-DEFINE_EVENT(transaction_event, transaction_commit,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_injected,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(trans_restart_split_race,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree *b),
- TP_ARGS(trans, caller_ip, b),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, level )
- __field(u16, written )
- __field(u16, blocks )
- __field(u16, u64s_remaining )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->level = b->c.level;
- __entry->written = b->written;
- __entry->blocks = btree_blocks(trans->c);
- __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
- ),
-
- TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
- __entry->trans_fn, (void *) __entry->caller_ip,
- __entry->level,
- __entry->written, __entry->blocks,
- __entry->u64s_remaining)
-);
-
-TRACE_EVENT(trans_blocked_journal_reclaim,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
-
- __field(unsigned long, key_cache_nr_keys )
- __field(unsigned long, key_cache_nr_dirty )
- __field(long, must_wait )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys);
- __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty);
- __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c);
- ),
-
- TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li",
- __entry->trans_fn, (void *) __entry->caller_ip,
- __entry->key_cache_nr_keys,
- __entry->key_cache_nr_dirty,
- __entry->must_wait)
-);
-
-TRACE_EVENT(trans_restart_journal_preres_get,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- unsigned flags),
- TP_ARGS(trans, caller_ip, flags),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(unsigned, flags )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->flags = flags;
- ),
-
- TP_printk("%s %pS %x", __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->flags)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_fault_inject,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_event, trans_traverse_all,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(trans_str, trans_restart_too_many_iters,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- const char *paths),
- TP_ARGS(trans, caller_ip, paths)
-);
-
-DECLARE_EVENT_CLASS(transaction_restart_iter,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos)
- ),
-
- TP_printk("%s %pS btree %s pos %llu:%llu:%u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-TRACE_EVENT(trans_restart_upgrade,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned old_locks_want,
- unsigned new_locks_want,
- struct get_locks_fail *f),
- TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u8, old_locks_want )
- __field(u8, new_locks_want )
- __field(u8, level )
- __field(u32, path_seq )
- __field(u32, node_seq )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = path->btree_id;
- __entry->old_locks_want = old_locks_want;
- __entry->new_locks_want = new_locks_want;
- __entry->level = f->l;
- __entry->path_seq = path->l[f->l].lock_seq;
- __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
- TRACE_BPOS_assign(pos, path->pos)
- ),
-
- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->old_locks_want,
- __entry->new_locks_want,
- __entry->level,
- __entry->path_seq,
- __entry->node_seq)
-);
-
-DEFINE_EVENT(trans_str, trans_restart_relock,
- TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
- TP_ARGS(trans, caller_ip, str)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock,
- TP_PROTO(struct btree_trans *trans,
- const char *cycle),
- TP_ARGS(trans, cycle)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(trans_restart_would_deadlock_write,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- ),
-
- TP_printk("%s", __entry->trans_fn)
-);
-
-TRACE_EVENT(trans_restart_mem_realloced,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- unsigned long bytes),
- TP_ARGS(trans, caller_ip, bytes),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(unsigned long, bytes )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->bytes = bytes;
- ),
-
- TP_printk("%s %pS bytes %lu",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->bytes)
-);
-
-TRACE_EVENT(trans_restart_key_cache_key_realloced,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned old_u64s,
- unsigned new_u64s),
- TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(enum btree_id, btree_id )
- TRACE_BPOS_entries(pos)
- __field(u32, old_u64s )
- __field(u32, new_u64s )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
-
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- __entry->old_u64s = old_u64s;
- __entry->new_u64s = new_u64s;
- ),
-
- TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->old_u64s,
- __entry->new_u64s)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(path_downgrade,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned old_locks_want),
- TP_ARGS(trans, caller_ip, path, old_locks_want),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(unsigned, old_locks_want )
- __field(unsigned, new_locks_want )
- __field(unsigned, btree )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->old_locks_want = old_locks_want;
- __entry->new_locks_want = path->locks_want;
- __entry->btree = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- ),
-
- TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->old_locks_want,
- __entry->new_locks_want,
- bch2_btree_id_str(__entry->btree),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot)
-);
-
-TRACE_EVENT(key_cache_fill,
- TP_PROTO(struct btree_trans *trans, const char *key),
- TP_ARGS(trans, key),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __string(key, key )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(key);
- ),
-
- TP_printk("%s %s", __entry->trans_fn, __get_str(key))
-);
-
-TRACE_EVENT(write_buffer_flush,
- TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
- TP_ARGS(trans, nr, skipped, fast, size),
-
- TP_STRUCT__entry(
- __field(size_t, nr )
- __field(size_t, skipped )
- __field(size_t, fast )
- __field(size_t, size )
- ),
-
- TP_fast_assign(
- __entry->nr = nr;
- __entry->skipped = skipped;
- __entry->fast = fast;
- __entry->size = size;
- ),
-
- TP_printk("%zu/%zu skipped %zu fast %zu",
- __entry->nr, __entry->size, __entry->skipped, __entry->fast)
-);
-
-TRACE_EVENT(write_buffer_flush_sync,
- TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
- TP_ARGS(trans, caller_ip),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- ),
-
- TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
-);
-
-TRACE_EVENT(write_buffer_flush_slowpath,
- TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
- TP_ARGS(trans, slowpath, total),
-
- TP_STRUCT__entry(
- __field(size_t, slowpath )
- __field(size_t, total )
- ),
-
- TP_fast_assign(
- __entry->slowpath = slowpath;
- __entry->total = total;
- ),
-
- TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
-);
-
-TRACE_EVENT(write_buffer_maybe_flush,
- TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key),
- TP_ARGS(trans, caller_ip, key),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __string(key, key )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(key);
- ),
-
- TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
-);
-
-DEFINE_EVENT(fs_str, rebalance_extent,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, data_update,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-TRACE_EVENT(error_downcast,
- TP_PROTO(int bch_err, int std_err, unsigned long ip),
- TP_ARGS(bch_err, std_err, ip),
-
- TP_STRUCT__entry(
- __array(char, bch_err, 32 )
- __array(char, std_err, 32 )
- __array(char, ip, 32 )
- ),
-
- TP_fast_assign(
- strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
- strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
- snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
- ),
-
- TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
-);
-
-#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
-
-TRACE_EVENT(update_by_path,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path,
- struct btree_insert_entry *i, bool overwrite),
- TP_ARGS(trans, path, i, overwrite),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(btree_path_idx_t, path_idx )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- __field(u8, overwrite )
- __field(btree_path_idx_t, update_idx )
- __field(btree_path_idx_t, nr_updates )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->path_idx = path - trans->paths;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- __entry->overwrite = overwrite;
- __entry->update_idx = i - trans->updates;
- __entry->nr_updates = trans->nr_updates;
- ),
-
- TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
- __entry->trans_fn,
- __entry->path_idx,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->overwrite,
- __entry->update_idx,
- __entry->nr_updates)
-);
-
-TRACE_EVENT(btree_path_lock,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_bkey_cached_common *b),
- TP_ARGS(trans, caller_ip, b),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u8, level )
- __array(char, node, 24 )
- __field(u32, lock_seq )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = b->btree_id;
- __entry->level = b->level;
-
- scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
- __entry->lock_seq = six_lock_seq(&b->lock);
- ),
-
- TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->level,
- __entry->node,
- __entry->lock_seq)
-);
-
-DECLARE_EVENT_CLASS(btree_path_ev,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path),
-
- TP_STRUCT__entry(
- __field(u16, idx )
- __field(u8, ref )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->ref = path->ref;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- ),
-
- TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
- __entry->idx, __entry->ref,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-TRACE_EVENT(btree_path_alloc,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, locks_want )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->locks_want = path->locks_want;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- ),
-
- TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
- __entry->idx,
- bch2_btree_id_str(__entry->btree_id),
- __entry->locks_want,
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot)
-);
-
-TRACE_EVENT(btree_path_get,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
- TP_ARGS(trans, path, new_pos),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, ref )
- __field(u8, preserve )
- __field(u8, locks_want )
- __field(u8, btree_id )
- TRACE_BPOS_entries(old_pos)
- TRACE_BPOS_entries(new_pos)
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->ref = path->ref;
- __entry->preserve = path->preserve;
- __entry->locks_want = path->locks_want;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(old_pos, path->pos);
- TRACE_BPOS_assign(new_pos, *new_pos);
- ),
-
- TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
- __entry->idx,
- __entry->ref,
- __entry->preserve,
- bch2_btree_id_str(__entry->btree_id),
- __entry->locks_want,
- __entry->old_pos_inode,
- __entry->old_pos_offset,
- __entry->old_pos_snapshot,
- __entry->new_pos_inode,
- __entry->new_pos_offset,
- __entry->new_pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(btree_path_clone,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
- TP_ARGS(trans, path, new),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, new_idx )
- __field(u8, btree_id )
- __field(u8, ref )
- __field(u8, preserve )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->new_idx = new - trans->paths;
- __entry->btree_id = path->btree_id;
- __entry->ref = path->ref;
- __entry->preserve = path->preserve;
- TRACE_BPOS_assign(pos, path->pos);
- ),
-
- TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
- __entry->idx,
- __entry->ref,
- __entry->preserve,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->new_idx)
-);
-
-DEFINE_EVENT(btree_path_clone, btree_path_clone,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
- TP_ARGS(trans, path, new)
-);
-
-DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
- TP_ARGS(trans, path, new)
-);
-
-DECLARE_EVENT_CLASS(btree_path_traverse,
- TP_PROTO(struct btree_trans *trans,
- struct btree_path *path),
- TP_ARGS(trans, path),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(btree_path_idx_t, idx )
- __field(u8, ref )
- __field(u8, preserve )
- __field(u8, should_be_locked )
- __field(u8, btree_id )
- __field(u8, level )
- TRACE_BPOS_entries(pos)
- __field(u8, locks_want )
- __field(u8, nodes_locked )
- __array(char, node0, 24 )
- __array(char, node1, 24 )
- __array(char, node2, 24 )
- __array(char, node3, 24 )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-
- __entry->idx = path - trans->paths;
- __entry->ref = path->ref;
- __entry->preserve = path->preserve;
- __entry->btree_id = path->btree_id;
- __entry->level = path->level;
- TRACE_BPOS_assign(pos, path->pos);
-
- __entry->locks_want = path->locks_want;
- __entry->nodes_locked = path->nodes_locked;
- struct btree *b = path->l[0].b;
- if (IS_ERR(b))
- strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[1].b;
- if (IS_ERR(b))
- strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[2].b;
- if (IS_ERR(b))
- strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[3].b;
- if (IS_ERR(b))
- strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
- ),
-
- TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
- "locks %u %u %u %u node %s %s %s %s",
- __entry->trans_fn,
- __entry->idx,
- __entry->ref,
- __entry->preserve,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->level,
- __entry->locks_want,
- (__entry->nodes_locked >> 6) & 3,
- (__entry->nodes_locked >> 4) & 3,
- (__entry->nodes_locked >> 2) & 3,
- (__entry->nodes_locked >> 0) & 3,
- __entry->node3,
- __entry->node2,
- __entry->node1,
- __entry->node0)
-);
-
-DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
- TP_PROTO(struct btree_trans *trans,
- struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-TRACE_EVENT(btree_path_set_pos,
- TP_PROTO(struct btree_trans *trans,
- struct btree_path *path,
- struct bpos *new_pos),
- TP_ARGS(trans, path, new_pos),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, ref )
- __field(u8, preserve )
- __field(u8, btree_id )
- TRACE_BPOS_entries(old_pos)
- TRACE_BPOS_entries(new_pos)
- __field(u8, locks_want )
- __field(u8, nodes_locked )
- __array(char, node0, 24 )
- __array(char, node1, 24 )
- __array(char, node2, 24 )
- __array(char, node3, 24 )
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->ref = path->ref;
- __entry->preserve = path->preserve;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(old_pos, path->pos);
- TRACE_BPOS_assign(new_pos, *new_pos);
-
- __entry->nodes_locked = path->nodes_locked;
- struct btree *b = path->l[0].b;
- if (IS_ERR(b))
- strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[1].b;
- if (IS_ERR(b))
- strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[2].b;
- if (IS_ERR(b))
- strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[3].b;
- if (IS_ERR(b))
- strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
- ),
-
- TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
- "locks %u %u %u %u node %s %s %s %s",
- __entry->idx,
- __entry->ref,
- __entry->preserve,
- bch2_btree_id_str(__entry->btree_id),
- __entry->old_pos_inode,
- __entry->old_pos_offset,
- __entry->old_pos_snapshot,
- __entry->new_pos_inode,
- __entry->new_pos_offset,
- __entry->new_pos_snapshot,
- (__entry->nodes_locked >> 6) & 3,
- (__entry->nodes_locked >> 4) & 3,
- (__entry->nodes_locked >> 2) & 3,
- (__entry->nodes_locked >> 0) & 3,
- __entry->node3,
- __entry->node2,
- __entry->node1,
- __entry->node0)
-);
-
-TRACE_EVENT(btree_path_free,
- TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
- TP_ARGS(trans, path, dup),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, preserve )
- __field(u8, should_be_locked)
- __field(s8, dup )
- __field(u8, dup_locked )
- ),
-
- TP_fast_assign(
- __entry->idx = path;
- __entry->preserve = trans->paths[path].preserve;
- __entry->should_be_locked = trans->paths[path].should_be_locked;
- __entry->dup = dup ? dup - trans->paths : -1;
- __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0;
- ),
-
- TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx,
- __entry->preserve ? 'P' : ' ',
- __entry->should_be_locked ? 'S' : ' ',
- __entry->dup,
- __entry->dup_locked)
-);
-
-TRACE_EVENT(btree_path_free_trans_begin,
- TP_PROTO(btree_path_idx_t path),
- TP_ARGS(path),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- ),
-
- TP_fast_assign(
- __entry->idx = path;
- ),
-
- TP_printk(" path %3u", __entry->idx)
-);
-
-#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
-#ifndef _TRACE_BCACHEFS_H
-
-static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
- struct btree_insert_entry *i, bool overwrite) {}
-static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
-static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
-static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
-static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
-static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
-static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
-static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
-
-#endif
-#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
-
-#define _TRACE_BCACHEFS_H
-#endif /* _TRACE_BCACHEFS_H */
-
-/* This part must be outside protection */
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../fs/bcachefs
-
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-#include <trace/define_trace.h>
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
deleted file mode 100644
index 9764c2e6a910..000000000000
--- a/fs/bcachefs/two_state_shared_lock.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "two_state_shared_lock.h"
-
-void __bch2_two_state_lock(two_state_lock_t *lock, int s)
-{
- __wait_event(lock->wait, bch2_two_state_trylock(lock, s));
-}
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
deleted file mode 100644
index 7f647846b511..000000000000
--- a/fs/bcachefs/two_state_shared_lock.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_TWO_STATE_LOCK_H
-#define _BCACHEFS_TWO_STATE_LOCK_H
-
-#include <linux/atomic.h>
-#include <linux/sched.h>
-#include <linux/wait.h>
-
-#include "util.h"
-
-/*
- * Two-state lock - can be taken for add or block - both states are shared,
- * like read side of rwsem, but conflict with other state:
- */
-typedef struct {
- atomic_long_t v;
- wait_queue_head_t wait;
-} two_state_lock_t;
-
-static inline void two_state_lock_init(two_state_lock_t *lock)
-{
- atomic_long_set(&lock->v, 0);
- init_waitqueue_head(&lock->wait);
-}
-
-static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
-{
- long i = s ? 1 : -1;
-
- EBUG_ON(atomic_long_read(&lock->v) == 0);
-
- if (atomic_long_sub_return_release(i, &lock->v) == 0)
- wake_up_all(&lock->wait);
-}
-
-static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
-{
- long i = s ? 1 : -1;
- long old;
-
- old = atomic_long_read(&lock->v);
- do {
- if (i > 0 ? old < 0 : old > 0)
- return false;
- } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i));
-
- return true;
-}
-
-void __bch2_two_state_lock(two_state_lock_t *, int);
-
-static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
-{
- if (!bch2_two_state_trylock(lock, s))
- __bch2_two_state_lock(lock, s);
-}
-
-#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
deleted file mode 100644
index 553de8d8e3e5..000000000000
--- a/fs/bcachefs/util.c
+++ /dev/null
@@ -1,1040 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * random utility code, for bcache but in theory not specific to bcache
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/console.h>
-#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/log2.h>
-#include <linux/math64.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/sched/clock.h>
-
-#include "eytzinger.h"
-#include "mean_and_variance.h"
-#include "util.h"
-
-static const char si_units[] = "?kMGTPEZY";
-
-/* string_get_size units: */
-static const char *const units_2[] = {
- "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
-};
-static const char *const units_10[] = {
- "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
-};
-
-static int parse_u64(const char *cp, u64 *res)
-{
- const char *start = cp;
- u64 v = 0;
-
- if (!isdigit(*cp))
- return -EINVAL;
-
- do {
- if (v > U64_MAX / 10)
- return -ERANGE;
- v *= 10;
- if (v > U64_MAX - (*cp - '0'))
- return -ERANGE;
- v += *cp - '0';
- cp++;
- } while (isdigit(*cp));
-
- *res = v;
- return cp - start;
-}
-
-static int bch2_pow(u64 n, u64 p, u64 *res)
-{
- *res = 1;
-
- while (p--) {
- if (*res > div64_u64(U64_MAX, n))
- return -ERANGE;
- *res *= n;
- }
- return 0;
-}
-
-static int parse_unit_suffix(const char *cp, u64 *res)
-{
- const char *start = cp;
- u64 base = 1024;
- unsigned u;
- int ret;
-
- if (*cp == ' ')
- cp++;
-
- for (u = 1; u < strlen(si_units); u++)
- if (*cp == si_units[u]) {
- cp++;
- goto got_unit;
- }
-
- for (u = 0; u < ARRAY_SIZE(units_2); u++)
- if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
- cp += strlen(units_2[u]);
- goto got_unit;
- }
-
- for (u = 0; u < ARRAY_SIZE(units_10); u++)
- if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
- cp += strlen(units_10[u]);
- base = 1000;
- goto got_unit;
- }
-
- *res = 1;
- return 0;
-got_unit:
- ret = bch2_pow(base, u, res);
- if (ret)
- return ret;
-
- return cp - start;
-}
-
-#define parse_or_ret(cp, _f) \
-do { \
- int _ret = _f; \
- if (_ret < 0) \
- return _ret; \
- cp += _ret; \
-} while (0)
-
-static int __bch2_strtou64_h(const char *cp, u64 *res)
-{
- const char *start = cp;
- u64 v = 0, b, f_n = 0, f_d = 1;
- int ret;
-
- parse_or_ret(cp, parse_u64(cp, &v));
-
- if (*cp == '.') {
- cp++;
- ret = parse_u64(cp, &f_n);
- if (ret < 0)
- return ret;
- cp += ret;
-
- ret = bch2_pow(10, ret, &f_d);
- if (ret)
- return ret;
- }
-
- parse_or_ret(cp, parse_unit_suffix(cp, &b));
-
- if (v > div64_u64(U64_MAX, b))
- return -ERANGE;
- v *= b;
-
- if (f_n > div64_u64(U64_MAX, b))
- return -ERANGE;
-
- f_n = div64_u64(f_n * b, f_d);
- if (v + f_n < v)
- return -ERANGE;
- v += f_n;
-
- *res = v;
- return cp - start;
-}
-
-static int __bch2_strtoh(const char *cp, u64 *res,
- u64 t_max, bool t_signed)
-{
- bool positive = *cp != '-';
- u64 v = 0;
-
- if (*cp == '+' || *cp == '-')
- cp++;
-
- parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
-
- if (*cp == '\n')
- cp++;
- if (*cp)
- return -EINVAL;
-
- if (positive) {
- if (v > t_max)
- return -ERANGE;
- } else {
- if (v && !t_signed)
- return -ERANGE;
-
- if (v > t_max + 1)
- return -ERANGE;
- v = -v;
- }
-
- *res = v;
- return 0;
-}
-
-#define STRTO_H(name, type) \
-int bch2_ ## name ## _h(const char *cp, type *res) \
-{ \
- u64 v = 0; \
- int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \
- ANYSINT_MAX(type) != ((type) ~0ULL)); \
- *res = v; \
- return ret; \
-}
-
-STRTO_H(strtoint, int)
-STRTO_H(strtouint, unsigned int)
-STRTO_H(strtoll, long long)
-STRTO_H(strtoull, unsigned long long)
-STRTO_H(strtou64, u64)
-
-u64 bch2_read_flag_list(const char *opt, const char * const list[])
-{
- u64 ret = 0;
- char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
-
- if (!d)
- return -ENOMEM;
-
- s = strim(d);
-
- while ((p = strsep(&s, ",;"))) {
- int flag = match_string(list, -1, p);
-
- if (flag < 0) {
- ret = -1;
- break;
- }
-
- ret |= BIT_ULL(flag);
- }
-
- kfree(d);
-
- return ret;
-}
-
-bool bch2_is_zero(const void *_p, size_t n)
-{
- const char *p = _p;
- size_t i;
-
- for (i = 0; i < n; i++)
- if (p[i])
- return false;
- return true;
-}
-
-void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
-{
- while (nr_bits)
- prt_char(out, '0' + ((v >> --nr_bits) & 1));
-}
-
-void bch2_prt_u64_base2(struct printbuf *out, u64 v)
-{
- bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
-}
-
-static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
- bool nonblocking)
-{
- bool locked = false;
- const char *p;
-
- if (!lines) {
- printk("%s (null)\n", prefix);
- return;
- }
-
- if (!nonblocking) {
- console_lock();
- locked = true;
- } else {
- locked = console_trylock();
- }
-
- while (1) {
- p = strchrnul(lines, '\n');
- printk("%s%.*s\n", prefix, (int) (p - lines), lines);
- if (!*p)
- break;
- lines = p + 1;
- }
- if (locked)
- console_unlock();
-}
-
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
-{
- return __bch2_print_string_as_lines(prefix, lines, false);
-}
-
-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines)
-{
- return __bch2_print_string_as_lines(prefix, lines, true);
-}
-
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
- gfp_t gfp)
-{
-#ifdef CONFIG_STACKTRACE
- unsigned nr_entries = 0;
-
- stack->nr = 0;
- int ret = darray_make_room_gfp(stack, 32, gfp);
- if (ret)
- return ret;
-
- if (!down_read_trylock(&task->signal->exec_update_lock))
- return -1;
-
- do {
- nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
- } while (nr_entries == stack->size &&
- !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
-
- stack->nr = nr_entries;
- up_read(&task->signal->exec_update_lock);
-
- return ret;
-#else
- return 0;
-#endif
-}
-
-void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
-{
- darray_for_each(*stack, i) {
- prt_printf(out, "[<0>] %pB", (void *) *i);
- prt_newline(out);
- }
-}
-
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
-{
- bch_stacktrace stack = { 0 };
- int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
-
- bch2_prt_backtrace(out, &stack);
- darray_exit(&stack);
- return ret;
-}
-
-#ifndef __KERNEL__
-#include <time.h>
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- time_t t = sec;
- char buf[64];
- ctime_r(&t, buf);
- strim(buf);
- prt_str(out, buf);
-}
-#else
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- char buf[64];
- snprintf(buf, sizeof(buf), "%ptT", &sec);
- prt_u64(out, sec);
-}
-#endif
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
- const struct time_unit *u = bch2_pick_time_units(ns);
-
- prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
-}
-
-static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
-{
- const struct time_unit *u = bch2_pick_time_units(ns);
-
- prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name);
-}
-
-static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
-{
- prt_printf(out, "%s\t", name);
- bch2_pr_time_units_aligned(out, ns);
- prt_newline(out);
-}
-
-#define TABSTOP_SIZE 12
-
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
-{
- struct quantiles *quantiles = time_stats_to_quantiles(stats);
- s64 f_mean = 0, d_mean = 0;
- u64 f_stddev = 0, d_stddev = 0;
-
- if (stats->buffer) {
- int cpu;
-
- spin_lock_irq(&stats->lock);
- for_each_possible_cpu(cpu)
- __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
- spin_unlock_irq(&stats->lock);
- }
-
- /*
- * avoid divide by zero
- */
- if (stats->freq_stats.n) {
- f_mean = mean_and_variance_get_mean(stats->freq_stats);
- f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
- d_mean = mean_and_variance_get_mean(stats->duration_stats);
- d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
- }
-
- printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
- prt_printf(out, "count:\t%llu\n", stats->duration_stats.n);
- printbuf_tabstop_pop(out);
-
- printbuf_tabstops_reset(out);
-
- printbuf_tabstop_push(out, out->indent + 20);
- printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
- printbuf_tabstop_push(out, 0);
- printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
-
- prt_printf(out, "\tsince mount\r\trecent\r\n");
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, out->indent + 20);
- printbuf_tabstop_push(out, TABSTOP_SIZE);
- printbuf_tabstop_push(out, 2);
- printbuf_tabstop_push(out, TABSTOP_SIZE);
-
- prt_printf(out, "duration of events\n");
- printbuf_indent_add(out, 2);
-
- pr_name_and_units(out, "min:", stats->min_duration);
- pr_name_and_units(out, "max:", stats->max_duration);
- pr_name_and_units(out, "total:", stats->total_duration);
-
- prt_printf(out, "mean:\t");
- bch2_pr_time_units_aligned(out, d_mean);
- prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
- prt_newline(out);
-
- prt_printf(out, "stddev:\t");
- bch2_pr_time_units_aligned(out, d_stddev);
- prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
-
- printbuf_indent_sub(out, 2);
- prt_newline(out);
-
- prt_printf(out, "time between events\n");
- printbuf_indent_add(out, 2);
-
- pr_name_and_units(out, "min:", stats->min_freq);
- pr_name_and_units(out, "max:", stats->max_freq);
-
- prt_printf(out, "mean:\t");
- bch2_pr_time_units_aligned(out, f_mean);
- prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
- prt_newline(out);
-
- prt_printf(out, "stddev:\t");
- bch2_pr_time_units_aligned(out, f_stddev);
- prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
-
- printbuf_indent_sub(out, 2);
- prt_newline(out);
-
- printbuf_tabstops_reset(out);
-
- if (quantiles) {
- int i = eytzinger0_first(NR_QUANTILES);
- const struct time_unit *u =
- bch2_pick_time_units(quantiles->entries[i].m);
- u64 last_q = 0;
-
- prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(j, NR_QUANTILES) {
- bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1;
-
- u64 q = max(quantiles->entries[j].m, last_q);
- prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
- if (is_last)
- prt_newline(out);
- last_q = q;
- }
- }
-}
-
-/* ratelimit: */
-
-/**
- * bch2_ratelimit_delay() - return how long to delay until the next time to do
- * some work
- * @d: the struct bch_ratelimit to update
- * Returns: the amount of time to delay by, in jiffies
- */
-u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
-{
- u64 now = local_clock();
-
- return time_after64(d->next, now)
- ? nsecs_to_jiffies(d->next - now)
- : 0;
-}
-
-/**
- * bch2_ratelimit_increment() - increment @d by the amount of work done
- * @d: the struct bch_ratelimit to update
- * @done: the amount of work done, in arbitrary units
- */
-void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
-{
- u64 now = local_clock();
-
- d->next += div_u64(done * NSEC_PER_SEC, d->rate);
-
- if (time_before64(now + NSEC_PER_SEC, d->next))
- d->next = now + NSEC_PER_SEC;
-
- if (time_after64(now - NSEC_PER_SEC * 2, d->next))
- d->next = now - NSEC_PER_SEC * 2;
-}
-
-/* pd controller: */
-
-/*
- * Updates pd_controller. Attempts to scale inputed values to units per second.
- * @target: desired value
- * @actual: current value
- *
- * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
- * it makes actual go down.
- */
-void bch2_pd_controller_update(struct bch_pd_controller *pd,
- s64 target, s64 actual, int sign)
-{
- s64 proportional, derivative, change;
-
- unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
-
- if (seconds_since_update == 0)
- return;
-
- pd->last_update = jiffies;
-
- proportional = actual - target;
- proportional *= seconds_since_update;
- proportional = div_s64(proportional, pd->p_term_inverse);
-
- derivative = actual - pd->last_actual;
- derivative = div_s64(derivative, seconds_since_update);
- derivative = ewma_add(pd->smoothed_derivative, derivative,
- (pd->d_term / seconds_since_update) ?: 1);
- derivative = derivative * pd->d_term;
- derivative = div_s64(derivative, pd->p_term_inverse);
-
- change = proportional + derivative;
-
- /* Don't increase rate if not keeping up */
- if (change > 0 &&
- pd->backpressure &&
- time_after64(local_clock(),
- pd->rate.next + NSEC_PER_MSEC))
- change = 0;
-
- change *= (sign * -1);
-
- pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
- 1, UINT_MAX);
-
- pd->last_actual = actual;
- pd->last_derivative = derivative;
- pd->last_proportional = proportional;
- pd->last_change = change;
- pd->last_target = target;
-}
-
-void bch2_pd_controller_init(struct bch_pd_controller *pd)
-{
- pd->rate.rate = 1024;
- pd->last_update = jiffies;
- pd->p_term_inverse = 6000;
- pd->d_term = 30;
- pd->d_smooth = pd->d_term;
- pd->backpressure = 1;
-}
-
-void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 20);
-
- prt_printf(out, "rate:\t");
- prt_human_readable_s64(out, pd->rate.rate);
- prt_newline(out);
-
- prt_printf(out, "target:\t");
- prt_human_readable_u64(out, pd->last_target);
- prt_newline(out);
-
- prt_printf(out, "actual:\t");
- prt_human_readable_u64(out, pd->last_actual);
- prt_newline(out);
-
- prt_printf(out, "proportional:\t");
- prt_human_readable_s64(out, pd->last_proportional);
- prt_newline(out);
-
- prt_printf(out, "derivative:\t");
- prt_human_readable_s64(out, pd->last_derivative);
- prt_newline(out);
-
- prt_printf(out, "change:\t");
- prt_human_readable_s64(out, pd->last_change);
- prt_newline(out);
-
- prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
-}
-
-/* misc: */
-
-void bch2_bio_map(struct bio *bio, void *base, size_t size)
-{
- while (size) {
- struct page *page = is_vmalloc_addr(base)
- ? vmalloc_to_page(base)
- : virt_to_page(base);
- unsigned offset = offset_in_page(base);
- unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
-
- BUG_ON(!bio_add_page(bio, page, len, offset));
- size -= len;
- base += len;
- }
-}
-
-int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
-{
- while (size) {
- struct page *page = alloc_pages(gfp_mask, 0);
- unsigned len = min_t(size_t, PAGE_SIZE, size);
-
- if (!page)
- return -ENOMEM;
-
- if (unlikely(!bio_add_page(bio, page, len, 0))) {
- __free_page(page);
- break;
- }
-
- size -= len;
- }
-
- return 0;
-}
-
-u64 bch2_get_random_u64_below(u64 ceil)
-{
- if (ceil <= U32_MAX)
- return __get_random_u32_below(ceil);
-
- /* this is the same (clever) algorithm as in __get_random_u32_below() */
- u64 rand = get_random_u64();
- u64 mult = ceil * rand;
-
- if (unlikely(mult < ceil)) {
- u64 bound;
- div64_u64_rem(-ceil, ceil, &bound);
- while (unlikely(mult < bound)) {
- rand = get_random_u64();
- mult = ceil * rand;
- }
- }
-
- return mul_u64_u64_shr(ceil, rand, 64);
-}
-
-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
-
- __bio_for_each_segment(bv, dst, iter, dst_iter) {
- void *dstp = kmap_local_page(bv.bv_page);
-
- memcpy(dstp + bv.bv_offset, src, bv.bv_len);
- kunmap_local(dstp);
-
- src += bv.bv_len;
- }
-}
-
-void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
-
- __bio_for_each_segment(bv, src, iter, src_iter) {
- void *srcp = kmap_local_page(bv.bv_page);
-
- memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
- kunmap_local(srcp);
-
- dst += bv.bv_len;
- }
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_corrupt_bio(struct bio *bio)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
-
- bio_for_each_segment(bv, bio, iter) {
- unsigned u64s = bv.bv_len / sizeof(u64);
-
- if (offset < u64s) {
- u64 *segment = bvec_kmap_local(&bv);
- segment[offset] = get_random_u64();
- kunmap_local(segment);
- return;
- }
- offset -= u64s;
- }
-}
-#endif
-
-#if 0
-void eytzinger1_test(void)
-{
- unsigned inorder, size;
-
- pr_info("1 based eytzinger test:\n");
-
- for (size = 2;
- size < 65536;
- size++) {
- unsigned extra = eytzinger1_extra(size);
-
- if (!(size % 4096))
- pr_info("tree size %u\n", size);
-
- inorder = 1;
- eytzinger1_for_each(eytz, size) {
- BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
- BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
- BUG_ON(eytz != eytzinger1_last(size) &&
- eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
-
- inorder++;
- }
- BUG_ON(inorder - 1 != size);
- }
-}
-
-void eytzinger0_test(void)
-{
-
- unsigned inorder, size;
-
- pr_info("0 based eytzinger test:\n");
-
- for (size = 1;
- size < 65536;
- size++) {
- unsigned extra = eytzinger0_extra(size);
-
- if (!(size % 4096))
- pr_info("tree size %u\n", size);
-
- inorder = 0;
- eytzinger0_for_each(eytz, size) {
- BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
- BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
- BUG_ON(eytz != eytzinger0_last(size) &&
- eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
-
- inorder++;
- }
- BUG_ON(inorder != size);
-
- inorder = size - 1;
- eytzinger0_for_each_prev(eytz, size) {
- BUG_ON(eytz != eytzinger0_first(size) &&
- eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz);
-
- inorder--;
- }
- BUG_ON(inorder != -1);
- }
-}
-
-static inline int cmp_u16(const void *_l, const void *_r)
-{
- const u16 *l = _l, *r = _r;
-
- return (*l > *r) - (*r > *l);
-}
-
-static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search)
-{
- int r, s;
- bool bad;
-
- r = eytzinger0_find_le(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
- if (r >= 0) {
- if (test_array[r] > search) {
- bad = true;
- } else {
- s = eytzinger0_next(r, nr);
- bad = s >= 0 && test_array[s] <= search;
- }
- } else {
- s = eytzinger0_last(nr);
- bad = s >= 0 && test_array[s] <= search;
- }
-
- if (bad) {
- s = -1;
- eytzinger0_for_each_prev(j, nr) {
- if (test_array[j] <= search) {
- s = j;
- break;
- }
- }
-
- eytzinger0_for_each(j, nr)
- pr_info("[%3u] = %12u\n", j, test_array[j]);
- pr_info("find_le(%12u) = %3i should be %3i\n",
- search, r, s);
- BUG();
- }
-}
-
-static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search)
-{
- int r, s;
- bool bad;
-
- r = eytzinger0_find_gt(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
- if (r >= 0) {
- if (test_array[r] <= search) {
- bad = true;
- } else {
- s = eytzinger0_prev(r, nr);
- bad = s >= 0 && test_array[s] > search;
- }
- } else {
- s = eytzinger0_first(nr);
- bad = s >= 0 && test_array[s] > search;
- }
-
- if (bad) {
- s = -1;
- eytzinger0_for_each(j, nr) {
- if (test_array[j] > search) {
- s = j;
- break;
- }
- }
-
- eytzinger0_for_each(j, nr)
- pr_info("[%3u] = %12u\n", j, test_array[j]);
- pr_info("find_gt(%12u) = %3i should be %3i\n",
- search, r, s);
- BUG();
- }
-}
-
-static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search)
-{
- int r, s;
- bool bad;
-
- r = eytzinger0_find_ge(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
- if (r >= 0) {
- if (test_array[r] < search) {
- bad = true;
- } else {
- s = eytzinger0_prev(r, nr);
- bad = s >= 0 && test_array[s] >= search;
- }
- } else {
- s = eytzinger0_first(nr);
- bad = s >= 0 && test_array[s] >= search;
- }
-
- if (bad) {
- s = -1;
- eytzinger0_for_each(j, nr) {
- if (test_array[j] >= search) {
- s = j;
- break;
- }
- }
-
- eytzinger0_for_each(j, nr)
- pr_info("[%3u] = %12u\n", j, test_array[j]);
- pr_info("find_ge(%12u) = %3i should be %3i\n",
- search, r, s);
- BUG();
- }
-}
-
-static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search)
-{
- unsigned r;
- int s;
- bool bad;
-
- r = eytzinger0_find(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
-
- if (r < nr) {
- bad = test_array[r] != search;
- } else {
- s = eytzinger0_find_le(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
- bad = s >= 0 && test_array[s] == search;
- }
-
- if (bad) {
- eytzinger0_for_each(j, nr)
- pr_info("[%3u] = %12u\n", j, test_array[j]);
- pr_info("find(%12u) = %3i is incorrect\n",
- search, r);
- BUG();
- }
-}
-
-static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
-{
- eytzinger0_find_test_le(test_array, nr, search);
- eytzinger0_find_test_gt(test_array, nr, search);
- eytzinger0_find_test_ge(test_array, nr, search);
- eytzinger0_find_test_eq(test_array, nr, search);
-}
-
-void eytzinger0_find_test(void)
-{
- unsigned i, nr, allocated = 1 << 12;
- u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
-
- for (nr = 1; nr < allocated; nr++) {
- u16 prev = 0;
-
- pr_info("testing %u elems\n", nr);
-
- get_random_bytes(test_array, nr * sizeof(test_array[0]));
- eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
-
- /* verify array is sorted correctly: */
- eytzinger0_for_each(j, nr) {
- BUG_ON(test_array[j] < prev);
- prev = test_array[j];
- }
-
- for (i = 0; i < U16_MAX; i += 1 << 12)
- eytzinger0_find_test_val(test_array, nr, i);
-
- for (i = 0; i < nr; i++) {
- eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
- eytzinger0_find_test_val(test_array, nr, test_array[i]);
- eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
- }
- }
-
- kfree(test_array);
-}
-#endif
-
-/*
- * Accumulate percpu counters onto one cpu's copy - only valid when access
- * against any percpu counter is guarded against
- */
-u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
-{
- u64 *ret;
- int cpu;
-
- /* access to pcpu vars has to be blocked by other locking */
- preempt_disable();
- ret = this_cpu_ptr(p);
- preempt_enable();
-
- for_each_possible_cpu(cpu) {
- u64 *i = per_cpu_ptr(p, cpu);
-
- if (i != ret) {
- acc_u64s(ret, i, nr);
- memset(i, 0, nr * sizeof(u64));
- }
- }
-
- return ret;
-}
-
-void bch2_darray_str_exit(darray_str *d)
-{
- darray_for_each(*d, i)
- kfree(*i);
- darray_exit(d);
-}
-
-int bch2_split_devs(const char *_dev_name, darray_str *ret)
-{
- darray_init(ret);
-
- char *dev_name, *s, *orig;
-
- dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
- if (!dev_name)
- return -ENOMEM;
-
- while ((s = strsep(&dev_name, ":"))) {
- char *p = kstrdup(s, GFP_KERNEL);
- if (!p)
- goto err;
-
- if (darray_push(ret, p)) {
- kfree(p);
- goto err;
- }
- }
-
- kfree(orig);
- return 0;
-err:
- bch2_darray_str_exit(ret);
- kfree(orig);
- return -ENOMEM;
-}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
deleted file mode 100644
index 7d921fc920a0..000000000000
--- a/fs/bcachefs/util.h
+++ /dev/null
@@ -1,741 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_UTIL_H
-#define _BCACHEFS_UTIL_H
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/closure.h>
-#include <linux/errno.h>
-#include <linux/freezer.h>
-#include <linux/kernel.h>
-#include <linux/min_heap.h>
-#include <linux/sched/clock.h>
-#include <linux/llist.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/ratelimit.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/workqueue.h>
-
-#include "mean_and_variance.h"
-
-#include "darray.h"
-#include "time_stats.h"
-
-struct closure;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define EBUG_ON(cond) BUG_ON(cond)
-#else
-#define EBUG_ON(cond)
-#endif
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define CPU_BIG_ENDIAN 0
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define CPU_BIG_ENDIAN 1
-#endif
-
-/* type hackery */
-
-#define type_is_exact(_val, _type) \
- __builtin_types_compatible_p(typeof(_val), _type)
-
-#define type_is(_val, _type) \
- (__builtin_types_compatible_p(typeof(_val), _type) || \
- __builtin_types_compatible_p(typeof(_val), const _type))
-
-/* Userspace doesn't align allocations as nicely as the kernel allocators: */
-static inline size_t buf_pages(void *p, size_t len)
-{
- return DIV_ROUND_UP(len +
- ((unsigned long) p & (PAGE_SIZE - 1)),
- PAGE_SIZE);
-}
-
-static inline void *bch2_kvmalloc(size_t n, gfp_t flags)
-{
- void *p = unlikely(n >= INT_MAX)
- ? vmalloc(n)
- : kvmalloc(n, flags & ~__GFP_ZERO);
- if (p && (flags & __GFP_ZERO))
- memset(p, 0, n);
- return p;
-}
-
-#define init_heap(heap, _size, gfp) \
-({ \
- (heap)->nr = 0; \
- (heap)->size = (_size); \
- (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
- (gfp)); \
-})
-
-#define free_heap(heap) \
-do { \
- kvfree((heap)->data); \
- (heap)->data = NULL; \
-} while (0)
-
-#define ANYSINT_MAX(t) \
- ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
-
-#include "printbuf.h"
-
-#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__)
-#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__)
-#define printbuf_str(_buf) bch2_printbuf_str(_buf)
-#define printbuf_exit(_buf) bch2_printbuf_exit(_buf)
-
-#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf)
-#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf)
-#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n)
-
-#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n)
-#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n)
-
-#define prt_newline(_out) bch2_prt_newline(_out)
-#define prt_tab(_out) bch2_prt_tab(_out)
-#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out)
-
-#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__)
-#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v))
-#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__)
-#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__)
-#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__)
-#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__)
-#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__)
-#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__)
-#define prt_bitflags_vector(...) bch2_prt_bitflags_vector(__VA_ARGS__)
-
-void bch2_pr_time_units(struct printbuf *, u64);
-void bch2_prt_datetime(struct printbuf *, time64_t);
-
-#ifdef __KERNEL__
-static inline void uuid_unparse_lower(u8 *uuid, char *out)
-{
- sprintf(out, "%pUb", uuid);
-}
-#else
-#include <uuid/uuid.h>
-#endif
-
-static inline void pr_uuid(struct printbuf *out, u8 *uuid)
-{
- char uuid_str[40];
-
- uuid_unparse_lower(uuid, uuid_str);
- prt_printf(out, "%s", uuid_str);
-}
-
-int bch2_strtoint_h(const char *, int *);
-int bch2_strtouint_h(const char *, unsigned int *);
-int bch2_strtoll_h(const char *, long long *);
-int bch2_strtoull_h(const char *, unsigned long long *);
-int bch2_strtou64_h(const char *, u64 *);
-
-static inline int bch2_strtol_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
- return bch2_strtoint_h(cp, (int *) res);
-#else
- return bch2_strtoll_h(cp, (long long *) res);
-#endif
-}
-
-static inline int bch2_strtoul_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
- return bch2_strtouint_h(cp, (unsigned int *) res);
-#else
- return bch2_strtoull_h(cp, (unsigned long long *) res);
-#endif
-}
-
-#define strtoi_h(cp, res) \
- ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\
- : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\
- : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\
- : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\
- : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\
- : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
- : -EINVAL)
-
-#define strtoul_safe(cp, var) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (!_r) \
- var = _v; \
- _r; \
-})
-
-#define strtoul_safe_clamp(cp, var, min, max) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (!_r) \
- var = clamp_t(typeof(var), _v, min, max); \
- _r; \
-})
-
-#define strtoul_safe_restrict(cp, var, min, max) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (!_r && _v >= min && _v <= max) \
- var = _v; \
- else \
- _r = -EINVAL; \
- _r; \
-})
-
-#define snprint(out, var) \
- prt_printf(out, \
- type_is(var, int) ? "%i\n" \
- : type_is(var, unsigned) ? "%u\n" \
- : type_is(var, long) ? "%li\n" \
- : type_is(var, unsigned long) ? "%lu\n" \
- : type_is(var, s64) ? "%lli\n" \
- : type_is(var, u64) ? "%llu\n" \
- : type_is(var, char *) ? "%s\n" \
- : "%i\n", var)
-
-bool bch2_is_zero(const void *, size_t);
-
-u64 bch2_read_flag_list(const char *, const char * const[]);
-
-void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
-void bch2_prt_u64_base2(struct printbuf *, u64);
-
-void bch2_print_string_as_lines(const char *prefix, const char *lines);
-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines);
-
-typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
-void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
-
-static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
-{
-#ifdef __KERNEL__
- prt_printf(out, "%pg", bdev);
-#else
- prt_str(out, bdev->name);
-#endif
-}
-
-void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-
-#define ewma_add(ewma, val, weight) \
-({ \
- typeof(ewma) _ewma = (ewma); \
- typeof(weight) _weight = (weight); \
- \
- (((_ewma << _weight) - _ewma) + (val)) >> _weight; \
-})
-
-struct bch_ratelimit {
- /* Next time we want to do some work, in nanoseconds */
- u64 next;
-
- /*
- * Rate at which we want to do work, in units per nanosecond
- * The units here correspond to the units passed to
- * bch2_ratelimit_increment()
- */
- unsigned rate;
-};
-
-static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
-{
- d->next = local_clock();
-}
-
-u64 bch2_ratelimit_delay(struct bch_ratelimit *);
-void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
-
-struct bch_pd_controller {
- struct bch_ratelimit rate;
- unsigned long last_update;
-
- s64 last_actual;
- s64 smoothed_derivative;
-
- unsigned p_term_inverse;
- unsigned d_smooth;
- unsigned d_term;
-
- /* for exporting to sysfs (no effect on behavior) */
- s64 last_derivative;
- s64 last_proportional;
- s64 last_change;
- s64 last_target;
-
- /*
- * If true, the rate will not increase if bch2_ratelimit_delay()
- * is not being called often enough.
- */
- bool backpressure;
-};
-
-void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
-void bch2_pd_controller_init(struct bch_pd_controller *);
-void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
-
-#define sysfs_pd_controller_attribute(name) \
- rw_attribute(name##_rate); \
- rw_attribute(name##_rate_bytes); \
- rw_attribute(name##_rate_d_term); \
- rw_attribute(name##_rate_p_term_inverse); \
- read_attribute(name##_rate_debug)
-
-#define sysfs_pd_controller_files(name) \
- &sysfs_##name##_rate, \
- &sysfs_##name##_rate_bytes, \
- &sysfs_##name##_rate_d_term, \
- &sysfs_##name##_rate_p_term_inverse, \
- &sysfs_##name##_rate_debug
-
-#define sysfs_pd_controller_show(name, var) \
-do { \
- sysfs_hprint(name##_rate, (var)->rate.rate); \
- sysfs_print(name##_rate_bytes, (var)->rate.rate); \
- sysfs_print(name##_rate_d_term, (var)->d_term); \
- sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
- \
- if (attr == &sysfs_##name##_rate_debug) \
- bch2_pd_controller_debug_to_text(out, var); \
-} while (0)
-
-#define sysfs_pd_controller_store(name, var) \
-do { \
- sysfs_strtoul_clamp(name##_rate, \
- (var)->rate.rate, 1, UINT_MAX); \
- sysfs_strtoul_clamp(name##_rate_bytes, \
- (var)->rate.rate, 1, UINT_MAX); \
- sysfs_strtoul(name##_rate_d_term, (var)->d_term); \
- sysfs_strtoul_clamp(name##_rate_p_term_inverse, \
- (var)->p_term_inverse, 1, INT_MAX); \
-} while (0)
-
-#define container_of_or_null(ptr, type, member) \
-({ \
- typeof(ptr) _ptr = ptr; \
- _ptr ? container_of(_ptr, type, member) : NULL; \
-})
-
-static inline struct list_head *list_pop(struct list_head *head)
-{
- if (list_empty(head))
- return NULL;
-
- struct list_head *ret = head->next;
- list_del_init(ret);
- return ret;
-}
-
-#define list_pop_entry(head, type, member) \
- container_of_or_null(list_pop(head), type, member)
-
-/* Does linear interpolation between powers of two */
-static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
-{
- unsigned fract = x & ~(~0 << fract_bits);
-
- x >>= fract_bits;
- x = 1 << x;
- x += (x * fract) >> fract_bits;
-
- return x;
-}
-
-void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
-
-#define closure_bio_submit(bio, cl) \
-do { \
- closure_get(cl); \
- submit_bio(bio); \
-} while (0)
-
-#define kthread_wait(cond) \
-({ \
- int _ret = 0; \
- \
- while (1) { \
- set_current_state(TASK_INTERRUPTIBLE); \
- if (kthread_should_stop()) { \
- _ret = -1; \
- break; \
- } \
- \
- if (cond) \
- break; \
- \
- schedule(); \
- } \
- set_current_state(TASK_RUNNING); \
- _ret; \
-})
-
-#define kthread_wait_freezable(cond) \
-({ \
- int _ret = 0; \
- while (1) { \
- set_current_state(TASK_INTERRUPTIBLE); \
- if (kthread_should_stop()) { \
- _ret = -1; \
- break; \
- } \
- \
- if (cond) \
- break; \
- \
- schedule(); \
- try_to_freeze(); \
- } \
- set_current_state(TASK_RUNNING); \
- _ret; \
-})
-
-u64 bch2_get_random_u64_below(u64);
-
-void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
-void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_corrupt_bio(struct bio *);
-
-static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
-{
- if (ratio && !get_random_u32_below(ratio))
- bch2_corrupt_bio(bio);
-}
-#else
-#define bch2_maybe_corrupt_bio(...) do {} while (0)
-#endif
-
-static inline void memcpy_u64s_small(void *dst, const void *src,
- unsigned u64s)
-{
- u64 *d = dst;
- const u64 *s = src;
-
- while (u64s--)
- *d++ = *s++;
-}
-
-static inline void __memcpy_u64s(void *dst, const void *src,
- unsigned u64s)
-{
-#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
- long d0, d1, d2;
-
- asm volatile("rep ; movsq"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (u64s), "1" (dst), "2" (src)
- : "memory");
-#else
- u64 *d = dst;
- const u64 *s = src;
-
- while (u64s--)
- *d++ = *s++;
-#endif
-}
-
-static inline void memcpy_u64s(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
- dst + u64s * sizeof(u64) <= src));
-
- __memcpy_u64s(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_down(void *dst, const void *src,
- unsigned u64s)
-{
- __memcpy_u64s(dst, src, u64s);
-}
-
-static inline void memmove_u64s_down(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst > src);
-
- __memmove_u64s_down(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_down_small(void *dst, const void *src,
- unsigned u64s)
-{
- memcpy_u64s_small(dst, src, u64s);
-}
-
-static inline void memmove_u64s_down_small(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst > src);
-
- __memmove_u64s_down_small(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
- unsigned u64s)
-{
- u64 *dst = (u64 *) _dst + u64s;
- u64 *src = (u64 *) _src + u64s;
-
- while (u64s--)
- *--dst = *--src;
-}
-
-static inline void memmove_u64s_up_small(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst < src);
-
- __memmove_u64s_up_small(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_up(void *_dst, const void *_src,
- unsigned u64s)
-{
- u64 *dst = (u64 *) _dst + u64s - 1;
- u64 *src = (u64 *) _src + u64s - 1;
-
-#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
- long d0, d1, d2;
-
- asm volatile("std ;\n"
- "rep ; movsq\n"
- "cld ;\n"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (u64s), "1" (dst), "2" (src)
- : "memory");
-#else
- while (u64s--)
- *dst-- = *src--;
-#endif
-}
-
-static inline void memmove_u64s_up(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst < src);
-
- __memmove_u64s_up(dst, src, u64s);
-}
-
-static inline void memmove_u64s(void *dst, const void *src,
- unsigned u64s)
-{
- if (dst < src)
- __memmove_u64s_down(dst, src, u64s);
- else
- __memmove_u64s_up(dst, src, u64s);
-}
-
-/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
-static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
-{
- unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
-
- memset(s + bytes, c, rem);
-}
-
-/* just the memmove, doesn't update @_nr */
-#define __array_insert_item(_array, _nr, _pos) \
- memmove(&(_array)[(_pos) + 1], \
- &(_array)[(_pos)], \
- sizeof((_array)[0]) * ((_nr) - (_pos)))
-
-#define array_insert_item(_array, _nr, _pos, _new_item) \
-do { \
- __array_insert_item(_array, _nr, _pos); \
- (_nr)++; \
- (_array)[(_pos)] = (_new_item); \
-} while (0)
-
-#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
-do { \
- (_nr) -= (_nr_to_remove); \
- memmove(&(_array)[(_pos)], \
- &(_array)[(_pos) + (_nr_to_remove)], \
- sizeof((_array)[0]) * ((_nr) - (_pos))); \
-} while (0)
-
-#define array_remove_item(_array, _nr, _pos) \
- array_remove_items(_array, _nr, _pos, 1)
-
-static inline void __move_gap(void *array, size_t element_size,
- size_t nr, size_t size,
- size_t old_gap, size_t new_gap)
-{
- size_t gap_end = old_gap + size - nr;
-
- if (new_gap < old_gap) {
- size_t move = old_gap - new_gap;
-
- memmove(array + element_size * (gap_end - move),
- array + element_size * (old_gap - move),
- element_size * move);
- } else if (new_gap > old_gap) {
- size_t move = new_gap - old_gap;
-
- memmove(array + element_size * old_gap,
- array + element_size * gap_end,
- element_size * move);
- }
-}
-
-/* Move the gap in a gap buffer: */
-#define move_gap(_d, _new_gap) \
-do { \
- BUG_ON(_new_gap > (_d)->nr); \
- BUG_ON((_d)->gap > (_d)->nr); \
- \
- __move_gap((_d)->data, sizeof((_d)->data[0]), \
- (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \
- (_d)->gap = _new_gap; \
-} while (0)
-
-#define bubble_sort(_base, _nr, _cmp) \
-do { \
- ssize_t _i, _last; \
- bool _swapped = true; \
- \
- for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
- _swapped = false; \
- for (_i = 0; _i < _last; _i++) \
- if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \
- swap((_base)[_i], (_base)[_i + 1]); \
- _swapped = true; \
- } \
- } \
-} while (0)
-
-#define per_cpu_sum(_p) \
-({ \
- typeof(*_p) _ret = 0; \
- \
- int cpu; \
- for_each_possible_cpu(cpu) \
- _ret += *per_cpu_ptr(_p, cpu); \
- _ret; \
-})
-
-static inline u64 percpu_u64_get(u64 __percpu *src)
-{
- return per_cpu_sum(src);
-}
-
-static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(dst, cpu) = 0;
- this_cpu_write(*dst, src);
-}
-
-static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
-{
- for (unsigned i = 0; i < nr; i++)
- acc[i] += src[i];
-}
-
-static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
- unsigned nr)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
-}
-
-static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- memset(per_cpu_ptr(p, cpu), c, bytes);
-}
-
-u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
-
-#define cmp_int(l, r) ((l > r) - (l < r))
-
-static inline int u8_cmp(u8 l, u8 r)
-{
- return cmp_int(l, r);
-}
-
-static inline int cmp_le32(__le32 l, __le32 r)
-{
- return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
-}
-
-#include <linux/uuid.h>
-
-static inline bool qstr_eq(const struct qstr l, const struct qstr r)
-{
- return l.len == r.len && !memcmp(l.name, r.name, l.len);
-}
-
-void bch2_darray_str_exit(darray_str *);
-int bch2_split_devs(const char *, darray_str *);
-
-#ifdef __KERNEL__
-
-__must_check
-static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
- return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
-__must_check
-static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
-{
- return copy_from_user(to, from, n) ? -EFAULT : 0;
-}
-
-#endif
-
-static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
-{
- if (v)
- set_bit(nr, addr);
- else
- clear_bit(nr, addr);
-}
-
-static inline void __set_bit_le64(size_t bit, __le64 *addr)
-{
- addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
-}
-
-static inline void __clear_bit_le64(size_t bit, __le64 *addr)
-{
- addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
-}
-
-static inline bool test_bit_le64(size_t bit, __le64 *addr)
-{
- return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
-}
-
-static inline void memcpy_swab(void *_dst, void *_src, size_t len)
-{
- u8 *dst = _dst + len;
- u8 *src = _src;
-
- while (len--)
- *--dst = *src++;
-}
-
-#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
deleted file mode 100644
index 6620ecae26af..000000000000
--- a/fs/bcachefs/varint.c
+++ /dev/null
@@ -1,130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/bitops.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-#ifdef CONFIG_VALGRIND
-#include <valgrind/memcheck.h>
-#endif
-
-#include "errcode.h"
-#include "varint.h"
-
-/**
- * bch2_varint_encode - encode a variable length integer
- * @out: destination to encode to
- * @v: unsigned integer to encode
- * Returns: size in bytes of the encoded integer - at most 9 bytes
- */
-int bch2_varint_encode(u8 *out, u64 v)
-{
- unsigned bits = fls64(v|1);
- unsigned bytes = DIV_ROUND_UP(bits, 7);
- __le64 v_le;
-
- if (likely(bytes < 9)) {
- v <<= bytes;
- v |= ~(~0 << (bytes - 1));
- v_le = cpu_to_le64(v);
- memcpy(out, &v_le, bytes);
- } else {
- *out++ = 255;
- bytes = 9;
- put_unaligned_le64(v, out);
- }
-
- return bytes;
-}
-
-/**
- * bch2_varint_decode - encode a variable length integer
- * @in: varint to decode
- * @end: end of buffer to decode from
- * @out: on success, decoded integer
- * Returns: size in bytes of the decoded integer - or -1 on failure (would
- * have read past the end of the buffer)
- */
-int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
-{
- unsigned bytes = likely(in < end)
- ? ffz(*in & 255) + 1
- : 1;
- u64 v;
-
- if (unlikely(in + bytes > end))
- return -BCH_ERR_varint_decode_error;
-
- if (likely(bytes < 9)) {
- __le64 v_le = 0;
-
- memcpy(&v_le, in, bytes);
- v = le64_to_cpu(v_le);
- v >>= bytes;
- } else {
- v = get_unaligned_le64(++in);
- }
-
- *out = v;
- return bytes;
-}
-
-/**
- * bch2_varint_encode_fast - fast version of bch2_varint_encode
- * @out: destination to encode to
- * @v: unsigned integer to encode
- * Returns: size in bytes of the encoded integer - at most 9 bytes
- *
- * This version assumes it's always safe to write 8 bytes to @out, even if the
- * encoded integer would be smaller.
- */
-int bch2_varint_encode_fast(u8 *out, u64 v)
-{
- unsigned bits = fls64(v|1);
- unsigned bytes = DIV_ROUND_UP(bits, 7);
-
- if (likely(bytes < 9)) {
- v <<= bytes;
- v |= ~(~0U << (bytes - 1));
- } else {
- *out++ = 255;
- bytes = 9;
- }
-
- put_unaligned_le64(v, out);
- return bytes;
-}
-
-/**
- * bch2_varint_decode_fast - fast version of bch2_varint_decode
- * @in: varint to decode
- * @end: end of buffer to decode from
- * @out: on success, decoded integer
- * Returns: size in bytes of the decoded integer - or -1 on failure (would
- * have read past the end of the buffer)
- *
- * This version assumes that it is safe to read at most 8 bytes past the end of
- * @end (we still return an error if the varint extends past @end).
- */
-int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
-{
-#ifdef CONFIG_VALGRIND
- VALGRIND_MAKE_MEM_DEFINED(in, 8);
-#endif
- u64 v = get_unaligned_le64(in);
- unsigned bytes = ffz(*in) + 1;
-
- if (unlikely(in + bytes > end))
- return -BCH_ERR_varint_decode_error;
-
- if (likely(bytes < 9)) {
- v >>= bytes;
- v &= ~(~0ULL << (7 * bytes));
- } else {
- v = get_unaligned_le64(++in);
- }
-
- *out = v;
- return bytes;
-}
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
deleted file mode 100644
index 92a182fb3d7a..000000000000
--- a/fs/bcachefs/varint.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_VARINT_H
-#define _BCACHEFS_VARINT_H
-
-int bch2_varint_encode(u8 *, u64);
-int bch2_varint_decode(const u8 *, const u8 *, u64 *);
-
-int bch2_varint_encode_fast(u8 *, u64);
-int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
-
-#endif /* _BCACHEFS_VARINT_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
deleted file mode 100644
index 2ad338e282da..000000000000
--- a/fs/bcachefs/vstructs.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _VSTRUCTS_H
-#define _VSTRUCTS_H
-
-#include "util.h"
-
-/*
- * NOTE: we can't differentiate between __le64 and u64 with type_is - this
- * assumes u64 is little endian:
- */
-#define __vstruct_u64s(_s) \
-({ \
- ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \
- : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \
- : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \
- : ((__force u8) ((_s)->u64s))); \
-})
-
-#define __vstruct_bytes(_type, _u64s) \
-({ \
- BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \
- \
- (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
-})
-
-#define vstruct_bytes(_s) \
- __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
-
-#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \
- (round_up(__vstruct_bytes(_type, _u64s), \
- 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
-
-#define vstruct_blocks(_s, _sector_block_bits) \
- __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
-
-#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \
- __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \
- __vstruct_u64s(_s) + (_u64s))
-
-#define vstruct_sectors(_s, _sector_block_bits) \
- (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
-
-#define vstruct_next(_s) \
- ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_last(_s) \
- ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_end(_s) \
- ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-
-#define vstruct_for_each(_s, _i) \
- for (typeof(&(_s)->start[0]) _i = (_s)->start; \
- _i < vstruct_last(_s); \
- _i = vstruct_next(_i))
-
-#define vstruct_for_each_safe(_s, _i) \
- for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \
- _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \
- _i = _next)
-
-#define vstruct_idx(_s, _idx) \
- ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
-
-#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
deleted file mode 100644
index f9667b944c0d..000000000000
--- a/fs/bcachefs/xattr.c
+++ /dev/null
@@ -1,631 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "fs.h"
-#include "rebalance.h"
-#include "str_hash.h"
-#include "xattr.h"
-
-#include <linux/dcache.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
-
-static u64 bch2_xattr_hash(const struct bch_hash_info *info,
- const struct xattr_search_key *key)
-{
- struct bch_str_hash_ctx ctx;
-
- bch2_str_hash_init(&ctx, info);
- bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
- bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
-
- return bch2_str_hash_end(&ctx, info);
-}
-
-static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
-{
- return bch2_xattr_hash(info, key);
-}
-
-static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
- struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
-
- return bch2_xattr_hash(info,
- &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
-}
-
-static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
-{
- struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
- const struct xattr_search_key *r = _r;
-
- return l.v->x_type != r->type ||
- l.v->x_name_len != r->name.len ||
- memcmp(l.v->x_name, r->name.name, r->name.len);
-}
-
-static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
- struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
- struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
-
- return l.v->x_type != r.v->x_type ||
- l.v->x_name_len != r.v->x_name_len ||
- memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
-}
-
-const struct bch_hash_desc bch2_xattr_hash_desc = {
- .btree_id = BTREE_ID_xattrs,
- .key_type = KEY_TYPE_xattr,
- .hash_key = xattr_hash_key,
- .hash_bkey = xattr_hash_bkey,
- .cmp_key = xattr_cmp_key,
- .cmp_bkey = xattr_cmp_bkey,
-};
-
-int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
- unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len));
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s,
- c, xattr_val_size_too_small,
- "value too small (%zu < %u)",
- bkey_val_u64s(k.k), val_u64s);
-
- /* XXX why +4 ? */
- val_u64s = xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len) + 4);
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s,
- c, xattr_val_size_too_big,
- "value too big (%zu > %u)",
- bkey_val_u64s(k.k), val_u64s);
-
- bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type),
- c, xattr_invalid_type,
- "invalid type (%u)", xattr.v->x_type);
-
- bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len),
- c, xattr_name_invalid_chars,
- "xattr name has invalid characters");
-fsck_err:
- return ret;
-}
-
-void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- const struct xattr_handler *handler;
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-
- handler = bch2_xattr_type_to_handler(xattr.v->x_type);
- if (handler && handler->prefix)
- prt_printf(out, "%s", handler->prefix);
- else if (handler)
- prt_printf(out, "(type %u)", xattr.v->x_type);
- else
- prt_printf(out, "(unknown type %u)", xattr.v->x_type);
-
- unsigned name_len = xattr.v->x_name_len;
- unsigned val_len = le16_to_cpu(xattr.v->x_val_len);
- unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
- offsetof(struct bch_xattr, x_name);
-
- val_len = min_t(int, val_len, max_name_val_bytes - name_len);
- name_len = min(name_len, max_name_val_bytes);
-
- prt_printf(out, "%.*s:%.*s",
- name_len, xattr.v->x_name,
- val_len, (char *) xattr_val(xattr.v));
-
- if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
- xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
- prt_char(out, ' ');
- bch2_acl_to_text(out, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
- }
-}
-
-static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
- const char *name, void *buffer, size_t size, int type)
-{
- struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
- struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
- struct btree_iter iter;
- struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
- inode_inum(inode), &search, 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
- ret = le16_to_cpu(xattr.v->x_val_len);
- if (buffer) {
- if (ret > size)
- ret = -ERANGE;
- else
- memcpy(buffer, xattr_val(xattr.v), ret);
- }
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode_u,
- const struct bch_hash_info *hash_info,
- const char *name, const void *value, size_t size,
- int type, int flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter inode_iter = { NULL };
- int ret;
-
- ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
- bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
- if (ret)
- return ret;
-
- inode_u->bi_ctime = bch2_current_time(c);
-
- ret = bch2_inode_write(trans, &inode_iter, inode_u);
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (ret)
- return ret;
-
- if (value) {
- struct bkey_i_xattr *xattr;
- unsigned namelen = strlen(name);
- unsigned u64s = BKEY_U64s +
- xattr_val_u64s(namelen, size);
-
- if (u64s > U8_MAX)
- return -ERANGE;
-
- xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
- if (IS_ERR(xattr))
- return PTR_ERR(xattr);
-
- bkey_xattr_init(&xattr->k_i);
- xattr->k.u64s = u64s;
- xattr->v.x_type = type;
- xattr->v.x_name_len = namelen;
- xattr->v.x_val_len = cpu_to_le16(size);
- memcpy(xattr->v.x_name, name, namelen);
- memcpy(xattr_val(&xattr->v), value, size);
-
- ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
- inum, &xattr->k_i,
- (flags & XATTR_CREATE ? STR_HASH_must_create : 0)|
- (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0));
- } else {
- struct xattr_search_key search =
- X_SEARCH(type, name, strlen(name));
-
- ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
- hash_info, inum, &search);
- }
-
- if (bch2_err_matches(ret, ENOENT))
- ret = flags & XATTR_REPLACE ? -ENODATA : 0;
-
- return ret;
-}
-
-struct xattr_buf {
- char *buf;
- size_t len;
- size_t used;
-};
-
-static int __bch2_xattr_emit(const char *prefix,
- const char *name, size_t name_len,
- struct xattr_buf *buf)
-{
- const size_t prefix_len = strlen(prefix);
- const size_t total_len = prefix_len + name_len + 1;
-
- if (buf->buf) {
- if (buf->used + total_len > buf->len)
- return -ERANGE;
-
- memcpy(buf->buf + buf->used, prefix, prefix_len);
- memcpy(buf->buf + buf->used + prefix_len,
- name, name_len);
- buf->buf[buf->used + prefix_len + name_len] = '\0';
- }
-
- buf->used += total_len;
- return 0;
-}
-
-static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
-{
- const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
-
- if (!xattr_handler_can_list(handler, dentry))
- return NULL;
-
- return xattr_prefix(handler);
-}
-
-static int bch2_xattr_emit(struct dentry *dentry,
- const struct bch_xattr *xattr,
- struct xattr_buf *buf)
-{
- const char *prefix;
-
- prefix = bch2_xattr_prefix(xattr->x_type, dentry);
- if (!prefix)
- return 0;
-
- return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
-}
-
-static int bch2_xattr_list_bcachefs(struct bch_fs *c,
- struct bch_inode_unpacked *inode,
- struct xattr_buf *buf,
- bool all)
-{
- const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
- unsigned id;
- int ret = 0;
- u64 v;
-
- for (id = 0; id < Inode_opt_nr; id++) {
- v = bch2_inode_opt_get(inode, id);
- if (!v)
- continue;
-
- if (!all &&
- !(inode->bi_fields_set & (1 << id)))
- continue;
-
- ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
- strlen(bch2_inode_opts[id]), buf);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
- struct bch_fs *c = dentry->d_sb->s_fs_info;
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
- u64 offset = 0, inum = inode->ei_inode.bi_inum;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs,
- POS(inum, offset),
- POS(inum, U64_MAX),
- inode->ei_inum.subvol, 0, k, ({
- if (k.k->type != KEY_TYPE_xattr)
- continue;
-
- bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
- }))) ?:
- bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
- bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
-
- return ret ? bch2_err_class(ret) : buf.used;
-}
-
-static int bch2_xattr_get_handler(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *vinode,
- const char *name, void *buffer, size_t size)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_trans_do(c,
- bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
-
- if (ret < 0 && bch2_err_matches(ret, ENOENT))
- ret = -ENODATA;
-
- return bch2_err_class(ret);
-}
-
-static int bch2_xattr_set_handler(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *vinode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
- struct bch_inode_unpacked inode_u;
- int ret;
-
- ret = bch2_trans_run(c,
- commit_do(trans, NULL, NULL, 0,
- bch2_xattr_set(trans, inode_inum(inode), &inode_u,
- &hash, name, value, size,
- handler->flags, flags)) ?:
- (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
-
- return bch2_err_class(ret);
-}
-
-static const struct xattr_handler bch_xattr_user_handler = {
- .prefix = XATTR_USER_PREFIX,
- .get = bch2_xattr_get_handler,
- .set = bch2_xattr_set_handler,
- .flags = KEY_TYPE_XATTR_INDEX_USER,
-};
-
-static bool bch2_xattr_trusted_list(struct dentry *dentry)
-{
- return capable(CAP_SYS_ADMIN);
-}
-
-static const struct xattr_handler bch_xattr_trusted_handler = {
- .prefix = XATTR_TRUSTED_PREFIX,
- .list = bch2_xattr_trusted_list,
- .get = bch2_xattr_get_handler,
- .set = bch2_xattr_set_handler,
- .flags = KEY_TYPE_XATTR_INDEX_TRUSTED,
-};
-
-static const struct xattr_handler bch_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .get = bch2_xattr_get_handler,
- .set = bch2_xattr_set_handler,
- .flags = KEY_TYPE_XATTR_INDEX_SECURITY,
-};
-
-#ifndef NO_BCACHEFS_FS
-
-static int opt_to_inode_opt(int id)
-{
- switch (id) {
-#define x(name, ...) \
- case Opt_##name: return Inode_opt_##name;
- BCH_INODE_OPTS()
-#undef x
- default:
- return -1;
- }
-}
-
-static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *vinode,
- const char *name, void *buffer, size_t size,
- bool all)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_opts opts =
- bch2_inode_opts_to_opts(&inode->ei_inode);
- const struct bch_option *opt;
- int id, inode_opt_id;
- struct printbuf out = PRINTBUF;
- int ret;
- u64 v;
-
- id = bch2_opt_lookup(name);
- if (id < 0 || !bch2_opt_is_inode_opt(id))
- return -EINVAL;
-
- inode_opt_id = opt_to_inode_opt(id);
- if (inode_opt_id < 0)
- return -EINVAL;
-
- opt = bch2_opt_table + id;
-
- if (!bch2_opt_defined_by_id(&opts, id))
- return -ENODATA;
-
- if (!all &&
- !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
- return -ENODATA;
-
- v = bch2_opt_get_by_id(&opts, id);
- bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
-
- ret = out.pos;
-
- if (out.allocation_failure) {
- ret = -ENOMEM;
- } else if (buffer) {
- if (out.pos > size)
- ret = -ERANGE;
- else
- memcpy(buffer, out.buf, out.pos);
- }
-
- printbuf_exit(&out);
- return ret;
-}
-
-static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *vinode,
- const char *name, void *buffer, size_t size)
-{
- return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
- name, buffer, size, false);
-}
-
-struct inode_opt_set {
- int id;
- u64 v;
- bool defined;
-};
-
-static int inode_opt_set_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct inode_opt_set *s = p;
-
- if (s->defined)
- bi->bi_fields_set |= 1U << s->id;
- else
- bi->bi_fields_set &= ~(1U << s->id);
-
- bch2_inode_opt_set(bi, s->id, s->v);
-
- return 0;
-}
-
-static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *vinode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- const struct bch_option *opt;
- char *buf;
- struct inode_opt_set s;
- int opt_id, inode_opt_id, ret;
-
- opt_id = bch2_opt_lookup(name);
- if (opt_id < 0)
- return -EINVAL;
-
- opt = bch2_opt_table + opt_id;
-
- inode_opt_id = opt_to_inode_opt(opt_id);
- if (inode_opt_id < 0)
- return -EINVAL;
-
- s.id = inode_opt_id;
-
- if (value) {
- u64 v = 0;
-
- buf = kmalloc(size + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- memcpy(buf, value, size);
- buf[size] = '\0';
-
- ret = bch2_opt_parse(c, opt, buf, &v, NULL);
- kfree(buf);
-
- if (ret < 0)
- goto err_class_exit;
-
- ret = bch2_opt_check_may_set(c, NULL, opt_id, v);
- if (ret < 0)
- goto err_class_exit;
-
- s.v = v + 1;
- s.defined = true;
- } else {
- /*
- * Check if this option was set on the parent - if so, switched
- * back to inheriting from the parent:
- *
- * rename() also has to deal with keeping inherited options up
- * to date - see bch2_reinherit_attrs()
- */
- spin_lock(&dentry->d_lock);
- if (!IS_ROOT(dentry)) {
- struct bch_inode_info *dir =
- to_bch_ei(d_inode(dentry->d_parent));
-
- s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
- } else {
- s.v = 0;
- }
- spin_unlock(&dentry->d_lock);
-
- s.defined = false;
- }
-
- mutex_lock(&inode->ei_update_lock);
- if (inode_opt_id == Inode_opt_project) {
- /*
- * inode fields accessible via the xattr interface are stored
- * with a +1 bias, so that 0 means unset:
- */
- ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
- if (ret)
- goto err;
- }
-
- ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
-err:
- mutex_unlock(&inode->ei_update_lock);
-err_class_exit:
- return bch2_err_class(ret);
-}
-
-static const struct xattr_handler bch_xattr_bcachefs_handler = {
- .prefix = "bcachefs.",
- .get = bch2_xattr_bcachefs_get,
- .set = bch2_xattr_bcachefs_set,
-};
-
-static int bch2_xattr_bcachefs_get_effective(
- const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *vinode,
- const char *name, void *buffer, size_t size)
-{
- return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
- name, buffer, size, true);
-}
-
-/* Noop - xattrs in the bcachefs_effective namespace are inherited */
-static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *vinode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- return 0;
-}
-
-static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
- .prefix = "bcachefs_effective.",
- .get = bch2_xattr_bcachefs_get_effective,
- .set = bch2_xattr_bcachefs_set_effective,
-};
-
-#endif /* NO_BCACHEFS_FS */
-
-const struct xattr_handler * const bch2_xattr_handlers[] = {
- &bch_xattr_user_handler,
- &bch_xattr_trusted_handler,
- &bch_xattr_security_handler,
-#ifndef NO_BCACHEFS_FS
- &bch_xattr_bcachefs_handler,
- &bch_xattr_bcachefs_effective_handler,
-#endif
- NULL
-};
-
-static const struct xattr_handler *bch_xattr_handler_map[] = {
- [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler,
- [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] =
- &nop_posix_acl_access,
- [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] =
- &nop_posix_acl_default,
- [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
- [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
-};
-
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
-{
- return type < ARRAY_SIZE(bch_xattr_handler_map)
- ? bch_xattr_handler_map[type]
- : NULL;
-}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
deleted file mode 100644
index 132fbbd15a66..000000000000
--- a/fs/bcachefs/xattr.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_XATTR_H
-#define _BCACHEFS_XATTR_H
-
-#include "str_hash.h"
-
-extern const struct bch_hash_desc bch2_xattr_hash_desc;
-
-int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_xattr ((struct bkey_ops) { \
- .key_validate = bch2_xattr_validate, \
- .val_to_text = bch2_xattr_to_text, \
- .min_val_size = 8, \
-})
-
-static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
-{
- return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
- name_len + val_len, sizeof(u64));
-}
-
-#define xattr_val(_xattr) \
- ((void *) (_xattr)->x_name + (_xattr)->x_name_len)
-
-struct xattr_search_key {
- u8 type;
- struct qstr name;
-};
-
-#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \
- { .type = _type, .name = QSTR_INIT(_name, _len) })
-
-struct dentry;
-struct xattr_handler;
-struct bch_hash_info;
-struct bch_inode_info;
-
-/* Exported for cmd_migrate.c in tools: */
-int bch2_xattr_set(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *, const struct bch_hash_info *,
- const char *, const void *, size_t, int, int);
-
-ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
-
-extern const struct xattr_handler * const bch2_xattr_handlers[];
-
-#endif /* _BCACHEFS_XATTR_H */
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
deleted file mode 100644
index c7916011ef34..000000000000
--- a/fs/bcachefs/xattr_format.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_XATTR_FORMAT_H
-#define _BCACHEFS_XATTR_FORMAT_H
-
-#define KEY_TYPE_XATTR_INDEX_USER 0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
-#define KEY_TYPE_XATTR_INDEX_SECURITY 4
-
-struct bch_xattr {
- struct bch_val v;
- __u8 x_type;
- __u8 x_name_len;
- __le16 x_val_len;
- __u8 x_name[] __counted_by(x_name_len);
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 8f430ff8e445..9fcfdd6b8189 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -307,7 +307,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
befs_ino = BEFS_I(inode);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index fa66a09e496a..d33d6bde992b 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -27,7 +27,7 @@ const struct file_operations bfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.splice_read = filemap_splice_read,
};
@@ -170,9 +170,10 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
truncate_pagecache(inode, inode->i_size);
}
-static int bfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int bfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index db81570c9637..ce6f83234b67 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -17,6 +17,7 @@
#include <linux/writeback.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
+#include <linux/fs_context.h>
#include "bfs.h"
MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>");
@@ -41,7 +42,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
@@ -60,7 +61,19 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
di = (struct bfs_inode *)bh->b_data + off;
- inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode);
+ /*
+ * https://martin.hinner.info/fs/bfs/bfs-structure.html explains that
+ * BFS in SCO UnixWare environment used only lower 9 bits of di->i_mode
+ * value. This means that, although bfs_write_inode() saves whole
+ * inode->i_mode bits (which include S_IFMT bits and S_IS{UID,GID,VTX}
+ * bits), middle 7 bits of di->i_mode value can be garbage when these
+ * bits were not saved by bfs_write_inode().
+ * Since we can't tell whether middle 7 bits are garbage, use only
+ * lower 12 bits (i.e. tolerate S_IS{UID,GID,VTX} bits possibly being
+ * garbage) and reconstruct S_IFMT bits for Linux environment from
+ * di->i_vtype value.
+ */
+ inode->i_mode = 0x00000FFF & le32_to_cpu(di->i_mode);
if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
inode->i_mode |= S_IFDIR;
inode->i_op = &bfs_dir_inops;
@@ -70,6 +83,11 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &bfs_file_inops;
inode->i_fop = &bfs_file_operations;
inode->i_mapping->a_ops = &bfs_aops;
+ } else {
+ brelse(bh);
+ printf("Unknown vtype=%u %s:%08lx\n",
+ le32_to_cpu(di->i_vtype), inode->i_sb->s_id, ino);
+ goto error;
}
BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock);
@@ -305,7 +323,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s)
#endif
}
-static int bfs_fill_super(struct super_block *s, void *data, int silent)
+static int bfs_fill_super(struct super_block *s, struct fs_context *fc)
{
struct buffer_head *bh, *sbh;
struct bfs_super_block *bfs_sb;
@@ -314,6 +332,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
struct bfs_sb_info *info;
int ret = -EINVAL;
unsigned long i_sblock, i_eblock, i_eoff, s_size;
+ int silent = fc->sb_flags & SB_SILENT;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
@@ -446,18 +465,28 @@ out:
return ret;
}
-static struct dentry *bfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int bfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, bfs_fill_super);
+}
+
+static const struct fs_context_operations bfs_context_ops = {
+ .get_tree = bfs_get_tree,
+};
+
+static int bfs_init_fs_context(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+ fc->ops = &bfs_context_ops;
+
+ return 0;
}
static struct file_system_type bfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "bfs",
- .mount = bfs_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "bfs",
+ .init_fs_context = bfs_init_fs_context,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("bfs");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 584fa89bc877..3eb734c192e9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,7 +46,7 @@
#include <linux/cred.h>
#include <linux/dax.h>
#include <linux/uaccess.h>
-#include <linux/rseq.h>
+#include <uapi/linux/rseq.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -68,12 +68,6 @@
static int load_elf_binary(struct linux_binprm *bprm);
-#ifdef CONFIG_USELIB
-static int load_elf_library(struct file *);
-#else
-#define load_elf_library NULL
-#endif
-
/*
* If we don't support core dumping, then supply a NULL so we
* don't even try.
@@ -101,7 +95,6 @@ static int elf_core_dump(struct coredump_params *cprm);
static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
- .load_shlib = load_elf_library,
#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
@@ -110,6 +103,21 @@ static struct linux_binfmt elf_format = {
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
+static inline void elf_coredump_set_mm_eflags(struct mm_struct *mm, u32 flags)
+{
+#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS
+ mm->saved_e_flags = flags;
+#endif
+}
+
+static inline u32 elf_coredump_get_mm_eflags(struct mm_struct *mm, u32 flags)
+{
+#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS
+ flags = mm->saved_e_flags;
+#endif
+ return flags;
+}
+
/*
* We need to explicitly zero any trailing portion of the page that follows
* p_filesz when it ends before the page ends (e.g. bss), otherwise this
@@ -526,7 +534,7 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
/* Sanity check the number of program headers... */
/* ...and their total size. */
size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
- if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
+ if (size == 0 || size > 65536)
goto out;
elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -653,7 +661,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
if (!elf_check_arch(interp_elf_ex) ||
elf_check_fdpic(interp_elf_ex))
goto out;
- if (!interpreter->f_op->mmap)
+ if (!can_mmap_file(interpreter))
goto out;
total_size = total_mapping_size(interp_elf_phdata,
@@ -830,6 +838,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
unsigned long elf_brk;
+ bool brk_moved = false;
int retval, i;
unsigned long elf_entry;
unsigned long e_entry;
@@ -854,7 +863,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out;
if (elf_check_fdpic(elf_ex))
goto out;
- if (!bprm->file->f_op->mmap)
+ if (!can_mmap_file(bprm->file))
goto out;
elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
@@ -1097,15 +1106,19 @@ out_free_interp:
/* Calculate any requested alignment. */
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
- /*
- * There are effectively two types of ET_DYN
- * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP)
- * and loaders (ET_DYN without PT_INTERP, since they
- * _are_ the ELF interpreter). The loaders must
- * be loaded away from programs since the program
- * may otherwise collide with the loader (especially
- * for ET_EXEC which does not have a randomized
- * position). For example to handle invocations of
+ /**
+ * DOC: PIE handling
+ *
+ * There are effectively two types of ET_DYN ELF
+ * binaries: programs (i.e. PIE: ET_DYN with
+ * PT_INTERP) and loaders (i.e. static PIE: ET_DYN
+ * without PT_INTERP, usually the ELF interpreter
+ * itself). Loaders must be loaded away from programs
+ * since the program may otherwise collide with the
+ * loader (especially for ET_EXEC which does not have
+ * a randomized position).
+ *
+ * For example, to handle invocations of
* "./ld.so someprog" to test out a new version of
* the loader, the subsequent program that the
* loader loads must avoid the loader itself, so
@@ -1118,6 +1131,9 @@ out_free_interp:
* ELF_ET_DYN_BASE and loaders are loaded into the
* independently randomized mmap region (0 load_bias
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
+ *
+ * See below for "brk" handling details, which is
+ * also affected by program vs loader and ASLR.
*/
if (interpreter) {
/* On ET_DYN with PT_INTERP, we do the ASLR. */
@@ -1234,8 +1250,6 @@ out_free_interp:
start_data += load_bias;
end_data += load_bias;
- current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk);
-
if (interpreter) {
elf_entry = load_elf_interp(interp_elf_ex,
interpreter,
@@ -1291,27 +1305,46 @@ out_free_interp:
mm->end_data = end_data;
mm->start_stack = bprm->p;
- if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) {
+ elf_coredump_set_mm_eflags(mm, elf_ex->e_flags);
+
+ /**
+ * DOC: "brk" handling
+ *
+ * For architectures with ELF randomization, when executing a
+ * loader directly (i.e. static PIE: ET_DYN without PT_INTERP),
+ * move the brk area out of the mmap region and into the unused
+ * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide
+ * early with the stack growing down or other regions being put
+ * into the mmap region by the kernel (e.g. vdso).
+ *
+ * In the CONFIG_COMPAT_BRK case, though, everything is turned
+ * off because we're not allowed to move the brk at all.
+ */
+ if (!IS_ENABLED(CONFIG_COMPAT_BRK) &&
+ IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
+ elf_ex->e_type == ET_DYN && !interpreter) {
+ elf_brk = ELF_ET_DYN_BASE;
+ /* This counts as moving the brk, so let brk(2) know. */
+ brk_moved = true;
+ }
+ mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk);
+
+ if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) {
/*
- * For architectures with ELF randomization, when executing
- * a loader directly (i.e. no interpreter listed in ELF
- * headers), move the brk area out of the mmap region
- * (since it grows up, and may collide early with the stack
- * growing down), and into the unused ELF_ET_DYN_BASE region.
+ * If we didn't move the brk to ELF_ET_DYN_BASE (above),
+ * leave a gap between .bss and brk.
*/
- if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
- elf_ex->e_type == ET_DYN && !interpreter) {
- mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
- } else {
- /* Otherwise leave a gap between .bss and brk. */
+ if (!brk_moved)
mm->brk = mm->start_brk = mm->brk + PAGE_SIZE;
- }
mm->brk = mm->start_brk = arch_randomize_brk(mm);
+ brk_moved = true;
+ }
+
#ifdef compat_brk_randomized
+ if (brk_moved)
current->brk_randomized = 1;
#endif
- }
if (current->personality & MMAP_PAGE_ZERO) {
/* Why this, you ask??? Well SVr4 maps page 0 as read-only,
@@ -1361,75 +1394,6 @@ out_free_ph:
goto out;
}
-#ifdef CONFIG_USELIB
-/* This is really simpleminded and specialized - we are loading an
- a.out library that is given an ELF header. */
-static int load_elf_library(struct file *file)
-{
- struct elf_phdr *elf_phdata;
- struct elf_phdr *eppnt;
- int retval, error, i, j;
- struct elfhdr elf_ex;
-
- error = -ENOEXEC;
- retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0);
- if (retval < 0)
- goto out;
-
- if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
- goto out;
-
- /* First of all, some simple consistency checks */
- if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
- !elf_check_arch(&elf_ex) || !file->f_op->mmap)
- goto out;
- if (elf_check_fdpic(&elf_ex))
- goto out;
-
- /* Now read in all of the header information */
-
- j = sizeof(struct elf_phdr) * elf_ex.e_phnum;
- /* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */
-
- error = -ENOMEM;
- elf_phdata = kmalloc(j, GFP_KERNEL);
- if (!elf_phdata)
- goto out;
-
- eppnt = elf_phdata;
- error = -ENOEXEC;
- retval = elf_read(file, eppnt, j, elf_ex.e_phoff);
- if (retval < 0)
- goto out_free_ph;
-
- for (j = 0, i = 0; i<elf_ex.e_phnum; i++)
- if ((eppnt + i)->p_type == PT_LOAD)
- j++;
- if (j != 1)
- goto out_free_ph;
-
- while (eppnt->p_type != PT_LOAD)
- eppnt++;
-
- /* Now use mmap to map the library into memory. */
- error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr),
- eppnt,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED_NOREPLACE | MAP_PRIVATE,
- 0);
-
- if (error != ELF_PAGESTART(eppnt->p_vaddr))
- goto out_free_ph;
-
- error = 0;
-
-out_free_ph:
- kfree(elf_phdata);
-out:
- return error;
-}
-#endif /* #ifdef CONFIG_USELIB */
-
#ifdef CONFIG_ELF_CORE
/*
* ELF core dumper
@@ -1503,8 +1467,8 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
phdr->p_align = 4;
}
-static void fill_note(struct memelfnote *note, const char *name, int type,
- unsigned int sz, void *data)
+static void __fill_note(struct memelfnote *note, const char *name, int type,
+ unsigned int sz, void *data)
{
note->name = name;
note->type = type;
@@ -1512,6 +1476,9 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
note->data = data;
}
+#define fill_note(note, type, sz, data) \
+ __fill_note(note, NN_ ## type, NT_ ## type, sz, data)
+
/*
* fill up all the fields in prstatus from the given task struct, except
* registers which need to be filled up separately.
@@ -1602,14 +1569,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(note, AUXV, i * sizeof(elf_addr_t), auxv);
}
static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
const kernel_siginfo_t *siginfo)
{
copy_siginfo_to_external(csigdata, siginfo);
- fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata);
+ fill_note(note, SIGINFO, sizeof(*csigdata), csigdata);
}
/*
@@ -1705,7 +1672,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
}
size = name_curpos - (char *)data;
- fill_note(note, NN_FILE, NT_FILE, size, data);
+ fill_note(note, FILE, size, data);
return 0;
}
@@ -1766,8 +1733,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
regset_get(t->task, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS,
- PRSTATUS_SIZE, &t->prstatus);
+ fill_note(&t->notes[0], PRSTATUS, PRSTATUS_SIZE, &t->prstatus);
info->size += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1780,6 +1746,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
for (view_iter = 1; view_iter < view->n; ++view_iter) {
const struct user_regset *regset = &view->regsets[view_iter];
int note_type = regset->core_note_type;
+ const char *note_name = regset->core_note_name;
bool is_fpreg = note_type == NT_PRFPREG;
void *data;
int ret;
@@ -1800,8 +1767,16 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX",
- note_type, ret, data);
+ /* There should be a note name, but if not, guess: */
+ if (WARN_ON_ONCE(!note_name))
+ note_name = "LINUX";
+ else
+ /* Warn on non-legacy-compatible names, for now. */
+ WARN_ON_ONCE(strcmp(note_name,
+ is_fpreg ? "CORE" : "LINUX"));
+
+ __fill_note(&t->notes[note_iter], note_name, note_type,
+ ret, data);
info->size += notesize(&t->notes[note_iter]);
note_iter++;
@@ -1820,8 +1795,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_prstatus(&t->prstatus.common, p, signr);
elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
- fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
- &(t->prstatus));
+ fill_note(&t->notes[0], PRSTATUS, sizeof(t->prstatus), &t->prstatus);
info->size += notesize(&t->notes[0]);
fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL);
@@ -1831,7 +1805,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
}
t->prstatus.pr_fpvalid = 1;
- fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu);
+ fill_note(&t->notes[1], PRFPREG, sizeof(*fpu), fpu);
info->size += notesize(&t->notes[1]);
return 1;
@@ -1847,11 +1821,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_thread_core_info *t;
struct elf_prpsinfo *psinfo;
struct core_thread *ct;
+ u16 machine;
+ u32 flags;
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
if (!psinfo)
return 0;
- fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&info->psinfo, PRPSINFO, sizeof(*psinfo), psinfo);
#ifdef CORE_DUMP_USE_REGSET
view = task_user_regset_view(dump_task);
@@ -1874,30 +1850,37 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
return 0;
}
- /*
- * Initialize the ELF file header.
- */
- fill_elf_header(elf, phdrs,
- view->e_machine, view->e_flags);
+ machine = view->e_machine;
+ flags = view->e_flags;
#else
view = NULL;
info->thread_notes = 2;
- fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
+ machine = ELF_ARCH;
+ flags = ELF_CORE_EFLAGS;
#endif
/*
+ * Override ELF e_flags with value taken from process,
+ * if arch needs that.
+ */
+ flags = elf_coredump_get_mm_eflags(dump_task->mm, flags);
+
+ /*
+ * Initialize the ELF file header.
+ */
+ fill_elf_header(elf, phdrs, machine, flags);
+
+ /*
* Allocate a structure for each thread.
*/
- info->thread = kzalloc(offsetof(struct elf_thread_core_info,
- notes[info->thread_notes]),
- GFP_KERNEL);
+ info->thread = kzalloc(struct_size(info->thread, notes, info->thread_notes),
+ GFP_KERNEL);
if (unlikely(!info->thread))
return 0;
info->thread->task = dump_task;
for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) {
- t = kzalloc(offsetof(struct elf_thread_core_info,
- notes[info->thread_notes]),
+ t = kzalloc(struct_size(t, notes, info->thread_notes),
GFP_KERNEL);
if (unlikely(!t))
return 0;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9133f3827f90..48fd2de3bca0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -109,7 +109,7 @@ static int is_elf(struct elfhdr *hdr, struct file *file)
return 0;
if (!elf_check_arch(hdr))
return 0;
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return 0;
return 1;
}
@@ -1275,8 +1275,8 @@ static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offs
return;
}
-static inline void fill_note(struct memelfnote *note, const char *name, int type,
- unsigned int sz, void *data)
+static inline void __fill_note(struct memelfnote *note, const char *name, int type,
+ unsigned int sz, void *data)
{
note->name = name;
note->type = type;
@@ -1285,6 +1285,9 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
return;
}
+#define fill_note(note, type, sz, data) \
+ __fill_note(note, NN_ ## type, NT_ ## type, sz, data)
+
/*
* fill up all the fields in prstatus from the given task struct, except
* registers which need to be filled up separately.
@@ -1398,8 +1401,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
regset_get(p, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
- &t->prstatus);
+ fill_note(&t->notes[0], PRSTATUS, sizeof(t->prstatus), &t->prstatus);
t->num_notes++;
*sz += notesize(&t->notes[0]);
@@ -1416,8 +1418,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
}
if (t->prstatus.pr_fpvalid) {
- fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu),
- &t->fpu);
+ fill_note(&t->notes[1], PRFPREG, sizeof(t->fpu), &t->fpu);
t->num_notes++;
*sz += notesize(&t->notes[1]);
}
@@ -1531,7 +1532,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
*/
fill_psinfo(psinfo, current->group_leader, current->mm);
- fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&psinfo_note, PRPSINFO, sizeof(*psinfo), psinfo);
thread_status_size += notesize(&psinfo_note);
auxv = (elf_addr_t *) current->mm->saved_auxv;
@@ -1539,7 +1540,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(&auxv_note, AUXV, i * sizeof(elf_addr_t), auxv);
thread_status_size += notesize(&auxv_note);
offset = sizeof(*elf); /* ELF header */
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 5a7ebd160724..8cb1a94339b8 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -675,44 +675,6 @@ static void bm_evict_inode(struct inode *inode)
}
/**
- * unlink_binfmt_dentry - remove the dentry for the binary type handler
- * @dentry: dentry associated with the binary type handler
- *
- * Do the actual filesystem work to remove a dentry for a registered binary
- * type handler. Since binfmt_misc only allows simple files to be created
- * directly under the root dentry of the filesystem we ensure that we are
- * indeed passed a dentry directly beneath the root dentry, that the inode
- * associated with the root dentry is locked, and that it is a regular file we
- * are asked to remove.
- */
-static void unlink_binfmt_dentry(struct dentry *dentry)
-{
- struct dentry *parent = dentry->d_parent;
- struct inode *inode, *parent_inode;
-
- /* All entries are immediate descendants of the root dentry. */
- if (WARN_ON_ONCE(dentry->d_sb->s_root != parent))
- return;
-
- /* We only expect to be called on regular files. */
- inode = d_inode(dentry);
- if (WARN_ON_ONCE(!S_ISREG(inode->i_mode)))
- return;
-
- /* The parent inode must be locked. */
- parent_inode = d_inode(parent);
- if (WARN_ON_ONCE(!inode_is_locked(parent_inode)))
- return;
-
- if (simple_positive(dentry)) {
- dget(dentry);
- simple_unlink(parent_inode, dentry);
- d_delete(dentry);
- dput(dentry);
- }
-}
-
-/**
* remove_binfmt_handler - remove a binary type handler
* @misc: handle to binfmt_misc instance
* @e: binary type handler to remove
@@ -729,7 +691,7 @@ static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
write_lock(&misc->entries_lock);
list_del_init(&e->list);
write_unlock(&misc->entries_lock);
- unlink_binfmt_dentry(e->dentry);
+ locked_recursive_removal(e->dentry, NULL);
}
/* /<entry> */
@@ -772,7 +734,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
case 3:
/* Delete this handler. */
inode = d_inode(inode->i_sb->s_root);
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
/*
* In order to add new element or remove elements from the list
@@ -803,14 +765,41 @@ static const struct file_operations bm_entry_operations = {
/* /register */
+/* add to filesystem */
+static int add_entry(Node *e, struct super_block *sb)
+{
+ struct dentry *dentry = simple_start_creating(sb->s_root, e->name);
+ struct inode *inode;
+ struct binfmt_misc *misc;
+
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ inode = bm_get_inode(sb, S_IFREG | 0644);
+ if (unlikely(!inode)) {
+ simple_done_creating(dentry);
+ return -ENOMEM;
+ }
+
+ refcount_set(&e->users, 1);
+ e->dentry = dentry;
+ inode->i_private = e;
+ inode->i_fop = &bm_entry_operations;
+
+ d_make_persistent(dentry, inode);
+ misc = i_binfmt_misc(inode);
+ write_lock(&misc->entries_lock);
+ list_add(&e->list, &misc->entries);
+ write_unlock(&misc->entries_lock);
+ simple_done_creating(dentry);
+ return 0;
+}
+
static ssize_t bm_register_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
Node *e;
- struct inode *inode;
struct super_block *sb = file_inode(file)->i_sb;
- struct dentry *root = sb->s_root, *dentry;
- struct binfmt_misc *misc;
int err = 0;
struct file *f = NULL;
@@ -820,8 +809,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
return PTR_ERR(e);
if (e->flags & MISC_FMT_OPEN_FILE) {
- const struct cred *old_cred;
-
/*
* Now that we support unprivileged binfmt_misc mounts make
* sure we use the credentials that the register @file was
@@ -829,9 +816,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
* didn't matter much as only a privileged process could open
* the register file.
*/
- old_cred = override_creds(file->f_cred);
- f = open_exec(e->interpreter);
- revert_creds(old_cred);
+ scoped_with_creds(file->f_cred)
+ f = open_exec(e->interpreter);
if (IS_ERR(f)) {
pr_notice("register: failed to install interpreter file %s\n",
e->interpreter);
@@ -841,42 +827,12 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
e->interp_file = f;
}
- inode_lock(d_inode(root));
- dentry = lookup_one_len(e->name, root, strlen(e->name));
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- goto out;
-
- err = -EEXIST;
- if (d_really_is_positive(dentry))
- goto out2;
-
- inode = bm_get_inode(sb, S_IFREG | 0644);
-
- err = -ENOMEM;
- if (!inode)
- goto out2;
-
- refcount_set(&e->users, 1);
- e->dentry = dget(dentry);
- inode->i_private = e;
- inode->i_fop = &bm_entry_operations;
-
- d_instantiate(dentry, inode);
- misc = i_binfmt_misc(inode);
- write_lock(&misc->entries_lock);
- list_add(&e->list, &misc->entries);
- write_unlock(&misc->entries_lock);
-
- err = 0;
-out2:
- dput(dentry);
-out:
- inode_unlock(d_inode(root));
-
+ err = add_entry(e, sb);
if (err) {
- if (f)
+ if (f) {
+ exe_file_allow_write_access(f);
filp_close(f, NULL);
+ }
kfree(e);
return err;
}
@@ -922,7 +878,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
case 3:
/* Delete all handlers. */
inode = d_inode(file_inode(file)->i_sb->s_root);
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
/*
* In order to add new element or remove elements from the list
@@ -1066,7 +1022,7 @@ static struct file_system_type bm_fs_type = {
.name = "binfmt_misc",
.init_fs_context = bm_init_fs_context,
.fs_flags = FS_USERNS_MOUNT,
- .kill_sb = kill_litter_super,
+ .kill_sb = kill_anon_super,
};
MODULE_ALIAS_FS("binfmt_misc");
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index 3fe9f59ef867..5ace2511fec5 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -2,11 +2,14 @@
/* Copyright (c) 2024 Google LLC. */
#include <linux/bpf.h>
+#include <linux/bpf_lsm.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/dcache.h>
#include <linux/fs.h>
+#include <linux/fsnotify.h>
#include <linux/file.h>
+#include <linux/kernfs.h>
#include <linux/mm.h>
#include <linux/xattr.h>
@@ -76,7 +79,7 @@ __bpf_kfunc void bpf_put_file(struct file *file)
* pathname in *buf*, including the NUL termination character. On error, a
* negative integer is returned.
*/
-__bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz)
+__bpf_kfunc int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz)
{
int len;
char *ret;
@@ -93,6 +96,24 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz)
return len;
}
+static bool match_security_bpf_prefix(const char *name__str)
+{
+ return !strncmp(name__str, XATTR_NAME_BPF_LSM, XATTR_NAME_BPF_LSM_LEN);
+}
+
+static int bpf_xattr_read_permission(const char *name, struct inode *inode)
+{
+ if (WARN_ON(!inode))
+ return -EINVAL;
+
+ /* Allow reading xattr with user. and security.bpf. prefix */
+ if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+ !match_security_bpf_prefix(name))
+ return -EPERM;
+
+ return inode_permission(&nop_mnt_idmap, inode, MAY_READ);
+}
+
/**
* bpf_get_dentry_xattr - get xattr of a dentry
* @dentry: dentry to get xattr from
@@ -101,9 +122,10 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz)
*
* Get xattr *name__str* of *dentry* and store the output in *value_ptr*.
*
- * For security reasons, only *name__str* with prefix "user." is allowed.
+ * For security reasons, only *name__str* with prefixes "user." or
+ * "security.bpf." are allowed.
*
- * Return: 0 on success, a negative value on error.
+ * Return: length of the xattr value on success, a negative value on error.
*/
__bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__str,
struct bpf_dynptr *value_p)
@@ -114,18 +136,12 @@ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__st
void *value;
int ret;
- if (WARN_ON(!inode))
- return -EINVAL;
-
- if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return -EPERM;
-
value_len = __bpf_dynptr_size(value_ptr);
value = __bpf_dynptr_data_rw(value_ptr, value_len);
if (!value)
return -EINVAL;
- ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
+ ret = bpf_xattr_read_permission(name__str, inode);
if (ret)
return ret;
return __vfs_getxattr(dentry, inode, name__str, value, value_len);
@@ -139,9 +155,10 @@ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__st
*
* Get xattr *name__str* of *file* and store the output in *value_ptr*.
*
- * For security reasons, only *name__str* with prefix "user." is allowed.
+ * For security reasons, only *name__str* with prefixes "user." or
+ * "security.bpf." are allowed.
*
- * Return: 0 on success, a negative value on error.
+ * Return: length of the xattr value on success, a negative value on error.
*/
__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
struct bpf_dynptr *value_p)
@@ -154,6 +171,193 @@ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
__bpf_kfunc_end_defs();
+static int bpf_xattr_write_permission(const char *name, struct inode *inode)
+{
+ if (WARN_ON(!inode))
+ return -EINVAL;
+
+ /* Only allow setting and removing security.bpf. xattrs */
+ if (!match_security_bpf_prefix(name))
+ return -EPERM;
+
+ return inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
+}
+
+/**
+ * bpf_set_dentry_xattr_locked - set a xattr of a dentry
+ * @dentry: dentry to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: xattr value
+ * @flags: flags to pass into filesystem operations
+ *
+ * Set xattr *name__str* of *dentry* to the value in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "security.bpf."
+ * is allowed.
+ *
+ * The caller already locked dentry->d_inode.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
+ const struct bpf_dynptr *value_p, int flags)
+{
+
+ struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
+ struct inode *inode = d_inode(dentry);
+ const void *value;
+ u32 value_len;
+ int ret;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ ret = bpf_xattr_write_permission(name__str, inode);
+ if (ret)
+ return ret;
+
+ ret = __vfs_setxattr(&nop_mnt_idmap, dentry, inode, name__str,
+ value, value_len, flags);
+ if (!ret) {
+ fsnotify_xattr(dentry);
+
+ /* This xattr is set by BPF LSM, so we do not call
+ * security_inode_post_setxattr. Otherwise, we would
+ * risk deadlocks by calling back to the same kfunc.
+ *
+ * This is the same as security_inode_setsecurity().
+ */
+ }
+ return ret;
+}
+
+/**
+ * bpf_remove_dentry_xattr_locked - remove a xattr of a dentry
+ * @dentry: dentry to get xattr from
+ * @name__str: name of the xattr
+ *
+ * Rmove xattr *name__str* of *dentry*.
+ *
+ * For security reasons, only *name__str* with prefix "security.bpf."
+ * is allowed.
+ *
+ * The caller already locked dentry->d_inode.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str)
+{
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ ret = bpf_xattr_write_permission(name__str, inode);
+ if (ret)
+ return ret;
+
+ ret = __vfs_removexattr(&nop_mnt_idmap, dentry, name__str);
+ if (!ret) {
+ fsnotify_xattr(dentry);
+
+ /* This xattr is removed by BPF LSM, so we do not call
+ * security_inode_post_removexattr. Otherwise, we would
+ * risk deadlocks by calling back to the same kfunc.
+ */
+ }
+ return ret;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_set_dentry_xattr - set a xattr of a dentry
+ * @dentry: dentry to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: xattr value
+ * @flags: flags to pass into filesystem operations
+ *
+ * Set xattr *name__str* of *dentry* to the value in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "security.bpf."
+ * is allowed.
+ *
+ * The caller has not locked dentry->d_inode.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_set_dentry_xattr(struct dentry *dentry, const char *name__str,
+ const struct bpf_dynptr *value_p, int flags)
+{
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ inode_lock(inode);
+ ret = bpf_set_dentry_xattr_locked(dentry, name__str, value_p, flags);
+ inode_unlock(inode);
+ return ret;
+}
+
+/**
+ * bpf_remove_dentry_xattr - remove a xattr of a dentry
+ * @dentry: dentry to get xattr from
+ * @name__str: name of the xattr
+ *
+ * Rmove xattr *name__str* of *dentry*.
+ *
+ * For security reasons, only *name__str* with prefix "security.bpf."
+ * is allowed.
+ *
+ * The caller has not locked dentry->d_inode.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name__str)
+{
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ inode_lock(inode);
+ ret = bpf_remove_dentry_xattr_locked(dentry, name__str);
+ inode_unlock(inode);
+ return ret;
+}
+
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_read_xattr - read xattr of a cgroup's node in cgroupfs
+ * @cgroup: cgroup to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *cgroup* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: length of the xattr value on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str,
+ struct bpf_dynptr *value_p)
+{
+ struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
+ u32 value_len;
+ void *value;
+
+ /* Only allow reading "user.*" xattrs */
+ if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EPERM;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data_rw(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ return kernfs_xattr_get(cgroup->kn, name__str, value, value_len);
+}
+#endif /* CONFIG_CGROUPS */
+
+__bpf_kfunc_end_defs();
+
BTF_KFUNCS_START(bpf_fs_kfunc_set_ids)
BTF_ID_FLAGS(func, bpf_get_task_exe_file,
KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
@@ -161,6 +365,8 @@ BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_fs_kfunc_set_ids)
static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id)
@@ -171,6 +377,37 @@ static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id)
return -EACCES;
}
+/* bpf_[set|remove]_dentry_xattr.* hooks have KF_TRUSTED_ARGS and
+ * KF_SLEEPABLE, so they are only available to sleepable hooks with
+ * dentry arguments.
+ *
+ * Setting and removing xattr requires exclusive lock on dentry->d_inode.
+ * Some hooks already locked d_inode, while some hooks have not locked
+ * d_inode. Therefore, we need different kfuncs for different hooks.
+ * Specifically, hooks in the following list (d_inode_locked_hooks)
+ * should call bpf_[set|remove]_dentry_xattr_locked; while other hooks
+ * should call bpf_[set|remove]_dentry_xattr.
+ */
+BTF_SET_START(d_inode_locked_hooks)
+BTF_ID(func, bpf_lsm_inode_post_removexattr)
+BTF_ID(func, bpf_lsm_inode_post_setattr)
+BTF_ID(func, bpf_lsm_inode_post_setxattr)
+BTF_ID(func, bpf_lsm_inode_removexattr)
+BTF_ID(func, bpf_lsm_inode_rmdir)
+BTF_ID(func, bpf_lsm_inode_setattr)
+BTF_ID(func, bpf_lsm_inode_setxattr)
+BTF_ID(func, bpf_lsm_inode_unlink)
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, bpf_lsm_path_unlink)
+BTF_ID(func, bpf_lsm_path_rmdir)
+#endif /* CONFIG_SECURITY_PATH */
+BTF_SET_END(d_inode_locked_hooks)
+
+bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
+{
+ return btf_id_set_contains(&d_inode_locked_hooks, prog->aux->attach_btf_id);
+}
+
static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
.owner = THIS_MODULE,
.set = &bpf_fs_kfunc_set_ids,
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index fa8515598341..4438637c8900 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -3,9 +3,9 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
select BLK_CGROUP_PUNT_BIO
+ select CRC32
select CRYPTO
select CRYPTO_CRC32C
- select LIBCRC32C
select CRYPTO_XXHASH
select CRYPTO_SHA256
select CRYPTO_BLAKE2B
@@ -52,20 +52,24 @@ config BTRFS_FS_RUN_SANITY_TESTS
bool "Btrfs will run sanity tests upon loading"
depends on BTRFS_FS
help
- This will run some basic sanity tests on the free space cache
- code to make sure it is acting as it should. These are mostly
- regression tests and are only really interesting to btrfs
- developers.
+ This will run sanity tests for core functionality like free space,
+ extent maps, extent io, extent buffers, inodes, qgroups and others,
+ at module load time. These are mostly regression tests and are only
+ interesting to developers.
If unsure, say N.
config BTRFS_DEBUG
bool "Btrfs debugging support"
depends on BTRFS_FS
+ select REF_TRACKER if STACKTRACE_SUPPORT
help
- Enable run-time debugging support for the btrfs filesystem. This may
- enable additional and expensive checks with negative impact on
- performance, or export extra information via sysfs.
+ Enable run-time debugging support for the btrfs filesystem.
+
+ Additional potentially expensive checks, debugging functionality or
+ sysfs exported information is enabled, like leak checks of internal
+ objects, optional forced space fragmentation and /sys/fs/btrfs/debug .
+ This has negative impact on performance.
If unsure, say N.
@@ -73,8 +77,10 @@ config BTRFS_ASSERT
bool "Btrfs assert support"
depends on BTRFS_FS
help
- Enable run-time assertion checking. This will result in panics if
- any of the assertions trip. This is meant for btrfs developers only.
+ Enable run-time assertion checking. Additional safety checks are
+ done, simple enough not to affect performance but verify invariants
+ and assumptions of code to run properly. This may result in panics,
+ and is meant for developers but can be enabled in general.
If unsure, say N.
@@ -89,7 +95,14 @@ config BTRFS_EXPERIMENTAL
Current list:
- - extent map shrinker - performance problems with too frequent shrinks
+ - COW fixup worker warning - last warning before removing the
+ functionality catching out-of-band page
+ dirtying, not necessary since 5.8
+
+ - RAID mirror read policy - additional read policies for balancing
+ reading from redundant block group
+ profiles (currently: pid, round-robin,
+ fixed devid)
- send stream protocol v3 - fs-verity support
@@ -102,15 +115,6 @@ config BTRFS_EXPERIMENTAL
- extent tree v2 - complex rework of extent tracking
- If unsure, say N.
-
-config BTRFS_FS_REF_VERIFY
- bool "Btrfs with the ref verify tool compiled in"
- depends on BTRFS_FS
- default n
- help
- Enable run-time extent reference verification instrumentation. This
- is meant to be used by btrfs developers for tracking down extent
- reference problems or verifying they didn't break something.
+ - large folio support
If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 2d5f0482678b..743d7677b175 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,7 +36,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+btrfs-$(CONFIG_BTRFS_DEBUG) += ref-verify.o
btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
btrfs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index e3716516ca38..1248aa2535d3 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -9,27 +9,24 @@
#include "fs.h"
#include "accessors.h"
-static bool check_setget_bounds(const struct extent_buffer *eb,
- const void *ptr, unsigned off, int size)
+static void __cold report_setget_bounds(const struct extent_buffer *eb,
+ const void *ptr, unsigned off, int size)
{
- const unsigned long member_offset = (unsigned long)ptr + off;
+ unsigned long member_offset = (unsigned long)ptr + off;
- if (unlikely(member_offset + size > eb->len)) {
- btrfs_warn(eb->fs_info,
- "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
- (member_offset > eb->len ? "start" : "end"),
- (unsigned long)ptr, eb->start, member_offset, size);
- return false;
- }
-
- return true;
+ btrfs_warn(eb->fs_info,
+ "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+ (member_offset > eb->len ? "start" : "end"),
+ (unsigned long)ptr, eb->start, member_offset, size);
}
-void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
+/* Copy bytes from @src1 and @src2 to @dest. */
+static __always_inline void memcpy_split_src(char *dest, const char *src1,
+ const char *src2, const size_t len1,
+ const size_t total)
{
- token->eb = eb;
- token->kaddr = folio_address(eb->folios[0]);
- token->offset = 0;
+ memcpy(dest, src1, len1);
+ memcpy(dest + len1, src2, total - len1);
}
/*
@@ -41,134 +38,77 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
* - btrfs_set_8 (for 8/16/32/64)
* - btrfs_get_8 (for 8/16/32/64)
*
- * Generic helpers with a token (cached address of the most recently accessed
- * page):
- * - btrfs_set_token_8 (for 8/16/32/64)
- * - btrfs_get_token_8 (for 8/16/32/64)
- *
* The set/get functions handle data spanning two pages transparently, in case
* metadata block size is larger than page. Every pointer to metadata items is
* an offset into the extent buffer page array, cast to a specific type. This
* gives us all the type checking.
*
* The extent buffer pages stored in the array folios may not form a contiguous
- * phyusical range, but the API functions assume the linear offset to the range
+ * physical range, but the API functions assume the linear offset to the range
* from 0 to metadata node size.
*/
#define DEFINE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off) \
-{ \
- const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
- const unsigned long oil = get_eb_offset_in_folio(token->eb, \
- member_offset);\
- const int unit_size = token->eb->folio_size; \
- const int unit_shift = token->eb->folio_shift; \
- const int size = sizeof(u##bits); \
- u8 lebytes[sizeof(u##bits)]; \
- const int part = unit_size - oil; \
- \
- ASSERT(token); \
- ASSERT(token->kaddr); \
- ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
- if (token->offset <= member_offset && \
- member_offset + size <= token->offset + unit_size) { \
- return get_unaligned_le##bits(token->kaddr + oil); \
- } \
- token->kaddr = folio_address(token->eb->folios[idx]); \
- token->offset = idx << unit_shift; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
- return get_unaligned_le##bits(token->kaddr + oil); \
- \
- memcpy(lebytes, token->kaddr + oil, part); \
- token->kaddr = folio_address(token->eb->folios[idx + 1]); \
- token->offset = (idx + 1) << unit_shift; \
- memcpy(lebytes + part, token->kaddr, size - part); \
- return get_unaligned_le##bits(lebytes); \
-} \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
- const unsigned long oil = get_eb_offset_in_folio(eb, \
- member_offset);\
- const int unit_size = eb->folio_size; \
- char *kaddr = folio_address(eb->folios[idx]); \
- const int size = sizeof(u##bits); \
- const int part = unit_size - oil; \
- u8 lebytes[sizeof(u##bits)]; \
- \
- ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
- return get_unaligned_le##bits(kaddr + oil); \
- \
- memcpy(lebytes, kaddr + oil, part); \
- kaddr = folio_address(eb->folios[idx + 1]); \
- memcpy(lebytes + part, kaddr, size - part); \
- return get_unaligned_le##bits(lebytes); \
-} \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val) \
-{ \
- const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
- const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ const unsigned long oif = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = token->eb->folio_size; \
- const int unit_shift = token->eb->folio_shift; \
- const int size = sizeof(u##bits); \
+ char *kaddr = folio_address(eb->folios[idx]) + oif; \
+ const int part = eb->folio_size - oif; \
u8 lebytes[sizeof(u##bits)]; \
- const int part = unit_size - oil; \
\
- ASSERT(token); \
- ASSERT(token->kaddr); \
- ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
- if (token->offset <= member_offset && \
- member_offset + size <= token->offset + unit_size) { \
- put_unaligned_le##bits(val, token->kaddr + oil); \
- return; \
+ if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \
+ report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \
+ return 0; \
} \
- token->kaddr = folio_address(token->eb->folios[idx]); \
- token->offset = idx << unit_shift; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
- oil + size <= unit_size) { \
- put_unaligned_le##bits(val, token->kaddr + oil); \
- return; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \
+ likely(sizeof(u##bits) <= part)) \
+ return get_unaligned_le##bits(kaddr); \
+ \
+ if (sizeof(u##bits) == 2) { \
+ lebytes[0] = *kaddr; \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ lebytes[1] = *kaddr; \
+ } else { \
+ memcpy_split_src(lebytes, kaddr, \
+ folio_address(eb->folios[idx + 1]), \
+ part, sizeof(u##bits)); \
} \
- put_unaligned_le##bits(val, lebytes); \
- memcpy(token->kaddr + oil, lebytes, part); \
- token->kaddr = folio_address(token->eb->folios[idx + 1]); \
- token->offset = (idx + 1) << unit_shift; \
- memcpy(token->kaddr, lebytes + part, size - part); \
+ return get_unaligned_le##bits(lebytes); \
} \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
- const unsigned long oil = get_eb_offset_in_folio(eb, \
+ const unsigned long oif = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = eb->folio_size; \
- char *kaddr = folio_address(eb->folios[idx]); \
- const int size = sizeof(u##bits); \
- const int part = unit_size - oil; \
+ char *kaddr = folio_address(eb->folios[idx]) + oif; \
+ const int part = eb->folio_size - oif; \
u8 lebytes[sizeof(u##bits)]; \
\
- ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
- oil + size <= unit_size) { \
- put_unaligned_le##bits(val, kaddr + oil); \
+ if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \
+ report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \
+ return; \
+ } \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \
+ likely(sizeof(u##bits) <= part)) { \
+ put_unaligned_le##bits(val, kaddr); \
return; \
} \
- \
put_unaligned_le##bits(val, lebytes); \
- memcpy(kaddr + oil, lebytes, part); \
- kaddr = folio_address(eb->folios[idx + 1]); \
- memcpy(kaddr, lebytes + part, size - part); \
+ if (sizeof(u##bits) == 2) { \
+ *kaddr = lebytes[0]; \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ *kaddr = lebytes[1]; \
+ } else { \
+ memcpy(kaddr, lebytes, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ memcpy(kaddr, lebytes + part, sizeof(u##bits) - part); \
+ } \
}
DEFINE_BTRFS_SETGET_BITS(8)
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 15ea6348800b..78721412951c 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -12,18 +12,11 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
#include "extent_io.h"
struct extent_buffer;
-struct btrfs_map_token {
- struct extent_buffer *eb;
- char *kaddr;
- unsigned long offset;
-};
-
-void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);
-
/*
* Some macros to generate set/get functions for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple one
@@ -56,11 +49,6 @@ static inline void put_unaligned_le8(u8 val, void *p)
sizeof_field(type, member)))
#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off); \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val); \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off); \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
@@ -83,18 +71,6 @@ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
{ \
static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
-} \
-static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
- const type *s) \
-{ \
- static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
- return btrfs_get_token_##bits(token, s, offsetof(type, member));\
-} \
-static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
- type *s, u##bits val) \
-{ \
- static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
- btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
@@ -479,18 +455,6 @@ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
int slot, u32 val) \
{ \
btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \
-} \
-static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
- int slot) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
- return btrfs_token_raw_item_##member(token, item); \
-} \
-static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
- int slot, u32 val) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
- btrfs_set_token_raw_item_##member(token, item, val); \
}
BTRFS_ITEM_SETGET_FUNCS(offset)
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index e0ba00d64ea0..c336e2ab7f8a 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -14,12 +14,13 @@
#include "ctree.h"
#include "xattr.h"
#include "acl.h"
+#include "misc.h"
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
{
int size;
const char *name;
- char *value = NULL;
+ char AUTO_KFREE(value);
struct posix_acl *acl;
if (rcu)
@@ -49,7 +50,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
acl = NULL;
else
acl = ERR_PTR(size);
- kfree(value);
return acl;
}
@@ -59,7 +59,7 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
{
int ret, size = 0;
const char *name;
- char *value = NULL;
+ char AUTO_KFREE(value);
switch (type) {
case ACL_TYPE_ACCESS:
@@ -85,28 +85,23 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
nofs_flag = memalloc_nofs_save();
value = kmalloc(size, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
- if (!value) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!value)
+ return -ENOMEM;
ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
if (ret < 0)
- goto out;
+ return ret;
}
if (trans)
ret = btrfs_setxattr(trans, inode, name, value, size, 0);
else
ret = btrfs_setxattr_trans(inode, name, value, size, 0);
+ if (ret < 0)
+ return ret;
-out:
- kfree(value);
-
- if (!ret)
- set_cached_acl(inode, type, acl);
-
- return ret;
+ set_cached_acl(inode, type, acl);
+ return 0;
}
int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index f3bffe08b290..6c6f3bb58f4e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -219,8 +219,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
spin_lock_irqsave(lock, flags);
if (list_empty(list))
break;
- work = list_entry(list->next, struct btrfs_work,
- ordered_list);
+ work = list_first_entry(list, struct btrfs_work, ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
/*
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 5936cff80ff3..78da47a3d00e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -666,10 +666,9 @@ static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx,
ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq);
btrfs_debug(ctx->fs_info,
- "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
- ref->root_id, level, ref->count, ret,
- ref->key_for_search.objectid, ref->key_for_search.type,
- ref->key_for_search.offset);
+"search slot in root %llu (level %d, ref count %d) returned %d for key " BTRFS_KEY_FMT,
+ ref->root_id, level, ref->count, ret,
+ BTRFS_KEY_FMT_VALUE(&ref->key_for_search));
if (ret < 0)
goto out;
@@ -733,7 +732,6 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
struct preftrees *preftrees,
struct share_check *sc)
{
- int err;
int ret = 0;
struct ulist *parents;
struct ulist_node *node;
@@ -752,6 +750,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
*/
while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
struct prelim_ref *ref;
+ int ret2;
ref = rb_entry(rnode, struct prelim_ref, rbnode);
if (WARN(ref->parent,
@@ -773,18 +772,18 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
ret = BACKREF_FOUND_SHARED;
goto out;
}
- err = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
+ ret2 = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
*/
- if (err == -ENOENT) {
+ if (ret2 == -ENOENT) {
prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref,
NULL);
continue;
- } else if (err) {
+ } else if (ret2) {
free_pref(ref);
- ret = err;
+ ret = ret2;
goto out;
}
@@ -859,7 +858,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
free_pref(ref);
return PTR_ERR(eb);
}
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_pref(ref);
free_extent_buffer(eb);
return -EIO;
@@ -1062,7 +1061,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
iref = (struct btrfs_extent_inline_ref *)ptr;
type = btrfs_get_extent_inline_ref_type(leaf, iref,
BTRFS_REF_TYPE_ANY);
- if (type == BTRFS_REF_TYPE_INVALID)
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID))
return -EUCLEAN;
offset = btrfs_extent_inline_ref_offset(leaf, iref);
@@ -1409,12 +1408,12 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
if (!path)
return -ENOMEM;
if (!ctx->trans) {
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
}
if (ctx->time_seq == BTRFS_SEQ_LAST)
- path->skip_locking = 1;
+ path->skip_locking = true;
again:
head = NULL;
@@ -1422,7 +1421,7 @@ again:
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -1561,7 +1560,7 @@ again:
btrfs_release_path(path);
- ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0);
+ ret = add_missing_keys(ctx->fs_info, &preftrees, !path->skip_locking);
if (ret)
goto out;
@@ -1614,7 +1613,7 @@ again:
ret = PTR_ERR(eb);
goto out;
}
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
@@ -1652,7 +1651,7 @@ again:
* case.
*/
ASSERT(eie);
- if (!eie) {
+ if (unlikely(!eie)) {
ret = -EUCLEAN;
goto out;
}
@@ -1690,7 +1689,7 @@ out:
* @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
* added to the ulist at @ctx->refs, and that ulist is allocated by this
* function. The caller should free the ulist with free_leaf_list() if
- * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
+ * @ctx->ignore_extent_item_pos is false, otherwise a simple ulist_free() is
* enough.
*
* Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
@@ -2201,7 +2200,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
int ret;
u64 flags;
u64 size = 0;
- u32 item_size;
const struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_key key;
@@ -2216,7 +2214,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -2244,7 +2242,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
eb = path->nodes[0];
- item_size = btrfs_item_size(eb, path->slots[0]);
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
@@ -2252,7 +2249,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
btrfs_debug(fs_info,
"logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u",
logical, logical - found_key->objectid, found_key->objectid,
- found_key->offset, flags, item_size);
+ found_key->offset, flags, btrfs_item_size(eb, path->slots[0]));
WARN_ON(!flags_ret);
if (flags_ret) {
@@ -2314,7 +2311,7 @@ static int get_extent_inline_ref(unsigned long *ptr,
*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
*out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref,
BTRFS_REF_TYPE_ANY);
- if (*out_type == BTRFS_REF_TYPE_INVALID)
+ if (unlikely(*out_type == BTRFS_REF_TYPE_INVALID))
return -EUCLEAN;
*ptr += btrfs_extent_inline_ref_size(*out_type);
@@ -2548,17 +2545,20 @@ static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *c
}
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
void *ctx, bool ignore_offset)
{
struct btrfs_backref_walk_ctx walk_ctx = { 0 };
int ret;
u64 flags = 0;
struct btrfs_key found_key;
- int search_commit_root = path->search_commit_root;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
- btrfs_release_path(path);
+ btrfs_free_path(path);
if (ret < 0)
return ret;
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
@@ -2571,8 +2571,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
walk_ctx.extent_item_pos = logical - found_key.objectid;
walk_ctx.fs_info = fs_info;
- return iterate_extent_inodes(&walk_ctx, search_commit_root,
- build_ino_list, ctx);
+ return iterate_extent_inodes(&walk_ctx, false, build_ino_list, ctx);
}
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
@@ -2786,7 +2785,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
* allocates space to return multiple file system paths for an inode.
* total_bytes to allocate are passed, note that space usable for actual path
* information will be total_bytes - sizeof(struct inode_fs_paths).
- * the returned pointer must be freed with free_ipath() in the end.
+ * the returned pointer must be freed with __free_inode_fs_paths() in the end.
*/
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path)
@@ -2811,14 +2810,6 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
return ifp;
}
-void free_ipath(struct inode_fs_paths *ipath)
-{
- if (!ipath)
- return;
- kvfree(ipath->fspath);
- kfree(ipath);
-}
-
struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info)
{
struct btrfs_backref_iter *ret;
@@ -2834,8 +2825,8 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf
}
/* Current backref iterator only supports iteration in commit root */
- ret->path->search_commit_root = 1;
- ret->path->skip_locking = 1;
+ ret->path->search_commit_root = true;
+ ret->path->skip_locking = true;
ret->fs_info = fs_info;
return ret;
@@ -2868,7 +2859,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -2876,8 +2867,8 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
ret = -EUCLEAN;
goto release;
}
- if (path->slots[0] == 0) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ if (unlikely(path->slots[0] == 0)) {
+ DEBUG_WARN();
ret = -EUCLEAN;
goto release;
}
@@ -3134,8 +3125,8 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
return;
while (!list_empty(&node->upper)) {
- edge = list_entry(node->upper.next, struct btrfs_backref_edge,
- list[LOWER]);
+ edge = list_first_entry(&node->upper, struct btrfs_backref_edge,
+ list[LOWER]);
list_del(&edge->list[LOWER]);
list_del(&edge->list[UPPER]);
btrfs_backref_free_edge(cache, edge);
@@ -3161,18 +3152,14 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
ASSERT(!cache->nr_edges);
}
-void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which)
+static void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper)
{
ASSERT(upper && lower && upper->level == lower->level + 1);
edge->node[LOWER] = lower;
edge->node[UPPER] = upper;
- if (link_which & LINK_LOWER)
- list_add_tail(&edge->list[LOWER], &lower->upper);
- if (link_which & LINK_UPPER)
- list_add_tail(&edge->list[UPPER], &upper->lower);
+ list_add_tail(&edge->list[LOWER], &lower->upper);
}
/*
* Handle direct tree backref
@@ -3242,7 +3229,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
- btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER);
+ btrfs_backref_link_edge(edge, cur, upper);
return 0;
}
@@ -3312,8 +3299,8 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
level = cur->level + 1;
/* Search the tree to find parent blocks referring to the block */
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0);
path->lowest_level = 0;
@@ -3327,9 +3314,9 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
eb = path->nodes[level];
if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
btrfs_err(fs_info,
-"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
+"couldn't find block (%llu) (level %d) in tree (%llu) with key " BTRFS_KEY_FMT,
cur->bytenr, level - 1, btrfs_root_id(root),
- tree_key->objectid, tree_key->type, tree_key->offset);
+ BTRFS_KEY_FMT_VALUE(tree_key));
btrfs_put_root(root);
ret = -ENOENT;
goto out;
@@ -3412,7 +3399,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
if (!upper->owner)
upper->owner = btrfs_header_owner(eb);
}
- btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER);
+ btrfs_backref_link_edge(edge, lower, upper);
if (rb_node) {
btrfs_put_root(root);
@@ -3461,7 +3448,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
/* No extra backref? This means the tree block is corrupted */
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -3473,8 +3460,8 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
* type BTRFS_TREE_BLOCK_REF_KEY
*/
ASSERT(list_is_singular(&cur->upper));
- edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
- list[LOWER]);
+ edge = list_first_entry(&cur->upper, struct btrfs_backref_edge,
+ list[LOWER]);
ASSERT(list_empty(&edge->list[UPPER]));
exist = edge->node[UPPER];
/*
@@ -3504,7 +3491,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
((unsigned long)iter->cur_ptr);
type = btrfs_get_extent_inline_ref_type(eb, iref,
BTRFS_REF_TYPE_BLOCK);
- if (type == BTRFS_REF_TYPE_INVALID) {
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID)) {
ret = -EUCLEAN;
goto out;
}
@@ -3570,7 +3557,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
ASSERT(start->checked);
- rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, &start->simple_node);
if (rb_node)
btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);
@@ -3616,13 +3603,12 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
}
/* Sanity check, we shouldn't have any unchecked nodes */
- if (!upper->checked) {
- ASSERT(0);
+ if (unlikely(!upper->checked)) {
+ DEBUG_WARN("we should not have any unchecked nodes");
return -EUCLEAN;
}
- rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node);
if (unlikely(rb_node)) {
btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
return -EUCLEAN;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 74e614031274..1d009b0f4c69 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -190,7 +190,7 @@ struct btrfs_backref_share_check_ctx {
* It's very common to have several file extent items that point to the
* same extent (bytenr) but with different offsets and lengths. This
* typically happens for COW writes, partial writes into prealloc
- * extents, NOCOW writes after snapshoting a root, hole punching or
+ * extents, NOCOW writes after snapshotting a root, hole punching or
* reflinking within the same file (less common perhaps).
* So keep a small cache with the lookup results for the extent pointed
* by the last few file extent items. This cache is checked, with a
@@ -226,8 +226,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
iterate_extent_inodes_t *iterate, void *user_ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, void *ctx,
- bool ignore_offset);
+ void *ctx, bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
@@ -242,7 +241,12 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
-void free_ipath(struct inode_fs_paths *ipath);
+
+DEFINE_FREE(inode_fs_paths, struct inode_fs_paths *,
+ if (_T) {
+ kvfree(_T->fspath);
+ kfree(_T);
+ })
int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
@@ -313,10 +317,15 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
* Represent a tree block in the backref cache
*/
struct btrfs_backref_node {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simple_node for search/insert */
+ union{
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
/*
* This is a sanity check, whenever we COW a block we will update
@@ -410,7 +419,7 @@ struct btrfs_backref_cache {
/*
* Whether this cache is for relocation
*
- * Reloction backref cache require more info for reloc root compared
+ * Relocation backref cache require more info for reloc root compared
* to generic backref cache.
*/
bool is_reloc;
@@ -423,13 +432,6 @@ struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache);
-#define LINK_LOWER (1 << 0)
-#define LINK_UPPER (1 << 1)
-
-void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which);
void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 8c2eee1f1878..fa1d321a2fb8 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -27,12 +27,12 @@ struct btrfs_failed_bio {
};
/* Is this a data path I/O that needs storage layer checksum and repair? */
-static inline bool is_data_bbio(struct btrfs_bio *bbio)
+static inline bool is_data_bbio(const struct btrfs_bio *bbio)
{
return bbio->inode && is_data_inode(bbio->inode);
}
-static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
+static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
{
return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
}
@@ -41,13 +41,17 @@ static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
* is already initialized by the block layer.
*/
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset,
btrfs_bio_end_io_t end_io, void *private)
{
+ /* @inode parameter is mandatory. */
+ ASSERT(inode);
+
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
- bbio->fs_info = fs_info;
+ bbio->inode = inode;
bbio->end_io = end_io;
bbio->private = private;
+ bbio->file_offset = file_offset;
atomic_set(&bbio->pending_ios, 1);
WRITE_ONCE(bbio->status, BLK_STS_OK);
}
@@ -60,7 +64,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
* a mempool.
*/
struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode, u64 file_offset,
btrfs_bio_end_io_t end_io, void *private)
{
struct btrfs_bio *bbio;
@@ -68,7 +72,7 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio, fs_info, end_io, private);
+ btrfs_bio_init(bbio, inode, file_offset, end_io, private);
return bbio;
}
@@ -85,20 +89,27 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
return ERR_CAST(bio);
bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
- bbio->inode = orig_bbio->inode;
- bbio->file_offset = orig_bbio->file_offset;
+ btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio);
orig_bbio->file_offset += map_length;
if (bbio_has_ordered_extent(bbio)) {
refcount_inc(&orig_bbio->ordered->refs);
bbio->ordered = orig_bbio->ordered;
+ bbio->orig_logical = orig_bbio->orig_logical;
+ orig_bbio->orig_logical += map_length;
}
+ bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
atomic_inc(&orig_bbio->pending_ios);
return bbio;
}
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
+ /* Make sure we're already in task context. */
+ ASSERT(in_task());
+
+ if (bbio->async_csum)
+ wait_for_completion(&bbio->csum_done);
+
bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
@@ -134,14 +145,14 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
}
}
-static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
{
if (cur_mirror == fbio->num_copies)
return cur_mirror + 1 - fbio->num_copies;
return cur_mirror + 1;
}
-static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
{
if (cur_mirror == 1)
return fbio->num_copies;
@@ -162,17 +173,30 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
struct btrfs_failed_bio *fbio = repair_bbio->private;
struct btrfs_inode *inode = repair_bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
- int mirror = repair_bbio->mirror_num;
-
/*
- * We can only trigger this for data bio, which doesn't support larger
- * folios yet.
+ * We can not move forward the saved_iter, as it will be later
+ * utilized by repair_bbio again.
*/
- ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
+ struct bvec_iter saved_iter = repair_bbio->saved_iter;
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
+ const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT;
+ const u32 nr_steps = repair_bbio->saved_iter.bi_size / step;
+ int mirror = repair_bbio->mirror_num;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
+ phys_addr_t paddr;
+ unsigned int slot = 0;
+
+ /* Repair bbio should be eaxctly one block sized. */
+ ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize);
+
+ btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) {
+ ASSERT(slot < nr_steps);
+ paddrs[slot] = paddr;
+ slot++;
+ }
if (repair_bbio->bio.bi_status ||
- !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
+ !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
@@ -191,8 +215,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
mirror = prev_repair_mirror(fbio, mirror);
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
repair_bbio->file_offset, fs_info->sectorsize,
- repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
- page_folio(bv->bv_page), bv->bv_offset, mirror);
+ logical, paddrs, step, mirror);
} while (mirror != fbio->bbio->mirror_num);
done:
@@ -209,13 +232,20 @@ done:
*/
static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
u32 bio_offset,
- struct bio_vec *bv,
+ phys_addr_t paddrs[],
struct btrfs_failed_bio *fbio)
{
struct btrfs_inode *inode = failed_bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
+ const u32 nr_steps = sectorsize / step;
+ /*
+ * For bs > ps cases, the saved_iter can be partially moved forward.
+ * In that case we should round it down to the block boundary.
+ */
+ const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
+ sectorsize);
struct btrfs_bio *repair_bbio;
struct bio *repair_bio;
int num_copies;
@@ -240,15 +270,22 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
atomic_inc(&fbio->repair_count);
- repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
+ repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS,
&btrfs_repair_bioset);
- repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
- __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+ repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+ for (int i = 0; i < nr_steps; i++) {
+ int ret;
+
+ ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE);
+
+ ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step,
+ offset_in_page(paddrs[i]));
+ ASSERT(ret == step);
+ }
repair_bbio = btrfs_bio(repair_bio);
- btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
- repair_bbio->inode = failed_bbio->inode;
- repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
+ btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset,
+ NULL, fbio);
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
@@ -260,10 +297,14 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u32 sectorsize = fs_info->sectorsize;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 step = min(sectorsize, PAGE_SIZE);
+ const u32 nr_steps = sectorsize / step;
struct bvec_iter *iter = &bbio->saved_iter;
blk_status_t status = bbio->bio.bi_status;
struct btrfs_failed_bio *fbio = NULL;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
+ phys_addr_t paddr;
u32 offset = 0;
/* Read-repair requires the inode field to be set by the submitter. */
@@ -281,19 +322,19 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
/* Clear the I/O error. A failed repair will reset it. */
bbio->bio.bi_status = BLK_STS_OK;
- while (iter->bi_size) {
- struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
-
- bv.bv_len = min(bv.bv_len, sectorsize);
- if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
- fbio = repair_one_sector(bbio, offset, &bv, fbio);
+ btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) {
+ paddrs[(offset / step) % nr_steps] = paddr;
+ offset += step;
- bio_advance_iter_single(&bbio->bio, iter, sectorsize);
- offset += sectorsize;
+ if (IS_ALIGNED(offset, sectorsize)) {
+ if (status ||
+ !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs))
+ fbio = repair_one_sector(bbio, offset - sectorsize,
+ paddrs, fbio);
+ }
}
-
if (bbio->csum != bbio->csum_inline)
- kfree(bbio->csum);
+ kvfree(bbio->csum);
if (fbio)
btrfs_repair_done(fbio);
@@ -301,7 +342,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
-static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
{
if (!dev || !dev->bdev)
return;
@@ -316,44 +357,43 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
}
-static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
- struct bio *bio)
+static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
+ const struct bio *bio)
{
if (bio->bi_opf & REQ_META)
return fs_info->endio_meta_workers;
return fs_info->endio_workers;
}
-static void btrfs_end_bio_work(struct work_struct *work)
+static void simple_end_io_work(struct work_struct *work)
{
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+ struct bio *bio = &bbio->bio;
- /* Metadata reads are checked and repaired by the submitter. */
- if (is_data_bbio(bbio))
- btrfs_check_read_bio(bbio, bbio->bio.bi_private);
- else
- btrfs_bio_end_io(bbio, bbio->bio.bi_status);
+ if (bio_op(bio) == REQ_OP_READ) {
+ /* Metadata reads are checked and repaired by the submitter. */
+ if (is_data_bbio(bbio))
+ return btrfs_check_read_bio(bbio, bbio->bio.bi_private);
+ return btrfs_bio_end_io(bbio, bbio->bio.bi_status);
+ }
+ if (bio_is_zone_append(bio) && !bio->bi_status)
+ btrfs_record_physical_zoned(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_simple_end_io(struct bio *bio)
{
struct btrfs_bio *bbio = btrfs_bio(bio);
struct btrfs_device *dev = bio->bi_private;
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
btrfs_bio_counter_dec(fs_info);
if (bio->bi_status)
btrfs_log_dev_io_error(bio, dev);
- if (bio_op(bio) == REQ_OP_READ) {
- INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
- queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
- } else {
- if (bio_is_zone_append(bio) && !bio->bi_status)
- btrfs_record_physical_zoned(bbio);
- btrfs_bio_end_io(bbio, bbio->bio.bi_status);
- }
+ INIT_WORK(&bbio->end_io_work, simple_end_io_work);
+ queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
}
static void btrfs_raid56_end_io(struct bio *bio)
@@ -361,6 +401,9 @@ static void btrfs_raid56_end_io(struct bio *bio)
struct btrfs_io_context *bioc = bio->bi_private;
struct btrfs_bio *bbio = btrfs_bio(bio);
+ /* RAID56 endio is always handled in workqueue. */
+ ASSERT(in_task());
+
btrfs_bio_counter_dec(bioc->fs_info);
bbio->mirror_num = bioc->mirror_num;
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
@@ -371,11 +414,12 @@ static void btrfs_raid56_end_io(struct bio *bio)
btrfs_put_bioc(bioc);
}
-static void btrfs_orig_write_end_io(struct bio *bio)
+static void orig_write_end_io_work(struct work_struct *work)
{
+ struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+ struct bio *bio = &bbio->bio;
struct btrfs_io_stripe *stripe = bio->bi_private;
struct btrfs_io_context *bioc = stripe->bioc;
- struct btrfs_bio *bbio = btrfs_bio(bio);
btrfs_bio_counter_dec(bioc->fs_info);
@@ -400,8 +444,18 @@ static void btrfs_orig_write_end_io(struct bio *bio)
btrfs_put_bioc(bioc);
}
-static void btrfs_clone_write_end_io(struct bio *bio)
+static void btrfs_orig_write_end_io(struct bio *bio)
+{
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ INIT_WORK(&bbio->end_io_work, orig_write_end_io_work);
+ queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
+}
+
+static void clone_write_end_io_work(struct work_struct *work)
{
+ struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+ struct bio *bio = &bbio->bio;
struct btrfs_io_stripe *stripe = bio->bi_private;
if (bio->bi_status) {
@@ -416,6 +470,14 @@ static void btrfs_clone_write_end_io(struct bio *bio)
bio_put(bio);
}
+static void btrfs_clone_write_end_io(struct bio *bio)
+{
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ INIT_WORK(&bbio->end_io_work, clone_write_end_io_work);
+ queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
+}
+
static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
{
if (!dev || !dev->bdev ||
@@ -439,7 +501,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
ASSERT(btrfs_dev_is_sequential(dev, physical));
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
}
- btrfs_debug_in_rcu(dev->fs_info,
+ btrfs_debug(dev->fs_info,
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
@@ -462,6 +524,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
{
struct bio *orig_bio = bioc->orig_bio, *bio;
+ struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio);
ASSERT(bio_op(orig_bio) != REQ_OP_READ);
@@ -470,8 +533,11 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
bio = orig_bio;
bio->bi_end_io = btrfs_orig_write_end_io;
} else {
- bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+ /* We need to use endio_work to run end_io in task context. */
+ bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset);
bio_inc_remaining(orig_bio);
+ btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode,
+ orig_bbio->file_offset, NULL, NULL);
bio->bi_end_io = btrfs_clone_write_end_io;
}
@@ -512,11 +578,15 @@ static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
}
}
-static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
+static int btrfs_bio_csum(struct btrfs_bio *bbio)
{
if (bbio->bio.bi_opf & REQ_META)
return btree_csum_one_bio(bbio);
- return btrfs_csum_one_bio(bbio);
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ return btrfs_csum_one_bio(bbio, true);
+#else
+ return btrfs_csum_one_bio(bbio, false);
+#endif
}
/*
@@ -543,11 +613,11 @@ static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work);
- blk_status_t ret;
+ int ret;
ret = btrfs_bio_csum(async->bbio);
if (ret)
- async->bbio->bio.bi_status = ret;
+ async->bbio->bio.bi_status = errno_to_blk_status(ret);
}
/*
@@ -588,20 +658,25 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
static bool should_async_write(struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
bool auto_csum_mode = true;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
- struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
- if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF)
- return false;
-
- auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO);
+ if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON)
+ return true;
+ /*
+ * Write bios will calculate checksum and submit bio at the same time.
+ * Unless explicitly required don't offload serial csum calculate and bio
+ * submit into a workqueue.
+ */
+ return false;
#endif
/* Submit synchronously if the checksum implementation is fast. */
- if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
+ if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
return false;
/*
@@ -612,7 +687,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
return false;
/* Zoned devices require I/O to be submitted in order. */
- if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info))
+ if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info))
return false;
return true;
@@ -627,7 +702,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
struct btrfs_io_context *bioc,
struct btrfs_io_stripe *smap, int mirror_num)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
@@ -646,11 +721,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
{
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
unsigned int nr_segs;
int sector_offset;
- map_length = min(map_length, bbio->fs_info->max_zone_append_size);
- sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
+ map_length = min(map_length, fs_info->max_zone_append_size);
+ sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits,
&nr_segs, map_length);
if (sector_offset) {
/*
@@ -658,7 +734,7 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
* sectorsize and thus cause unaligned I/Os. Fix that by
* always rounding down to the nearest boundary.
*/
- return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize);
+ return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize);
}
return map_length;
}
@@ -666,7 +742,7 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
struct btrfs_inode *inode = bbio->inode;
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = &bbio->bio;
u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 length = bio->bi_iter.bi_size;
@@ -674,23 +750,31 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bool use_append = btrfs_use_zone_append(bbio);
struct btrfs_io_context *bioc = NULL;
struct btrfs_io_stripe smap;
- blk_status_t ret;
- int error;
+ blk_status_t status;
+ int ret;
- if (!bbio->inode || btrfs_is_data_reloc_root(inode->root))
+ if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root))
smap.rst_search_commit_root = true;
else
smap.rst_search_commit_root = false;
btrfs_bio_counter_inc_blocked(fs_info);
- error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num);
- if (error) {
- ret = errno_to_blk_status(error);
+ ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num);
+ if (ret) {
+ status = errno_to_blk_status(ret);
btrfs_bio_counter_dec(fs_info);
goto end_bbio;
}
+ /*
+ * For fscrypt writes we will get the encrypted bio after we've remapped
+ * our bio to the physical disk location, so we need to save the
+ * original bytenr so we know what we're checksumming.
+ */
+ if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio))
+ bbio->orig_logical = logical;
+
map_length = min(map_length, length);
if (use_append)
map_length = btrfs_append_map_length(bbio, map_length);
@@ -700,7 +784,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
split = btrfs_split_bio(fs_info, bbio, map_length);
if (IS_ERR(split)) {
- ret = errno_to_blk_status(PTR_ERR(split));
+ status = errno_to_blk_status(PTR_ERR(split));
btrfs_bio_counter_dec(fs_info);
goto end_bbio;
}
@@ -715,7 +799,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
bbio->saved_iter = bio->bi_iter;
ret = btrfs_lookup_bio_sums(bbio);
- if (ret)
+ status = errno_to_blk_status(ret);
+ if (status)
goto fail;
}
@@ -740,7 +825,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
* Csum items for reloc roots have already been cloned at this
* point, so they are handled as part of the no-checksum case.
*/
- if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
+ if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
!btrfs_is_data_reloc_root(inode->root)) {
if (should_async_write(bbio) &&
@@ -748,13 +833,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
goto done;
ret = btrfs_bio_csum(bbio);
- if (ret)
+ status = errno_to_blk_status(ret);
+ if (status)
goto fail;
} else if (use_append ||
(btrfs_is_zoned(fs_info) && inode &&
inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
- if (ret)
+ status = errno_to_blk_status(ret);
+ if (status)
goto fail;
}
}
@@ -775,19 +862,48 @@ fail:
ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
ASSERT(remaining);
- btrfs_bio_end_io(remaining, ret);
+ btrfs_bio_end_io(remaining, status);
}
end_bbio:
- btrfs_bio_end_io(bbio, ret);
+ btrfs_bio_end_io(bbio, status);
/* Do not submit another chunk */
return true;
}
+static void assert_bbio_alignment(struct btrfs_bio *bbio)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 alignment = min(blocksize, PAGE_SIZE);
+ const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ const u32 length = bbio->bio.bi_iter.bi_size;
+
+ /* The logical and length should still be aligned to blocksize. */
+ ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) &&
+ length != 0, "root=%llu inode=%llu logical=%llu length=%u",
+ btrfs_root_id(bbio->inode->root),
+ btrfs_ino(bbio->inode), logical, length);
+
+ bio_for_each_bvec(bvec, &bbio->bio, iter)
+ ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) &&
+ IS_ALIGNED(bvec.bv_len, alignment),
+ "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
+ btrfs_root_id(bbio->inode->root),
+ btrfs_ino(bbio->inode), logical, length, iter.bi_idx,
+ bvec.bv_offset, bvec.bv_len);
+#endif
+}
+
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
{
/* If bbio->inode is not populated, its file_offset must be 0. */
ASSERT(bbio->inode || bbio->file_offset == 0);
+ assert_bbio_alignment(bbio);
+
while (!btrfs_submit_chunk(bbio, mirror_num))
;
}
@@ -801,19 +917,36 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
*
* The I/O is issued synchronously to block the repair read completion from
* freeing the bio.
+ *
+ * @ino: Offending inode number
+ * @fileoff: File offset inside the inode
+ * @length: Length of the repair write
+ * @logical: Logical address of the range
+ * @paddrs: Physical address array of the content
+ * @step: Length of for each paddrs
+ * @mirror_num: Mirror number to write to. Must not be zero
*/
-int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct folio *folio,
- unsigned int folio_offset, int mirror_num)
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
+ u32 length, u64 logical, const phys_addr_t paddrs[],
+ unsigned int step, int mirror_num)
{
+ const u32 nr_steps = DIV_ROUND_UP_POW2(length, step);
struct btrfs_io_stripe smap = { 0 };
- struct bio_vec bvec;
- struct bio bio;
+ struct bio *bio = NULL;
int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
+ /* Basic alignment checks. */
+ ASSERT(IS_ALIGNED(logical, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(length, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize));
+ /* Either it's a single data or metadata block. */
+ ASSERT(length <= BTRFS_MAX_BLOCKSIZE);
+ ASSERT(step <= length);
+ ASSERT(is_power_of_2(step));
+
if (btrfs_repair_one_zone(fs_info, logical))
return 0;
@@ -827,31 +960,33 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (ret < 0)
goto out_counter_dec;
- if (!smap.dev->bdev ||
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
+ if (unlikely(!smap.dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
ret = -EIO;
goto out_counter_dec;
}
- bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- ret = bio_add_folio(&bio, folio, length, folio_offset);
- ASSERT(ret);
- ret = submit_bio_wait(&bio);
+ bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
+ bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+ for (int i = 0; i < nr_steps; i++) {
+ ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i]));
+ /* We should have allocated enough slots to contain all the different pages. */
+ ASSERT(ret == step);
+ }
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
if (ret) {
/* try to remap that extent elsewhere? */
btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
- goto out_bio_uninit;
+ goto out_counter_dec;
}
- btrfs_info_rl_in_rcu(fs_info,
+ btrfs_info_rl(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
- ino, start, btrfs_dev_name(smap.dev),
+ ino, fileoff, btrfs_dev_name(smap.dev),
smap.physical >> SECTOR_SHIFT);
ret = 0;
-out_bio_uninit:
- bio_uninit(&bio);
out_counter_dec:
btrfs_bio_counter_dec(fs_info);
return ret;
@@ -864,16 +999,16 @@ out_counter_dec:
*/
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
u64 length = bbio->bio.bi_iter.bi_size;
struct btrfs_io_stripe smap = { 0 };
int ret;
- ASSERT(fs_info);
ASSERT(mirror_num > 0);
ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
- ASSERT(!bbio->inode);
+ ASSERT(!is_data_inode(bbio->inode));
+ ASSERT(bbio->is_scrub);
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
@@ -900,22 +1035,18 @@ int __init btrfs_bioset_init(void)
return -ENOMEM;
if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_bio, bio), 0))
- goto out_free_bioset;
+ goto out;
if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
- goto out_free_clone_bioset;
+ goto out;
if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
sizeof(struct btrfs_failed_bio)))
- goto out_free_repair_bioset;
+ goto out;
return 0;
-out_free_repair_bioset:
- bioset_exit(&btrfs_repair_bioset);
-out_free_clone_bioset:
- bioset_exit(&btrfs_clone_bioset);
-out_free_bioset:
- bioset_exit(&btrfs_bioset);
+out:
+ btrfs_bioset_exit();
return -ENOMEM;
}
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index e2fe16074ad6..1be74209f0b8 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -18,13 +18,6 @@ struct btrfs_inode;
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
-/*
- * Maximum number of sectors for a single bio to limit the size of the
- * checksum array. This matches the number of bio_vecs per bio and thus the
- * I/O size for buffered I/O.
- */
-#define BTRFS_MAX_BIO_SECTORS (256)
-
typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
/*
@@ -34,7 +27,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
struct btrfs_bio {
/*
* Inode and offset into it that this I/O operates on.
- * Only set for data I/O.
+ *
+ * If the inode is a data one, csum verification and read-repair
+ * will be done automatically.
+ * If the inode is a metadata one, everything is handled by the caller.
*/
struct btrfs_inode *inode;
u64 file_offset;
@@ -56,11 +52,16 @@ struct btrfs_bio {
* - pointer to the checksums for this bio
* - original physical address from the allocator
* (for zone append only)
+ * - original logical address, used for checksumming fscrypt bios
*/
struct {
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_sum *sums;
+ struct work_struct csum_work;
+ struct completion csum_done;
+ struct bvec_iter csum_saved_iter;
u64 orig_physical;
+ u64 orig_logical;
};
/* For metadata reads: parentness verification. */
@@ -76,12 +77,21 @@ struct btrfs_bio {
atomic_t pending_ios;
struct work_struct end_io_work;
- /* File system that this I/O operates on. */
- struct btrfs_fs_info *fs_info;
-
/* Save the first error status of split bio. */
blk_status_t status;
+ /* Use the commit root to look up csums (data read bio only). */
+ bool csum_search_commit_root;
+
+ /*
+ * Since scrub will reuse btree inode, we need this flag to distinguish
+ * scrub bios.
+ */
+ bool is_scrub;
+
+ /* Whether the csum generation for data write is async. */
+ bool async_csum;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
@@ -97,10 +107,10 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
int __init btrfs_bioset_init(void);
void __cold btrfs_bioset_exit(void);
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset,
btrfs_bio_end_io_t end_io, void *private);
struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode, u64 file_offset,
btrfs_bio_end_io_t end_io, void *private);
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
@@ -109,8 +119,8 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
-int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct folio *folio,
- unsigned int folio_offset, int mirror_num);
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
+ u32 length, u64 logical, const phys_addr_t paddrs[],
+ unsigned int step, int mirror_num);
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a8129f1ce78c..08b14449fabe 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group
}
#endif
+static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
+{
+ /* The meta_write_pointer is available only on the zoned setup. */
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return false;
+
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ return block_group->start + block_group->alloc_offset >
+ block_group->meta_write_pointer;
+}
+
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
* is not in progress
@@ -525,10 +538,9 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
*total_added_ret = 0;
while (start < end) {
- if (!find_first_extent_bit(&info->excluded_extents, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY | EXTENT_UPTODATE,
- NULL))
+ if (!btrfs_find_first_extent_bit(&info->excluded_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL))
break;
if (extent_start <= start) {
@@ -601,8 +613,8 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
BTRFS_SUPER_INFO_OFFSET));
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
path->reada = READA_FORWARD;
search_offset = index * div_u64(block_group->length, max_index);
@@ -701,7 +713,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *extent_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
u64 total_found = 0;
@@ -732,8 +744,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
* root to add free space. So we skip locking and search the commit
* root, since its read-only
*/
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
path->reada = READA_FORWARD;
key.objectid = last;
@@ -828,14 +840,13 @@ next:
block_group->start + block_group->length,
NULL);
out:
- btrfs_free_path(path);
return ret;
}
static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
{
- clear_extent_bits(&bg->fs_info->excluded_extents, bg->start,
- bg->start + bg->length - 1, EXTENT_UPTODATE);
+ btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start,
+ bg->start + bg->length - 1, EXTENT_DIRTY, NULL);
}
static noinline void caching_thread(struct btrfs_work *work)
@@ -879,7 +890,7 @@ static noinline void caching_thread(struct btrfs_work *work)
*/
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
- ret = load_free_space_tree(caching_ctl);
+ ret = btrfs_load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
done:
@@ -1054,7 +1065,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_block_group *block_group;
struct btrfs_free_cluster *cluster;
struct inode *inode;
@@ -1237,7 +1248,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* another task to attempt to create another block group with the same
* item key (and failing with -EEXIST and a transaction abort).
*/
- ret = remove_block_group_free_space(trans, block_group);
+ ret = btrfs_remove_block_group_free_space(trans, block_group);
if (ret)
goto out;
@@ -1246,6 +1257,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
+ /*
+ * Hitting this WARN means we removed a block group with an unwritten
+ * region. It will cause "unable to find chunk map for logical" errors.
+ */
+ if (WARN_ON(has_unwritten_metadata(block_group)))
+ btrfs_warn(fs_info,
+ "block group %llu is removed before metadata write out",
+ block_group->start);
+
set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
/*
@@ -1285,7 +1305,6 @@ out:
btrfs_put_block_group(block_group);
if (remove_rsv)
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
- btrfs_free_path(path);
return ret;
}
@@ -1338,7 +1357,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* data in this block group. That check should be done by relocation routine,
* not this function.
*/
-static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group *cache, bool force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
@@ -1383,8 +1402,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
* leeway to allow us to mark this block group as read only.
*/
- if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
- BTRFS_RESERVE_NO_FLUSH))
+ if (btrfs_can_overcommit(sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH))
ret = 0;
}
@@ -1405,7 +1423,7 @@ out:
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
- btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ btrfs_dump_space_info(cache->space_info, 0, false);
}
return ret;
}
@@ -1420,9 +1438,8 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
int ret;
spin_lock(&fs_info->trans_lock);
- if (trans->transaction->list.prev != &fs_info->trans_list) {
- prev_trans = list_last_entry(&trans->transaction->list,
- struct btrfs_transaction, list);
+ if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) {
+ prev_trans = list_prev_entry(trans->transaction, list);
refcount_inc(&prev_trans->use_count);
}
spin_unlock(&fs_info->trans_lock);
@@ -1439,14 +1456,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
- ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
if (ret)
goto out;
}
- ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
out:
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans)
@@ -1589,8 +1606,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true);
- if (space_info->total_bytes - block_group->length < used &&
- block_group->zone_unusable < block_group->length) {
+ if ((space_info->total_bytes - block_group->length < used &&
+ block_group->zone_unusable < block_group->length) ||
+ has_unwritten_metadata(block_group)) {
/*
* Add a reference for the list, compensate for the ref
* drop under the "next" label for the
@@ -1619,8 +1637,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
ret = btrfs_zone_finish(block_group);
if (ret < 0) {
btrfs_dec_block_group_ro(block_group);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ btrfs_link_bg_list(block_group, &retry_list);
ret = 0;
+ }
goto next;
}
@@ -1773,7 +1793,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
bg1 = list_entry(a, struct btrfs_block_group, bg_list);
bg2 = list_entry(b, struct btrfs_block_group, bg_list);
- return bg1->used > bg2->used;
+ /*
+ * Some other task may be updating the ->used field concurrently, but it
+ * is not serious if we get a stale value or load/store tearing issues,
+ * as sorting the list of block groups to reclaim is not critical and an
+ * occasional imperfect order is ok. So silence KCSAN and avoid the
+ * overhead of locking or any other synchronization.
+ */
+ return data_race(bg1->used > bg2->used);
}
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
@@ -1821,12 +1848,10 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!btrfs_should_reclaim(fs_info))
return;
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
- sb_end_write(fs_info->sb);
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
return;
- }
/*
* Long running balances can keep us blocked here for eternity, so
@@ -1834,7 +1859,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
return;
}
@@ -1846,7 +1870,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
- u64 zone_unusable;
u64 used;
u64 reserved;
int ret = 0;
@@ -1913,23 +1936,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
- /*
- * Cache the zone_unusable value before turning the block group
- * to read only. As soon as the block group is read only it's
- * zone_unusable value gets moved to the block group's read-only
- * bytes and isn't available for calculations anymore. We also
- * cache it before unlocking the block group, to prevent races
- * (reports from KCSAN and such tools) with tasks updating it.
- */
- zone_unusable = bg->zone_unusable;
-
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
/*
* Get out fast, in case we're read-only or unmounting the
* filesystem. It is OK to drop block groups from the list even
- * for the read-only case. As we did sb_start_write(),
+ * for the read-only case. As we did take the super write lock,
* "mount -o remount,ro" won't happen and read-only filesystem
* means it is forced read-only due to a fatal error. So, it
* never gets back to read-write to let us reclaim again.
@@ -1953,7 +1966,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
* called, which is where we will transfer a reserved extent's
* size from the "reserved" counter to the "used" counter - this
* happens when running delayed references. When we relocate the
- * chunk below, relocation first flushes dellaloc, waits for
+ * chunk below, relocation first flushes delalloc, waits for
* ordered extent completion (which is where we create delayed
* references for data extents) and commits the current
* transaction (which runs delayed references), and only after
@@ -1966,14 +1979,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
reserved = bg->reserved;
spin_unlock(&bg->lock);
- btrfs_info(fs_info,
- "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable",
- bg->start,
- div64_u64(used * 100, bg->length),
- div64_u64(reserved * 100, bg->length),
- div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
- ret = btrfs_relocate_chunk(fs_info, bg->start);
+ ret = btrfs_relocate_chunk(fs_info, bg->start, false);
if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
@@ -2018,7 +2025,6 @@ end:
list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
@@ -2026,7 +2032,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
btrfs_reclaim_sweep(fs_info);
spin_lock(&fs_info->unused_bgs_lock);
if (!list_empty(&fs_info->reclaim_bgs))
- queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+ queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work);
spin_unlock(&fs_info->unused_bgs_lock);
}
@@ -2059,7 +2065,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key
return -ENOENT;
}
- if (map->start != key->objectid || map->chunk_len != key->offset) {
+ if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
key->objectid, key->offset, map->start, map->chunk_len);
@@ -2072,7 +2078,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
@@ -2218,9 +2224,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
cache->bytes_super += stripe_len;
- ret = set_extent_bit(&fs_info->excluded_extents, cache->start,
- cache->start + stripe_len - 1,
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start,
+ cache->start + stripe_len - 1,
+ EXTENT_DIRTY, NULL);
if (ret)
return ret;
}
@@ -2233,7 +2239,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return ret;
/* Shouldn't have super stripes in sequential zones */
- if (zoned && nr) {
+ if (unlikely(zoned && nr)) {
kfree(logical);
btrfs_err(fs_info,
"zoned: block group %llu must not contain super block",
@@ -2246,9 +2252,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
- ret = set_extent_bit(&fs_info->excluded_extents, logical[nr],
- logical[nr] + len - 1,
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_set_extent_bit(&fs_info->excluded_extents,
+ logical[nr], logical[nr] + len - 1,
+ EXTENT_DIRTY, NULL);
if (ret) {
kfree(logical);
return ret;
@@ -2324,7 +2330,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
break;
bg = btrfs_lookup_block_group(fs_info, map->start);
- if (!bg) {
+ if (unlikely(!bg)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
map->start, map->chunk_len);
@@ -2332,9 +2338,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
btrfs_free_chunk_map(map);
break;
}
- if (bg->start != map->start || bg->length != map->chunk_len ||
- (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (unlikely(bg->start != map->start || bg->length != map->chunk_len ||
+ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+ (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
map->start, map->chunk_len,
@@ -2373,8 +2379,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->commit_used = cache->used;
cache->flags = btrfs_stack_block_group_flags(bgi);
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
+ cache->space_info = btrfs_find_space_info(info, cache->flags);
- set_free_space_tree_thresholds(cache);
+ btrfs_set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
@@ -2451,6 +2458,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
btrfs_remove_free_space_cache(cache);
goto error;
}
+
trace_btrfs_add_block_group(info, cache, 0);
btrfs_add_bg_to_space_info(info, cache);
@@ -2495,6 +2503,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = map->chunk_len;
bg->flags = map->type;
+ bg->space_info = btrfs_find_space_info(fs_info, bg->flags);
ret = btrfs_add_block_group_cache(bg);
/*
* We may have some valid block group cache added already, in
@@ -2791,7 +2800,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
- add_block_group_free_space(trans, block_group);
+ btrfs_add_block_group_free_space(trans, block_group);
/*
* If we restriped during balance, we may have added a new raid
@@ -2824,7 +2833,7 @@ next:
* space or none at all (due to no need to COW, extent buffers
* were already COWed in the current transaction and still
* unwritten, tree heights lower than the maximum possible
- * height, etc). For data we generally reserve the axact amount
+ * height, etc). For data we generally reserve the exact amount
* of space we are going to allocate later, the exception is
* when using compression, as we must reserve space based on the
* uncompressed data size, because the compression is only done
@@ -2868,8 +2877,8 @@ static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 off
}
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
- u64 type,
- u64 chunk_offset, u64 size)
+ struct btrfs_space_info *space_info,
+ u64 type, u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
@@ -2889,7 +2898,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
cache->length = size;
- set_free_space_tree_thresholds(cache);
+ btrfs_set_free_space_tree_thresholds(cache);
cache->flags = type;
cache->cached = BTRFS_CACHE_FINISHED;
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
@@ -2923,7 +2932,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
* assigned to our block group. We want our bg to be added to the rbtree
* with its ->space_info set.
*/
- cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
+ cache->space_info = space_info;
ASSERT(cache->space_info);
ret = btrfs_add_block_group_cache(cache);
@@ -2968,6 +2977,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_space_info *space_info = cache->space_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = btrfs_block_group_root(fs_info);
u64 alloc_flags;
@@ -3020,7 +3030,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
*/
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
- ret = btrfs_chunk_alloc(trans, alloc_flags,
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags,
CHUNK_ALLOC_FORCE);
/*
* ENOSPC is allowed here, we may have enough space
@@ -3048,15 +3058,15 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
(cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
goto unlock_out;
- alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
- ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
/*
* We have allocated a new chunk. We also need to activate that chunk to
* grant metadata tickets for zoned filesystem.
*/
- ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
+ ret = btrfs_zoned_activate_one_bg(space_info, true);
if (ret < 0)
goto out;
@@ -3232,7 +3242,7 @@ again:
*/
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
/*
* So theoretically we could recover from this, simply set the
* super cache generation to 0 so we know to invalidate the
@@ -3635,9 +3645,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
ret = update_block_group_item(trans, path, cache);
- }
- if (ret)
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
+ }
}
/* If its not on the io list, we need to put the block group */
@@ -3738,8 +3750,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
- set_extent_bit(&trans->transaction->pinned_extents, bytenr,
- bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
}
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -3785,7 +3797,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* reservation and return -EAGAIN, otherwise this function always succeeds.
*/
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc,
+ u64 ram_bytes, u64 num_bytes, bool delalloc,
bool force_wrong_size_class)
{
struct btrfs_space_info *space_info = cache->space_info;
@@ -3796,30 +3808,38 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
spin_lock(&cache->lock);
if (cache->ro) {
ret = -EAGAIN;
- goto out;
+ goto out_error;
}
if (btrfs_block_group_should_use_size_class(cache)) {
size_class = btrfs_calc_block_group_size_class(num_bytes);
ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
if (ret)
- goto out;
+ goto out_error;
}
+
cache->reserved += num_bytes;
- space_info->bytes_reserved += num_bytes;
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
+
trace_btrfs_space_reservation(cache->fs_info, "space_info",
space_info->flags, num_bytes, 1);
+ spin_unlock(&cache->lock);
+
+ space_info->bytes_reserved += num_bytes;
btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
- if (delalloc)
- cache->delalloc_bytes += num_bytes;
/*
* Compression can use less space than we reserved, so wake tickets if
* that happens.
*/
if (num_bytes < ram_bytes)
- btrfs_try_granting_tickets(cache->fs_info, space_info);
-out:
+ btrfs_try_granting_tickets(space_info);
+ spin_unlock(&space_info->lock);
+
+ return 0;
+
+out_error:
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
return ret;
@@ -3828,35 +3848,38 @@ out:
/*
* Update the block_group and space info counters.
*
- * @cache: The cache we are manipulating
- * @num_bytes: The number of bytes in question
- * @delalloc: The blocks are allocated for the delalloc write
+ * @cache: The cache we are manipulating.
+ * @num_bytes: The number of bytes in question.
+ * @is_delalloc: Whether the blocks are allocated for a delalloc write.
*
* This is called by somebody who is freeing space that was never actually used
* on disk. For example if you reserve some space for a new leaf in transaction
* A and before transaction A commits you free that leaf, you call this with
* reserve set to 0 in order to clear the reservation.
*/
-void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
- u64 num_bytes, int delalloc)
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
+ bool is_delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
+ bool bg_ro;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
- if (cache->ro)
+ bg_ro = cache->ro;
+ cache->reserved -= num_bytes;
+ if (is_delalloc)
+ cache->delalloc_bytes -= num_bytes;
+ spin_unlock(&cache->lock);
+
+ if (bg_ro)
space_info->bytes_readonly += num_bytes;
else if (btrfs_is_zoned(cache->fs_info))
space_info->bytes_zone_unusable += num_bytes;
- cache->reserved -= num_bytes;
+
space_info->bytes_reserved -= num_bytes;
space_info->max_extent_size = 0;
- if (delalloc)
- cache->delalloc_bytes -= num_bytes;
- spin_unlock(&cache->lock);
-
- btrfs_try_granting_tickets(cache->fs_info, space_info);
+ btrfs_try_granting_tickets(space_info);
spin_unlock(&space_info->lock);
}
@@ -3871,14 +3894,14 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
}
}
-static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *sinfo, int force)
+static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *sinfo, int force)
{
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
if (force == CHUNK_ALLOC_FORCE)
- return 1;
+ return true;
/*
* in limited mode, we want to have some free space up to
@@ -3889,22 +3912,31 @@ static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
if (sinfo->total_bytes - bytes_used < thresh)
- return 1;
+ return true;
}
if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
- return 0;
- return 1;
+ return false;
+ return true;
}
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
{
u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
+ struct btrfs_space_info *space_info;
- return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ space_info = btrfs_find_space_info(trans->fs_info, type);
+ if (!space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+
+ return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
}
-static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info,
+ u64 flags)
{
struct btrfs_block_group *bg;
int ret;
@@ -3917,7 +3949,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
*/
check_system_chunk(trans, flags);
- bg = btrfs_create_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, space_info, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
goto out;
@@ -3965,8 +3997,16 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *sys_space_info;
- sys_bg = btrfs_create_chunk(trans, sys_flags);
+ sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags);
+ if (unlikely(!sys_space_info)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3974,17 +4014,17 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- } else if (ret) {
+ } else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4097,6 +4137,8 @@ out:
*
* This function, btrfs_chunk_alloc(), belongs to phase 1.
*
+ * @space_info: specify which space_info the new chunk should belong to.
+ *
* If @force is CHUNK_ALLOC_FORCE:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
@@ -4105,11 +4147,11 @@ out:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
*/
-int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info, u64 flags,
enum btrfs_chunk_alloc_enum force)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_space_info *space_info;
struct btrfs_block_group *ret_bg;
bool wait_for_alloc = false;
bool should_alloc = false;
@@ -4148,9 +4190,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return -ENOSPC;
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
-
do {
spin_lock(&space_info->lock);
if (force < space_info->force_alloc)
@@ -4158,11 +4197,11 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
should_alloc = should_alloc_chunk(fs_info, space_info, force);
if (space_info->full) {
/* No more free physical space */
+ spin_unlock(&space_info->lock);
if (should_alloc)
ret = -ENOSPC;
else
ret = 0;
- spin_unlock(&space_info->lock);
return ret;
} else if (!should_alloc) {
spin_unlock(&space_info->lock);
@@ -4174,16 +4213,16 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* recheck if we should continue with our allocation
* attempt.
*/
+ spin_unlock(&space_info->lock);
wait_for_alloc = true;
force = CHUNK_ALLOC_NO_FORCE;
- spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
/* Proceed with allocation */
- space_info->chunk_alloc = 1;
- wait_for_alloc = false;
+ space_info->chunk_alloc = true;
spin_unlock(&space_info->lock);
+ wait_for_alloc = false;
}
cond_resched();
@@ -4211,7 +4250,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- ret_bg = do_chunk_alloc(trans, flags);
+ ret_bg = do_chunk_alloc(trans, space_info, flags);
trans->allocating_chunk = false;
if (IS_ERR(ret_bg)) {
@@ -4230,7 +4269,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
spin_lock(&space_info->lock);
if (ret < 0) {
if (ret == -ENOSPC)
- space_info->full = 1;
+ space_info->full = true;
else
goto out;
} else {
@@ -4240,7 +4279,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
out:
- space_info->chunk_alloc = 0;
+ space_info->chunk_alloc = false;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
@@ -4281,12 +4320,16 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
left, bytes, type);
- btrfs_dump_space_info(fs_info, info, 0, 0);
+ btrfs_dump_space_info(info, 0, false);
}
if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_info, flags);
+ ASSERT(space_info);
/*
* Ignore failure to create system chunk. We might end up not
@@ -4294,7 +4337,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
*/
- bg = btrfs_create_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, space_info, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
} else {
@@ -4302,7 +4345,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
* We have a new chunk. We also need to activate it for
* zoned filesystem.
*/
- ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
+ ret = btrfs_zoned_activate_one_bg(info, true);
if (ret < 0)
return;
@@ -4402,6 +4445,43 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
}
}
+static void check_removing_space_info(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *info = space_info->fs_info;
+
+ if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) {
+ /* This is a top space_info, proceed with its children first. */
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
+ if (space_info->sub_group[i]) {
+ check_removing_space_info(space_info->sub_group[i]);
+ kfree(space_info->sub_group[i]);
+ space_info->sub_group[i] = NULL;
+ }
+ }
+ }
+
+ /*
+ * Do not hide this behind enospc_debug, this is actually important and
+ * indicates a real bug if this happens.
+ */
+ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0))
+ btrfs_dump_space_info(space_info, 0, false);
+
+ /*
+ * If there was a failure to cleanup a log tree, very likely due to an
+ * IO failure on a writeback attempt of one or more of its extent
+ * buffers, we could not do proper (and cheap) unaccounting of their
+ * reserved space, so don't warn on bytes_reserved > 0 in that case.
+ */
+ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
+ if (WARN_ON(space_info->bytes_reserved > 0))
+ btrfs_dump_space_info(space_info, 0, false);
+ }
+
+ WARN_ON(space_info->reclaim_size > 0);
+}
+
/*
* Must be called only after stopping all workers, since we could have block
* group caching kthreads running, and therefore they could race with us if we
@@ -4427,8 +4507,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
- caching_ctl = list_entry(info->caching_block_groups.next,
- struct btrfs_caching_control, list);
+ caching_ctl = list_first_entry(&info->caching_block_groups,
+ struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
@@ -4499,32 +4579,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
- space_info = list_entry(info->space_info.next,
- struct btrfs_space_info,
- list);
-
- /*
- * Do not hide this behind enospc_debug, this is actually
- * important and indicates a real bug if this happens.
- */
- if (WARN_ON(space_info->bytes_pinned > 0 ||
- space_info->bytes_may_use > 0))
- btrfs_dump_space_info(info, space_info, 0, 0);
-
- /*
- * If there was a failure to cleanup a log tree, very likely due
- * to an IO failure on a writeback attempt of one or more of its
- * extent buffers, we could not do proper (and cheap) unaccounting
- * of their reserved space, so don't warn on bytes_reserved > 0 in
- * that case.
- */
- if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
- if (WARN_ON(space_info->bytes_reserved > 0))
- btrfs_dump_space_info(info, space_info, 0, 0);
- }
+ space_info = list_first_entry(&info->space_info,
+ struct btrfs_space_info, list);
- WARN_ON(space_info->reclaim_size > 0);
+ check_removing_space_info(space_info);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 36937eeab9b8..5f933455118c 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -63,7 +63,7 @@ enum btrfs_discard_state {
* CHUNK_ALLOC_FORCE means it must try to allocate one
*
* CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
- * find_free_extent() that also activaes the zone
+ * find_free_extent() that also activates the zone
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
@@ -83,6 +83,8 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
/* Does the block group need to be added to the free space tree? */
BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ /* Set after we add a new block group to the free space tree. */
+ BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
/* Indicate that the block group is placed on a sequential zone */
BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
/*
@@ -244,6 +246,11 @@ struct btrfs_block_group {
/* Lock for free space tree operations. */
struct mutex free_space_lock;
+ /* Protected by @free_space_lock. */
+ bool using_free_space_bitmaps;
+ /* Protected by @free_space_lock. */
+ bool using_free_space_bitmaps_cached;
+
/*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
@@ -326,8 +333,8 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
- u64 type,
- u64 chunk_offset, u64 size);
+ struct btrfs_space_info *space_info,
+ u64 type, u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
@@ -338,11 +345,12 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc,
+ u64 ram_bytes, u64 num_bytes, bool delalloc,
bool force_wrong_size_class);
-void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
- u64 num_bytes, int delalloc);
-int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
+ bool is_delalloc);
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 3f3608299c0b..96cf7a162987 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -218,8 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
if (num_bytes == 0)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
- num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
@@ -259,8 +258,7 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
if (!ret)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
- num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
@@ -387,7 +385,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
num_bytes = block_rsv->reserved - block_rsv->size;
btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes);
block_rsv->reserved = block_rsv->size;
- btrfs_try_granting_tickets(fs_info, sinfo);
+ btrfs_try_granting_tickets(sinfo);
}
block_rsv->full = (block_rsv->reserved == block_rsv->size);
@@ -418,6 +416,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_CHUNK_TREE_OBJECTID:
root->block_rsv = &fs_info->chunk_block_rsv;
break;
+ case BTRFS_TREE_LOG_OBJECTID:
+ root->block_rsv = &fs_info->treelog_rsv;
+ break;
default:
root->block_rsv = NULL;
break;
@@ -438,6 +439,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
+ /* The treelog_rsv uses a dedicated space_info on the zoned mode. */
+ if (!btrfs_is_zoned(fs_info)) {
+ fs_info->treelog_rsv.space_info = space_info;
+ } else {
+ ASSERT(space_info->sub_group[0]->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
+ fs_info->treelog_rsv.space_info = space_info->sub_group[0];
+ }
+
btrfs_update_global_block_rsv(fs_info);
}
@@ -519,8 +528,8 @@ again:
block_rsv->type, ret);
}
try_reserve:
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
- blocksize, BTRFS_RESERVE_NO_FLUSH);
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
/*
@@ -541,7 +550,7 @@ try_reserve:
* one last time to force a reservation if there's enough actual space
* on disk to make the reservation.
*/
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize,
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize,
BTRFS_RESERVE_FLUSH_EMERGENCY);
if (!ret)
return block_rsv;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index d12b1fac5c74..79ae9d05cd91 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -24,6 +24,7 @@ enum btrfs_rsv_type {
BTRFS_BLOCK_RSV_CHUNK,
BTRFS_BLOCK_RSV_DELOPS,
BTRFS_BLOCK_RSV_DELREFS,
+ BTRFS_BLOCK_RSV_TREELOG,
BTRFS_BLOCK_RSV_EMPTY,
BTRFS_BLOCK_RSV_TEMP,
};
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4e2952cf5766..73602ee8de3f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -18,20 +18,20 @@
#include <linux/lockdep.h>
#include <uapi/linux/btrfs_tree.h>
#include <trace/events/btrfs.h>
+#include "ctree.h"
#include "block-rsv.h"
#include "extent_map.h"
-#include "extent_io.h"
#include "extent-io-tree.h"
-#include "ordered-data.h"
-#include "delayed-inode.h"
-struct extent_state;
struct posix_acl;
struct iov_iter;
struct writeback_control;
struct btrfs_root;
struct btrfs_fs_info;
struct btrfs_trans_handle;
+struct btrfs_bio;
+struct btrfs_file_extent;
+struct btrfs_delayed_node;
/*
* Since we search a directory based on f_pos (struct dir_context::pos) we have
@@ -248,7 +248,7 @@ struct btrfs_inode {
u64 new_delalloc_bytes;
/*
* The offset of the last dir index key that was logged.
- * This is used only for directories.
+ * This is used only for directories. Protected by 'log_mutex'.
*/
u64 last_dir_index_offset;
};
@@ -338,6 +338,11 @@ struct btrfs_inode {
struct list_head delayed_iput;
struct rw_semaphore i_mmap_lock;
+
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info;
+#endif
+
struct inode vfs_inode;
};
@@ -525,14 +530,27 @@ static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode)
mapping_set_stable_writes(inode->vfs_inode.i_mapping);
}
-/* Array of bytes with variable length, hexadecimal format 0x1234 */
-#define CSUM_FMT "0x%*phN"
-#define CSUM_FMT_VALUE(size, bytes) size, bytes
+static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
+{
+ /* Metadata inode should not reach here. */
+ ASSERT(is_data_inode(inode));
+
+ /* We only allow BITS_PER_LONGS blocks for each bitmap. */
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ mapping_set_folio_order_range(inode->vfs_inode.i_mapping,
+ inode->root->fs_info->block_min_order,
+ inode->root->fs_info->block_max_order);
+#endif
+}
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- u32 pgoff, u8 *csum, const u8 * const csum_expected);
+void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info,
+ const phys_addr_t paddr, u8 *dest);
+void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info,
+ const phys_addr_t paddrs[], u8 *dest);
+int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
+ const u8 * const csum_expected);
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
- u32 bio_offset, struct bio_vec *bv);
+ u32 bio_offset, const phys_addr_t paddrs[]);
noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
bool nowait);
@@ -545,10 +563,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name);
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const struct fscrypt_str *name, int add_backref, u64 index);
+ const struct fscrypt_str *name, bool add_backref, u64 index);
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- int front);
+int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e7f8ee5d48a4..6b3357287b42 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -67,9 +67,7 @@ static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
GFP_NOFS, &btrfs_compressed_bioset));
- btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
- bbio->inode = inode;
- bbio->file_offset = start;
+ btrfs_bio_init(bbio, inode, start, end_io, NULL);
return to_compressed_bio(bbio);
}
@@ -90,19 +88,19 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
}
static int compression_compress_pages(int type, struct list_head *ws,
- struct address_space *mapping, u64 start,
+ struct btrfs_inode *inode, u64 start,
struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB:
- return zlib_compress_folios(ws, mapping, start, folios,
+ return zlib_compress_folios(ws, inode, start, folios,
out_folios, total_in, total_out);
case BTRFS_COMPRESS_LZO:
- return lzo_compress_folios(ws, mapping, start, folios,
+ return lzo_compress_folios(ws, inode, start, folios,
out_folios, total_in, total_out);
case BTRFS_COMPRESS_ZSTD:
- return zstd_compress_folios(ws, mapping, start, folios,
+ return zstd_compress_folios(ws, inode, start, folios,
out_folios, total_in, total_out);
case BTRFS_COMPRESS_NONE:
default:
@@ -194,15 +192,13 @@ static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_c
static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
{
- struct list_head remove;
+ LIST_HEAD(remove);
struct list_head *tmp, *next;
int freed;
if (compr_pool.count == 0)
return SHRINK_STOP;
- INIT_LIST_HEAD(&remove);
-
/* For now, just simply drain the whole list. */
spin_lock(&compr_pool.lock);
list_splice_init(&compr_pool.list, &remove);
@@ -223,10 +219,14 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co
/*
* Common wrappers for page allocation from compression wrappers
*/
-struct folio *btrfs_alloc_compr_folio(void)
+struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info)
{
struct folio *folio = NULL;
+ /* For bs > ps cases, no cached folio pool for now. */
+ if (fs_info->block_min_order)
+ goto alloc;
+
spin_lock(&compr_pool.lock);
if (compr_pool.count > 0) {
folio = list_first_entry(&compr_pool.list, struct folio, lru);
@@ -238,13 +238,18 @@ struct folio *btrfs_alloc_compr_folio(void)
if (folio)
return folio;
- return folio_alloc(GFP_NOFS, 0);
+alloc:
+ return folio_alloc(GFP_NOFS, fs_info->block_min_order);
}
void btrfs_free_compr_folio(struct folio *folio)
{
bool do_free = false;
+ /* The folio is from bs > ps fs, no cached pool for now. */
+ if (folio_order(folio))
+ goto free;
+
spin_lock(&compr_pool.lock);
if (compr_pool.count > compr_pool.thresh) {
do_free = true;
@@ -257,6 +262,7 @@ void btrfs_free_compr_folio(struct folio *folio)
if (!do_free)
return;
+free:
ASSERT(folio_ref_count(folio) == 1);
folio_put(folio);
}
@@ -282,15 +288,15 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
{
struct inode *inode = &cb->bbio.inode->vfs_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- unsigned long index = cb->start >> PAGE_SHIFT;
- unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
+ pgoff_t index = cb->start >> PAGE_SHIFT;
+ const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
- const int error = blk_status_to_errno(cb->bbio.bio.bi_status);
int i;
int ret;
- if (error)
- mapping_set_error(inode->i_mapping, error);
+ ret = blk_status_to_errno(cb->bbio.bio.bi_status);
+ if (ret)
+ mapping_set_error(inode->i_mapping, ret);
folio_batch_init(&fbatch);
while (index <= end_index) {
@@ -311,22 +317,6 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
/* the inode may be gone now */
}
-static void btrfs_finish_compressed_write_work(struct work_struct *work)
-{
- struct compressed_bio *cb =
- container_of(work, struct compressed_bio, write_end_work);
-
- btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
- cb->bbio.bio.bi_status == BLK_STS_OK);
-
- if (cb->writeback)
- end_compressed_writeback(cb);
- /* Note, our inode could be gone now */
-
- btrfs_free_compressed_folios(cb);
- bio_put(&cb->bbio.bio);
-}
-
/*
* Do the cleanup once all the compressed pages hit the disk. This will clear
* writeback on the file pages and free the compressed pages.
@@ -337,25 +327,33 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
static void end_bbio_compressed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
- queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+ btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
+ cb->bbio.bio.bi_status == BLK_STS_OK);
+
+ if (cb->writeback)
+ end_compressed_writeback(cb);
+ /* Note, our inode could be gone now. */
+ btrfs_free_compressed_folios(cb);
+ bio_put(&cb->bbio.bio);
}
static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
{
struct bio *bio = &cb->bbio.bio;
u32 offset = 0;
+ unsigned int findex = 0;
while (offset < cb->compressed_len) {
+ struct folio *folio = cb->compressed_folios[findex];
+ u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio));
int ret;
- u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
/* Maximum compressed extent is smaller than bio size limit. */
- ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT],
- len, 0);
+ ret = bio_add_folio(bio, folio, len, 0);
ASSERT(ret);
offset += len;
+ findex++;
}
}
@@ -389,7 +387,6 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb->compressed_folios = compressed_folios;
cb->compressed_len = ordered->disk_num_bytes;
cb->writeback = writeback;
- INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_folios = nr_folios;
cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
cb->bbio.ordered = ordered;
@@ -415,7 +412,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int *memstall, unsigned long *pflags)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- unsigned long end_index;
+ pgoff_t end_index;
struct bio *orig_bio = &cb->orig_bbio->bio;
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
u64 isize = i_size_read(inode);
@@ -443,11 +440,15 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (fs_info->sectorsize < PAGE_SIZE)
return 0;
+ /* For bs > ps cases, we don't support readahead for compressed folios for now. */
+ if (fs_info->block_min_order)
+ return 0;
+
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (cur < compressed_end) {
- u64 page_end;
- u64 pg_index = cur >> PAGE_SHIFT;
+ pgoff_t page_end;
+ pgoff_t pg_index = cur >> PAGE_SHIFT;
u32 add_size;
if (pg_index > end_index)
@@ -474,8 +475,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
continue;
}
- folio = filemap_alloc_folio(mapping_gfp_constraint(mapping,
- ~__GFP_FS), 0);
+ folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS),
+ 0, NULL);
if (!folio)
break;
@@ -499,9 +500,9 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
- lock_extent(tree, cur, page_end, NULL);
+ btrfs_lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
+ em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
/*
@@ -510,20 +511,20 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* to this compressed extent on disk.
*/
if (!em || cur < em->start ||
- (cur + fs_info->sectorsize > extent_map_end(em)) ||
- (extent_map_block_start(em) >> SECTOR_SHIFT) !=
+ (cur + fs_info->sectorsize > btrfs_extent_map_end(em)) ||
+ (btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) !=
orig_bio->bi_iter.bi_sector) {
- free_extent_map(em);
- unlock_extent(tree, cur, page_end, NULL);
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(tree, cur, page_end, NULL);
folio_unlock(folio);
folio_put(folio);
break;
}
add_size = min(em->start + em->len, page_end + 1) - cur;
- free_extent_map(em);
- unlock_extent(tree, cur, page_end, NULL);
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(tree, cur, page_end, NULL);
- if (folio->index == end_index) {
+ if (folio_contains(folio, end_index)) {
size_t zero_offset = offset_in_folio(folio, isize);
if (zero_offset) {
@@ -576,19 +577,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
struct extent_map *em;
unsigned long pflags;
int memstall = 0;
- blk_status_t ret;
- int ret2;
+ blk_status_t status;
+ int ret;
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
+ em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
if (!em) {
- ret = BLK_STS_IOERR;
+ status = BLK_STS_IOERR;
goto out;
}
- ASSERT(extent_map_is_compressed(em));
+ ASSERT(btrfs_extent_map_is_compressed(em));
compressed_len = em->disk_num_bytes;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
@@ -600,21 +601,23 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
cb->len = bbio->bio.bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = extent_map_compression(em);
+ cb->compress_type = btrfs_extent_map_compression(em);
cb->orig_bbio = bbio;
+ cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS);
+ cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info));
+ cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
if (!cb->compressed_folios) {
- ret = BLK_STS_RESOURCE;
+ status = BLK_STS_RESOURCE;
goto out_free_bio;
}
- ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
- if (ret2) {
- ret = BLK_STS_RESOURCE;
+ ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order,
+ cb->compressed_folios);
+ if (ret) {
+ status = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
}
@@ -637,7 +640,7 @@ out_free_compressed_pages:
out_free_bio:
bio_put(&cb->bbio.bio);
out:
- btrfs_bio_end_io(bbio, ret);
+ btrfs_bio_end_io(bbio, status);
}
/*
@@ -687,8 +690,6 @@ struct heuristic_ws {
struct list_head list;
};
-static struct workspace_manager heuristic_wsm;
-
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -701,7 +702,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info)
{
struct heuristic_ws *ws;
@@ -728,11 +729,9 @@ fail:
return ERR_PTR(-ENOMEM);
}
-const struct btrfs_compress_op btrfs_heuristic_compress = {
- .workspace_manager = &heuristic_wsm,
-};
+const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 };
-static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+static const struct btrfs_compress_levels * const btrfs_compress_levels[] = {
/* The heuristic is represented as compression type 0 */
&btrfs_heuristic_compress,
&btrfs_zlib_compress,
@@ -740,13 +739,13 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zstd_compress,
};
-static struct list_head *alloc_workspace(int type, int level)
+static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
- case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
- case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace();
- case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info);
+ case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info);
+ case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level);
default:
/*
* This can't happen, the type is validated several times
@@ -772,44 +771,58 @@ static void free_workspace(int type, struct list_head *ws)
}
}
-static void btrfs_init_workspace_manager(int type)
+static int alloc_workspace_manager(struct btrfs_fs_info *fs_info,
+ enum btrfs_compression_type type)
{
- struct workspace_manager *wsm;
+ struct workspace_manager *gwsm;
struct list_head *workspace;
- wsm = btrfs_compress_op[type]->workspace_manager;
- INIT_LIST_HEAD(&wsm->idle_ws);
- spin_lock_init(&wsm->ws_lock);
- atomic_set(&wsm->total_ws, 0);
- init_waitqueue_head(&wsm->ws_wait);
+ ASSERT(fs_info->compr_wsm[type] == NULL);
+ gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL);
+ if (!gwsm)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&gwsm->idle_ws);
+ spin_lock_init(&gwsm->ws_lock);
+ atomic_set(&gwsm->total_ws, 0);
+ init_waitqueue_head(&gwsm->ws_wait);
+ fs_info->compr_wsm[type] = gwsm;
/*
* Preallocate one workspace for each compression type so we can
* guarantee forward progress in the worst case
*/
- workspace = alloc_workspace(type, 0);
+ workspace = alloc_workspace(fs_info, type, 0);
if (IS_ERR(workspace)) {
- pr_warn(
- "BTRFS: cannot preallocate compression workspace, will try later\n");
+ btrfs_warn(fs_info,
+ "cannot preallocate compression workspace for %s, will try later",
+ btrfs_compress_type2str(type));
} else {
- atomic_set(&wsm->total_ws, 1);
- wsm->free_ws = 1;
- list_add(workspace, &wsm->idle_ws);
+ atomic_set(&gwsm->total_ws, 1);
+ gwsm->free_ws = 1;
+ list_add(workspace, &gwsm->idle_ws);
}
+ return 0;
}
-static void btrfs_cleanup_workspace_manager(int type)
+static void free_workspace_manager(struct btrfs_fs_info *fs_info,
+ enum btrfs_compression_type type)
{
- struct workspace_manager *wsman;
struct list_head *ws;
+ struct workspace_manager *gwsm = fs_info->compr_wsm[type];
- wsman = btrfs_compress_op[type]->workspace_manager;
- while (!list_empty(&wsman->idle_ws)) {
- ws = wsman->idle_ws.next;
+ /* ZSTD uses its own workspace manager, should enter here. */
+ ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES);
+ if (!gwsm)
+ return;
+ fs_info->compr_wsm[type] = NULL;
+ while (!list_empty(&gwsm->idle_ws)) {
+ ws = gwsm->idle_ws.next;
list_del(ws);
free_workspace(type, ws);
- atomic_dec(&wsman->total_ws);
+ atomic_dec(&gwsm->total_ws);
}
+ kfree(gwsm);
}
/*
@@ -818,9 +831,9 @@ static void btrfs_cleanup_workspace_manager(int type)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-struct list_head *btrfs_get_workspace(int type, int level)
+struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
{
- struct workspace_manager *wsm;
+ struct workspace_manager *wsm = fs_info->compr_wsm[type];
struct list_head *workspace;
int cpus = num_online_cpus();
unsigned nofs_flag;
@@ -830,7 +843,7 @@ struct list_head *btrfs_get_workspace(int type, int level)
wait_queue_head_t *ws_wait;
int *free_ws;
- wsm = btrfs_compress_op[type]->workspace_manager;
+ ASSERT(wsm);
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
@@ -866,7 +879,7 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- workspace = alloc_workspace(type, level);
+ workspace = alloc_workspace(fs_info, type, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -888,22 +901,22 @@ again:
/* once per minute */ 60 * HZ,
/* no burst */ 1);
- if (__ratelimit(&_rs)) {
- pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
- }
+ if (__ratelimit(&_rs))
+ btrfs_warn(fs_info,
+ "no compression workspaces, low memory, retrying");
}
goto again;
}
return workspace;
}
-static struct list_head *get_workspace(int type, int level)
+static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
- case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
- case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
- case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
+ case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level);
+ case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level);
default:
/*
* This can't happen, the type is validated several times
@@ -917,21 +930,21 @@ static struct list_head *get_workspace(int type, int level)
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-void btrfs_put_workspace(int type, struct list_head *ws)
+void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
{
- struct workspace_manager *wsm;
+ struct workspace_manager *gwsm = fs_info->compr_wsm[type];
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
- wsm = btrfs_compress_op[type]->workspace_manager;
- idle_ws = &wsm->idle_ws;
- ws_lock = &wsm->ws_lock;
- total_ws = &wsm->total_ws;
- ws_wait = &wsm->ws_wait;
- free_ws = &wsm->free_ws;
+ ASSERT(gwsm);
+ idle_ws = &gwsm->idle_ws;
+ ws_lock = &gwsm->ws_lock;
+ total_ws = &gwsm->total_ws;
+ ws_wait = &gwsm->ws_wait;
+ free_ws = &gwsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
@@ -948,13 +961,13 @@ wake:
cond_wake_up(ws_wait);
}
-static void put_workspace(int type, struct list_head *ws)
+static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
- case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
- case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
- case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
+ case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws);
+ case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws);
+ case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws);
default:
/*
* This can't happen, the type is validated several times
@@ -970,12 +983,12 @@ static void put_workspace(int type, struct list_head *ws)
*/
static int btrfs_compress_set_level(unsigned int type, int level)
{
- const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+ const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
if (level == 0)
- level = ops->default_level;
+ level = levels->default_level;
else
- level = min(max(level, ops->min_level), ops->max_level);
+ level = clamp(level, levels->min_level, levels->max_level);
return level;
}
@@ -985,9 +998,9 @@ static int btrfs_compress_set_level(unsigned int type, int level)
*/
bool btrfs_compress_level_valid(unsigned int type, int level)
{
- const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+ const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
- return ops->min_level <= level && level <= ops->max_level;
+ return levels->min_level <= level && level <= levels->max_level;
}
/* Wrapper around find_get_page(), with extra error message. */
@@ -1022,44 +1035,46 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
* - compression algo are 0-3
* - the level are bits 4-7
*
- * @out_pages is an in/out parameter, holds maximum number of pages to allocate
- * and returns number of actually allocated pages
+ * @out_folios is an in/out parameter, holds maximum number of folios to allocate
+ * and returns number of actually allocated folios
*
* @total_in is used to return the number of bytes actually read. It
* may be smaller than the input length if we had to exit early because we
- * ran out of room in the pages array or because we cross the
+ * ran out of room in the folios array or because we cross the
* max_out threshold.
*
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
-int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
const unsigned long orig_len = *total_out;
struct list_head *workspace;
int ret;
level = btrfs_compress_set_level(type, level);
- workspace = get_workspace(type, level);
- ret = compression_compress_pages(type, workspace, mapping, start, folios,
+ workspace = get_workspace(fs_info, type, level);
+ ret = compression_compress_pages(type, workspace, inode, start, folios,
out_folios, total_in, total_out);
/* The total read-in bytes should be no larger than the input. */
ASSERT(*total_in <= orig_len);
- put_workspace(type, workspace);
+ put_workspace(fs_info, type, workspace);
return ret;
}
static int btrfs_decompress_bio(struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct list_head *workspace;
int ret;
int type = cb->compress_type;
- workspace = get_workspace(type, 0);
+ workspace = get_workspace(fs_info, type, 0);
ret = compression_decompress_bio(workspace, cb);
- put_workspace(type, workspace);
+ put_workspace(fs_info, type, workspace);
if (!ret)
zero_fill_bio(&cb->orig_bbio->bio);
@@ -1069,7 +1084,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
/*
* a less complex decompression routine. Our compressed data fits in a
* single page, and we want to read a single page out of it.
- * start_byte tells us the offset into the compressed data we're interested in
+ * dest_pgoff tells us the offset into the destination folio where we write the
+ * decompressed data.
*/
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
unsigned long dest_pgoff, size_t srclen, size_t destlen)
@@ -1080,20 +1096,50 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
int ret;
/*
- * The full destination page range should not exceed the page size.
+ * The full destination folio range should not exceed the folio size.
* And the @destlen should not exceed sectorsize, as this is only called for
* inline file extents, which should not exceed sectorsize.
*/
- ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+ ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize);
- workspace = get_workspace(type, 0);
+ workspace = get_workspace(fs_info, type, 0);
ret = compression_decompress(type, workspace, data_in, dest_folio,
dest_pgoff, srclen, destlen);
- put_workspace(type, workspace);
+ put_workspace(fs_info, type, workspace);
return ret;
}
+int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+
+ ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
+ if (ret < 0)
+ goto error;
+ ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
+ if (ret < 0)
+ goto error;
+ ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
+ if (ret < 0)
+ goto error;
+ ret = zstd_alloc_workspace_manager(fs_info);
+ if (ret < 0)
+ goto error;
+ return 0;
+error:
+ btrfs_free_compress_wsm(fs_info);
+ return ret;
+}
+
+void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info)
+{
+ free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
+ free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
+ free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
+ zstd_free_workspace_manager(fs_info);
+}
+
int __init btrfs_init_compress(void)
{
if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
@@ -1105,11 +1151,6 @@ int __init btrfs_init_compress(void)
if (!compr_pool.shrinker)
return -ENOMEM;
- btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
- btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
- btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
- zstd_init_workspace_manager();
-
spin_lock_init(&compr_pool.lock);
INIT_LIST_HEAD(&compr_pool.list);
compr_pool.count = 0;
@@ -1130,14 +1171,26 @@ void __cold btrfs_exit_compress(void)
btrfs_compr_pool_scan(NULL, NULL);
shrinker_free(compr_pool.shrinker);
- btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
- btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
- btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
- zstd_cleanup_workspace_manager();
bioset_exit(&btrfs_compressed_bioset);
}
/*
+ * The bvec is a single page bvec from a bio that contains folios from a filemap.
+ *
+ * Since the folio may be a large one, and if the bv_page is not a head page of
+ * a large folio, then page->index is unreliable.
+ *
+ * Thus we need this helper to grab the proper file offset.
+ */
+static u64 file_offset_from_bvec(const struct bio_vec *bvec)
+{
+ const struct page *page = bvec->bv_page;
+ const struct folio *folio = page_folio(page);
+
+ return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset;
+}
+
+/*
* Copy decompressed data from working buffer to pages.
*
* @buf: The decompressed data buffer
@@ -1182,13 +1235,14 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
u32 copy_start;
/* Offset inside the full decompressed extent */
u32 bvec_offset;
+ void *kaddr;
bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
/*
* cb->start may underflow, but subtracting that value can still
* give us correct offset inside the full decompressed extent.
*/
- bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
+ bvec_offset = file_offset_from_bvec(&bvec) - cb->start;
/* Haven't reached the bvec range, exit */
if (decompressed + buf_len <= bvec_offset)
@@ -1204,10 +1258,12 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
* @buf + @buf_len.
*/
ASSERT(copy_start - decompressed < buf_len);
- memcpy_to_page(bvec.bv_page, bvec.bv_offset,
- buf + copy_start - decompressed, copy_len);
- cur_offset += copy_len;
+ kaddr = bvec_kmap_local(&bvec);
+ memcpy(kaddr, buf + copy_start - decompressed, copy_len);
+ kunmap_local(kaddr);
+
+ cur_offset += copy_len;
bio_advance(orig_bio, copy_len);
/* Finished the bio */
if (!orig_bio->bi_iter.bi_size)
@@ -1237,7 +1293,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
#define ENTROPY_LVL_HIGH (80)
/*
- * For increasead precision in shannon_entropy calculation,
+ * For increased precision in shannon_entropy calculation,
* let's do pow(n, M) to save more digits after comma:
*
* - maximum int bit length is 64
@@ -1463,7 +1519,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
struct heuristic_ws *ws)
{
struct page *page;
- u64 index, index_end;
+ pgoff_t index, index_end;
u32 i, curr_sample_pos;
u8 *in_data;
@@ -1523,7 +1579,8 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*/
int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct list_head *ws_list = get_workspace(0, 0);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct list_head *ws_list = get_workspace(fs_info, 0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
@@ -1592,30 +1649,34 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
}
out:
- put_workspace(0, ws_list);
+ put_workspace(fs_info, 0, ws_list);
return ret;
}
/*
- * Convert the compression suffix (eg. after "zlib" starting with ":") to
- * level, unrecognized string will set the default level. Negative level
- * numbers are allowed.
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to level.
+ *
+ * If the resulting level exceeds the algo's supported levels, it will be clamped.
+ *
+ * Return <0 if no valid string can be found.
+ * Return 0 if everything is fine.
*/
-int btrfs_compress_str2level(unsigned int type, const char *str)
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret)
{
int level = 0;
int ret;
- if (!type)
+ if (!type) {
+ *level_ret = btrfs_compress_set_level(type, level);
return 0;
+ }
if (str[0] == ':') {
ret = kstrtoint(str + 1, 10, &level);
if (ret)
- level = 0;
+ return ret;
}
- level = btrfs_compress_set_level(type, level);
-
- return level;
+ *level_ret = btrfs_compress_set_level(type, level);
+ return 0;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index df198623cc08..e0228017e861 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -11,14 +11,15 @@
#include <linux/list.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
+#include <linux/pagemap.h>
#include "bio.h"
+#include "fs.h"
+#include "btrfs_inode.h"
struct address_space;
-struct page;
struct inode;
struct btrfs_inode;
struct btrfs_ordered_extent;
-struct btrfs_bio;
/*
* We want to make sure that amount of RAM required to uncompress an extent is
@@ -62,33 +63,39 @@ struct compressed_bio {
/* Whether this is a write for writeback. */
bool writeback;
- union {
- /* For reads, this is the bio we are copying the data into */
- struct btrfs_bio *orig_bbio;
- struct work_struct write_end_work;
- };
+ /* For reads, this is the bio we are copying the data into. */
+ struct btrfs_bio *orig_bbio;
/* Must be last. */
struct btrfs_bio bbio;
};
-/* @range_end must be exclusive. */
-static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
+static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb)
{
- u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE;
+ return cb->bbio.inode->root->fs_info;
+}
- return min(range_end, page_end) - cur;
+/* @range_end must be exclusive. */
+static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur)
+{
+ /* @cur must be inside the folio. */
+ ASSERT(folio_pos(folio) <= cur);
+ ASSERT(cur < folio_next_pos(folio));
+ return umin(range_end, folio_next_pos(folio)) - cur;
}
+int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info);
+void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info);
+
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
bool btrfs_compress_level_valid(unsigned int type, int level);
-int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
- unsigned long start_byte, size_t srclen, size_t destlen);
+ unsigned long dest_pgoff, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
@@ -98,19 +105,11 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
-int btrfs_compress_str2level(unsigned int type, const char *str);
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret);
-struct folio *btrfs_alloc_compr_folio(void);
+struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info);
void btrfs_free_compr_folio(struct folio *folio);
-enum btrfs_compression_type {
- BTRFS_COMPRESS_NONE = 0,
- BTRFS_COMPRESS_ZLIB = 1,
- BTRFS_COMPRESS_LZO = 2,
- BTRFS_COMPRESS_ZSTD = 3,
- BTRFS_NR_COMPRESS_TYPES = 4,
-};
-
struct workspace_manager {
struct list_head idle_ws;
spinlock_t ws_lock;
@@ -122,11 +121,10 @@ struct workspace_manager {
wait_queue_head_t ws_wait;
};
-struct list_head *btrfs_get_workspace(int type, int level);
-void btrfs_put_workspace(int type, struct list_head *ws);
+struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level);
+void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws);
-struct btrfs_compress_op {
- struct workspace_manager *workspace_manager;
+struct btrfs_compress_levels {
/* Maximum level supported by the compression algorithm */
int min_level;
int max_level;
@@ -136,10 +134,10 @@ struct btrfs_compress_op {
/* The heuristic workspaces are managed via the 0th workspace manager */
#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES
-extern const struct btrfs_compress_op btrfs_heuristic_compress;
-extern const struct btrfs_compress_op btrfs_zlib_compress;
-extern const struct btrfs_compress_op btrfs_lzo_compress;
-extern const struct btrfs_compress_op btrfs_zstd_compress;
+extern const struct btrfs_compress_levels btrfs_heuristic_compress;
+extern const struct btrfs_compress_levels btrfs_zlib_compress;
+extern const struct btrfs_compress_levels btrfs_lzo_compress;
+extern const struct btrfs_compress_levels btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
bool btrfs_compress_is_valid_type(const char *str, size_t len);
@@ -149,39 +147,39 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret);
-int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
+struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level);
void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
+struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level);
-int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-struct list_head *lzo_alloc_workspace(void);
+struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info);
void lzo_free_workspace(struct list_head *ws);
-int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(int level);
+int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info);
+void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info);
+struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level);
void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(int level);
-void zstd_put_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level);
+void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a2e7979372cc..a48b4befbee7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -30,10 +30,10 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *ins_key, struct btrfs_path *path,
- int data_size, int extend);
+ int data_size, bool extend);
static int push_node_left(struct btrfs_trans_handle *trans,
struct extent_buffer *dst,
- struct extent_buffer *src, int empty);
+ struct extent_buffer *src, bool empty);
static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
@@ -198,7 +198,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
* the inc_not_zero dance and if it doesn't work then
* synchronize_rcu and try again.
*/
- if (atomic_inc_not_zero(&eb->refs)) {
+ if (refcount_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
break;
}
@@ -283,15 +283,26 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
- WARN_ON(btrfs_header_generation(buf) > trans->transid);
- if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (unlikely(btrfs_header_generation(buf) > trans->transid)) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_inc_ref(trans, root, cow, 1);
- else
+ if (unlikely(ret))
+ btrfs_abort_transaction(trans, ret);
+ } else {
ret = btrfs_inc_ref(trans, root, cow, 0);
+ if (unlikely(ret))
+ btrfs_abort_transaction(trans, ret);
+ }
if (ret) {
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -303,9 +314,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf)
{
const u64 buf_gen = btrfs_header_generation(buf);
@@ -525,14 +536,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -545,18 +556,18 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
parent_start = buf->start;
ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
- atomic_inc(&cow->refs);
+ refcount_inc(&cow->refs);
rcu_assign_pointer(root->node, cow);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -564,7 +575,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(trans->transid != btrfs_header_generation(parent));
ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -575,14 +586,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(trans, parent);
if (last_ref) {
ret = btrfs_tree_mod_log_free_eb(buf);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
}
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
parent_start, last_ref);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -602,15 +613,12 @@ error_unlock_cow:
return ret;
}
-static inline int should_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+static inline bool should_cow_block(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf)
{
if (btrfs_is_testing(root->fs_info))
- return 0;
-
- /* Ensure we can see the FORCE_COW bit */
- smp_mb__before_atomic();
+ return false;
/*
* We do not need to cow a block if
@@ -623,13 +631,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
* after we've finished copying src root, we must COW the shared
* block to ensure the metadata consistency.
*/
- if (btrfs_header_generation(buf) == trans->transid &&
- !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
- !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
- !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
- return 0;
- return 1;
+
+ if (btrfs_header_generation(buf) != trans->transid)
+ return true;
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN))
+ return true;
+
+ /* Ensure we can see the FORCE_COW bit. */
+ smp_mb__before_atomic();
+ if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
+ return true;
+
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
+ return false;
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+ return true;
+
+ return false;
}
/*
@@ -724,7 +744,7 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke
* Slot may point to the total number of items (i.e. one position beyond the last
* key) if the key is bigger than the last key in the extent buffer.
*/
-int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot)
{
unsigned long p;
@@ -833,7 +853,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
&check);
if (IS_ERR(eb))
return eb;
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_extent_buffer(eb);
return ERR_PTR(-EIO);
}
@@ -842,6 +862,75 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
}
/*
+ * Promote a child node to become the new tree root.
+ *
+ * @trans: Transaction handle
+ * @root: Tree root structure to update
+ * @path: Path holding nodes and locks
+ * @level: Level of the parent (old root)
+ * @parent: The parent (old root) with exactly one item
+ *
+ * This helper is called during rebalancing when the root node contains only
+ * a single item (nritems == 1). We can reduce the tree height by promoting
+ * that child to become the new root and freeing the old root node. The path
+ * locks and references are updated accordingly.
+ *
+ * Return: 0 on success, negative errno on failure. The transaction is aborted
+ * on critical errors.
+ */
+static int promote_child_to_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ int level, struct extent_buffer *parent)
+{
+ struct extent_buffer *child;
+ int ret;
+
+ ASSERT(btrfs_header_nritems(parent) == 1);
+
+ child = btrfs_read_node_slot(parent, 0);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+
+ btrfs_tree_lock(child);
+ ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ return ret;
+ }
+
+ ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
+ if (unlikely(ret < 0)) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ rcu_assign_pointer(root->node, child);
+
+ add_root_to_dirty_list(root);
+ btrfs_tree_unlock(child);
+
+ path->locks[level] = 0;
+ path->nodes[level] = NULL;
+ btrfs_clear_buffer_dirty(trans, parent);
+ btrfs_tree_unlock(parent);
+ /* Once for the path. */
+ free_extent_buffer(parent);
+
+ root_sub_used_bytes(root);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1);
+ /* Once for the root ptr. */
+ free_extent_buffer_stale(parent);
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
* node level balancing, used to make sure nodes are in proper order for
* item deletion. We balance from the top down, so we have to make sure
* that a deletion won't leave an node completely empty later on.
@@ -880,55 +969,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
* by promoting the node below to a root
*/
if (!parent) {
- struct extent_buffer *child;
-
if (btrfs_header_nritems(mid) != 1)
return 0;
- /* promote the child to a root */
- child = btrfs_read_node_slot(mid, 0);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
- goto out;
- }
-
- btrfs_tree_lock(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
- BTRFS_NESTING_COW);
- if (ret) {
- btrfs_tree_unlock(child);
- free_extent_buffer(child);
- goto out;
- }
-
- ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
- if (ret < 0) {
- btrfs_tree_unlock(child);
- free_extent_buffer(child);
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
- rcu_assign_pointer(root->node, child);
-
- add_root_to_dirty_list(root);
- btrfs_tree_unlock(child);
-
- path->locks[level] = 0;
- path->nodes[level] = NULL;
- btrfs_clear_buffer_dirty(trans, mid);
- btrfs_tree_unlock(mid);
- /* once for the path */
- free_extent_buffer(mid);
-
- root_sub_used_bytes(root);
- ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
- /* once for the root ptr */
- free_extent_buffer_stale(mid);
- if (ret < 0) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
- return 0;
+ return promote_child_to_root(trans, root, path, level, mid);
}
if (btrfs_header_nritems(mid) >
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
@@ -999,7 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1008,7 +1052,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &right_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1060,7 +1104,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1070,7 +1114,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_node_key(mid, &mid_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1081,11 +1125,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the path */
if (left) {
if (btrfs_header_nritems(left) > orig_slot) {
- atomic_inc(&left->refs);
/* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
path->slots[level] = orig_slot;
+ /* Left is now owned by path. */
+ left = NULL;
if (mid) {
btrfs_tree_unlock(mid);
free_extent_buffer(mid);
@@ -1105,8 +1150,7 @@ out:
free_extent_buffer(right);
}
if (left) {
- if (path->nodes[level] != left)
- btrfs_tree_unlock(left);
+ btrfs_tree_unlock(left);
free_extent_buffer(left);
}
return ret;
@@ -1175,7 +1219,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(mid, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_tree_unlock(left);
free_extent_buffer(left);
btrfs_abort_transaction(trans, ret);
@@ -1235,7 +1279,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_tree_unlock(right);
free_extent_buffer(right);
btrfs_abort_transaction(trans, ret);
@@ -1268,7 +1312,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
* to the block in 'slot', and triggering ra on them.
*/
static void reada_for_search(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
int level, int slot, u64 objectid)
{
struct extent_buffer *node;
@@ -1350,7 +1394,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
}
}
-static noinline void reada_for_balance(struct btrfs_path *path, int level)
+static noinline void reada_for_balance(const struct btrfs_path *path, int level)
{
struct extent_buffer *parent;
int slot;
@@ -1415,8 +1459,8 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
if (i >= lowest_unlock && i > skip_level) {
- check_skip = false;
btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ check_skip = false;
path->locks[i] = 0;
if (write_lock_level &&
i > min_write_lock_level &&
@@ -1446,8 +1490,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
u64 blocknr;
struct extent_buffer *tmp = NULL;
int ret = 0;
+ int ret2;
int parent_level;
- int err;
bool read_tmp = false;
bool tmp_locked = false;
bool path_released = false;
@@ -1473,13 +1517,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
reada_for_search(fs_info, p, parent_level, slot, key->objectid);
/* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
+ if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) {
/*
* Do extra check for first_key, eb can be stale due to
* being cached, read from scrub, or have multiple
* parents (shared tree blocks).
*/
- if (btrfs_verify_level_key(tmp, &check)) {
+ if (unlikely(btrfs_verify_level_key(tmp, &check))) {
ret = -EUCLEAN;
goto out;
}
@@ -1505,9 +1549,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
}
/* Now we're allowed to do a blocking uptodate check. */
- err = btrfs_read_extent_buffer(tmp, &check);
- if (err) {
- ret = err;
+ ret2 = btrfs_read_extent_buffer(tmp, &check);
+ if (ret2) {
+ ret = ret2;
goto out;
}
@@ -1548,9 +1592,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
}
/* Now we're allowed to do a blocking uptodate check. */
- err = btrfs_read_extent_buffer(tmp, &check);
- if (err) {
- ret = err;
+ ret2 = btrfs_read_extent_buffer(tmp, &check);
+ if (ret2) {
+ ret = ret2;
goto out;
}
@@ -1560,7 +1604,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
- if (!extent_buffer_uptodate(tmp)) {
+ if (unlikely(!extent_buffer_uptodate(tmp))) {
ret = -EIO;
goto out;
}
@@ -1685,13 +1729,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
if (p->search_commit_root) {
b = root->commit_root;
- atomic_inc(&b->refs);
+ refcount_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
- * p->search_commit_root = 1.
+ * p->search_commit_root is true.
*/
- ASSERT(p->skip_locking == 1);
+ ASSERT(p->skip_locking);
goto out;
}
@@ -1741,7 +1785,7 @@ out:
* The root may have failed to write out at some point, and thus is no
* longer valid, return an error in this case.
*/
- if (!extent_buffer_uptodate(b)) {
+ if (unlikely(!extent_buffer_uptodate(b))) {
if (root_lock)
btrfs_tree_unlock_rw(b, root_lock);
free_extent_buffer(b);
@@ -1794,7 +1838,7 @@ static int finish_need_commit_sem_search(struct btrfs_path *path)
return 0;
}
-static inline int search_for_key_slot(struct extent_buffer *eb,
+static inline int search_for_key_slot(const struct extent_buffer *eb,
int search_low_slot,
const struct btrfs_key *key,
int prev_cmp,
@@ -1928,15 +1972,14 @@ static int search_leaf(struct btrfs_trans_handle *trans,
ASSERT(leaf_free_space >= 0);
if (leaf_free_space < ins_len) {
- int err;
-
- err = split_leaf(trans, root, key, path, ins_len,
- (ret == 0));
- ASSERT(err <= 0);
- if (WARN_ON(err > 0))
- err = -EUCLEAN;
- if (err)
- ret = err;
+ int ret2;
+
+ ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0));
+ ASSERT(ret2 <= 0);
+ if (WARN_ON(ret2 > 0))
+ ret2 = -EUCLEAN;
+ if (ret2)
+ ret = ret2;
}
}
@@ -1982,7 +2025,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *b;
int slot;
int ret;
- int err;
int level;
int lowest_unlock = 1;
/* everything at write_lock_level or lower must be write locked */
@@ -2053,6 +2095,7 @@ again:
while (b) {
int dec = 0;
+ int ret2;
level = btrfs_header_level(b);
@@ -2081,16 +2124,15 @@ again:
}
if (last_level)
- err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b,
- BTRFS_NESTING_COW);
+ ret2 = btrfs_cow_block(trans, root, b, NULL, 0,
+ &b, BTRFS_NESTING_COW);
else
- err = btrfs_cow_block(trans, root, b,
- p->nodes[level + 1],
- p->slots[level + 1], &b,
- BTRFS_NESTING_COW);
- if (err) {
- ret = err;
+ ret2 = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
+ if (ret2) {
+ ret = ret2;
goto done;
}
}
@@ -2138,12 +2180,12 @@ cow_done:
slot--;
}
p->slots[level] = slot;
- err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
- &write_lock_level);
- if (err == -EAGAIN)
+ ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len,
+ &write_lock_level);
+ if (ret2 == -EAGAIN)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
b = p->nodes[level];
@@ -2169,11 +2211,11 @@ cow_done:
goto done;
}
- err = read_block_for_search(root, p, &b, slot, key);
- if (err == -EAGAIN && !p->nowait)
+ ret2 = read_block_for_search(root, p, &b, slot, key);
+ if (ret2 == -EAGAIN && !p->nowait)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
@@ -2236,7 +2278,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
struct extent_buffer *b;
int slot;
int ret;
- int err;
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
@@ -2252,7 +2293,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
again:
b = btrfs_get_old_root(root, time_seq);
- if (!b) {
+ if (unlikely(!b)) {
ret = -EIO;
goto done;
}
@@ -2261,6 +2302,7 @@ again:
while (b) {
int dec = 0;
+ int ret2;
level = btrfs_header_level(b);
p->nodes[level] = b;
@@ -2296,11 +2338,11 @@ again:
goto done;
}
- err = read_block_for_search(root, p, &b, slot, key);
- if (err == -EAGAIN && !p->nowait)
+ ret2 = read_block_for_search(root, p, &b, slot, key);
+ if (ret2 == -EAGAIN && !p->nowait)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
@@ -2581,12 +2623,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
- "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+ "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT,
slot, btrfs_disk_key_objectid(&disk_key),
btrfs_disk_key_type(&disk_key),
btrfs_disk_key_offset(&disk_key),
- new_key->objectid, new_key->type,
- new_key->offset);
+ BTRFS_KEY_FMT_VALUE(new_key));
BUG();
}
}
@@ -2595,12 +2636,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
- "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+ "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT,
slot, btrfs_disk_key_objectid(&disk_key),
btrfs_disk_key_type(&disk_key),
btrfs_disk_key_offset(&disk_key),
- new_key->objectid, new_key->type,
- new_key->offset);
+ BTRFS_KEY_FMT_VALUE(new_key));
BUG();
}
}
@@ -2659,10 +2699,9 @@ static bool check_sibling_keys(const struct extent_buffer *left,
btrfs_crit(left->fs_info, "right extent buffer:");
btrfs_print_tree(right, false);
btrfs_crit(left->fs_info,
-"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
- left_last.objectid, left_last.type,
- left_last.offset, right_first.objectid,
- right_first.type, right_first.offset);
+"bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&left_last),
+ BTRFS_KEY_FMT_VALUE(&right_first));
return true;
}
return false;
@@ -2677,7 +2716,7 @@ static bool check_sibling_keys(const struct extent_buffer *left,
*/
static int push_node_left(struct btrfs_trans_handle *trans,
struct extent_buffer *dst,
- struct extent_buffer *src, int empty)
+ struct extent_buffer *src, bool empty)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int push_items = 0;
@@ -2713,13 +2752,13 @@ static int push_node_left(struct btrfs_trans_handle *trans,
push_items = min(src_nritems - 8, push_items);
/* dst is the left eb, src is the middle eb */
- if (check_sibling_keys(dst, src)) {
+ if (unlikely(check_sibling_keys(dst, src))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2787,7 +2826,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
push_items = max_push;
/* dst is the right eb, src is the middle eb */
- if (check_sibling_keys(src, dst)) {
+ if (unlikely(check_sibling_keys(src, dst))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
return ret;
@@ -2804,7 +2843,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
push_items);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2872,8 +2911,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (ret < 0) {
int ret2;
+ btrfs_clear_buffer_dirty(trans, c);
ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
- if (ret2 < 0)
+ if (unlikely(ret2 < 0))
btrfs_abort_transaction(trans, ret2);
btrfs_tree_unlock(c);
free_extent_buffer(c);
@@ -2885,7 +2925,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
free_extent_buffer(old);
add_root_to_dirty_list(root);
- atomic_inc(&c->refs);
+ refcount_inc(&c->refs);
path->nodes[level] = c;
path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
@@ -2918,7 +2958,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
if (level) {
ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
slot, nritems - slot);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2931,7 +2971,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
if (level) {
ret = btrfs_tree_mod_log_insert_key(lower, slot,
BTRFS_MOD_LOG_KEY_ADD);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3007,7 +3047,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ASSERT(btrfs_header_level(c) == level);
ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_tree_unlock(split);
free_extent_buffer(split);
btrfs_abort_transaction(trans, ret);
@@ -3076,7 +3116,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf)
int ret;
ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_crit(fs_info,
"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
ret,
@@ -3092,7 +3132,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf)
*/
static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- int data_size, int empty,
+ int data_size, bool empty,
struct extent_buffer *right,
int free_space, u32 left_nritems,
u32 min_slot)
@@ -3100,7 +3140,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = right->fs_info;
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
- struct btrfs_map_token token;
struct btrfs_disk_key disk_key;
int slot;
u32 i;
@@ -3174,13 +3213,12 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
/* update the item pointers */
- btrfs_init_map_token(&token, right);
right_nritems += push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- push_space -= btrfs_token_item_size(&token, i);
- btrfs_set_token_item_offset(&token, i, push_space);
+ push_space -= btrfs_item_size(right, i);
+ btrfs_set_item_offset(right, i, push_space);
}
left_nritems -= push_items;
@@ -3200,10 +3238,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
/* then fixup the leaf pointer in the path */
if (path->slots[0] >= left_nritems) {
path->slots[0] -= left_nritems;
- if (btrfs_header_nritems(path->nodes[0]) == 0)
- btrfs_clear_buffer_dirty(trans, path->nodes[0]);
- btrfs_tree_unlock(path->nodes[0]);
- free_extent_buffer(path->nodes[0]);
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
path->nodes[0] = right;
path->slots[1] += 1;
} else {
@@ -3231,7 +3267,7 @@ out_unlock:
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path,
int min_data_size, int data_size,
- int empty, u32 min_slot)
+ bool empty, u32 min_slot)
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *right;
@@ -3270,7 +3306,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
- if (check_sibling_keys(left, right)) {
+ if (unlikely(check_sibling_keys(left, right))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
btrfs_tree_unlock(right);
@@ -3308,7 +3344,7 @@ out_unlock:
*/
static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
struct btrfs_path *path, int data_size,
- int empty, struct extent_buffer *left,
+ bool empty, struct extent_buffer *left,
int free_space, u32 right_nritems,
u32 max_slot)
{
@@ -3323,7 +3359,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
int ret = 0;
u32 this_item_size;
u32 old_left_item_size;
- struct btrfs_map_token token;
if (empty)
nr = min(right_nritems, max_slot);
@@ -3371,21 +3406,24 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
- btrfs_init_map_token(&token, left);
old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i,
+ ioff = btrfs_item_offset(left, i);
+ btrfs_set_item_offset(left, i,
ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
/* fixup right node */
- if (push_items > right_nritems)
- WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
- right_nritems);
+ if (unlikely(push_items > right_nritems)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)",
+ push_items, right_nritems);
+ goto out;
+ }
if (push_items < right_nritems) {
push_space = btrfs_item_offset(right, push_items - 1) -
@@ -3398,13 +3436,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_header_nritems(right) - push_items);
}
- btrfs_init_map_token(&token, right);
right_nritems -= push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- push_space = push_space - btrfs_token_item_size(&token, i);
- btrfs_set_token_item_offset(&token, i, push_space);
+ push_space = push_space - btrfs_item_size(right, i);
+ btrfs_set_item_offset(right, i, push_space);
}
btrfs_mark_buffer_dirty(trans, left);
@@ -3419,8 +3456,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
path->slots[0] += old_left_nritems;
- btrfs_tree_unlock(path->nodes[0]);
- free_extent_buffer(path->nodes[0]);
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
path->nodes[0] = left;
path->slots[1] -= 1;
} else {
@@ -3489,7 +3526,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- if (check_sibling_keys(left, right)) {
+ if (unlikely(check_sibling_keys(left, right))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3518,7 +3555,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
int i;
int ret;
struct btrfs_disk_key disk_key;
- struct btrfs_map_token token;
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
@@ -3531,12 +3567,11 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
- btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
+ ioff = btrfs_item_offset(right, i);
+ btrfs_set_item_offset(right, i, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -3639,7 +3674,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const struct btrfs_key *ins_key,
struct btrfs_path *path, int data_size,
- int extend)
+ bool extend)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *l;
@@ -3849,10 +3884,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- path->keep_locks = 1;
- path->search_for_split = 1;
+ path->keep_locks = true;
+ path->search_for_split = true;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- path->search_for_split = 0;
+ path->search_for_split = false;
if (ret > 0)
ret = -EAGAIN;
if (ret < 0)
@@ -3879,11 +3914,11 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
if (ret)
goto err;
- path->keep_locks = 0;
+ path->keep_locks = false;
btrfs_unlock_up_safe(path, 1);
return 0;
err:
- path->keep_locks = 0;
+ path->keep_locks = false;
return ret;
}
@@ -4002,7 +4037,6 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
unsigned int old_size;
unsigned int size_diff;
int i;
- struct btrfs_map_token token;
leaf = path->nodes[0];
slot = path->slots[0];
@@ -4025,12 +4059,11 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + size_diff);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff + size_diff);
}
/* shift the data */
@@ -4074,7 +4107,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
btrfs_set_item_size(leaf, slot, new_size);
btrfs_mark_buffer_dirty(trans, leaf);
- if (btrfs_leaf_free_space(leaf) < 0) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4093,14 +4126,13 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
unsigned int old_data;
unsigned int old_size;
int i;
- struct btrfs_map_token token;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
- if (btrfs_leaf_free_space(leaf) < data_size) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4108,7 +4140,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
old_data = btrfs_item_data_end(leaf, slot);
BUG_ON(slot < 0);
- if (slot >= nritems) {
+ if (unlikely(slot >= nritems)) {
btrfs_print_leaf(leaf);
btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
slot, nritems);
@@ -4119,24 +4151,22 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff - data_size);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff - data_size);
}
/* shift the data */
memmove_leaf_data(leaf, data_end - data_size, data_end,
old_data - data_end);
- data_end = old_data;
old_size = btrfs_item_size(leaf, slot);
btrfs_set_item_size(leaf, slot, old_size + data_size);
btrfs_mark_buffer_dirty(trans, leaf);
- if (btrfs_leaf_free_space(leaf) < 0) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4164,7 +4194,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
struct extent_buffer *leaf;
int slot;
- struct btrfs_map_token token;
u32 total_size;
/*
@@ -4185,18 +4214,17 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
data_end = leaf_data_end(leaf);
total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
- if (btrfs_leaf_free_space(leaf) < total_size) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) {
btrfs_print_leaf(leaf);
btrfs_crit(fs_info, "not enough freespace need %u have %d",
total_size, btrfs_leaf_free_space(leaf));
BUG();
}
- btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
unsigned int old_data = btrfs_item_data_end(leaf, slot);
- if (old_data < data_end) {
+ if (unlikely(old_data < data_end)) {
btrfs_print_leaf(leaf);
btrfs_crit(fs_info,
"item at slot %d with data offset %u beyond data end of leaf %u",
@@ -4210,8 +4238,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i,
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i,
ioff - batch->total_data_size);
}
/* shift the items */
@@ -4228,14 +4256,14 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
data_end -= batch->data_sizes[i];
- btrfs_set_token_item_offset(&token, slot + i, data_end);
- btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
+ btrfs_set_item_offset(leaf, slot + i, data_end);
+ btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]);
}
btrfs_set_header_nritems(leaf, nritems + batch->nr);
btrfs_mark_buffer_dirty(trans, leaf);
- if (btrfs_leaf_free_space(leaf) < 0) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4377,7 +4405,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (level) {
ret = btrfs_tree_mod_log_insert_move(parent, slot,
slot + 1, nritems - slot - 1);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -4390,7 +4418,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
} else if (level) {
ret = btrfs_tree_mod_log_insert_key(parent, slot,
BTRFS_MOD_LOG_KEY_REMOVE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -4442,7 +4470,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used_bytes(root);
- atomic_inc(&leaf->refs);
+ refcount_inc(&leaf->refs);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
if (ret < 0)
@@ -4469,7 +4497,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (slot + nr != nritems) {
const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
const int data_end = leaf_data_end(leaf);
- struct btrfs_map_token token;
u32 dsize = 0;
int i;
@@ -4479,12 +4506,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
memmove_leaf_data(leaf, data_end + dsize, data_end,
last_off - data_end);
- btrfs_init_map_token(&token, leaf);
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + dsize);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff + dsize);
}
memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
@@ -4494,9 +4520,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
/* delete the leaf if we've emptied it */
if (nritems == 0) {
- if (leaf == root->node) {
- btrfs_set_header_level(leaf, 0);
- } else {
+ if (leaf != root->node) {
btrfs_clear_buffer_dirty(trans, leaf);
ret = btrfs_del_leaf(trans, root, path, leaf);
if (ret < 0)
@@ -4527,7 +4551,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* for possible call to btrfs_del_ptr below
*/
slot = path->slots[1];
- atomic_inc(&leaf->refs);
+ refcount_inc(&leaf->refs);
/*
* We want to be able to at least push one item to the
* left neighbour leaf, and that's the first item.
@@ -4562,10 +4586,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (btrfs_header_nritems(leaf) == 0) {
path->slots[1] = slot;
ret = btrfs_del_leaf(trans, root, path, leaf);
+ free_extent_buffer(leaf);
if (ret < 0)
return ret;
- free_extent_buffer(leaf);
- ret = 0;
} else {
/* if we're still in the path, make sure
* we're dirty. Otherwise, one of the
@@ -4585,16 +4608,13 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
/*
* A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are have a minimum transaction id.
+ * for leaves that have a minimum transaction id.
* This is used by the btree defrag code, and tree logging
*
* This does not cow, but it does stuff the starting key it finds back
* into min_key, so you can call btrfs_search_slot with cow=1 on the
* key and get a writable path.
*
- * This honors path->lowest_level to prevent descent past a given level
- * of the tree.
- *
* min_trans indicates the oldest transaction that you are interested
* in walking through. Any nodes or leaves older than min_trans are
* skipped over (without reading them).
@@ -4612,10 +4632,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
u32 nritems;
int level;
int ret = 1;
- int keep_locks = path->keep_locks;
+ const bool keep_locks = path->keep_locks;
ASSERT(!path->nowait);
- path->keep_locks = 1;
+ ASSERT(path->lowest_level == 0);
+ path->keep_locks = true;
again:
cur = btrfs_read_lock_root_node(root);
level = btrfs_header_level(cur);
@@ -4636,8 +4657,8 @@ again:
goto out;
}
- /* at the lowest level, we're done, setup the path and exit */
- if (level == path->lowest_level) {
+ /* At level 0 we're done, setup the path and exit. */
+ if (level == 0) {
if (slot >= nritems)
goto find_next_key;
ret = 0;
@@ -4678,12 +4699,6 @@ find_next_key:
goto out;
}
}
- if (level == path->lowest_level) {
- ret = 0;
- /* Save our key for returning back. */
- btrfs_node_key_to_cpu(cur, min_key, slot);
- goto out;
- }
cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
@@ -4699,7 +4714,7 @@ find_next_key:
out:
path->keep_locks = keep_locks;
if (ret == 0)
- btrfs_unlock_up_safe(path, path->lowest_level + 1);
+ btrfs_unlock_up_safe(path, 1);
return ret;
}
@@ -4711,7 +4726,7 @@ out:
* 0 is returned if another key is found, < 0 if there are any errors
* and 1 is returned if there are no higher keys in the tree
*
- * path->keep_locks should be set to 1 on the search made before
+ * path->keep_locks should be set to true on the search made before
* calling this function.
*/
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
@@ -4810,13 +4825,13 @@ again:
next = NULL;
btrfs_release_path(path);
- path->keep_locks = 1;
+ path->keep_locks = true;
if (time_seq) {
ret = btrfs_search_old_slot(root, &key, path, time_seq);
} else {
if (path->need_commit_sem) {
- path->need_commit_sem = 0;
+ path->need_commit_sem = false;
need_commit_sem = true;
if (path->nowait) {
if (!down_read_trylock(&fs_info->commit_root_sem)) {
@@ -4829,41 +4844,30 @@ again:
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
- path->keep_locks = 0;
+ path->keep_locks = false;
if (ret < 0)
goto done;
nritems = btrfs_header_nritems(path->nodes[0]);
/*
- * by releasing the path above we dropped all our locks. A balance
- * could have added more items next to the key that used to be
- * at the very end of the block. So, check again here and
- * advance the path if there are now more items available.
- */
- if (nritems > 0 && path->slots[0] < nritems - 1) {
- if (ret == 0)
- path->slots[0]++;
- ret = 0;
- goto done;
- }
- /*
- * So the above check misses one case:
- * - after releasing the path above, someone has removed the item that
- * used to be at the very end of the block, and balance between leafs
- * gets another one with bigger key.offset to replace it.
+ * By releasing the path above we dropped all our locks. A balance
+ * could have happened and
*
- * This one should be returned as well, or we can get leaf corruption
- * later(esp. in __btrfs_drop_extents()).
+ * 1. added more items after the previous last item
+ * 2. deleted the previous last item
*
- * And a bit more explanation about this check,
- * with ret > 0, the key isn't found, the path points to the slot
- * where it should be inserted, so the path->slots[0] item must be the
- * bigger one.
+ * So, check again here and advance the path if there are now more
+ * items available.
*/
- if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
- ret = 0;
- goto done;
+ if (nritems > 0 && path->slots[0] <= nritems - 1) {
+ if (ret == 0 && path->slots[0] != nritems - 1) {
+ path->slots[0]++;
+ goto done;
+ } else if (ret > 0) {
+ ret = 0;
+ goto done;
+ }
}
while (level < BTRFS_MAX_LEVEL) {
@@ -4968,7 +4972,7 @@ done:
if (need_commit_sem) {
int ret2;
- path->need_commit_sem = 1;
+ path->need_commit_sem = true;
ret2 = finish_need_commit_sem_search(path);
up_read(&fs_info->commit_root_sem);
if (ret2)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 075a06db43a1..692370fc07b2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -17,9 +17,7 @@
#include <linux/refcount.h>
#include <uapi/linux/btrfs_tree.h>
#include "locking.h"
-#include "fs.h"
#include "accessors.h"
-#include "extent-io-tree.h"
struct extent_buffer;
struct btrfs_block_rsv;
@@ -61,27 +59,27 @@ struct btrfs_path {
/* if there is real range locking, this locks field will change */
u8 locks[BTRFS_MAX_LEVEL];
u8 reada;
- /* keep some upper locks as we walk down */
u8 lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
* and to force calls to keep space in the nodes
*/
- unsigned int search_for_split:1;
- unsigned int keep_locks:1;
- unsigned int skip_locking:1;
- unsigned int search_commit_root:1;
- unsigned int need_commit_sem:1;
- unsigned int skip_release_on_error:1;
+ bool search_for_split:1;
+ /* Keep some upper locks as we walk down. */
+ bool keep_locks:1;
+ bool skip_locking:1;
+ bool search_commit_root:1;
+ bool need_commit_sem:1;
+ bool skip_release_on_error:1;
/*
* Indicate that new item (btrfs_search_slot) is extending already
* existing item and ins_len contains only the data size and not item
* header (ie. sizeof(struct btrfs_item) is not included).
*/
- unsigned int search_for_extension:1;
+ bool search_for_extension:1;
/* Stop search if any locks need to be taken (for read) */
- unsigned int nowait:1;
+ bool nowait:1;
};
#define BTRFS_PATH_AUTO_FREE(path_name) \
@@ -224,16 +222,10 @@ struct btrfs_root {
struct list_head root_list;
- /*
- * Xarray that keeps track of in-memory inodes, protected by the lock
- * @inode_lock.
- */
+ /* Xarray that keeps track of in-memory inodes. */
struct xarray inodes;
- /*
- * Xarray that keeps track of delayed nodes of every inode, protected
- * by @inode_lock.
- */
+ /* Xarray that keeps track of delayed nodes of every inode. */
struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
@@ -508,7 +500,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
int __init btrfs_ctree_init(void);
void __cold btrfs_ctree_exit(void);
-int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
@@ -576,9 +568,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
@@ -727,13 +719,18 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
}
int btrfs_leaf_free_space(const struct extent_buffer *leaf);
-static inline int is_fstree(u64 rootid)
+static inline bool btrfs_is_fstree(u64 rootid)
{
- if (rootid == BTRFS_FS_TREE_OBJECTID ||
- ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
- !btrfs_qgroup_level(rootid)))
- return 1;
- return 0;
+ if (rootid == BTRFS_FS_TREE_OBJECTID)
+ return true;
+
+ if ((s64)rootid < (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return false;
+
+ if (btrfs_qgroup_level(rootid) != 0)
+ return false;
+
+ return true;
}
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index d4310d93f532..b81e224d4a27 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -15,6 +15,7 @@
#include "defrag.h"
#include "file-item.h"
#include "super.h"
+#include "compression.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
@@ -60,6 +61,14 @@ static int compare_inode_defrag(const struct inode_defrag *defrag1,
return 0;
}
+static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node);
+ const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node);
+
+ return compare_inode_defrag(new_defrag, existing_defrag);
+}
+
/*
* Insert a record for an inode into the defrag tree. The lock must be held
* already.
@@ -71,49 +80,35 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct inode_defrag *entry;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- int ret;
+ struct rb_node *node;
- p = &fs_info->defrag_inodes.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
+ node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp);
+ if (node) {
+ struct inode_defrag *entry;
- ret = compare_inode_defrag(defrag, entry);
- if (ret < 0)
- p = &parent->rb_left;
- else if (ret > 0)
- p = &parent->rb_right;
- else {
- /*
- * If we're reinserting an entry for an old defrag run,
- * make sure to lower the transid of our existing
- * record.
- */
- if (defrag->transid < entry->transid)
- entry->transid = defrag->transid;
- entry->extent_thresh = min(defrag->extent_thresh,
- entry->extent_thresh);
- return -EEXIST;
- }
+ entry = rb_entry(node, struct inode_defrag, rb_node);
+ /*
+ * If we're reinserting an entry for an old defrag run, make
+ * sure to lower the transid of our existing record.
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh);
+ return -EEXIST;
}
set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
- rb_link_node(&defrag->rb_node, parent, p);
- rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
return 0;
}
-static inline int need_auto_defrag(struct btrfs_fs_info *fs_info)
+static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
- return 0;
+ return false;
if (btrfs_fs_closing(fs_info))
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
@@ -159,7 +154,7 @@ void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh)
}
/*
- * Pick the defragable inode that we want, if it doesn't exist, we will get the
+ * Pick the defraggable inode that we want, if it doesn't exist, we will get the
* next one.
*/
static struct inode_defrag *btrfs_pick_defrag_inode(
@@ -191,10 +186,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode(
if (parent && compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
- if (parent)
- entry = rb_entry(parent, struct inode_defrag, rb_node);
- else
- entry = NULL;
+ entry = rb_entry_safe(parent, struct inode_defrag, rb_node);
}
out:
if (entry)
@@ -263,10 +255,9 @@ again:
range.extent_thresh = defrag->extent_thresh;
file_ra_state_init(ra, inode->vfs_inode.i_mapping);
- sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
- sb_end_write(fs_info->sb);
+ scoped_guard(super_write, fs_info->sb)
+ ret = btrfs_defrag_file(inode, ra, &range,
+ defrag->transid, BTRFS_DEFRAG_BATCH);
iput(&inode->vfs_inode);
if (ret < 0)
@@ -480,7 +471,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
memcpy(&key, &root->defrag_progress, sizeof(key));
}
- path->keep_locks = 1;
+ path->keep_locks = true;
ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
if (ret < 0)
@@ -523,7 +514,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/*
* Now that we reallocated the node we can find the next key. Note that
* btrfs_find_next_key() can release our path and do another search
- * without COWing, this is because even with path->keep_locks = 1,
+ * without COWing, this is because even with path->keep_locks == true,
* btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
* node when path->slots[node_level - 1] does not point to the last
* item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
@@ -624,7 +615,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
u64 ino = btrfs_ino(inode);
int ret;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto err;
@@ -734,12 +725,12 @@ next:
not_found:
btrfs_release_path(&path);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return NULL;
err:
btrfs_release_path(&path);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
@@ -756,7 +747,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* full extent lock.
*/
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, sectorsize);
+ em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
/*
@@ -769,7 +760,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* file extent items in the inode's subvolume tree).
*/
if (em && (em->flags & EXTENT_FLAG_MERGED)) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
}
@@ -779,10 +770,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* Get the big lock and read metadata off disk. */
if (!locked)
- lock_extent(io_tree, start, end, &cached);
+ btrfs_lock_extent(io_tree, start, end, &cached);
em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked)
- unlock_extent(io_tree, start, end, &cached);
+ btrfs_unlock_extent(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -794,7 +785,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
const struct extent_map *em)
{
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return BTRFS_MAX_COMPRESSED;
return fs_info->max_extent_size;
}
@@ -837,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
ret = true;
out:
- free_extent_map(next);
+ btrfs_free_extent_map(next);
return ret;
}
@@ -857,13 +848,14 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t
{
struct address_space *mapping = inode->vfs_inode.i_mapping;
gfp_t mask = btrfs_alloc_write_mask(mapping);
- u64 page_start = (u64)index << PAGE_SHIFT;
- u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 lock_start;
+ u64 lock_end;
struct extent_state *cached_state = NULL;
struct folio *folio;
int ret;
again:
+ /* TODO: Add order fgp order flags when large folios are fully enabled. */
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio))
@@ -871,13 +863,16 @@ again:
/*
* Since we can defragment files opened read-only, we can encounter
- * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
- * can't do I/O using huge pages yet, so return an error for now.
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS).
+ *
+ * The IO for such large folios is not fully tested, thus return
+ * an error to reject such folios unless it's an experimental build.
+ *
* Filesystem transparent huge pages are typically only used for
* executables that explicitly enable them, so this isn't very
* restrictive.
*/
- if (folio_test_large(folio)) {
+ if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-ETXTBSY);
@@ -890,14 +885,15 @@ again:
return ERR_PTR(ret);
}
+ lock_start = folio_pos(folio);
+ lock_end = folio_next_pos(folio) - 1;
/* Wait for any existing ordered extent in the range */
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio));
+ btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
if (!ordered)
break;
@@ -928,7 +924,7 @@ again:
folio_put(folio);
goto again;
}
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-EIO);
@@ -951,7 +947,7 @@ struct defrag_target_range {
* @extent_thresh: file extent size threshold, any extent size >= this value
* will be ignored
* @newer_than: only defrag extents newer than this value
- * @do_compress: whether the defrag is doing compression
+ * @do_compress: whether the defrag is doing compression or no-compression
* if true, @extent_thresh will be ignored and all regular
* file extents meeting @newer_than will be targets.
* @locked: if the range has already held extent lock
@@ -1027,8 +1023,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* very likely resulting in a larger extent after writeback is
* triggered (except in a case of free space fragmentation).
*/
- if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
- EXTENT_DELALLOC))
+ if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC))
goto next;
/*
@@ -1066,8 +1062,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
/* Empty target list, no way to merge with last entry */
if (list_empty(target_list))
goto next;
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
+ last = list_last_entry(target_list,
+ struct defrag_target_range, list);
/* Not mergeable with last entry */
if (last->start + last->len != cur)
goto next;
@@ -1077,7 +1073,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
add:
last_is_target = true;
- range_len = min(extent_map_end(em), start + len) - cur;
+ range_len = min(btrfs_extent_map_end(em), start + len) - cur;
/*
* This one is a good target, check if it can be merged into
* last range of the target list.
@@ -1085,8 +1081,8 @@ add:
if (!list_empty(target_list)) {
struct defrag_target_range *last;
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
+ last = list_last_entry(target_list,
+ struct defrag_target_range, list);
ASSERT(last->start + last->len <= cur);
if (last->start + last->len == cur) {
/* Mergeable, enlarge the last entry */
@@ -1099,7 +1095,7 @@ add:
/* Allocate new defrag_target_range */
new = kmalloc(sizeof(*new), GFP_NOFS);
if (!new) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = -ENOMEM;
break;
}
@@ -1108,8 +1104,8 @@ add:
list_add_tail(&new->list, target_list);
next:
- cur = extent_map_end(em);
- free_extent_map(em);
+ cur = btrfs_extent_map_end(em);
+ btrfs_free_extent_map(em);
}
if (ret < 0) {
struct defrag_target_range *entry;
@@ -1162,27 +1158,31 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
struct extent_changeset *data_reserved = NULL;
const u64 start = target->start;
const u64 len = target->len;
- unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
- unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = folios[0]->index;
int ret = 0;
- int i;
-
- ASSERT(last_index - first_index + 1 <= nr_pages);
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
if (ret < 0)
return ret;
- clear_extent_bit(&inode->io_tree, start, start + len - 1,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, cached_state);
- set_extent_bit(&inode->io_tree, start, start + len - 1,
- EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
-
- /* Update the page status */
- for (i = start_index - first_index; i <= last_index - first_index; i++) {
- folio_clear_checked(folios[i]);
- btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
+ btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, cached_state);
+ btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
+
+ /*
+ * Update the page status.
+ * Due to possible large folios, we have to check all folios one by one.
+ */
+ for (int i = 0; i < nr_pages && folios[i]; i++) {
+ struct folio *folio = folios[i];
+
+ if (!folio)
+ break;
+ if (start >= folio_next_pos(folio) ||
+ start + len <= folio_pos(folio))
+ continue;
+ btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
@@ -1200,11 +1200,10 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
LIST_HEAD(target_list);
struct folio **folios;
const u32 sectorsize = inode->root->fs_info->sectorsize;
- u64 last_index = (start + len - 1) >> PAGE_SHIFT;
- u64 start_index = start >> PAGE_SHIFT;
- unsigned int nr_pages = last_index - start_index + 1;
+ u64 cur = start;
+ const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) -
+ (start >> PAGE_SHIFT) + 1;
int ret = 0;
- int i;
ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
@@ -1214,21 +1213,25 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
return -ENOMEM;
/* Prepare all pages */
- for (i = 0; i < nr_pages; i++) {
- folios[i] = defrag_prepare_one_folio(inode, start_index + i);
+ for (int i = 0; cur < start + len && i < nr_pages; i++) {
+ folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT);
if (IS_ERR(folios[i])) {
ret = PTR_ERR(folios[i]);
- nr_pages = i;
+ folios[i] = NULL;
goto free_folios;
}
+ cur = folio_next_pos(folios[i]);
}
- for (i = 0; i < nr_pages; i++)
+ for (int i = 0; i < nr_pages; i++) {
+ if (!folios[i])
+ break;
folio_wait_writeback(folios[i]);
+ }
+ /* We should get at least one folio. */
+ ASSERT(folios[0]);
/* Lock the pages range */
- lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
/*
* Now we have a consistent view about the extent map, re-check
* which range really needs to be defragged.
@@ -1254,11 +1257,11 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
kfree(entry);
}
unlock_extent:
- unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
free_folios:
- for (i = 0; i < nr_pages; i++) {
+ for (int i = 0; i < nr_pages; i++) {
+ if (!folios[i])
+ break;
folio_unlock(folios[i]);
folio_put(folios[i]);
}
@@ -1362,6 +1365,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
u64 cur;
u64 last_byte;
bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
+ bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS);
int compress_type = BTRFS_COMPRESS_ZLIB;
int compress_level = 0;
int ret = 0;
@@ -1392,6 +1396,9 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
if (range->compress_type)
compress_type = range->compress_type;
}
+ } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) {
+ compress_type = BTRFS_DEFRAG_DONT_COMPRESS;
+ compress_level = 1;
}
if (extent_thresh == 0)
@@ -1442,13 +1449,14 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
btrfs_inode_unlock(inode, 0);
break;
}
- if (do_compress) {
+ if (do_compress || no_compress) {
inode->defrag_compress = compress_type;
inode->defrag_compress_level = compress_level;
}
ret = defrag_one_cluster(inode, ra, cur,
cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress, &sectors_defragged,
+ newer_than, do_compress || no_compress,
+ &sectors_defragged,
max_to_defrag, &last_scanned);
if (sectors_defragged > prev_sectors_defragged)
@@ -1487,7 +1495,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
ret = sectors_defragged;
}
- if (do_compress) {
+ if (do_compress || no_compress) {
btrfs_inode_lock(inode, 0);
inode->defrag_compress = BTRFS_COMPRESS_NONE;
btrfs_inode_unlock(inode, 0);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 88e900e5a43d..0970799d0aa4 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -111,6 +111,18 @@
* making error handling and cleanup easier.
*/
+static inline struct btrfs_space_info *data_sinfo_for_inode(const struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(inode->root)) {
+ ASSERT(fs_info->data_sinfo->sub_group[0]->subgroup_id ==
+ BTRFS_SUB_GROUP_DATA_RELOC);
+ return fs_info->data_sinfo->sub_group[0];
+ }
+ return fs_info->data_sinfo;
+}
+
int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
@@ -123,7 +135,7 @@ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
if (btrfs_is_free_space_inode(inode))
flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- return btrfs_reserve_data_bytes(fs_info, bytes, flush);
+ return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush);
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
@@ -144,14 +156,14 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
else if (btrfs_is_free_space_inode(inode))
flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- ret = btrfs_reserve_data_bytes(fs_info, len, flush);
+ ret = btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), len, flush);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0) {
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
extent_changeset_free(*reserved);
*reserved = NULL;
} else {
@@ -168,15 +180,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
- u64 len)
+void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len)
{
- struct btrfs_space_info *data_sinfo;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
- data_sinfo = fs_info->data_sinfo;
- btrfs_space_info_free_bytes_may_use(data_sinfo, len);
+ btrfs_space_info_free_bytes_may_use(data_sinfo_for_inode(inode), len);
}
/*
@@ -196,7 +206,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
btrfs_qgroup_free_data(inode, reserved, start, len, NULL);
}
@@ -348,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
noflush);
if (ret)
return ret;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
- meta_reserve, flush);
+ ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, meta_reserve,
+ flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
@@ -439,6 +449,29 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
btrfs_inode_rsv_release(inode, true);
}
+/* Shrink a previously reserved extent to a new length. */
+void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 reserved_num_extents = count_max_extents(fs_info, reserved_len);
+ const u32 new_num_extents = count_max_extents(fs_info, new_len);
+ const int diff_num_extents = new_num_extents - reserved_num_extents;
+
+ ASSERT(new_len <= reserved_len);
+ if (new_num_extents == reserved_num_extents)
+ return;
+
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, diff_num_extents);
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ if (btrfs_is_testing(fs_info))
+ return;
+
+ btrfs_inode_rsv_release(inode, true);
+}
+
/*
* Reserve data and metadata space for delalloc
*
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 3f32953c0a80..6119c0d3f883 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -18,8 +18,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
-void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
- u64 len);
+void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
@@ -27,5 +26,6 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
u64 disk_num_bytes, bool noflush);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len);
#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 3f1551d8a5c6..ce6e9f8812e0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -57,6 +57,7 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
refcount_set(&delayed_node->refs, 0);
+ btrfs_delayed_node_ref_tracker_dir_init(delayed_node);
delayed_node->ins_root = RB_ROOT_CACHED;
delayed_node->del_root = RB_ROOT_CACHED;
mutex_init(&delayed_node->mutex);
@@ -65,7 +66,8 @@ static inline void btrfs_init_delayed_node(
}
static struct btrfs_delayed_node *btrfs_get_delayed_node(
- struct btrfs_inode *btrfs_inode)
+ struct btrfs_inode *btrfs_inode,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(btrfs_inode);
@@ -74,6 +76,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = READ_ONCE(btrfs_inode->delayed_node);
if (node) {
refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS);
return node;
}
@@ -83,6 +86,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
if (node) {
if (btrfs_inode->delayed_node) {
refcount_inc(&node->refs); /* can be accessed */
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
BUG_ON(btrfs_inode->delayed_node != node);
xa_unlock(&root->delayed_nodes);
return node;
@@ -106,6 +110,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
*/
if (refcount_inc_not_zero(&node->refs)) {
refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker,
+ GFP_ATOMIC);
btrfs_inode->delayed_node = node;
} else {
node = NULL;
@@ -119,9 +126,15 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
return NULL;
}
-/* Will return either the node or PTR_ERR(-ENOMEM) */
+/*
+ * Look up an existing delayed node associated with @btrfs_inode or create a new
+ * one and insert it to the delayed nodes of the root.
+ *
+ * Return the delayed node, or error pointer on failure.
+ */
static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
- struct btrfs_inode *btrfs_inode)
+ struct btrfs_inode *btrfs_inode,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_node *node;
struct btrfs_root *root = btrfs_inode->root;
@@ -130,7 +143,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
void *ptr;
again:
- node = btrfs_get_delayed_node(btrfs_inode);
+ node = btrfs_get_delayed_node(btrfs_inode, tracker);
if (node)
return node;
@@ -139,12 +152,10 @@ again:
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
- /* Cached in the inode and can be accessed. */
- refcount_set(&node->refs, 2);
-
/* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
if (ret == -ENOMEM) {
+ btrfs_delayed_node_ref_tracker_dir_exit(node);
kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(-ENOMEM);
}
@@ -153,6 +164,7 @@ again:
if (ptr) {
/* Somebody inserted it, go back and read it. */
xa_unlock(&root->delayed_nodes);
+ btrfs_delayed_node_ref_tracker_dir_exit(node);
kmem_cache_free(delayed_node_cache, node);
node = NULL;
goto again;
@@ -161,6 +173,12 @@ again:
ASSERT(xa_err(ptr) != -EINVAL);
ASSERT(xa_err(ptr) != -ENOMEM);
ASSERT(ptr == NULL);
+
+ /* Cached in the inode and can be accessed. */
+ refcount_set(&node->refs, 2);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_ATOMIC);
+
btrfs_inode->delayed_node = node;
xa_unlock(&root->delayed_nodes);
@@ -186,6 +204,8 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
list_add_tail(&node->n_list, &root->node_list);
list_add_tail(&node->p_list, &root->prepare_list);
refcount_inc(&node->refs); /* inserted into list */
+ btrfs_delayed_node_ref_tracker_alloc(node, &node->node_list_tracker,
+ GFP_ATOMIC);
root->nodes++;
set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
}
@@ -199,6 +219,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
spin_lock(&root->lock);
if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
root->nodes--;
+ btrfs_delayed_node_ref_tracker_free(node, &node->node_list_tracker);
refcount_dec(&node->refs); /* not in the list */
list_del_init(&node->n_list);
if (!list_empty(&node->p_list))
@@ -209,26 +230,26 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
}
static struct btrfs_delayed_node *btrfs_first_delayed_node(
- struct btrfs_delayed_root *delayed_root)
+ struct btrfs_delayed_root *delayed_root,
+ struct btrfs_ref_tracker *tracker)
{
- struct list_head *p;
- struct btrfs_delayed_node *node = NULL;
+ struct btrfs_delayed_node *node;
spin_lock(&delayed_root->lock);
- if (list_empty(&delayed_root->node_list))
- goto out;
-
- p = delayed_root->node_list.next;
- node = list_entry(p, struct btrfs_delayed_node, n_list);
- refcount_inc(&node->refs);
-out:
+ node = list_first_entry_or_null(&delayed_root->node_list,
+ struct btrfs_delayed_node, n_list);
+ if (node) {
+ refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ }
spin_unlock(&delayed_root->lock);
return node;
}
static struct btrfs_delayed_node *btrfs_next_delayed_node(
- struct btrfs_delayed_node *node)
+ struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_root *delayed_root;
struct list_head *p;
@@ -248,6 +269,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
next = list_entry(p, struct btrfs_delayed_node, n_list);
refcount_inc(&next->refs);
+ btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC);
out:
spin_unlock(&delayed_root->lock);
@@ -256,7 +278,7 @@ out:
static void __btrfs_release_delayed_node(
struct btrfs_delayed_node *delayed_node,
- int mod)
+ int mod, struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_root *delayed_root;
@@ -272,6 +294,7 @@ static void __btrfs_release_delayed_node(
btrfs_dequeue_delayed_node(delayed_root, delayed_node);
mutex_unlock(&delayed_node->mutex);
+ btrfs_delayed_node_ref_tracker_free(delayed_node, tracker);
if (refcount_dec_and_test(&delayed_node->refs)) {
struct btrfs_root *root = delayed_node->root;
@@ -281,39 +304,41 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
+ btrfs_delayed_node_ref_tracker_dir_exit(delayed_node);
kmem_cache_free(delayed_node_cache, delayed_node);
}
}
-static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
{
- __btrfs_release_delayed_node(node, 0);
+ __btrfs_release_delayed_node(node, 0, tracker);
}
static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
- struct btrfs_delayed_root *delayed_root)
+ struct btrfs_delayed_root *delayed_root,
+ struct btrfs_ref_tracker *tracker)
{
- struct list_head *p;
- struct btrfs_delayed_node *node = NULL;
+ struct btrfs_delayed_node *node;
spin_lock(&delayed_root->lock);
- if (list_empty(&delayed_root->prepare_list))
- goto out;
-
- p = delayed_root->prepare_list.next;
- list_del_init(p);
- node = list_entry(p, struct btrfs_delayed_node, p_list);
- refcount_inc(&node->refs);
-out:
+ node = list_first_entry_or_null(&delayed_root->prepare_list,
+ struct btrfs_delayed_node, p_list);
+ if (node) {
+ list_del_init(&node->p_list);
+ refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ }
spin_unlock(&delayed_root->lock);
return node;
}
static inline void btrfs_release_prepared_delayed_node(
- struct btrfs_delayed_node *node)
+ struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
{
- __btrfs_release_delayed_node(node, 1);
+ __btrfs_release_delayed_node(node, 1, tracker);
}
static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
@@ -336,6 +361,20 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
return item;
}
+static int delayed_item_index_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *index = key;
+ const struct btrfs_delayed_item *delayed_item = rb_entry(node,
+ struct btrfs_delayed_item, rb_node);
+
+ if (delayed_item->index < *index)
+ return 1;
+ else if (delayed_item->index > *index)
+ return -1;
+
+ return 0;
+}
+
/*
* Look up the delayed item by key.
*
@@ -349,21 +388,10 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
struct rb_root *root,
u64 index)
{
- struct rb_node *node = root->rb_node;
- struct btrfs_delayed_item *delayed_item = NULL;
+ struct rb_node *node;
- while (node) {
- delayed_item = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- if (delayed_item->index < index)
- node = node->rb_right;
- else if (delayed_item->index > index)
- node = node->rb_left;
- else
- return delayed_item;
- }
-
- return NULL;
+ node = rb_find(&index, root, delayed_item_index_cmp);
+ return rb_entry_safe(node, struct btrfs_delayed_item, rb_node);
}
static int btrfs_delayed_item_cmp(const struct rb_node *new,
@@ -371,14 +399,8 @@ static int btrfs_delayed_item_cmp(const struct rb_node *new,
{
const struct btrfs_delayed_item *new_item =
rb_entry(new, struct btrfs_delayed_item, rb_node);
- const struct btrfs_delayed_item *exist_item =
- rb_entry(exist, struct btrfs_delayed_item, rb_node);
- if (new_item->index < exist_item->index)
- return -1;
- if (new_item->index > exist_item->index)
- return 1;
- return 0;
+ return delayed_item_index_cmp(&new_item->index, exist);
}
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
@@ -454,40 +476,25 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
struct btrfs_delayed_node *delayed_node)
{
- struct rb_node *p;
- struct btrfs_delayed_item *item = NULL;
+ struct rb_node *p = rb_first_cached(&delayed_node->ins_root);
- p = rb_first_cached(&delayed_node->ins_root);
- if (p)
- item = rb_entry(p, struct btrfs_delayed_item, rb_node);
-
- return item;
+ return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
}
static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
struct btrfs_delayed_node *delayed_node)
{
- struct rb_node *p;
- struct btrfs_delayed_item *item = NULL;
-
- p = rb_first_cached(&delayed_node->del_root);
- if (p)
- item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+ struct rb_node *p = rb_first_cached(&delayed_node->del_root);
- return item;
+ return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
}
static struct btrfs_delayed_item *__btrfs_next_delayed_item(
struct btrfs_delayed_item *item)
{
- struct rb_node *p;
- struct btrfs_delayed_item *next = NULL;
-
- p = rb_next(&item->rb_node);
- if (p)
- next = rb_entry(p, struct btrfs_delayed_item, rb_node);
+ struct rb_node *p = rb_next(&item->rb_node);
- return next;
+ return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -661,7 +668,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_key first_key;
const u32 first_data_size = first_item->data_len;
int total_size;
- char *ins_data = NULL;
+ char AUTO_KFREE(ins_data);
int ret;
bool continuous_keys_only = false;
@@ -731,12 +738,10 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
u32 *ins_sizes;
int i = 0;
- ins_data = kmalloc(batch.nr * sizeof(u32) +
- batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
- if (!ins_data) {
- ret = -ENOMEM;
- goto out;
- }
+ ins_data = kmalloc_array(batch.nr,
+ sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
batch.keys = ins_keys;
@@ -752,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret)
- goto out;
+ return ret;
list_for_each_entry(curr, &item_list, tree_list) {
char *data_ptr;
@@ -807,9 +812,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
-out:
- kfree(ins_data);
- return ret;
+
+ return 0;
}
static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
@@ -1025,8 +1029,16 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
if (ret > 0)
ret = -ENOENT;
- if (ret < 0)
+ if (ret < 0) {
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the
+ * improper counts behind.
+ */
+ if (unlikely(ret != -ENOENT))
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -1051,8 +1063,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
+ if (unlikely(ret < 0)) {
+ btrfs_abort_transaction(trans, ret);
goto err_out;
+ }
ASSERT(ret > 0);
ASSERT(path->slots[0] > 0);
ret = 0;
@@ -1074,21 +1088,14 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
* in the same item doesn't exist.
*/
ret = btrfs_del_item(trans, root, path);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
btrfs_release_delayed_iref(node);
btrfs_release_path(path);
err_out:
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
-
- /*
- * If we fail to update the delayed inode we need to abort the
- * transaction, because we could leave the inode with the improper
- * counts behind.
- */
- if (ret && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
-
return ret;
}
@@ -1143,6 +1150,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
+ struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret = 0;
@@ -1160,17 +1168,18 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
delayed_root = fs_info->delayed_root;
- curr_node = btrfs_first_delayed_node(delayed_root);
+ curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker);
while (curr_node && (!count || nr--)) {
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
prev_node = curr_node;
- curr_node = btrfs_next_delayed_node(curr_node);
+ prev_delayed_node_tracker = curr_delayed_node_tracker;
+ curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker);
/*
* See the comment below about releasing path before releasing
* node. If the commit of delayed items was successful the path
@@ -1178,7 +1187,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
* point to locked extent buffers (a leaf at the very least).
*/
ASSERT(path->nodes[0] == NULL);
- btrfs_release_delayed_node(prev_node);
+ btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker);
}
/*
@@ -1191,7 +1200,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
btrfs_free_path(path);
if (curr_node)
- btrfs_release_delayed_node(curr_node);
+ btrfs_release_delayed_node(curr_node, &curr_delayed_node_tracker);
trans->block_rsv = block_rsv;
return ret;
@@ -1210,7 +1219,9 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
- struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *delayed_node =
+ btrfs_get_delayed_node(inode, &delayed_node_tracker);
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_block_rsv *block_rsv;
int ret;
@@ -1221,14 +1232,14 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
if (!delayed_node->count) {
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
mutex_unlock(&delayed_node->mutex);
path = btrfs_alloc_path();
if (!path) {
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return -ENOMEM;
}
@@ -1237,7 +1248,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
trans->block_rsv = block_rsv;
return ret;
@@ -1247,18 +1258,20 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_trans_handle *trans;
- struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *delayed_node;
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret;
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return 0;
mutex_lock(&delayed_node->mutex);
if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
mutex_unlock(&delayed_node->mutex);
@@ -1292,7 +1305,7 @@ trans_out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return ret;
}
@@ -1306,7 +1319,8 @@ void btrfs_remove_delayed_node(struct btrfs_inode *inode)
return;
inode->delayed_node = NULL;
- btrfs_release_delayed_node(delayed_node);
+
+ btrfs_release_delayed_node(delayed_node, &delayed_node->inode_cache_tracker);
}
struct btrfs_async_delayed_work {
@@ -1322,6 +1336,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_delayed_node *delayed_node = NULL;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_root *root;
struct btrfs_block_rsv *block_rsv;
int total_done = 0;
@@ -1338,7 +1353,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
BTRFS_DELAYED_BACKGROUND / 2)
break;
- delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+ delayed_node = btrfs_first_prepared_delayed_node(delayed_root,
+ &delayed_node_tracker);
if (!delayed_node)
break;
@@ -1347,7 +1363,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
btrfs_release_path(path);
- btrfs_release_prepared_delayed_node(delayed_node);
+ btrfs_release_prepared_delayed_node(delayed_node,
+ &delayed_node_tracker);
total_done++;
continue;
}
@@ -1362,7 +1379,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
btrfs_btree_balance_dirty_nodelay(root->fs_info);
btrfs_release_path(path);
- btrfs_release_prepared_delayed_node(delayed_node);
+ btrfs_release_prepared_delayed_node(delayed_node,
+ &delayed_node_tracker);
total_done++;
} while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
@@ -1394,20 +1412,28 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
{
- WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *node;
+
+ node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker);
+ if (WARN_ON(node)) {
+ btrfs_delayed_node_ref_tracker_free(node,
+ &delayed_node_tracker);
+ refcount_dec(&node->refs);
+ }
}
-static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
+static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
{
int val = atomic_read(&delayed_root->items_seq);
if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
- return 1;
+ return true;
if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
- return 1;
+ return true;
- return 0;
+ return false;
}
void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
@@ -1468,13 +1494,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_delayed_item *delayed_item;
struct btrfs_dir_item *dir_item;
bool reserve_leaf_space;
u32 data_len;
int ret;
- delayed_node = btrfs_get_or_create_delayed_node(dir);
+ delayed_node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
@@ -1550,12 +1577,12 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
mutex_unlock(&delayed_node->mutex);
release_node:
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return ret;
}
-static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
- u64 index)
+static bool btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
+ u64 index)
{
struct btrfs_delayed_item *item;
@@ -1563,7 +1590,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
if (!item) {
mutex_unlock(&node->mutex);
- return 1;
+ return false;
}
/*
@@ -1598,23 +1625,25 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
}
mutex_unlock(&node->mutex);
- return 0;
+ return true;
}
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, u64 index)
{
struct btrfs_delayed_node *node;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_delayed_item *item;
int ret;
- node = btrfs_get_or_create_delayed_node(dir);
+ node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
if (IS_ERR(node))
return PTR_ERR(node);
- ret = btrfs_delete_delayed_insertion_item(node, index);
- if (!ret)
+ if (btrfs_delete_delayed_insertion_item(node, index)) {
+ ret = 0;
goto end;
+ }
item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
if (!item) {
@@ -1631,7 +1660,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
*/
if (ret < 0) {
btrfs_err(trans->fs_info,
-"metadata reservation failed for delayed dir item deltiona, should have been reserved");
+"metadata reservation failed for delayed dir item deletion, index: %llu, root: %llu, inode: %llu, error: %d",
+ index, btrfs_root_id(node->root), node->inode_id, ret);
btrfs_release_delayed_item(item);
goto end;
}
@@ -1640,22 +1670,23 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
ret = __btrfs_add_delayed_item(node, item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
- "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, btrfs_root_id(node->root),
- node->inode_id, ret);
+"failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d",
+ index, btrfs_root_id(node->root), node->inode_id, ret);
btrfs_delayed_item_release_metadata(dir->root, item);
btrfs_release_delayed_item(item);
}
mutex_unlock(&node->mutex);
end:
- btrfs_release_delayed_node(node);
+ btrfs_release_delayed_node(node, &delayed_node_tracker);
return ret;
}
int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
{
- struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *delayed_node;
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return -ENOENT;
@@ -1665,12 +1696,12 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
* is updated now. So we needn't lock the delayed node.
*/
if (!delayed_node->index_cnt) {
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return -EINVAL;
}
inode->index_cnt = delayed_node->index_cnt;
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
@@ -1681,8 +1712,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
{
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *item;
+ struct btrfs_ref_tracker delayed_node_tracker;
- delayed_node = btrfs_get_delayed_node(inode);
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return false;
@@ -1717,6 +1749,7 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
* insert/delete delayed items in this period. So we also needn't
* requeue or dequeue this delayed node.
*/
+ btrfs_delayed_node_ref_tracker_free(delayed_node, &delayed_node_tracker);
refcount_dec(&delayed_node->refs);
return true;
@@ -1747,17 +1780,16 @@ void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
downgrade_write(&inode->vfs_inode.i_rwsem);
}
-int btrfs_should_delete_dir_index(const struct list_head *del_list,
- u64 index)
+bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index)
{
struct btrfs_delayed_item *curr;
- int ret = 0;
+ bool ret = false;
list_for_each_entry(curr, del_list, readdir_list) {
if (curr->index > index)
break;
if (curr->index == index) {
- ret = 1;
+ ret = true;
break;
}
}
@@ -1767,15 +1799,14 @@ int btrfs_should_delete_dir_index(const struct list_head *del_list,
/*
* Read dir info stored in the delayed tree.
*/
-int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- const struct list_head *ins_list)
+bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ const struct list_head *ins_list)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
struct btrfs_key location;
char *name;
int name_len;
- int over = 0;
unsigned char d_type;
/*
@@ -1784,6 +1815,8 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
* directory, nobody can delete any directory indexes now.
*/
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ bool over;
+
list_del(&curr->readdir_list);
if (curr->index < ctx->pos) {
@@ -1801,17 +1834,16 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
btrfs_disk_key_to_cpu(&location, &di->location);
- over = !dir_emit(ctx, name, name_len,
- location.objectid, d_type);
+ over = !dir_emit(ctx, name, name_len, location.objectid, d_type);
if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (over)
- return 1;
+ return true;
ctx->pos++;
}
- return 0;
+ return false;
}
static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
@@ -1857,19 +1889,19 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_inode_item *inode_item;
struct inode *vfs_inode = &inode->vfs_inode;
- delayed_node = btrfs_get_delayed_node(inode);
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return -ENOENT;
mutex_lock(&delayed_node->mutex);
if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return -ENOENT;
}
@@ -1878,8 +1910,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
@@ -1909,7 +1939,7 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
inode->index_cnt = (u64)-1;
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
@@ -1918,9 +1948,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
int ret = 0;
- delayed_node = btrfs_get_or_create_delayed_node(inode);
+ delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
@@ -1940,7 +1971,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
atomic_inc(&root->fs_info->delayed_root->items);
release_node:
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return ret;
}
@@ -1948,6 +1979,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
/*
* we don't do delayed inode updates during log recovery because it
@@ -1957,7 +1989,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
return -EAGAIN;
- delayed_node = btrfs_get_or_create_delayed_node(inode);
+ delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
@@ -1976,15 +2008,12 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
* It is very rare.
*/
mutex_lock(&delayed_node->mutex);
- if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
- goto release_node;
-
- set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
- delayed_node->count++;
- atomic_inc(&fs_info->delayed_root->items);
-release_node:
+ if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
+ delayed_node->count++;
+ atomic_inc(&fs_info->delayed_root->items);
+ }
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
@@ -2028,19 +2057,21 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
{
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
- delayed_node = btrfs_get_delayed_node(inode);
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return;
__btrfs_kill_delayed_node(delayed_node);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
}
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
unsigned long index = 0;
struct btrfs_delayed_node *delayed_nodes[8];
+ struct btrfs_ref_tracker delayed_node_trackers[8];
while (1) {
struct btrfs_delayed_node *node;
@@ -2059,6 +2090,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
* about to be removed from the tree in the loop below
*/
if (refcount_inc_not_zero(&node->refs)) {
+ btrfs_delayed_node_ref_tracker_alloc(node,
+ &delayed_node_trackers[count],
+ GFP_ATOMIC);
delayed_nodes[count] = node;
count++;
}
@@ -2070,7 +2104,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
- btrfs_release_delayed_node(delayed_nodes[i]);
+ btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);
+ btrfs_release_delayed_node(delayed_nodes[i],
+ &delayed_node_trackers[i]);
}
}
}
@@ -2078,14 +2114,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
{
struct btrfs_delayed_node *curr_node, *prev_node;
+ struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
- curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
+ curr_node = btrfs_first_delayed_node(fs_info->delayed_root,
+ &curr_delayed_node_tracker);
while (curr_node) {
__btrfs_kill_delayed_node(curr_node);
prev_node = curr_node;
- curr_node = btrfs_next_delayed_node(curr_node);
- btrfs_release_delayed_node(prev_node);
+ prev_delayed_node_tracker = curr_delayed_node_tracker;
+ curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker);
+ btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker);
}
}
@@ -2095,8 +2134,9 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
{
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
+ struct btrfs_ref_tracker delayed_node_tracker;
- node = btrfs_get_delayed_node(inode);
+ node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!node)
return;
@@ -2154,6 +2194,7 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
* delete delayed items.
*/
ASSERT(refcount_read(&node->refs) > 1);
+ btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
refcount_dec(&node->refs);
}
@@ -2164,8 +2205,9 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
struct btrfs_delayed_item *next;
+ struct btrfs_ref_tracker delayed_node_tracker;
- node = btrfs_get_delayed_node(inode);
+ node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!node)
return;
@@ -2197,5 +2239,6 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
* delete delayed items.
*/
ASSERT(refcount_read(&node->refs) > 1);
+ btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
refcount_dec(&node->refs);
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index c4b4ba122beb..b09d4ec8c77d 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -16,6 +16,7 @@
#include <linux/fs.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
+#include <linux/ref_tracker.h>
#include "ctree.h"
struct btrfs_disk_key;
@@ -44,6 +45,22 @@ struct btrfs_delayed_root {
wait_queue_head_t wait;
};
+struct btrfs_ref_tracker_dir {
+#ifdef CONFIG_BTRFS_DEBUG
+ struct ref_tracker_dir dir;
+#else
+ struct {} tracker;
+#endif
+};
+
+struct btrfs_ref_tracker {
+#ifdef CONFIG_BTRFS_DEBUG
+ struct ref_tracker *tracker;
+#else
+ struct {} tracker;
+#endif
+};
+
#define BTRFS_DELAYED_NODE_IN_LIST 0
#define BTRFS_DELAYED_NODE_INODE_DIRTY 1
#define BTRFS_DELAYED_NODE_DEL_IREF 2
@@ -78,6 +95,12 @@ struct btrfs_delayed_node {
* actual number of leaves we end up using. Protected by @mutex.
*/
u32 index_item_leaves;
+ /* Track all references to this delayed node. */
+ struct btrfs_ref_tracker_dir ref_dir;
+ /* Track delayed node reference stored in node list. */
+ struct btrfs_ref_tracker node_list_tracker;
+ /* Track delayed node reference stored in inode cache. */
+ struct btrfs_ref_tracker inode_cache_tracker;
};
struct btrfs_delayed_item {
@@ -150,10 +173,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list);
-int btrfs_should_delete_dir_index(const struct list_head *del_list,
- u64 index);
-int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- const struct list_head *ins_list);
+bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index);
+bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ const struct list_head *ins_list);
/* Used during directory logging. */
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
@@ -170,4 +192,81 @@ void __cold btrfs_delayed_inode_exit(void);
/* for debugging */
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
+#define BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT 16
+#define BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT 16
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return;
+
+ ref_tracker_dir_init(&node->ref_dir.dir,
+ BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT,
+ "delayed_node");
+}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return;
+
+ ref_tracker_dir_exit(&node->ref_dir.dir);
+}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return;
+
+ /*
+ * Only print if there are leaked references. The caller is
+ * holding one reference, so if refs == 1 there is no leak.
+ */
+ if (refcount_read(&node->refs) == 1)
+ return;
+
+ ref_tracker_dir_print(&node->ref_dir.dir,
+ BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker,
+ gfp_t gfp)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return 0;
+
+ return ref_tracker_alloc(&node->ref_dir.dir, &tracker->tracker, gfp);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return 0;
+
+ return ref_tracker_free(&node->ref_dir.dir, &tracker->tracker);
+}
+#else
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) { }
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { }
+
+static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) { }
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker,
+ gfp_t gfp)
+{
+ return 0;
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 98c5b61dabe8..e8bc37453336 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -228,7 +228,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
if (!num_bytes)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(space_info, num_bytes, flush);
if (ret)
return ret;
@@ -331,12 +331,9 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
struct rb_node *node = &ins->ref_node;
- struct rb_node *exist;
+ struct rb_node *exist = rb_find_add_cached(node, root, cmp_refs_node);
- exist = rb_find_add_cached(node, root, cmp_refs_node);
- if (exist)
- return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node);
- return NULL;
+ return rb_entry_safe(exist, struct btrfs_delayed_ref_node, ref_node);
}
static struct btrfs_delayed_ref_head *find_first_ref_head(
@@ -801,9 +798,13 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
}
/*
- * helper function to actually insert a head node into the rbtree.
- * this does all the dirty work in terms of maintaining the correct
- * overall modification count.
+ * Helper function to actually insert a head node into the xarray. This does all
+ * the dirty work in terms of maintaining the correct overall modification
+ * count.
+ *
+ * The caller is responsible for calling kfree() on @qrecord. More specifically,
+ * if this function reports that it did not insert it as noted in
+ * @qrecord_inserted_ret, then it's safe to call kfree() on it.
*
* Returns an error pointer in case of an error.
*/
@@ -817,7 +818,14 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits);
- bool qrecord_inserted = false;
+
+ /*
+ * If 'qrecord_inserted_ret' is provided, then the first thing we need
+ * to do is to initialize it to false just in case we have an exit
+ * before trying to insert the record.
+ */
+ if (qrecord_inserted_ret)
+ *qrecord_inserted_ret = false;
delayed_refs = &trans->transaction->delayed_refs;
lockdep_assert_held(&delayed_refs->lock);
@@ -836,6 +844,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
/* Record qgroup extent info if provided */
if (qrecord) {
+ /*
+ * Setting 'qrecord' but not 'qrecord_inserted_ret' will likely
+ * result in a memory leakage.
+ */
+ ASSERT(qrecord_inserted_ret != NULL);
+
int ret;
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord,
@@ -843,12 +857,10 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
if (ret) {
/* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, index);
- /* Caller responsible for freeing qrecord on error. */
if (ret < 0)
return ERR_PTR(ret);
- kfree(qrecord);
- } else {
- qrecord_inserted = true;
+ } else if (qrecord_inserted_ret) {
+ *qrecord_inserted_ret = true;
}
}
@@ -891,14 +903,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
}
- if (qrecord_inserted_ret)
- *qrecord_inserted_ret = qrecord_inserted;
return head_ref;
}
/*
- * Initialize the structure which represents a modification to a an extent.
+ * Initialize the structure which represents a modification to an extent.
*
* @fs_info: Internal to the mounted filesystem mount structure.
*
@@ -931,7 +941,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
- if (is_fstree(generic_ref->ref_root))
+ if (btrfs_is_fstree(generic_ref->ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
refcount_set(&ref->refs, 1);
@@ -955,14 +965,14 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
bool skip_qgroup)
{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: generic_ref->ref_root;
#endif
generic_ref->tree_ref.level = level;
generic_ref->type = BTRFS_REF_METADATA;
- if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
- (!mod_root || is_fstree(mod_root))))
+ if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) &&
+ (!mod_root || btrfs_is_fstree(mod_root))))
generic_ref->skip_qgroup = true;
else
generic_ref->skip_qgroup = false;
@@ -972,15 +982,15 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
u64 mod_root, bool skip_qgroup)
{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: generic_ref->ref_root;
#endif
generic_ref->data_ref.objectid = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
- if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
- (!mod_root || is_fstree(mod_root))))
+ if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) &&
+ (!mod_root || btrfs_is_fstree(mod_root))))
generic_ref->skip_qgroup = true;
else
generic_ref->skip_qgroup = false;
@@ -1052,6 +1062,14 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
xa_release(&delayed_refs->head_refs, index);
spin_unlock(&delayed_refs->lock);
ret = PTR_ERR(new_head_ref);
+
+ /*
+ * It's only safe to call kfree() on 'qrecord' if
+ * add_delayed_ref_head() has _not_ inserted it for
+ * tracing. Otherwise we need to handle this here.
+ */
+ if (!qrecord_reserved || qrecord_inserted)
+ goto free_head_ref;
goto free_record;
}
head_ref = new_head_ref;
@@ -1074,6 +1092,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
if (qrecord_inserted)
return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr);
+
+ kfree(record);
return 0;
free_record:
@@ -1254,7 +1274,6 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
{
struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
- bool testing = btrfs_is_testing(fs_info);
spin_lock(&delayed_refs->lock);
while (true) {
@@ -1284,7 +1303,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
- if (!testing && pin_bytes) {
+ if (!btrfs_is_testing(fs_info) && pin_bytes) {
struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, head->bytenr);
@@ -1315,14 +1334,14 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
btrfs_error_unpin_extent_range(fs_info, head->bytenr,
head->bytenr + head->num_bytes - 1);
}
- if (!testing)
+ if (!btrfs_is_testing(fs_info))
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
- if (!testing)
+ if (!btrfs_is_testing(fs_info))
btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
@@ -1339,7 +1358,7 @@ int __init btrfs_delayed_ref_init(void)
{
btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0);
if (!btrfs_delayed_ref_head_cachep)
- goto fail;
+ return -ENOMEM;
btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0);
if (!btrfs_delayed_ref_node_cachep)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f5ae880308d3..5ce940532144 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -262,7 +262,6 @@ enum btrfs_ref_type {
BTRFS_REF_NOT_SET,
BTRFS_REF_DATA,
BTRFS_REF_METADATA,
- BTRFS_REF_LAST,
} __packed;
struct btrfs_ref {
@@ -277,10 +276,6 @@ struct btrfs_ref {
*/
bool skip_qgroup;
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- /* Through which root is this modification. */
- u64 real_root;
-#endif
u64 bytenr;
u64 num_bytes;
u64 owning_root;
@@ -297,6 +292,11 @@ struct btrfs_ref {
struct btrfs_data_ref data_ref;
struct btrfs_tree_ref tree_ref;
};
+
+#ifdef CONFIG_BTRFS_DEBUG
+ /* Through which root is this modification. */
+ u64 real_root;
+#endif
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
@@ -421,7 +421,7 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent);
void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans);
-static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_owner(const struct btrfs_delayed_ref_node *node)
{
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
@@ -429,7 +429,7 @@ static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
return node->tree_ref.level;
}
-static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_offset(const struct btrfs_delayed_ref_node *node)
{
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
@@ -437,7 +437,7 @@ static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
return 0;
}
-static inline u8 btrfs_ref_type(struct btrfs_ref *ref)
+static inline u8 btrfs_ref_type(const struct btrfs_ref *ref)
{
ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 53d7d85cb4be..b6c7da8e1bc8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -98,7 +98,7 @@ no_valid_dev_replace_entry_found:
* We don't have a replace item or it's corrupted. If there is
* a replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
return -EUCLEAN;
@@ -158,7 +158,7 @@ no_valid_dev_replace_entry_found:
* We don't have an active replace item but if there is a
* replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) {
btrfs_err(fs_info,
"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
@@ -177,8 +177,7 @@ no_valid_dev_replace_entry_found:
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
*/
- if (!dev_replace->srcdev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (unlikely(!dev_replace->srcdev && !btrfs_test_opt(fs_info, DEGRADED))) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -186,8 +185,7 @@ no_valid_dev_replace_entry_found:
"srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
src_devid);
}
- if (!dev_replace->tgtdev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (unlikely(!dev_replace->tgtdev && !btrfs_test_opt(fs_info, DEGRADED))) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -250,7 +248,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
+ fs_info->sb, &fs_holder_ops);
if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev_file);
@@ -327,7 +325,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- fput(bdev_file);
+ bdev_fput(bdev_file);
return ret;
}
@@ -491,8 +489,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
}
path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
key.objectid = src_dev->devid;
key.type = BTRFS_DEV_EXTENT_KEY;
@@ -600,7 +598,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(src_device);
if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"cannot replace device %s (devid %llu) due to active swapfile",
btrfs_dev_name(src_device), src_device->devid);
return -ETXTBSY;
@@ -637,7 +635,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- ASSERT(0);
+ DEBUG_WARN("unexpected STARTED or SUSPENDED dev-replace state");
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
up_write(&dev_replace->rwsem);
goto leave;
@@ -647,7 +645,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->srcdev = src_device;
dev_replace->tgtdev = tgt_device;
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s started",
btrfs_dev_name(src_device),
src_device->devid,
@@ -794,17 +792,17 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
- while (find_first_extent_bit(&srcdev->alloc_state, start,
- &found_start, &found_end,
- CHUNK_ALLOCATED, &cached_state)) {
- ret = set_extent_bit(&tgtdev->alloc_state, found_start,
- found_end, CHUNK_ALLOCATED, NULL);
+ while (btrfs_find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = btrfs_set_extent_bit(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED, NULL);
if (ret)
break;
start = found_end + 1;
}
- free_extent_state(cached_state);
+ btrfs_free_extent_state(cached_state);
return ret;
}
@@ -943,7 +941,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
tgt_device);
} else {
if (scrub_ret != -ECANCELED)
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
@@ -961,7 +959,7 @@ error:
return scrub_ret;
}
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s finished",
btrfs_dev_name(src_device),
src_device->devid,
@@ -1109,7 +1107,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
* btrfs_dev_replace_finishing() will handle the
* cleanup part
*/
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
@@ -1143,7 +1141,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"suspended dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
@@ -1247,7 +1245,7 @@ static int btrfs_dev_replace_kthread(void *data)
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
btrfs_dev_name(dev_replace->srcdev),
dev_replace->srcdev->devid,
@@ -1265,16 +1263,16 @@ static int btrfs_dev_replace_kthread(void *data)
return 0;
}
-int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
{
if (!dev_replace->is_valid)
- return 0;
+ return false;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- return 0;
+ return false;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
@@ -1289,7 +1287,7 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
*/
break;
}
- return 1;
+ return true;
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 23e480efe5e6..b35cecf388f2 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,7 +25,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
-int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index b29cc31a7c4a..085a83ae9e62 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -9,6 +9,7 @@
#include "transaction.h"
#include "accessors.h"
#include "dir-item.h"
+#include "delayed-inode.h"
/*
* insert a name into a directory, doing overflow properly if there is a hash
@@ -111,7 +112,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
int ret = 0;
int ret2 = 0;
struct btrfs_root *root = dir->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *dir_item;
struct extent_buffer *leaf;
unsigned long name_ptr;
@@ -163,7 +164,6 @@ second_insert:
ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir,
&disk_key, type, index);
out_free:
- btrfs_free_path(path);
if (ret)
return ret;
if (ret2)
@@ -227,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
return di;
}
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino,
const struct fscrypt_str *name)
{
int ret;
@@ -242,7 +242,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
if (!path)
return -ENOMEM;
- key.objectid = dir;
+ key.objectid = dir_ino;
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name->name, name->len);
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 8462579a95f4..e52174a8baf9 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -14,7 +14,7 @@ struct btrfs_inode;
struct btrfs_root;
struct btrfs_trans_handle;
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index a374ce7a1813..07e19e88ba4b 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -10,6 +10,8 @@
#include "fs.h"
#include "transaction.h"
#include "volumes.h"
+#include "bio.h"
+#include "ordered-data.h"
struct btrfs_dio_data {
ssize_t submitted;
@@ -42,21 +44,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
/* Direct lock must be taken before the extent lock. */
if (nowait) {
- if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
+ if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
return -EAGAIN;
} else {
- lock_dio_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
}
while (1) {
if (nowait) {
- if (!try_lock_extent(io_tree, lockstart, lockend,
- cached_state)) {
+ if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
+ cached_state)) {
ret = -EAGAIN;
break;
}
} else {
- lock_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
}
/*
* We're concerned with the entire range that we're going to be
@@ -78,7 +80,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
lockstart, lockend)))
break;
- unlock_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
if (ordered) {
if (nowait) {
@@ -131,7 +133,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
}
if (ret)
- unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
return ret;
}
@@ -151,11 +153,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
}
ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
- (1 << type) |
- (1 << BTRFS_ORDERED_DIRECT));
+ (1U << type) |
+ (1U << BTRFS_ORDERED_DIRECT));
if (IS_ERR(ordered)) {
if (em) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
btrfs_drop_extent_map_range(inode, start,
start + file_extent->num_bytes - 1, false);
}
@@ -184,7 +186,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
again:
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
- 0, alloc_hint, &ins, 1, 1);
+ 0, alloc_hint, &ins, true, true);
if (ret == -EAGAIN) {
ASSERT(btrfs_is_zoned(fs_info));
wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
@@ -204,8 +206,7 @@ again:
BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
- 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
return em;
}
@@ -246,7 +247,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
else
type = BTRFS_ORDERED_NOCOW;
len = min(len, em->len - (start - em->start));
- block_start = extent_map_block_start(em) + (start - em->start);
+ block_start = btrfs_extent_map_block_start(em) + (start - em->start);
if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
false) == 1) {
@@ -265,7 +266,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
nowait);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*map = NULL;
btrfs_dec_nocow_writers(bg);
if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
@@ -278,7 +279,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
&file_extent, type);
btrfs_dec_nocow_writers(bg);
if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*map = em2;
em = em2;
}
@@ -291,7 +292,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
dio_data->nocow_done = true;
} else {
/* Our caller expects us to free the input extent map. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*map = NULL;
if (nowait) {
@@ -386,7 +387,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to allocate a contiguous array for the checksums.
*/
if (!write)
- len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
+ len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS);
lockstart = start;
lockend = start + len - 1;
@@ -440,8 +441,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
start, data_alloc_len, false);
if (!ret)
dio_data->data_space_reserved = true;
- else if (ret && !(BTRFS_I(inode)->flags &
- (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ else if (!(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
goto err;
}
@@ -474,8 +475,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
- if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
- free_extent_map(em);
+ if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
+ btrfs_free_extent_map(em);
/*
* If we are in a NOWAIT context, return -EAGAIN in order to
* fallback to buffered IO. This is not only because we can
@@ -516,7 +517,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* after we have submitted bios for all the extents in the range.
*/
if ((flags & IOMAP_NOWAIT) && len < length) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = -EAGAIN;
goto unlock_err;
}
@@ -558,13 +559,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
- iomap->addr = extent_map_block_start(em) + (start - em->start);
+ iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
iomap->type = IOMAP_MAPPED;
}
iomap->offset = start;
iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/*
* Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
@@ -575,13 +576,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (write)
unlock_bits |= EXTENT_DIO_LOCKED;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- unlock_bits, &cached_state);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, &cached_state);
/* We didn't use everything, unlock the dio extent for the remainder. */
if (!write && (start + len) < lockend)
- unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
- lockend, NULL);
+ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
+ lockend, NULL);
return 0;
@@ -591,8 +592,8 @@ unlock_err:
* to update this, be explicit that we expect EXTENT_LOCKED and
* EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
err:
if (dio_data->data_space_reserved) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -615,8 +616,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
- unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
+ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
return 0;
}
@@ -627,8 +628,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
pos, length, false);
else
- unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
+ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
ret = -ENOTBLK;
}
if (write) {
@@ -660,8 +661,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)
dip->file_offset, dip->bytes,
!bio->bi_status);
} else {
- unlock_dio_extent(&inode->io_tree, dip->file_offset,
- dip->file_offset + dip->bytes - 1, NULL);
+ btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
}
bbio->bio.bi_private = bbio->private;
@@ -692,9 +693,9 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
* a pre-existing one.
*/
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
- ret = split_extent_map(bbio->inode, bbio->file_offset,
- ordered->num_bytes, len,
- ordered->disk_bytenr);
+ ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
+ ordered->num_bytes, len,
+ ordered->disk_bytenr);
if (ret)
return ret;
}
@@ -714,10 +715,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
container_of(bbio, struct btrfs_dio_private, bbio);
struct btrfs_dio_data *dio_data = iter->private;
- btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
+ btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset,
btrfs_dio_end_io, bio->bi_private);
- bbio->inode = BTRFS_I(iter->inode);
- bbio->file_offset = file_offset;
dip->file_offset = file_offset;
dip->bytes = bio->bi_iter.bi_size;
@@ -787,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
if (iov_iter_alignment(iter) & blocksize_mask)
return -EINVAL;
+ /*
+ * For bs > ps support, we heavily rely on large folios to make sure no
+ * block will cross large folio boundaries.
+ *
+ * But memory provided by direct IO is only virtually contiguous, not
+ * physically contiguous, and will break the btrfs' large folio requirement.
+ *
+ * So for bs > ps support, all direct IOs should fallback to buffered ones.
+ */
+ if (fs_info->sectorsize > PAGE_SIZE)
+ return -EINVAL;
+
return 0;
}
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index d6eef4bd9e9d..89fe85778115 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
lockdep_assert_held(&discard_ctl->lock);
- if (!btrfs_run_discard_work(discard_ctl))
- return;
if (list_empty(&block_group->discard_list) ||
block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
@@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
if (!btrfs_is_block_group_data_only(block_group))
return;
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+
spin_lock(&discard_ctl->lock);
__add_to_discard_list(discard_ctl, block_group);
spin_unlock(&discard_ctl->lock);
@@ -244,6 +245,20 @@ again:
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group)) {
__add_to_discard_list(discard_ctl, block_group);
+ /*
+ * The block group must have been moved to other
+ * discard list even if discard was disabled in
+ * the meantime or a transaction abort happened,
+ * otherwise we can end up in an infinite loop,
+ * always jumping into the 'again' label and
+ * keep getting this block group over and over
+ * in case there are no other block groups in
+ * the discard lists.
+ */
+ ASSERT(block_group->discard_index !=
+ BTRFS_DISCARD_INDEX_UNUSED,
+ "discard_index=%d",
+ block_group->discard_index);
} else {
list_del_init(&block_group->discard_list);
btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1a916716cefe..89149fac804c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
#include "relocation.h"
#include "scrub.h"
#include "super.h"
+#include "delayed-inode.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -116,7 +117,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
* detect blocks that either didn't get written at all or got written
* in the wrong place.
*/
-int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
+int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic)
{
if (!extent_buffer_uptodate(eb))
return 0;
@@ -182,25 +183,33 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ const u32 step = min(fs_info->nodesize, PAGE_SIZE);
+ const u32 nr_steps = eb->len / step;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (int i = 0; i < num_extent_folios(eb); i++) {
+ for (int i = 0; i < num_extent_pages(eb); i++) {
struct folio *folio = eb->folios[i];
- u64 start = max_t(u64, eb->start, folio_pos(folio));
- u64 end = min_t(u64, eb->start + eb->len,
- folio_pos(folio) + eb->folio_size);
- u32 len = end - start;
-
- ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, folio, offset_in_folio(folio, start),
- mirror_num);
- if (ret)
- break;
+
+ /* No large folio support yet. */
+ ASSERT(folio_order(folio) == 0);
+ ASSERT(i < nr_steps);
+
+ /*
+ * For nodesize < page size, there is just one paddr, with some
+ * offset inside the page.
+ *
+ * For nodesize >= page size, it's one or more paddrs, and eb->start
+ * must be aligned to page boundary.
+ */
+ paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start);
}
+ ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start,
+ paddrs, step, mirror_num);
return ret;
}
@@ -224,7 +233,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
ASSERT(check);
while (1) {
- clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = read_extent_buffer_pages(eb, mirror_num, check);
if (!ret)
break;
@@ -256,7 +264,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
/*
* Checksum a dirty tree block before IO.
*/
-blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
+int btree_csum_one_bio(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
@@ -267,9 +275,9 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
/* Btree blocks are always contiguous on disk. */
if (WARN_ON_ONCE(bbio->file_offset != eb->start))
- return BLK_STS_IOERR;
+ return -EIO;
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
- return BLK_STS_IOERR;
+ return -EIO;
/*
* If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
@@ -278,13 +286,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
*/
if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
memzero_extent_buffer(eb, 0, eb->len);
- return BLK_STS_OK;
+ return 0;
}
if (WARN_ON_ONCE(found_start != eb->start))
- return BLK_STS_IOERR;
+ return -EIO;
if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
- return BLK_STS_IOERR;
+ return -EIO;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
offsetof(struct btrfs_header, fsid),
@@ -312,7 +320,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
- return BLK_STS_OK;
+ return 0;
error:
btrfs_print_tree(eb, 0);
@@ -326,7 +334,7 @@ error:
*/
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
- return errno_to_blk_status(ret);
+ return ret;
}
static bool check_tree_block_fsid(struct extent_buffer *eb)
@@ -370,21 +378,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
ASSERT(check);
found_start = btrfs_header_bytenr(eb);
- if (found_start != eb->start) {
+ if (unlikely(found_start != eb->start)) {
btrfs_err_rl(fs_info,
"bad tree block start, mirror %u want %llu have %llu",
eb->read_mirror, eb->start, found_start);
ret = -EIO;
goto out;
}
- if (check_tree_block_fsid(eb)) {
+ if (unlikely(check_tree_block_fsid(eb))) {
btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
eb->start, eb->read_mirror);
ret = -EIO;
goto out;
}
found_level = btrfs_header_level(eb);
- if (found_level >= BTRFS_MAX_LEVEL) {
+ if (unlikely(found_level >= BTRFS_MAX_LEVEL)) {
btrfs_err(fs_info,
"bad tree block level, mirror %u level %d on logical %llu",
eb->read_mirror, btrfs_header_level(eb), eb->start);
@@ -398,19 +406,19 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
-"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s",
+"checksum verify failed on logical %llu mirror %u wanted " BTRFS_CSUM_FMT " found " BTRFS_CSUM_FMT " level %d%s",
eb->start, eb->read_mirror,
- CSUM_FMT_VALUE(csum_size, header_csum),
- CSUM_FMT_VALUE(csum_size, result),
+ BTRFS_CSUM_FMT_VALUE(csum_size, header_csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb),
ignore_csum ? ", ignored" : "");
- if (!ignore_csum) {
+ if (unlikely(!ignore_csum)) {
ret = -EUCLEAN;
goto out;
}
}
- if (found_level != check->level) {
+ if (unlikely(found_level != check->level)) {
btrfs_err(fs_info,
"level verify failed on logical %llu mirror %u wanted %u found %u",
eb->start, eb->read_mirror, check->level, found_level);
@@ -452,15 +460,9 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
goto out;
}
- /*
- * If this is a leaf block and it is corrupt, set the corrupt bit so
- * that we don't try and read the other copies of this block, just
- * return -EIO.
- */
- if (found_level == 0 && btrfs_check_leaf(eb)) {
- set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ /* If this is a leaf block and it is corrupt, just return -EIO. */
+ if (found_level == 0 && btrfs_check_leaf(eb))
ret = -EIO;
- }
if (found_level > 0 && btrfs_check_node(eb))
ret = -EIO;
@@ -641,25 +643,19 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
}
-static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
- u64 objectid)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, gfp_t flags)
{
- bool dummy = btrfs_is_testing(fs_info);
+ struct btrfs_root *root;
+
+ root = kzalloc(sizeof(*root), flags);
+ if (!root)
+ return NULL;
- memset(&root->root_key, 0, sizeof(root->root_key));
- memset(&root->root_item, 0, sizeof(root->root_item));
- memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
root->fs_info = fs_info;
root->root_key.objectid = objectid;
- root->node = NULL;
- root->commit_root = NULL;
- root->state = 0;
RB_CLEAR_NODE(&root->rb_node);
- btrfs_set_root_last_trans(root, 0);
- root->free_objectid = 0;
- root->nr_delalloc_inodes = 0;
- root->nr_ordered_extents = 0;
xa_init(&root->inodes);
xa_init(&root->delayed_nodes);
@@ -693,15 +689,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
refcount_set(&root->refs, 1);
atomic_set(&root->snapshot_force_cow, 0);
atomic_set(&root->nr_swapfiles, 0);
- btrfs_set_root_log_transid(root, 0);
root->log_transid_committed = -1;
- btrfs_set_root_last_log_commit(root, 0);
- root->anon_dev = 0;
- if (!dummy) {
- extent_io_tree_init(fs_info, &root->dirty_log_pages,
- IO_TREE_ROOT_DIRTY_LOG_PAGES);
- extent_io_tree_init(fs_info, &root->log_csum_range,
- IO_TREE_LOG_CSUM_RANGE);
+ if (!btrfs_is_testing(fs_info)) {
+ btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES);
+ btrfs_extent_io_tree_init(fs_info, &root->log_csum_range,
+ IO_TREE_LOG_CSUM_RANGE);
}
spin_lock_init(&root->root_item_lock);
@@ -712,14 +705,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
-}
-static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
- u64 objectid, gfp_t flags)
-{
- struct btrfs_root *root = kzalloc(sizeof(*root), flags);
- if (root)
- __setup_root(root, fs_info, objectid);
return root;
}
@@ -892,7 +878,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, leaf->len);
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
- if (is_fstree(objectid))
+ if (btrfs_is_fstree(objectid))
generate_random_guid(root->root_item.uuid);
else
export_guid(root->root_item.uuid, &guid_null);
@@ -1055,7 +1041,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
root->node = NULL;
goto fail;
}
- if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) {
ret = -EIO;
goto fail;
}
@@ -1064,10 +1050,10 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
* For real fs, and not log/reloc trees, root owner must
* match its root node owner
*/
- if (!btrfs_is_testing(fs_info) &&
- btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
- btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
- btrfs_root_id(root) != btrfs_header_owner(root->node)) {
+ if (unlikely(!btrfs_is_testing(fs_info) &&
+ btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
+ btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_root_id(root) != btrfs_header_owner(root->node))) {
btrfs_crit(fs_info,
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
btrfs_root_id(root), root->node->start,
@@ -1112,7 +1098,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
!btrfs_is_data_reloc_root(root) &&
- is_fstree(btrfs_root_id(root))) {
+ btrfs_is_fstree(btrfs_root_id(root))) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1121,7 +1107,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
*/
- if (is_fstree(btrfs_root_id(root)) &&
+ if (btrfs_is_fstree(btrfs_root_id(root)) &&
btrfs_root_refs(&root->root_item) > 0) {
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
@@ -1254,6 +1240,9 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
+ if (fs_info->fs_devices)
+ btrfs_close_devices(fs_info->fs_devices);
+ btrfs_free_compress_wsm(fs_info);
percpu_counter_destroy(&fs_info->stats_read_blocks);
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
@@ -1323,7 +1312,7 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
* This is namely for free-space-tree and quota tree, which can change
* at runtime and should only be grabbed from fs_info.
*/
- if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
return ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
@@ -1564,7 +1553,7 @@ static int transaction_kthread(void *arg)
do {
cannot_commit = false;
- delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
+ delay = secs_to_jiffies(fs_info->commit_interval);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
@@ -1579,9 +1568,9 @@ static int transaction_kthread(void *arg)
cur->state < TRANS_STATE_COMMIT_PREP &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
- delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay -= secs_to_jiffies(delta - 1);
delay = min(delay,
- msecs_to_jiffies(fs_info->commit_interval * 1000));
+ secs_to_jiffies(fs_info->commit_interval));
goto sleep;
}
transid = cur->transid;
@@ -1779,8 +1768,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
destroy_workqueue(fs_info->endio_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
- if (fs_info->compressed_write_workers)
- destroy_workqueue(fs_info->compressed_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -1843,6 +1830,8 @@ void btrfs_put_root(struct btrfs_root *root)
if (refcount_dec_and_test(&root->refs)) {
if (WARN_ON(!xa_empty(&root->inodes)))
xa_destroy(&root->inodes);
+ if (WARN_ON(!xa_empty(&root->delayed_nodes)))
+ xa_destroy(&root->delayed_nodes);
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -1863,8 +1852,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
int i;
while (!list_empty(&fs_info->dead_roots)) {
- gang[0] = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
+ gang[0] = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
@@ -1927,13 +1916,14 @@ static int btrfs_init_btree_inode(struct super_block *sb)
inode->i_mapping->a_ops = &btree_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO);
- extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
+ btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
+ IO_TREE_BTREE_INODE_IO);
+ btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
__insert_inode_hash(inode, hash);
+ set_bit(AS_KERNEL_FILE, &inode->i_mapping->flags);
fs_info->btree_inode = inode;
return 0;
@@ -1953,7 +1943,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
- fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
mutex_init(&fs_info->qgroup_rescan_lock);
@@ -1963,7 +1952,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
- unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
+ unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU;
fs_info->workers =
btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
@@ -1990,8 +1979,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
- fs_info->compressed_write_workers =
- alloc_workqueue("btrfs-compressed-write", flags, max_active);
fs_info->endio_freespace_worker =
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
max_active, 0);
@@ -2002,12 +1989,11 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
ordered_flags);
fs_info->discard_ctl.discard_workers =
- alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);
+ alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE);
if (!(fs_info->workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->compressed_write_workers &&
fs_info->endio_write_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
@@ -2034,14 +2020,10 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
- /*
- * Check if the checksum implementation is a fast accelerated one.
- * As-is this is a bit of a hack and should be replaced once the csum
- * implementations provide that information themselves.
- */
+ /* Check if the checksum implementation is a fast accelerated one. */
switch (csum_type) {
case BTRFS_CSUM_TYPE_CRC32:
- if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
+ if (crc32_optimizations() & CRC32C_OPTIMIZATION)
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
break;
case BTRFS_CSUM_TYPE_XXHASH:
@@ -2067,7 +2049,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
u64 bytenr = btrfs_super_log_root(disk_super);
int level = btrfs_super_log_root_level(disk_super);
- if (fs_devices->rw_devices == 0) {
+ if (unlikely(fs_devices->rw_devices == 0)) {
btrfs_warn(fs_info, "log replay required on RO media");
return -EIO;
}
@@ -2088,7 +2070,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
btrfs_put_root(log_tree_root);
return ret;
}
- if (!extent_buffer_uptodate(log_tree_root->node)) {
+ if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) {
btrfs_err(fs_info, "failed to read log tree");
btrfs_put_root(log_tree_root);
return -EIO;
@@ -2096,10 +2078,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
+ btrfs_put_root(log_tree_root);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Failed to recover log tree");
- btrfs_put_root(log_tree_root);
return ret;
}
@@ -2164,8 +2146,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
- ret = PTR_ERR(root);
+ ret = PTR_ERR(root);
break;
}
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -2334,7 +2315,7 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
const u32 sectorsize = btrfs_super_sectorsize(sb);
u32 sys_array_size = btrfs_super_sys_array_size(sb);
- if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) {
btrfs_err(fs_info, "system chunk array too big %u > %u",
sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
return -EUCLEAN;
@@ -2352,12 +2333,12 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
len = sizeof(*disk_key);
- if (cur + len > sys_array_size)
+ if (unlikely(cur + len > sys_array_size))
goto short_read;
cur += len;
btrfs_disk_key_to_cpu(&key, disk_key);
- if (key.type != BTRFS_CHUNK_ITEM_KEY) {
+ if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) {
btrfs_err(fs_info,
"unexpected item type %u in sys_array at offset %u",
key.type, cur);
@@ -2365,10 +2346,10 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
}
chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
num_stripes = btrfs_stack_chunk_num_stripes(chunk);
- if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)
+ if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size))
goto short_read;
type = btrfs_stack_chunk_type(chunk);
- if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) {
btrfs_err(fs_info,
"invalid chunk type %llu in sys_array at offset %u",
type, cur);
@@ -2448,21 +2429,7 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- /*
- * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE.
- *
- * For 4K page sized systems with non-debug builds, all 3 matches (4K).
- * For 4K page sized systems with debug builds, there are two block sizes
- * supported. (4K and 2K)
- *
- * We can support 16K sectorsize with 64K page size without problem,
- * but such sectorsize/pagesize combination doesn't make much sense.
- * 4K will be our future standard, PAGE_SIZE is supported from the very
- * beginning.
- */
- if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K &&
- sectorsize != PAGE_SIZE &&
- sectorsize != BTRFS_MIN_BLOCKSIZE)) {
+ if (!btrfs_supported_blocksize(sectorsize)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -2629,13 +2596,13 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
ret = btrfs_validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
- if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
+ if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) {
ret = -EUCLEAN;
btrfs_err(fs_info, "invalid csum type, has %u want %u",
btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
goto out;
}
- if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"invalid incompat flags, has 0x%llx valid mask 0x%llx",
@@ -2665,7 +2632,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev
root->node = NULL;
return ret;
}
- if (!extent_buffer_uptodate(root->node)) {
+ if (unlikely(!extent_buffer_uptodate(root->node))) {
free_extent_buffer(root->node);
root->node = NULL;
return -EIO;
@@ -2769,10 +2736,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
return ret;
}
+/*
+ * Lockdep gets confused between our buffer_tree which requires IRQ locking because
+ * we modify marks in the IRQ context, and our delayed inode xarray which doesn't
+ * have these requirements. Use a class key so lockdep doesn't get them mixed up.
+ */
+static struct lock_class_key buffer_xa_class;
+
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+
+ /* Use the same flags as mapping->i_pages. */
+ xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+ lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
+
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2784,7 +2762,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
- spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
spin_lock_init(&fs_info->zone_active_bgs_lock);
@@ -2829,6 +2806,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
BTRFS_BLOCK_RSV_DELOPS);
@@ -2862,8 +2840,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
- extent_io_tree_init(fs_info, &fs_info->excluded_extents,
- IO_TREE_FS_EXCLUDED_EXTENTS);
+ btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents,
+ IO_TREE_FS_EXCLUDED_EXTENTS);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -3255,13 +3233,13 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
}
/*
- * Subpage runtime limitation on v1 cache.
+ * Subpage/bs > ps runtime limitation on v1 cache.
*
- * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * V1 space cache still has some hard coded PAGE_SIZE usage, while
* we're already defaulting to v2 cache, no need to bother v1 as it's
* going to be deprecated anyway.
*/
- if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
btrfs_warn(fs_info,
"v1 space cache is not supported for page size %lu with sectorsize %u",
PAGE_SIZE, fs_info->sectorsize);
@@ -3315,7 +3293,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
/*
* Read super block and check the signature bytes only
*/
- disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
+ disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false);
if (IS_ERR(disk_super)) {
ret = PTR_ERR(disk_super);
goto fail_alloc;
@@ -3392,12 +3370,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
fs_info->nodesize = nodesize;
+ fs_info->nodesize_bits = ilog2(nodesize);
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT);
+ fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
fs_info->fs_devices->fs_info = fs_info;
+ if (fs_info->sectorsize > PAGE_SIZE)
+ btrfs_warn(fs_info,
+ "support for block size %u with page size %lu is experimental, some features may be missing",
+ fs_info->sectorsize, PAGE_SIZE);
/*
* Handle the space caching options appropriately now that we have the
* super block loaded and validated.
@@ -3419,6 +3404,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
+ ret = btrfs_alloc_compress_wsm(fs_info);
+ if (ret)
+ goto fail_sb_buffer;
ret = btrfs_init_workqueues(fs_info);
if (ret)
goto fail_sb_buffer;
@@ -3466,7 +3454,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
- if (!fs_devices->latest_dev->bdev) {
+ if (unlikely(!fs_devices->latest_dev->bdev)) {
btrfs_err(fs_info, "failed to read devices");
ret = -EIO;
goto fail_tree_roots;
@@ -3557,6 +3545,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
+ btrfs_zoned_reserve_data_reloc_bg(fs_info);
btrfs_free_zone_cache(fs_info);
btrfs_check_active_zone_reservation(fs_info);
@@ -3677,7 +3666,6 @@ fail_alloc:
iput(fs_info->btree_inode);
fail:
- btrfs_close_devices(fs_info->fs_devices);
ASSERT(ret < 0);
return ret;
}
@@ -3690,7 +3678,7 @@ static void btrfs_end_super_write(struct bio *bio)
bio_for_each_folio_all(fi, bio) {
if (bio->bi_status) {
- btrfs_warn_rl_in_rcu(device->fs_info,
+ btrfs_warn_rl(device->fs_info,
"lost super block write due to IO error on %s (%d)",
btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
@@ -3710,85 +3698,6 @@ static void btrfs_end_super_write(struct bio *bio)
bio_put(bio);
}
-struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num, bool drop_cache)
-{
- struct btrfs_super_block *super;
- struct page *page;
- u64 bytenr, bytenr_orig;
- struct address_space *mapping = bdev->bd_mapping;
- int ret;
-
- bytenr_orig = btrfs_sb_offset(copy_num);
- ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
- if (ret == -ENOENT)
- return ERR_PTR(-EINVAL);
- else if (ret)
- return ERR_PTR(ret);
-
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
- return ERR_PTR(-EINVAL);
-
- if (drop_cache) {
- /* This should only be called with the primary sb. */
- ASSERT(copy_num == 0);
-
- /*
- * Drop the page of the primary superblock, so later read will
- * always read from the device.
- */
- invalidate_inode_pages2_range(mapping,
- bytenr >> PAGE_SHIFT,
- (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
- }
-
- page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
- if (IS_ERR(page))
- return ERR_CAST(page);
-
- super = page_address(page);
- if (btrfs_super_magic(super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(super);
- return ERR_PTR(-ENODATA);
- }
-
- if (btrfs_super_bytenr(super) != bytenr_orig) {
- btrfs_release_disk_super(super);
- return ERR_PTR(-EINVAL);
- }
-
- return super;
-}
-
-
-struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
-{
- struct btrfs_super_block *super, *latest = NULL;
- int i;
- u64 transid = 0;
-
- /* we would like to check all the supers, but that would make
- * a btrfs mount succeed after a mkfs from a different FS.
- * So, we need to add a special mount option to scan for
- * later supers, using BTRFS_SUPER_MIRROR_MAX instead
- */
- for (i = 0; i < 1; i++) {
- super = btrfs_read_dev_one_super(bdev, i, false);
- if (IS_ERR(super))
- continue;
-
- if (!latest || btrfs_super_generation(super) > transid) {
- if (latest)
- btrfs_release_disk_super(super);
-
- latest = super;
- transid = btrfs_super_generation(super);
- }
- }
-
- return super;
-}
-
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
* folios we use for writing are locked.
@@ -3828,8 +3737,8 @@ static int write_dev_supers(struct btrfs_device *device,
continue;
} else if (ret < 0) {
btrfs_err(device->fs_info,
- "couldn't get super block location for mirror %d",
- i);
+ "couldn't get super block location for mirror %d error %d",
+ i, ret);
atomic_inc(&device->sb_write_errors);
continue;
}
@@ -3848,12 +3757,11 @@ static int write_dev_supers(struct btrfs_device *device,
GFP_NOFS);
if (IS_ERR(folio)) {
btrfs_err(device->fs_info,
- "couldn't get super block page for bytenr %llu",
- bytenr);
+ "couldn't get super block page for bytenr %llu error %ld",
+ bytenr, PTR_ERR(folio));
atomic_inc(&device->sb_write_errors);
continue;
}
- ASSERT(folio_order(folio) == 0);
offset = offset_in_folio(folio, bytenr);
disk_super = folio_address(folio) + offset;
@@ -3926,7 +3834,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
/* If the folio has been removed, then we know it completed. */
if (IS_ERR(folio))
continue;
- ASSERT(folio_order(folio) == 0);
/* Folio will be unlocked once the write completes. */
folio_wait_locked(folio);
@@ -4041,7 +3948,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
* Checks last_flush_error of disks in order to determine the device
* state.
*/
- if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
+ if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL)))
return -EIO;
return 0;
@@ -4069,7 +3976,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
}
if (min_tolerated == INT_MAX) {
- pr_warn("BTRFS: unknown raid flag: %llu", flags);
+ btrfs_warn(NULL, "unknown raid flag: %llu", flags);
min_tolerated = 0;
}
@@ -4143,7 +4050,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
ret = btrfs_validate_write_super(fs_info, sb);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_handle_fs_error(fs_info, -EUCLEAN,
"unexpected superblock corruption detected");
@@ -4154,7 +4061,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
if (ret)
total_errors++;
}
- if (total_errors > max_errors) {
+ if (unlikely(total_errors > max_errors)) {
btrfs_err(fs_info, "%d errors while writing supers",
total_errors);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4179,7 +4086,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
total_errors++;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- if (total_errors > max_errors) {
+ if (unlikely(total_errors > max_errors)) {
btrfs_handle_fs_error(fs_info, -EIO,
"%d errors while writing supers",
total_errors);
@@ -4246,8 +4153,9 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
u64 found_end;
found = true;
- while (find_first_extent_bit(&trans->dirty_pages, cur,
- &found_start, &found_end, EXTENT_DIRTY, &cached)) {
+ while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur,
+ &found_start, &found_end,
+ EXTENT_DIRTY, &cached)) {
dirty_bytes += found_end + 1 - found_start;
cur = found_end + 1;
}
@@ -4366,7 +4274,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
/*
* When finishing a compressed write bio we schedule a work queue item
- * to finish an ordered extent - btrfs_finish_compressed_write_work()
+ * to finish an ordered extent - end_bbio_compressed_write()
* calls btrfs_finish_ordered_extent() which in turns does a call to
* btrfs_queue_ordered_fn(), and that queues the ordered extent
* completion either in the endio_write_workers work queue or in the
@@ -4374,7 +4282,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
* below, so before we flush them we must flush this queue for the
* workers of compressed writes.
*/
- flush_workqueue(fs_info->compressed_write_workers);
+ flush_workqueue(fs_info->endio_workers);
/*
* After we parked the cleaner kthread, ordered extents may have
@@ -4387,8 +4295,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*
* So wait for all ongoing ordered extents to complete and then run
* delayed iputs. This works because once we reach this point no one
- * can either create new ordered extents nor create delayed iputs
- * through some other means.
+ * can create new ordered extents, but delayed iputs can still be added
+ * by a reclaim worker (see comments further below).
*
* Also note that btrfs_wait_ordered_roots() is not safe here, because
* it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
@@ -4399,15 +4307,29 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->endio_write_workers);
/* Ordered extents for free space inodes. */
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ /*
+ * Run delayed iputs in case an async reclaim worker is waiting for them
+ * to be run as mentioned above.
+ */
btrfs_run_delayed_iputs(fs_info);
- /* There should be no more workload to generate new delayed iputs. */
- set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
cancel_work_sync(&fs_info->em_shrinker_work);
+ /*
+ * Run delayed iputs again because an async reclaim worker may have
+ * added new ones if it was flushing delalloc:
+ *
+ * shrink_delalloc() -> btrfs_start_delalloc_roots() ->
+ * start_delalloc_inodes() -> btrfs_add_delayed_iput()
+ */
+ btrfs_run_delayed_iputs(fs_info);
+
+ /* There should be no more workload to generate new delayed iputs. */
+ set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
+
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4443,7 +4365,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
if (btrfs_check_quota_leak(fs_info)) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN("qgroup reserved space leaked");
btrfs_err(fs_info, "qgroup reserved space leaked");
}
@@ -4490,7 +4412,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
btrfs_mapping_tree_free(fs_info);
- btrfs_close_devices(fs_info->fs_devices);
}
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
@@ -4700,9 +4621,9 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- mark, NULL)) {
- clear_extent_bits(dirty_pages, start, end, mark);
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, NULL)) {
+ btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL);
while (start <= end) {
eb = find_extent_buffer(fs_info, start);
start += fs_info->nodesize;
@@ -4735,14 +4656,14 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
* the same extent range.
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
- if (!find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, &cached_state)) {
+ if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state)) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
- clear_extent_dirty(unpin, start, end, &cached_state);
- free_extent_state(cached_state);
+ btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
+ btrfs_free_extent_state(cached_state);
btrfs_error_unpin_extent_range(fs_info, start, end);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
@@ -4945,7 +4866,7 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 587842991b24..5320da83d0cf 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -9,7 +9,8 @@
#include <linux/sizes.h>
#include <linux/compiler_types.h>
#include "ctree.h"
-#include "fs.h"
+#include "bio.h"
+#include "ordered-data.h"
struct block_device;
struct super_block;
@@ -58,9 +59,6 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb, int mirror_num);
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
-struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
-struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
const struct btrfs_key *key);
@@ -109,12 +107,11 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
- int atomic);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
const struct btrfs_tree_parent_check *check);
-blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
+int btree_csum_one_bio(struct btrfs_bio *bbio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 7fc8a3200b40..230d9326b685 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -23,7 +23,11 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
int type;
if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
- *max_len = BTRFS_FID_SIZE_CONNECTABLE;
+ if (btrfs_root_id(BTRFS_I(inode)->root) !=
+ btrfs_root_id(BTRFS_I(parent)->root))
+ *max_len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+ else
+ *max_len = BTRFS_FID_SIZE_CONNECTABLE;
return FILEID_INVALID;
} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
@@ -45,6 +49,8 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
parent_root_id = btrfs_root_id(BTRFS_I(parent)->root);
if (parent_root_id != fid->root_objectid) {
+ if (*max_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+ return FILEID_INVALID;
fid->parent_root_objectid = parent_root_id;
len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
type = FILEID_BTRFS_WITH_PARENT_ROOT;
@@ -174,7 +180,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto fail;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset of -1 found, there would have to exist an
* inode with such number or a root with such id.
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 13de6af279e5..bb2ca1c9c7b0 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -42,8 +42,9 @@ static inline void btrfs_extent_state_leak_debug_check(void)
struct extent_state *state;
while (!list_empty(&states)) {
- state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state = list_first_entry(&states, struct extent_state, leak_list);
+ btrfs_err(NULL,
+ "state leak: start %llu end %llu state %u in tree %d refs %d",
state->start, state->end, state->state,
extent_state_in_tree(state),
refcount_read(&state->refs));
@@ -59,13 +60,12 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
- const struct btrfs_inode *inode;
+ const struct btrfs_inode *inode = tree->inode;
u64 isize;
if (tree->owner != IO_TREE_INODE_IO)
return;
- inode = extent_io_tree_to_inode_const(tree);
isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(inode->root->fs_info,
@@ -80,25 +80,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
-
-/*
- * The only tree allowed to set the inode is IO_TREE_INODE_IO.
- */
-static bool is_inode_io_tree(const struct extent_io_tree *tree)
-{
- return tree->owner == IO_TREE_INODE_IO;
-}
-
-/* Return the inode if it's valid for the given tree, otherwise NULL. */
-struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
-{
- if (tree->owner == IO_TREE_INODE_IO)
- return tree->inode;
- return NULL;
-}
-
/* Read-only access to the inode. */
-const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
+const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree)
{
if (tree->owner == IO_TREE_INODE_IO)
return tree->inode;
@@ -106,15 +89,15 @@ const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_t
}
/* For read-only access to fs_info. */
-const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
+const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
{
if (tree->owner == IO_TREE_INODE_IO)
return tree->inode->root->fs_info;
return tree->fs_info;
}
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner)
+void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner)
{
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
@@ -129,7 +112,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
* aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never
* set on any extent state when calling this function).
*/
-void extent_io_tree_release(struct extent_io_tree *tree)
+void btrfs_extent_io_tree_release(struct extent_io_tree *tree)
{
struct rb_root root;
struct extent_state *state;
@@ -148,7 +131,7 @@ void extent_io_tree_release(struct extent_io_tree *tree)
* (see wait_extent_bit()).
*/
ASSERT(!waitqueue_active(&state->wq));
- free_extent_state(state);
+ btrfs_free_extent_state(state);
cond_resched_lock(&tree->lock);
}
/*
@@ -176,7 +159,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
btrfs_leak_debug_add_state(state);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
- trace_alloc_extent_state(state, mask, _RET_IP_);
+ trace_btrfs_alloc_extent_state(state, mask, _RET_IP_);
return state;
}
@@ -188,14 +171,14 @@ static struct extent_state *alloc_extent_state_atomic(struct extent_state *preal
return prealloc;
}
-void free_extent_state(struct extent_state *state)
+void btrfs_free_extent_state(struct extent_state *state)
{
if (!state)
return;
if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del_state(state);
- trace_free_extent_state(state, _RET_IP_);
+ trace_btrfs_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
}
@@ -222,38 +205,34 @@ static inline struct extent_state *next_state(struct extent_state *state)
{
struct rb_node *next = rb_next(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
+ return rb_entry_safe(next, struct extent_state, rb_node);
}
static inline struct extent_state *prev_state(struct extent_state *state)
{
struct rb_node *next = rb_prev(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
+ return rb_entry_safe(next, struct extent_state, rb_node);
}
/*
- * Search @tree for an entry that contains @offset. Such entry would have
- * entry->start <= offset && entry->end >= offset.
+ * Search @tree for an entry that contains @offset or if none exists for the
+ * first entry that starts and ends after that offset.
*
* @tree: the tree to search
- * @offset: offset that should fall within an entry in @tree
+ * @offset: search offset
* @node_ret: pointer where new node should be anchored (used when inserting an
* entry in the tree)
* @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
- * Return a pointer to the entry that contains @offset byte address and don't change
- * @node_ret and @parent_ret.
+ * Return a pointer to the entry that contains @offset byte address.
+ *
+ * If no such entry exists, return the first entry that starts and ends after
+ * @offset if one exists, otherwise NULL.
*
- * If no such entry exists, return pointer to entry that ends before @offset
- * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ * If the returned entry starts at @offset, then @node_ret and @parent_ret
+ * aren't changed.
*/
static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
u64 offset,
@@ -282,7 +261,11 @@ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree
if (parent_ret)
*parent_ret = prev;
- /* Search neighbors until we find the first one past the end */
+ /*
+ * Return either the current entry if it contains offset (it ends after
+ * or at offset) or the first entry that starts and ends after offset if
+ * one exists, or NULL.
+ */
while (entry && offset > entry->end)
entry = next_state(entry);
@@ -351,7 +334,7 @@ static void __cold extent_io_tree_panic(const struct extent_io_tree *tree,
const char *opname,
int err)
{
- btrfs_panic(extent_io_tree_to_fs_info(tree), err,
+ btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err,
"extent io tree error on %s state start %llu end %llu",
opname, state->start, state->end);
}
@@ -362,13 +345,12 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s
prev = prev_state(state);
if (prev && prev->end == state->start - 1 && prev->state == state->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
- state, prev);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode, state, prev);
state->start = prev->start;
rb_erase(&prev->rb_node, &tree->state);
RB_CLEAR_NODE(&prev->rb_node);
- free_extent_state(prev);
+ btrfs_free_extent_state(prev);
}
}
@@ -378,13 +360,12 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
next = next_state(state);
if (next && next->start == state->end + 1 && next->state == state->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
- state, next);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode, state, next);
state->end = next->end;
rb_erase(&next->rb_node, &tree->state);
RB_CLEAR_NODE(&next->rb_node);
- free_extent_state(next);
+ btrfs_free_extent_state(next);
}
}
@@ -413,8 +394,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
- if (is_inode_io_tree(tree))
- btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_set_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@@ -459,10 +440,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
if (state->end < entry->start) {
if (try_merge && end == entry->start &&
state->state == entry->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(
- extent_io_tree_to_inode(tree),
- state, entry);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
entry->start = state->start;
merge_prev_state(tree, entry);
state->state = 0;
@@ -472,10 +452,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
} else if (state->end > entry->end) {
if (try_merge && entry->end == start &&
state->state == entry->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(
- extent_io_tree_to_inode(tree),
- state, entry);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
entry->end = state->end;
merge_next_state(tree, entry);
state->state = 0;
@@ -527,9 +506,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
- if (is_inode_io_tree(tree))
- btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
- split);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_split_delalloc_extent(tree->inode, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -549,7 +527,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
} else if (prealloc->end > entry->end) {
node = &(*node)->rb_right;
} else {
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
return -EEXIST;
}
}
@@ -561,6 +539,18 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
}
/*
+ * Use this during tree iteration to avoid doing next node searches when it's
+ * not needed (the current record ends at or after the target range's end).
+ */
+static inline struct extent_state *next_search_state(struct extent_state *state, u64 end)
+{
+ if (state->end < end)
+ return next_state(state);
+
+ return NULL;
+}
+
+/*
* Utility function to clear some bits in an extent state struct. It will
* optionally wake up anyone waiting on this state (wake == 1).
*
@@ -569,16 +559,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- u32 bits, int wake,
+ u32 bits, int wake, u64 end,
struct extent_changeset *changeset)
{
struct extent_state *next;
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
- if (is_inode_io_tree(tree))
- btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
- bits);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_clear_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@@ -586,17 +575,17 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
if (wake)
wake_up(&state->wq);
if (state->state == 0) {
- next = next_state(state);
+ next = next_search_state(state, end);
if (extent_state_in_tree(state)) {
rb_erase(&state->rb_node, &tree->state);
RB_CLEAR_NODE(&state->rb_node);
- free_extent_state(state);
+ btrfs_free_extent_state(state);
} else {
WARN_ON(1);
}
} else {
merge_state(tree, state);
- next = next_state(state);
+ next = next_search_state(state, end);
}
return next;
}
@@ -620,18 +609,18 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state,
- struct extent_changeset *changeset)
+int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
struct extent_state *prealloc = NULL;
u64 last_end;
- int err;
- int clear = 0;
- int wake;
- int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+ int ret = 0;
+ bool clear;
+ bool wake;
+ const bool delete = (bits & EXTENT_CLEAR_ALL_BITS);
gfp_t mask;
set_gfp_mask_from_bits(&bits, &mask);
@@ -644,9 +633,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & EXTENT_DELALLOC)
bits |= EXTENT_NORESERVE;
- wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0);
- if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
- clear = 1;
+ wake = (bits & EXTENT_LOCK_BITS);
+ clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY));
again:
if (!prealloc) {
/*
@@ -676,7 +664,7 @@ again:
goto hit_next;
}
if (clear)
- free_extent_state(cached);
+ btrfs_free_extent_state(cached);
}
/* This search will find the extents that end after our range starts. */
@@ -691,7 +679,7 @@ hit_next:
/* The state doesn't have the wanted bits, go ahead. */
if (!(state->state & bits)) {
- state = next_state(state);
+ state = next_search_state(state, end);
goto next;
}
@@ -714,18 +702,24 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
-
+ ret = split_state(tree, state, prealloc, start);
prealloc = NULL;
- if (err)
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
goto out;
+ }
if (state->end <= end) {
- state = clear_state_bit(tree, state, bits, wake, changeset);
+ state = clear_state_bit(tree, state, bits, wake, end,
+ changeset);
goto next;
}
- goto search_again;
+ if (need_resched())
+ goto search_again;
+ /*
+ * Fallthrough and try atomic extent state allocation if needed.
+ * If it fails we'll jump to 'search_again' retry the allocation
+ * in non-atomic mode and start the search again.
+ */
}
/*
* | ---- desired range ---- |
@@ -736,30 +730,31 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
+ ret = split_state(tree, state, prealloc, end + 1);
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
if (wake)
wake_up(&state->wq);
- clear_state_bit(tree, prealloc, bits, wake, changeset);
+ clear_state_bit(tree, prealloc, bits, wake, end, changeset);
prealloc = NULL;
goto out;
}
- state = clear_state_bit(tree, state, bits, wake, changeset);
+ state = clear_state_bit(tree, state, bits, wake, end, changeset);
next:
- if (last_end == (u64)-1)
+ if (last_end >= end)
goto out;
start = last_end + 1;
- if (start <= end && state && !need_resched())
+ if (state && !need_resched())
goto hit_next;
search_again:
- if (start > end)
- goto out;
spin_unlock(&tree->lock);
if (gfpflags_allow_blocking(mask))
cond_resched();
@@ -767,10 +762,9 @@ search_again:
out:
spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
- return 0;
+ return ret;
}
@@ -820,7 +814,7 @@ process_node:
schedule();
spin_lock(&tree->lock);
finish_wait(&state->wq, &wait);
- free_extent_state(state);
+ btrfs_free_extent_state(state);
goto again;
}
start = state->end + 1;
@@ -838,7 +832,7 @@ out:
if (cached_state && *cached_state) {
state = *cached_state;
*cached_state = NULL;
- free_extent_state(state);
+ btrfs_free_extent_state(state);
}
spin_unlock(&tree->lock);
}
@@ -877,7 +871,7 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t
*/
state = tree_search(tree, start);
while (state) {
- if (state->end >= start && (state->state & bits))
+ if (state->state & bits)
return state;
state = next_state(state);
}
@@ -892,9 +886,9 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t
* Return true if we find something, and update @start_ret and @end_ret.
* Return false if we found nothing.
*/
-bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state)
+bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
bool ret = false;
@@ -914,13 +908,13 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
* again. If we haven't found any, clear as well since
* it's now useless.
*/
- free_extent_state(*cached_state);
+ btrfs_free_extent_state(*cached_state);
*cached_state = NULL;
if (state)
goto got_it;
goto out;
}
- free_extent_state(*cached_state);
+ btrfs_free_extent_state(*cached_state);
*cached_state = NULL;
}
@@ -952,14 +946,17 @@ out:
* contiguous area for given bits. We will search to the first bit we find, and
* then walk down the tree until we find a non-contiguous area. The area
* returned will be the full contiguous area with the bits set.
+ *
+ * Returns true if we found a range with the given bits set, in which case
+ * @start_ret and @end_ret are updated, or false if no range was found.
*/
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
+bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
- int ret = 1;
+ bool ret = false;
- ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
+ ASSERT(!btrfs_fs_incompat(btrfs_extent_io_tree_to_fs_info(tree), NO_HOLES));
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
@@ -971,7 +968,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
break;
*end_ret = state->end;
}
- ret = 0;
+ ret = true;
}
spin_unlock(&tree->lock);
return ret;
@@ -1046,11 +1043,11 @@ out:
*
* [start, end] is inclusive This takes the tree lock.
*/
-static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u64 *failed_start,
- struct extent_state **failed_state,
- struct extent_state **cached_state,
- struct extent_changeset *changeset)
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u64 *failed_start,
+ struct extent_state **failed_state,
+ struct extent_state **cached_state,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -1129,12 +1126,11 @@ hit_next:
set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
- if (last_end == (u64)-1)
+ if (last_end >= end)
goto out;
start = last_end + 1;
state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
goto search_again;
}
@@ -1186,12 +1182,11 @@ hit_next:
set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
- if (last_end == (u64)-1)
+ if (last_end >= end)
goto out;
start = last_end + 1;
state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
}
goto search_again;
@@ -1204,14 +1199,8 @@ hit_next:
* extent we found.
*/
if (state->start > start) {
- u64 this_end;
struct extent_state *inserted_state;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
@@ -1221,17 +1210,38 @@ hit_next:
* extent.
*/
prealloc->start = start;
- prealloc->end = this_end;
+ if (end < last_start)
+ prealloc->end = end;
+ else
+ prealloc->end = last_start - 1;
+
inserted_state = insert_state(tree, prealloc, bits, changeset);
if (IS_ERR(inserted_state)) {
ret = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, prealloc, "insert", ret);
+ goto out;
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
prealloc = NULL;
- start = this_end + 1;
+ start = inserted_state->end + 1;
+
+ /* Beyond target range, stop. */
+ if (start > end)
+ goto out;
+
+ if (need_resched())
+ goto search_again;
+
+ state = next_search_state(inserted_state, end);
+ /*
+ * If there's a next state, whether contiguous or not, we don't
+ * need to unlock and start search again. If it's not contiguous
+ * we will end up here and try to allocate a prealloc state and insert.
+ */
+ if (state)
+ goto hit_next;
goto search_again;
}
/*
@@ -1252,8 +1262,11 @@ hit_next:
if (!prealloc)
goto search_again;
ret = split_state(tree, state, prealloc, end + 1);
- if (ret)
+ if (ret) {
extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1272,18 +1285,16 @@ search_again:
out:
spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
return ret;
}
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state)
+int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state)
{
- return __set_extent_bit(tree, start, end, bits, NULL, NULL,
- cached_state, NULL);
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, cached_state, NULL);
}
/*
@@ -1304,9 +1315,9 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
*
* All allocations are done with GFP_NOFS.
*/
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state)
+int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -1374,12 +1385,11 @@ hit_next:
if (state->start == start && state->end <= end) {
set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
+ state = clear_state_bit(tree, state, clear_bits, 0, end, NULL);
+ if (last_end >= end)
goto out;
start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
goto search_again;
}
@@ -1406,20 +1416,19 @@ hit_next:
goto out;
}
ret = split_state(tree, state, prealloc, start);
- if (ret)
- extent_io_tree_panic(tree, state, "split", ret);
prealloc = NULL;
- if (ret)
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
goto out;
+ }
if (state->end <= end) {
set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
+ state = clear_state_bit(tree, state, clear_bits, 0, end, NULL);
+ if (last_end >= end)
goto out;
start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
}
goto search_again;
@@ -1432,14 +1441,8 @@ hit_next:
* extent we found.
*/
if (state->start > start) {
- u64 this_end;
struct extent_state *inserted_state;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
ret = -ENOMEM;
@@ -1451,16 +1454,37 @@ hit_next:
* extent.
*/
prealloc->start = start;
- prealloc->end = this_end;
+ if (end < last_start)
+ prealloc->end = end;
+ else
+ prealloc->end = last_start - 1;
+
inserted_state = insert_state(tree, prealloc, bits, NULL);
if (IS_ERR(inserted_state)) {
ret = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, prealloc, "insert", ret);
+ goto out;
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
prealloc = NULL;
- start = this_end + 1;
+ start = inserted_state->end + 1;
+
+ /* Beyond target range, stop. */
+ if (start > end)
+ goto out;
+
+ if (need_resched())
+ goto search_again;
+
+ state = next_search_state(inserted_state, end);
+ /*
+ * If there's a next state, whether contiguous or not, we don't
+ * need to unlock and start search again. If it's not contiguous
+ * we will end up here and try to allocate a prealloc state and insert.
+ */
+ if (state)
+ goto hit_next;
goto search_again;
}
/*
@@ -1477,12 +1501,15 @@ hit_next:
}
ret = split_state(tree, state, prealloc, end + 1);
- if (ret)
+ if (ret) {
extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+ clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL);
prealloc = NULL;
goto out;
}
@@ -1497,8 +1524,7 @@ search_again:
out:
spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
return ret;
}
@@ -1518,8 +1544,8 @@ out:
* spans (last_range_end, end of device]. In this case it's up to the caller to
* trim @end_ret to the appropriate size.
*/
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
+void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct extent_state *prev = NULL, *next = NULL;
@@ -1636,10 +1662,10 @@ out:
* all given bits set. If the returned number of bytes is greater than zero
* then @start is updated with the offset of the first byte with the bits set.
*/
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end, u64 max_bytes,
- u32 bits, int contig,
- struct extent_state **cached_state)
+u64 btrfs_count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ u32 bits, bool contig,
+ struct extent_state **cached_state)
{
struct extent_state *state = NULL;
struct extent_state *cached;
@@ -1710,7 +1736,7 @@ search:
}
if (cached_state) {
- free_extent_state(*cached_state);
+ btrfs_free_extent_state(*cached_state);
*cached_state = state;
if (state)
refcount_inc(&state->refs);
@@ -1724,16 +1750,16 @@ search:
/*
* Check if the single @bit exists in the given range.
*/
-bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
+bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
{
- struct extent_state *state = NULL;
+ struct extent_state *state;
bool bitset = false;
ASSERT(is_power_of_2(bit));
spin_lock(&tree->lock);
state = tree_search(tree, start);
- while (state && start <= end) {
+ while (state) {
if (state->start > end)
break;
@@ -1742,9 +1768,7 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32
break;
}
- /* If state->end is (u64)-1, start will overflow to 0 */
- start = state->end + 1;
- if (start > end || start == 0)
+ if (state->end >= end)
break;
state = next_state(state);
}
@@ -1752,16 +1776,51 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32
return bitset;
}
+void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+
+ /*
+ * The cached state is currently mandatory and not used to start the
+ * search, only to cache the first state record found in the range.
+ */
+ ASSERT(cached_state != NULL);
+ ASSERT(*cached_state == NULL);
+
+ *bits = 0;
+
+ spin_lock(&tree->lock);
+ state = tree_search(tree, start);
+ if (state && state->start < end) {
+ *cached_state = state;
+ refcount_inc(&state->refs);
+ }
+ while (state) {
+ if (state->start > end)
+ break;
+
+ *bits |= state->state;
+
+ if (state->end >= end)
+ break;
+
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+}
+
/*
* Check if the whole range [@start,@end) contains the single @bit set.
*/
-bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
- struct extent_state *cached)
+bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached)
{
- struct extent_state *state = NULL;
+ struct extent_state *state;
bool bitset = true;
ASSERT(is_power_of_2(bit));
+ ASSERT(start < end);
spin_lock(&tree->lock);
if (cached && extent_state_in_tree(cached) && cached->start <= start &&
@@ -1769,30 +1828,22 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
state = cached;
else
state = tree_search(tree, start);
- while (state && start <= end) {
+ while (state) {
if (state->start > start) {
bitset = false;
break;
}
- if (state->start > end)
- break;
-
if ((state->state & bit) == 0) {
bitset = false;
break;
}
- if (state->end == (u64)-1)
+ if (state->end >= end)
break;
- /*
- * Last entry (if state->end is (u64)-1 and overflow happens),
- * or next entry starts after the range.
- */
+ /* Next state must start where this one ends. */
start = state->end + 1;
- if (start > end || start == 0)
- break;
state = next_state(state);
}
@@ -1804,8 +1855,8 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
}
/* Wrappers around set/clear extent bit */
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
+int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCK_BITS yet, as current changeset will
@@ -1814,11 +1865,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCK_BITS));
- return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
}
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
+int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCK_BITS case, same reason as
@@ -1826,20 +1877,20 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCK_BITS));
- return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, bits, NULL, changeset);
}
-bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached)
+bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached)
{
- int err;
+ int ret;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, bits, &failed_start,
- NULL, cached, NULL);
- if (err == -EEXIST) {
+ ret = set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL);
+ if (ret == -EEXIST) {
if (failed_start > start)
- clear_extent_bit(tree, start, failed_start - 1, bits, cached);
+ btrfs_clear_extent_bit(tree, start, failed_start - 1,
+ bits, cached);
return 0;
}
return 1;
@@ -1849,35 +1900,54 @@ bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits
* Either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
-int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached_state)
+int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *failed_state = NULL;
- int err;
+ int ret;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, bits, &failed_start,
- &failed_state, cached_state, NULL);
- while (err == -EEXIST) {
+ ret = set_extent_bit(tree, start, end, bits, &failed_start,
+ &failed_state, cached_state, NULL);
+ while (ret == -EEXIST) {
if (failed_start != start)
- clear_extent_bit(tree, start, failed_start - 1,
- bits, cached_state);
+ btrfs_clear_extent_bit(tree, start, failed_start - 1,
+ bits, cached_state);
wait_extent_bit(tree, failed_start, end, bits, &failed_state);
- err = __set_extent_bit(tree, start, end, bits,
- &failed_start, &failed_state,
- cached_state, NULL);
+ ret = set_extent_bit(tree, start, end, bits, &failed_start,
+ &failed_state, cached_state, NULL);
}
- return err;
+ return ret;
+}
+
+/*
+ * Get the extent state that follows the given extent state.
+ * This is meant to be used in a context where we know no other tasks can
+ * concurrently modify the tree.
+ */
+struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+{
+ struct extent_state *next;
+
+ spin_lock(&tree->lock);
+ ASSERT(extent_state_in_tree(state));
+ next = next_state(state);
+ if (next)
+ refcount_inc(&next->refs);
+ spin_unlock(&tree->lock);
+
+ return next;
}
-void __cold extent_state_free_cachep(void)
+void __cold btrfs_extent_state_free_cachep(void)
{
btrfs_extent_state_leak_debug_check();
kmem_cache_destroy(extent_state_cache);
}
-int __init extent_state_init_cachep(void)
+int __init btrfs_extent_state_init_cachep(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0, 0,
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 6ffef1cd37c1..6f07b965e8da 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -17,10 +17,10 @@ struct btrfs_inode;
/* Bits for the extent state */
enum {
ENUM_BIT(EXTENT_DIRTY),
- ENUM_BIT(EXTENT_UPTODATE),
ENUM_BIT(EXTENT_LOCKED),
ENUM_BIT(EXTENT_DIO_LOCKED),
- ENUM_BIT(EXTENT_NEW),
+ ENUM_BIT(EXTENT_DIRTY_LOG1),
+ ENUM_BIT(EXTENT_DIRTY_LOG2),
ENUM_BIT(EXTENT_DELALLOC),
ENUM_BIT(EXTENT_DEFRAG),
ENUM_BIT(EXTENT_BOUNDARY),
@@ -39,6 +39,11 @@ enum {
*/
ENUM_BIT(EXTENT_DELALLOC_NEW),
/*
+ * Mark that a range is being locked for finishing an ordered extent.
+ * Used together with EXTENT_LOCKED.
+ */
+ ENUM_BIT(EXTENT_FINISHING_ORDERED),
+ /*
* When an ordered extent successfully completes for a region marked as
* a new delalloc range, use this flag when clearing a new delalloc
* range to indicate that the VFS' inode number of bytes should be
@@ -130,117 +135,110 @@ struct extent_state {
#endif
};
-struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree);
-const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree);
-const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
+const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree);
+const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner);
-void extent_io_tree_release(struct extent_io_tree *tree);
-int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached);
-bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached);
+void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner);
+void btrfs_extent_io_tree_release(struct extent_io_tree *tree);
+int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached);
+bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached);
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached)
+static inline int btrfs_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+ return btrfs_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached);
}
-static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline bool btrfs_try_lock_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+ return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached);
}
-int __init extent_state_init_cachep(void);
-void __cold extent_state_free_cachep(void);
-
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end,
- u64 max_bytes, u32 bits, int contig,
- struct extent_state **cached_state);
-
-void free_extent_state(struct extent_state *state);
-bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
- struct extent_state *cached_state);
-bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset);
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached,
- struct extent_changeset *changeset);
-
-static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 end, u32 bits,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, bits, cached, NULL);
-}
+int __init btrfs_extent_state_init_cachep(void);
+void __cold btrfs_extent_state_free_cachep(void);
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL);
-}
+u64 btrfs_count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, u32 bits, bool contig,
+ struct extent_state **cached_state);
-static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, u32 bits)
+void btrfs_free_extent_state(struct extent_state *state);
+bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached_state);
+bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
+void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits,
+ struct extent_state **cached_state);
+int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset);
+int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached,
+ struct extent_changeset *changeset);
+
+static inline int btrfs_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits,
+ struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, bits, NULL);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, bits, cached, NULL);
}
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset);
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state);
-
-static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
+static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
- cached_state, NULL);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_LOCKED,
+ cached, NULL);
}
-static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset);
+int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state);
+
+static inline int btrfs_clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, cached);
+ return btrfs_clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, cached);
}
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state);
-
-bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state);
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits);
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits);
+int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state);
+
+bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state);
+void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits);
+bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int btrfs_lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+ return btrfs_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached);
}
-static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline bool btrfs_try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+ return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached);
}
-static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int btrfs_unlock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_DIO_LOCKED,
+ cached, NULL);
}
+struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree,
+ struct extent_state *state);
+
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 957230abd827..e4cae34620d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -40,13 +40,14 @@
#include "orphan.h"
#include "tree-checker.h"
#include "raid-stripe-tree.h"
+#include "delayed-inode.h"
#undef SCRAMBLE_DELAYED_REFS
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
@@ -56,12 +57,12 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod, u64 oref_root);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op);
-static int find_next_key(struct btrfs_path *path, int level,
+static int find_next_key(const struct btrfs_path *path, int level,
struct btrfs_key *key);
-static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
+static int block_group_bits(const struct btrfs_block_group *cache, u64 bits)
{
return (cache->flags & bits) == bits;
}
@@ -164,8 +165,8 @@ search_again:
if (unlikely(num_refs == 0)) {
ret = -EUCLEAN;
btrfs_err(fs_info,
- "unexpected zero reference count for extent item (%llu %u %llu)",
- key.objectid, key.type, key.offset);
+ "unexpected zero reference count for extent item " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&key));
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -325,11 +326,11 @@ search_again:
/*
* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
- * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
+ * is_data == BTRFS_REF_TYPE_DATA, data type is required,
* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
*/
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
+ const struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
@@ -401,23 +402,23 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
-static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref)
+static u64 hash_extent_data_ref_item(const struct extent_buffer *leaf,
+ const struct btrfs_extent_data_ref *ref)
{
return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
btrfs_extent_data_ref_objectid(leaf, ref),
btrfs_extent_data_ref_offset(leaf, ref));
}
-static int match_extent_data_ref(struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref,
- u64 root_objectid, u64 owner, u64 offset)
+static bool match_extent_data_ref(const struct extent_buffer *leaf,
+ const struct btrfs_extent_data_ref *ref,
+ u64 root_objectid, u64 owner, u64 offset)
{
if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
- return 0;
- return 1;
+ return false;
+ return true;
}
static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
@@ -497,7 +498,7 @@ fail:
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
@@ -597,8 +598,8 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
} else {
btrfs_err(trans->fs_info,
- "unrecognized backref key (%llu %u %llu)",
- key.objectid, key.type, key.offset);
+ "unrecognized backref key " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&key));
btrfs_abort_transaction(trans, -EUCLEAN);
return -EUCLEAN;
}
@@ -617,13 +618,13 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline u32 extent_data_ref_count(struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref)
+static noinline u32 extent_data_ref_count(const struct btrfs_path *path,
+ const struct btrfs_extent_inline_ref *iref)
{
struct btrfs_key key;
struct extent_buffer *leaf;
- struct btrfs_extent_data_ref *ref1;
- struct btrfs_shared_data_ref *ref2;
+ const struct btrfs_extent_data_ref *ref1;
+ const struct btrfs_shared_data_ref *ref2;
u32 num_refs = 0;
int type;
@@ -638,10 +639,10 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path,
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
ASSERT(type != BTRFS_REF_TYPE_INVALID);
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
- ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+ ref1 = (const struct btrfs_extent_data_ref *)(&iref->offset);
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
} else {
- ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+ ref2 = (const struct btrfs_shared_data_ref *)(iref + 1);
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
}
} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
@@ -684,7 +685,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
@@ -722,7 +723,7 @@ static inline int extent_ref_type(u64 parent, u64 owner)
return type;
}
-static int find_next_key(struct btrfs_path *path, int level,
+static int find_next_key(const struct btrfs_path *path, int level,
struct btrfs_key *key)
{
@@ -788,7 +789,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
want = extent_ref_type(parent, owner);
if (insert) {
extra_size = btrfs_extent_inline_ref_size(want);
- path->search_for_extension = 1;
+ path->search_for_extension = true;
} else
extra_size = -1;
@@ -879,7 +880,7 @@ again:
ptr += btrfs_extent_inline_ref_size(type);
continue;
}
- if (type == BTRFS_REF_TYPE_INVALID) {
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID)) {
ret = -EUCLEAN;
goto out;
}
@@ -954,7 +955,7 @@ again:
if (!path->keep_locks) {
btrfs_release_path(path);
- path->keep_locks = 1;
+ path->keep_locks = true;
goto again;
}
@@ -975,11 +976,11 @@ out_no_entry:
*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
if (path->keep_locks) {
- path->keep_locks = 0;
+ path->keep_locks = false;
btrfs_unlock_up_safe(path, 1);
}
if (insert)
- path->search_for_extension = 0;
+ path->search_for_extension = false;
return ret;
}
@@ -1210,7 +1211,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
* We're adding refs to a tree block we already own, this
* should not happen at all.
*/
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (unlikely(owner < BTRFS_FIRST_FREE_OBJECTID)) {
btrfs_print_leaf(path->nodes[0]);
btrfs_crit(trans->fs_info,
"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
@@ -1480,7 +1481,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
*
*/
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
BTRFS_PATH_AUTO_FREE(path);
@@ -1522,19 +1523,21 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
/* now insert the actual backref */
- if (owner < BTRFS_FIRST_FREE_OBJECTID)
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = insert_tree_block_ref(trans, path, node, bytenr);
- else
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else {
ret = insert_extent_data_ref(trans, path, node, bytenr);
-
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
return ret;
}
static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_head *href)
+ const struct btrfs_delayed_ref_head *href)
{
u64 root = href->owning_root;
@@ -1543,7 +1546,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
* where it has already been unset.
*/
if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
- !href->is_data || !is_fstree(root))
+ !href->is_data || !btrfs_is_fstree(root))
return;
btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
@@ -1552,7 +1555,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1620,7 +1623,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
}
static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head,
+ const struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1707,7 +1710,7 @@ again:
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1754,7 +1757,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1762,7 +1765,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
if (TRANS_ABORTED(trans)) {
if (insert_reserved) {
- btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes);
free_head_ref_squota_rsv(trans->fs_info, href);
}
return 0;
@@ -1781,7 +1784,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
else
BUG();
if (ret && insert_reserved)
- btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes);
if (ret < 0)
btrfs_err(trans->fs_info,
"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
@@ -1888,7 +1891,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
if (head->must_insert_reserved) {
- btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes);
if (head->is_data) {
struct btrfs_root *csum_root;
@@ -2006,7 +2009,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
if (min_bytes == 0) {
- max_count = delayed_refs->num_heads_ready;
+ /*
+ * We may be subject to a harmless race if some task is
+ * concurrently adding or removing a delayed ref, so silence
+ * KCSAN and similar tools.
+ */
+ max_count = data_race(delayed_refs->num_heads_ready);
min_bytes = U64_MAX;
}
@@ -2150,7 +2158,7 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
ret = __btrfs_run_delayed_refs(trans, min_bytes);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2348,7 +2356,7 @@ static noinline int check_committed_ref(struct btrfs_inode *inode,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -2450,7 +2458,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc)
+ bool full_backref, bool inc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 parent;
@@ -2536,15 +2544,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, bool full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, true);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, bool full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, false);
}
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
@@ -2584,34 +2592,34 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
}
static int pin_down_extent(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *cache,
- u64 bytenr, u64 num_bytes, int reserved)
+ struct btrfs_block_group *bg,
+ u64 bytenr, u64 num_bytes, bool reserved)
{
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes);
- if (reserved) {
- cache->reserved -= num_bytes;
- cache->space_info->bytes_reserved -= num_bytes;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- set_extent_bit(&trans->transaction->pinned_extents, bytenr,
- bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ struct btrfs_space_info *space_info = bg->space_info;
+ const u64 reserved_bytes = (reserved ? num_bytes : 0);
+
+ spin_lock(&space_info->lock);
+ spin_lock(&bg->lock);
+ bg->pinned += num_bytes;
+ bg->reserved -= reserved_bytes;
+ spin_unlock(&bg->lock);
+ space_info->bytes_reserved -= reserved_bytes;
+ btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
+ spin_unlock(&space_info->lock);
+
+ btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
return 0;
}
-int btrfs_pin_extent(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int reserved)
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
- pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
+ pin_down_extent(trans, cache, bytenr, num_bytes, true);
btrfs_put_block_group(cache);
return 0;
@@ -2635,7 +2643,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- pin_down_extent(trans, cache, eb->start, eb->len, 0);
+ pin_down_extent(trans, cache, eb->start, eb->len, false);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, eb->start, eb->len);
@@ -2740,23 +2748,20 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
struct btrfs_free_cluster *cluster = NULL;
u64 total_unpinned = 0;
u64 empty_cluster = 0;
- bool readonly;
- int ret = 0;
while (start <= end) {
u64 len;
+ bool readonly;
- readonly = false;
if (!cache ||
start >= cache->start + cache->length) {
if (cache)
btrfs_put_block_group(cache);
total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
- if (cache == NULL) {
+ if (unlikely(cache == NULL)) {
/* Logic error, something removed the block group. */
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
cluster = fetch_cluster_info(fs_info,
@@ -2790,27 +2795,28 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
+ readonly = cache->ro;
cache->pinned -= len;
+ spin_unlock(&cache->lock);
+
btrfs_space_info_update_bytes_pinned(space_info, -len);
space_info->max_extent_size = 0;
- if (cache->ro) {
+
+ if (readonly) {
space_info->bytes_readonly += len;
- readonly = true;
} else if (btrfs_is_zoned(fs_info)) {
/* Need reset before reusing in a zoned block group */
btrfs_space_info_update_bytes_zone_unusable(space_info, len);
- readonly = true;
- }
- spin_unlock(&cache->lock);
- if (!readonly && return_free_space)
+ } else if (return_free_space) {
btrfs_return_free_space(space_info, len);
+ }
spin_unlock(&space_info->lock);
}
if (cache)
btrfs_put_block_group(cache);
-out:
- return ret;
+
+ return 0;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
@@ -2818,34 +2824,63 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *block_group, *tmp;
struct list_head *deleted_bgs;
- struct extent_io_tree *unpin;
+ struct extent_io_tree *unpin = &trans->transaction->pinned_extents;
+ struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+ int unpin_error = 0;
int ret;
- unpin = &trans->transaction->pinned_extents;
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state);
- while (!TRANS_ABORTED(trans)) {
- struct extent_state *cached_state = NULL;
-
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
- if (!find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, &cached_state)) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- break;
- }
+ while (!TRANS_ABORTED(trans) && cached_state) {
+ struct extent_state *next_state;
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
- clear_extent_dirty(unpin, start, end, &cached_state);
+ next_state = btrfs_next_extent_state(unpin, cached_state);
+ btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
ret = unpin_extent_range(fs_info, start, end, true);
- BUG_ON(ret);
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- free_extent_state(cached_state);
- cond_resched();
+ /*
+ * If we get an error unpinning an extent range, store the first
+ * error to return later after trying to unpin all ranges and do
+ * the sync discards. Our caller will abort the transaction
+ * (which already wrote new superblocks) and on the next mount
+ * the space will be available as it was pinned by in-memory
+ * only structures in this phase.
+ */
+ if (ret) {
+ btrfs_err_rl(fs_info,
+"failed to unpin extent range [%llu, %llu] when committing transaction %llu: %s (%d)",
+ start, end, trans->transid,
+ btrfs_decode_error(ret), ret);
+ if (!unpin_error)
+ unpin_error = ret;
+ }
+
+ btrfs_free_extent_state(cached_state);
+
+ if (need_resched()) {
+ btrfs_free_extent_state(next_state);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ cond_resched();
+ cached_state = NULL;
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state);
+ } else {
+ cached_state = next_state;
+ if (cached_state) {
+ start = cached_state->start;
+ end = cached_state->end;
+ }
+ }
}
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_free_extent_state(cached_state);
if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
btrfs_discard_calc_delay(&fs_info->discard_ctl);
@@ -2859,14 +2894,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
*/
deleted_bgs = &trans->transaction->deleted_bgs;
list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
- u64 trimmed = 0;
-
ret = -EROFS;
if (!TRANS_ABORTED(trans))
- ret = btrfs_discard_extent(fs_info,
- block_group->start,
- block_group->length,
- &trimmed);
+ ret = btrfs_discard_extent(fs_info, block_group->start,
+ block_group->length, NULL);
/*
* Not strictly necessary to lock, as the block_group should be
@@ -2888,7 +2919,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
}
}
- return 0;
+ return unpin_error;
}
/*
@@ -2950,26 +2981,26 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
csum_root = btrfs_csum_root(trans->fs_info, bytenr);
ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
}
ret = btrfs_record_squota_delta(trans->fs_info, delta);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
- ret = add_to_free_space_tree(trans, bytenr, num_bytes);
- if (ret) {
+ ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3049,12 +3080,12 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -3083,13 +3114,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- if (!is_data && refs_to_drop != 1) {
+ if (unlikely(!is_data && refs_to_drop != 1)) {
btrfs_crit(info,
"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
node->bytenr, refs_to_drop);
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
if (is_data)
@@ -3130,19 +3161,18 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
if (!found_extent) {
- if (iref) {
+ if (unlikely(iref)) {
abort_and_dump(trans, path,
"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
/* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop, is_data);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
btrfs_release_path(path);
@@ -3189,9 +3219,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"umm, got %d back from search, was looking for %llu, slot %d",
ret, bytenr, path->slots[0]);
}
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
extent_slot = path->slots[0];
}
@@ -3200,10 +3230,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
bytenr, node->parent, node->ref_root, owner_objectid,
owner_offset, path->slots[0]);
- goto out;
+ return ret;
} else {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
leaf = path->nodes[0];
@@ -3214,7 +3244,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
ei = btrfs_item_ptr(leaf, extent_slot,
struct btrfs_extent_item);
@@ -3222,26 +3252,24 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ if (unlikely(item_size < sizeof(*ei) + sizeof(*bi))) {
abort_and_dump(trans, path,
"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
key.objectid, key.type, key.offset,
path->slots[0], owner_objectid, item_size,
sizeof(*ei) + sizeof(*bi));
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
- if (refs < refs_to_drop) {
+ if (unlikely(refs < refs_to_drop)) {
abort_and_dump(trans, path,
"trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
refs_to_drop, refs, bytenr, path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
refs -= refs_to_drop;
@@ -3253,12 +3281,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* be updated by remove_extent_backref
*/
if (iref) {
- if (!found_extent) {
+ if (unlikely(!found_extent)) {
abort_and_dump(trans, path,
"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
} else {
btrfs_set_extent_refs(leaf, ei, refs);
@@ -3266,9 +3293,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop, is_data);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
}
} else {
@@ -3282,23 +3309,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
/* In this branch refs == 1 */
if (found_extent) {
- if (is_data && refs_to_drop !=
- extent_data_ref_count(path, iref)) {
+ if (unlikely(is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref))) {
abort_and_dump(trans, path,
"invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
extent_data_ref_count(path, iref),
refs_to_drop, path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
if (iref) {
- if (path->slots[0] != extent_slot) {
+ if (unlikely(path->slots[0] != extent_slot)) {
abort_and_dump(trans, path,
-"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
- key.objectid, key.type,
- key.offset, path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+"invalid iref, extent item key " BTRFS_KEY_FMT " slot %u doesn't have wanted iref",
+ BTRFS_KEY_FMT_VALUE(&key),
+ path->slots[0]);
+ return -EUCLEAN;
}
} else {
/*
@@ -3307,12 +3332,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* | extent_slot ||extent_slot + 1|
* [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
*/
- if (path->slots[0] != extent_slot + 1) {
+ if (unlikely(path->slots[0] != extent_slot + 1)) {
abort_and_dump(trans, path,
"invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
path->slots[0]);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
path->slots[0] = extent_slot;
num_to_del = 2;
@@ -3331,9 +3355,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
btrfs_release_path(path);
@@ -3341,8 +3365,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3451,7 +3473,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
bg = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ pin_down_extent(trans, bg, buf->start, buf->len, true);
btrfs_put_block_group(bg);
goto out;
}
@@ -3475,7 +3497,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
|| btrfs_is_zoned(fs_info)) {
- pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ pin_down_extent(trans, bg, buf->start, buf->len, true);
btrfs_put_block_group(bg);
goto out;
}
@@ -3483,17 +3505,11 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(bg, buf->start, buf->len);
- btrfs_free_reserved_bytes(bg, buf->len, 0);
+ btrfs_free_reserved_bytes(bg, buf->len, false);
btrfs_put_block_group(bg);
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
out:
-
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't
- * matter anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
return 0;
}
@@ -3511,7 +3527,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree, just update pinning info and exit early.
*/
if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
- btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
+ btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
@@ -3562,15 +3578,14 @@ enum btrfs_loop_type {
};
static inline void
-btrfs_lock_block_group(struct btrfs_block_group *cache,
- int delalloc)
+btrfs_lock_block_group(struct btrfs_block_group *cache, bool delalloc)
{
if (delalloc)
down_read(&cache->data_rwsem);
}
static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
- int delalloc)
+ bool delalloc)
{
btrfs_get_block_group(cache);
if (delalloc)
@@ -3580,7 +3595,7 @@ static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
static struct btrfs_block_group *btrfs_lock_cluster(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
- int delalloc)
+ bool delalloc)
__acquires(&cluster->refill_lock)
{
struct btrfs_block_group *used_bg = NULL;
@@ -3617,14 +3632,28 @@ static struct btrfs_block_group *btrfs_lock_cluster(
}
static inline void
-btrfs_release_block_group(struct btrfs_block_group *cache,
- int delalloc)
+btrfs_release_block_group(struct btrfs_block_group *cache, bool delalloc)
{
if (delalloc)
up_read(&cache->data_rwsem);
btrfs_put_block_group(cache);
}
+static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl,
+ const struct btrfs_block_group *bg)
+{
+ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
+ return true;
+ if (!btrfs_block_group_should_use_size_class(bg))
+ return true;
+ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
+ return true;
+ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
+ bg->size_class == BTRFS_BG_SZ_NONE)
+ return true;
+ return ffe_ctl->size_class == bg->size_class;
+}
+
/*
* Helper function for find_free_extent().
*
@@ -3646,7 +3675,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
if (!cluster_bg)
goto refill_cluster;
if (cluster_bg != bg && (cluster_bg->ro ||
- !block_group_bits(cluster_bg, ffe_ctl->flags)))
+ !block_group_bits(cluster_bg, ffe_ctl->flags) ||
+ !find_free_extent_check_size_class(ffe_ctl, cluster_bg)))
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
@@ -3992,7 +4022,7 @@ static int do_allocation(struct btrfs_block_group *block_group,
static void release_block_group(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
- int delalloc)
+ bool delalloc)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
@@ -4111,6 +4141,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct btrfs_key *ins,
struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
bool full_search)
{
struct btrfs_root *root = fs_info->chunk_root;
@@ -4165,7 +4196,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return ret;
}
- ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
+ ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags,
CHUNK_ALLOC_FORCE_FOR_EXTENT);
/* Do not bail out on ENOSPC since we can do more. */
@@ -4202,21 +4233,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
-static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group *bg)
-{
- if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
- return true;
- if (!btrfs_block_group_should_use_size_class(bg))
- return true;
- if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
- return true;
- if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
- bg->size_class == BTRFS_BG_SZ_NONE)
- return true;
- return ffe_ctl->size_class == bg->size_class;
-}
-
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4269,7 +4285,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
}
static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
- struct find_free_extent_ctl *ffe_ctl)
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info)
{
if (ffe_ctl->for_treelog) {
spin_lock(&fs_info->treelog_bg_lock);
@@ -4287,12 +4304,13 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
/*
- * No lock is OK here because avail is monotinically
+ * No lock is OK here because avail is monotonically
* decreasing, and this is just a hint.
*/
u64 avail = block_group->zone_capacity - block_group->alloc_offset;
if (block_group_bits(block_group, ffe_ctl->flags) &&
+ block_group->space_info == space_info &&
avail >= ffe_ctl->num_bytes) {
ffe_ctl->hint_byte = block_group->start;
break;
@@ -4314,7 +4332,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
- return prepare_allocation_zoned(fs_info, ffe_ctl);
+ return prepare_allocation_zoned(fs_info, ffe_ctl, space_info);
default:
BUG();
}
@@ -4382,11 +4400,22 @@ static noinline int find_free_extent(struct btrfs_root *root,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(root, ffe_ctl);
+ trace_btrfs_find_free_extent(root, ffe_ctl);
space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
+ if (btrfs_is_zoned(fs_info) && space_info) {
+ /* Use dedicated sub-space_info for dedicated block group users. */
+ if (ffe_ctl->for_data_reloc) {
+ space_info = space_info->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+ } else if (ffe_ctl->for_treelog) {
+ space_info = space_info->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
+ }
+ }
if (!space_info) {
- btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
+ btrfs_err(fs_info, "no space info for %llu, tree-log %d, relocation %d",
+ ffe_ctl->flags, ffe_ctl->for_treelog, ffe_ctl->for_data_reloc);
return -ENOSPC;
}
@@ -4408,6 +4437,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
* picked out then we don't care that the block group is cached.
*/
if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
+ block_group->space_info == space_info &&
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
@@ -4433,7 +4463,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
}
}
search:
- trace_find_free_extent_search_loop(root, ffe_ctl);
+ trace_btrfs_find_free_extent_search_loop(root, ffe_ctl);
ffe_ctl->have_caching_bg = false;
if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
ffe_ctl->index == 0)
@@ -4485,7 +4515,7 @@ search:
}
have_block_group:
- trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
+ trace_btrfs_find_free_extent_have_block_group(root, ffe_ctl, block_group);
ffe_ctl->cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl->cached)) {
ffe_ctl->have_caching_bg = true;
@@ -4578,7 +4608,8 @@ loop:
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
+ ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info,
+ full_search);
if (ret > 0)
goto search;
@@ -4647,7 +4678,7 @@ loop:
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data, int delalloc)
+ struct btrfs_key *ins, bool is_data, bool delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct find_free_extent_ctl ffe_ctl = {};
@@ -4692,16 +4723,15 @@ again:
"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
flags, num_bytes, for_treelog, for_data_reloc);
if (sinfo)
- btrfs_dump_space_info(fs_info, sinfo,
- num_bytes, 1);
+ btrfs_dump_space_info(sinfo, num_bytes, 1);
}
}
return ret;
}
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc)
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len,
+ bool is_delalloc)
{
struct btrfs_block_group *cache;
@@ -4713,7 +4743,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
}
btrfs_add_free_space(cache, start, len);
- btrfs_free_reserved_bytes(cache, len, delalloc);
+ btrfs_free_reserved_bytes(cache, len, is_delalloc);
trace_btrfs_reserved_extent_free(fs_info, start, len);
btrfs_put_block_group(cache);
@@ -4733,7 +4763,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
return -ENOSPC;
}
- ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
+ ret = pin_down_extent(trans, cache, eb->start, eb->len, true);
btrfs_put_block_group(cache);
return ret;
}
@@ -4744,7 +4774,7 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
+ ret = btrfs_remove_from_free_space_tree(trans, bytenr, num_bytes);
if (ret)
return ret;
@@ -4835,7 +4865,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -4923,7 +4953,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
- if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_fstree(root->relocation_src_root))
generic_ref.owning_root = root->relocation_src_root;
btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
@@ -4945,7 +4975,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
- struct btrfs_squota_delta delta = {
+ const struct btrfs_squota_delta delta = {
.root = root_objectid,
.num_bytes = ins->offset,
.generation = trans->transid,
@@ -4979,7 +5009,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
offset, ins, 1, root_objectid);
if (ret)
- btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
+ btrfs_pin_extent(trans, ins->objectid, ins->offset);
ret = btrfs_record_squota_delta(fs_info, &delta);
btrfs_put_block_group(block_group);
return ret;
@@ -5020,7 +5050,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
- if (check_eb_lock_owner(buf)) {
+ if (unlikely(check_eb_lock_owner(buf))) {
free_extent_buffer(buf);
return ERR_PTR(-EUCLEAN);
}
@@ -5071,17 +5101,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* EXTENT bit to differentiate dirty pages.
*/
if (buf->log_index == 0)
- set_extent_bit(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1,
- EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1,
+ EXTENT_DIRTY_LOG1, NULL);
else
- set_extent_bit(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1,
- EXTENT_NEW, NULL);
+ btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1,
+ EXTENT_DIRTY_LOG2, NULL);
} else {
buf->log_index = -1;
- set_extent_bit(&trans->transaction->dirty_pages, buf->start,
- buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start,
+ buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
}
/* this returns a buffer locked for blocking */
return buf;
@@ -5125,7 +5155,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
- empty_size, hint, &ins, 0, 0);
+ empty_size, hint, &ins, false, false);
if (ret)
goto out_unuse;
@@ -5187,7 +5217,7 @@ out_free_buf:
btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, false);
out_unuse:
btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
@@ -5429,17 +5459,17 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
if (!(wc->flags[level] & flag)) {
ASSERT(path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_dec_ref(trans, root, eb, 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_set_disk_extent_flags(trans, eb, flag);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -5514,7 +5544,7 @@ again:
goto again;
}
- exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
+ exists = btrfs_find_delayed_tree_ref(head, btrfs_root_id(root), parent);
mutex_unlock(&head->mutex);
out:
spin_unlock(&delayed_refs->lock);
@@ -5541,7 +5571,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
- if (btrfs_buffer_uptodate(next, generation, 0))
+ if (btrfs_buffer_uptodate(next, generation, false))
return 0;
check.level = level - 1;
@@ -5570,7 +5600,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
* If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
*
* If we are DROP_REFERENCE this will figure out if we need to drop our current
- * reference, skipping it if we dropped it from a previous incompleted drop, or
+ * reference, skipping it if we dropped it from a previous uncompleted drop, or
* dropping it if we still have a reference to it.
*/
static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -5595,7 +5625,7 @@ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_r
ref.parent = path->nodes[level]->start;
} else {
ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
- if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
+ if (unlikely(btrfs_root_id(root) != btrfs_header_owner(path->nodes[level]))) {
btrfs_err(root->fs_info, "mismatched block owner");
return -EIO;
}
@@ -5717,7 +5747,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
/*
* We have to walk down into this node, and if we're currently at the
- * DROP_REFERNCE stage and this block is shared then we need to switch
+ * DROP_REFERENCE stage and this block is shared then we need to switch
* to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
*/
if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
@@ -5731,7 +5761,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
level--;
ASSERT(level == btrfs_header_level(next));
- if (level != btrfs_header_level(next)) {
+ if (unlikely(level != btrfs_header_level(next))) {
btrfs_err(root->fs_info, "mismatched level");
ret = -EIO;
goto out_unlock;
@@ -5834,15 +5864,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
ret = btrfs_dec_ref(trans, root, eb, 1);
- else
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ } else {
ret = btrfs_dec_ref(trans, root, eb, 0);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- return ret;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
- if (is_fstree(btrfs_root_id(root))) {
+ if (btrfs_is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
btrfs_err_rl(fs_info,
@@ -5862,13 +5897,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else if (btrfs_root_id(root) != btrfs_header_owner(eb))
+ else if (unlikely(btrfs_root_id(root) != btrfs_header_owner(eb)))
goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else if (btrfs_root_id(root) !=
- btrfs_header_owner(path->nodes[level + 1]))
+ else if (unlikely(btrfs_root_id(root) !=
+ btrfs_header_owner(path->nodes[level + 1])))
goto owner_mismatch;
}
@@ -6003,9 +6038,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*
- * If called with for_reloc == 0, may exit early with -EAGAIN
+ * If called with for_reloc set, may exit early with -EAGAIN
*/
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
+int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc)
{
const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6013,7 +6048,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root_item *root_item = &root->root_item;
- struct walk_control *wc;
+ struct walk_control AUTO_KFREE(wc);
struct btrfs_key key;
const u64 rootid = btrfs_root_id(root);
int ret = 0;
@@ -6031,9 +6066,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
wc = kzalloc(sizeof(*wc), GFP_NOFS);
if (!wc) {
- btrfs_free_path(path);
ret = -ENOMEM;
- goto out;
+ goto out_free;
}
/*
@@ -6132,13 +6166,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
while (1) {
ret = walk_down_tree(trans, root, path, wc);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
break;
}
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -6165,7 +6199,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -6201,7 +6235,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
ret = btrfs_del_root(trans, &root->root_key);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -6209,7 +6243,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (!is_reloc_root) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
} else if (ret > 0) {
@@ -6243,7 +6277,6 @@ out_end_trans:
btrfs_end_transaction_throttle(trans);
out_free:
- kfree(wc);
btrfs_free_path(path);
out:
if (!ret && root_dropped) {
@@ -6286,7 +6319,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
BTRFS_PATH_AUTO_FREE(path);
- struct walk_control *wc;
+ struct walk_control AUTO_KFREE(wc);
int level;
int parent_level;
int ret = 0;
@@ -6303,7 +6336,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
- atomic_inc(&parent->refs);
+ refcount_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
@@ -6325,18 +6358,17 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
while (1) {
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0)
- break;
+ return ret;
ret = walk_up_tree(trans, root, path, wc, parent_level);
if (ret) {
- if (ret > 0)
- ret = 0;
+ if (ret < 0)
+ return ret;
break;
}
}
- kfree(wc);
- return ret;
+ return 0;
}
/*
@@ -6397,14 +6429,14 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
if (ret)
break;
- find_first_clear_extent_bit(&device->alloc_state, start,
- &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&device->alloc_state, start,
+ &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
/* Check if there are any CHUNK_* bits left */
if (start > device->total_bytes) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- btrfs_warn_in_rcu(fs_info,
+ DEBUG_WARN();
+ btrfs_warn(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
start, end - start + 1,
btrfs_dev_name(device),
@@ -6436,8 +6468,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
ret = btrfs_issue_discard(device->bdev, start, len,
&bytes);
if (!ret)
- set_extent_bit(&device->alloc_state, start,
- start + bytes - 1, CHUNK_TRIMMED, NULL);
+ btrfs_set_extent_bit(&device->alloc_state, start,
+ start + bytes - 1, CHUNK_TRIMMED, NULL);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 0ed682d9ed7b..71bb8109c969 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -30,7 +30,6 @@ struct find_free_extent_ctl {
u64 min_alloc_size;
u64 empty_size;
u64 flags;
- int delalloc;
/* Where to start the search inside the bg */
u64 search_start;
@@ -40,6 +39,7 @@ struct find_free_extent_ctl {
struct btrfs_free_cluster *last_ptr;
bool use_cluster;
+ bool delalloc;
bool have_caching_bg;
bool orig_have_caching_bg;
@@ -49,6 +49,16 @@ struct find_free_extent_ctl {
/* Allocation is called for data relocation */
bool for_data_reloc;
+ /*
+ * Set to true if we're retrying the allocation on this block group
+ * after waiting for caching progress, this is so that we retry only
+ * once before moving on to another block group.
+ */
+ bool retry_uncached;
+
+ /* Whether or not the allocator is currently following a hint. */
+ bool hinted;
+
/* RAID index, converted from flags */
int index;
@@ -57,13 +67,6 @@ struct find_free_extent_ctl {
*/
int loop;
- /*
- * Set to true if we're retrying the allocation on this block group
- * after waiting for caching progress, this is so that we retry only
- * once before moving on to another block group.
- */
- bool retry_uncached;
-
/* If current block group is cached */
int cached;
@@ -82,9 +85,6 @@ struct find_free_extent_ctl {
/* Allocation policy */
enum btrfs_extent_allocation_policy policy;
- /* Whether or not the allocator is currently following a hint */
- bool hinted;
-
/* Size class of block groups to prefer in early loops */
enum btrfs_block_group_size_class size_class;
};
@@ -97,7 +97,7 @@ enum btrfs_inline_ref_type {
};
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
+ const struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
@@ -110,8 +110,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags,
u64 *owner_root);
-int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
- int reserved);
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
@@ -138,25 +137,24 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data, int delalloc);
+ struct btrfs_key *ins, bool is_data, bool delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, bool full_backref);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, bool full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf, int slot);
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc);
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len,
+ bool is_delalloc);
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
- int for_reloc);
+int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 197f5e51c474..629fd5af4286 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -75,9 +75,9 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs,
struct extent_buffer, leak_list);
- pr_err(
- "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_err(fs_info,
+ "buffer leak start %llu len %u refs %d bflags %lu owner %llu",
+ eb->start, eb->len, refcount_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
WARN_ON_ONCE(1);
@@ -96,9 +96,31 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
*/
struct btrfs_bio_ctrl {
struct btrfs_bio *bbio;
+ /* Last byte contained in bbio + 1 . */
+ loff_t next_file_offset;
enum btrfs_compression_type compress_type;
u32 len_to_oe_boundary;
blk_opf_t opf;
+ /*
+ * For data read bios, we attempt to optimize csum lookups if the extent
+ * generation is older than the current one. To make this possible, we
+ * need to track the maximum generation of an extent in a bio_ctrl to
+ * make the decision when submitting the bio.
+ *
+ * The pattern between do_readpage(), submit_one_bio() and
+ * submit_extent_folio() is quite subtle, so tracking this is tricky.
+ *
+ * As we process extent E, we might submit a bio with existing built up
+ * extents before adding E to a new bio, or we might just add E to the
+ * bio. As a result, E's generation could apply to the current bio or
+ * to the next one, so we need to be careful to update the bio_ctrl's
+ * generation with E's only when we are sure E is added to bio_ctrl->bbio
+ * in submit_extent_folio().
+ *
+ * See the comment in btrfs_lookup_bio_sums() for more detail on the
+ * need for this optimization.
+ */
+ u64 generation;
btrfs_bio_end_io_t end_io_func;
struct writeback_control *wbc;
@@ -108,8 +130,47 @@ struct btrfs_bio_ctrl {
* This is to avoid touching ranges covered by compression/inline.
*/
unsigned long submit_bitmap;
+ struct readahead_control *ractl;
+
+ /*
+ * The start offset of the last used extent map by a read operation.
+ *
+ * This is for proper compressed read merge.
+ * U64_MAX means we are starting the read and have made no progress yet.
+ *
+ * The current btrfs_bio_is_contig() only uses disk_bytenr as
+ * the condition to check if the read can be merged with previous
+ * bio, which is not correct. E.g. two file extents pointing to the
+ * same extent but with different offset.
+ *
+ * So here we need to do extra checks to only merge reads that are
+ * covered by the same extent map.
+ * Just extent_map::start will be enough, as they are unique
+ * inside the same inode.
+ */
+ u64 last_em_start;
};
+/*
+ * Helper to set the csum search commit root option for a bio_ctrl's bbio
+ * before submitting the bio.
+ *
+ * Only for use by submit_one_bio().
+ */
+static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl)
+{
+ struct btrfs_bio *bbio = bio_ctrl->bbio;
+
+ ASSERT(bbio);
+
+ if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode)))
+ return;
+
+ bio_ctrl->bbio->csum_search_commit_root =
+ (bio_ctrl->generation &&
+ bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info));
+}
+
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_bio *bbio = bio_ctrl->bbio;
@@ -120,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
/* Caller should ensure the bio has at least some range added */
ASSERT(bbio->bio.bi_iter.bi_size);
+ bio_set_csum_search_commit_root(bio_ctrl);
+
if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
btrfs_submit_compressed_read(bbio);
@@ -128,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
/* The bbio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
+ /*
+ * We used the generation to decide whether to lookup csums in the
+ * commit_root or not when we called bio_set_csum_search_commit_root()
+ * above. Now, reset the generation for the next bio.
+ */
+ bio_ctrl->generation = 0;
}
/*
@@ -221,22 +290,17 @@ static void __process_folios_contig(struct address_space *mapping,
}
static noinline void unlock_delalloc_folio(const struct inode *inode,
- const struct folio *locked_folio,
+ struct folio *locked_folio,
u64 start, u64 end)
{
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
-
ASSERT(locked_folio);
- if (index == locked_folio->index && end_index == index)
- return;
__process_folios_contig(inode->i_mapping, locked_folio, start, end,
PAGE_UNLOCK);
}
static noinline int lock_delalloc_folios(struct inode *inode,
- const struct folio *locked_folio,
+ struct folio *locked_folio,
u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -246,9 +310,6 @@ static noinline int lock_delalloc_folios(struct inode *inode,
u64 processed_end = start;
struct folio_batch fbatch;
- if (index == locked_folio->index && index == end_index)
- return 0;
-
folio_batch_init(&fbatch);
while (index <= end_index) {
unsigned int found_folios, i;
@@ -272,8 +333,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
goto out;
}
range_start = max_t(u64, folio_pos(folio), start);
- range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
- end + 1) - range_start;
+ range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start;
btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
processed_end = range_start + range_len - 1;
@@ -314,8 +374,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
const u64 orig_start = *start;
const u64 orig_end = *end;
- /* The sanity tests may not set a valid fs_info. */
- u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
+ u64 max_bytes = fs_info->max_extent_size;
u64 delalloc_start;
u64 delalloc_end;
bool found;
@@ -327,12 +386,19 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
ASSERT(orig_end > orig_start);
/* The range should at least cover part of the folio */
- ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) ||
+ ASSERT(!(orig_start >= folio_next_pos(locked_folio) ||
orig_end <= folio_pos(locked_folio)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
+
+ /*
+ * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can
+ * return early without handling any dirty ranges.
+ */
+ ASSERT(max_bytes >= fs_info->sectorsize);
+
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
@@ -340,7 +406,7 @@ again:
/* @delalloc_end can be -1, never go beyond @orig_end */
*end = min(delalloc_end, orig_end);
- free_extent_state(cached_state);
+ btrfs_free_extent_state(cached_state);
return false;
}
@@ -358,18 +424,19 @@ again:
if (delalloc_end + 1 - delalloc_start > max_bytes)
delalloc_end = delalloc_start + max_bytes - 1;
- /* step two, lock all the folioss after the folios that has start */
+ /* step two, lock all the folios after the folios that has start */
ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
delalloc_end);
ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
- /* some of the folios are gone, lets avoid looping by
- * shortening the size of the delalloc range we're searching
+ /*
+ * Some of the folios are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching.
*/
- free_extent_state(cached_state);
+ btrfs_free_extent_state(cached_state);
cached_state = NULL;
if (!loops) {
- max_bytes = PAGE_SIZE;
+ max_bytes = fs_info->sectorsize;
loops = 1;
goto again;
} else {
@@ -379,13 +446,13 @@ again:
}
/* step three, lock the state bits for the whole range */
- lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
+ btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
- ret = test_range_bit(tree, delalloc_start, delalloc_end,
- EXTENT_DELALLOC, cached_state);
+ ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end,
+ EXTENT_DELALLOC, cached_state);
- unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
+ btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
unlock_delalloc_folio(inode, locked_folio, delalloc_start,
delalloc_end);
@@ -403,7 +470,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
__process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start,
end, page_ops);
@@ -425,7 +492,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + folio_size(folio));
+ start + len <= folio_next_pos(folio));
if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
@@ -450,7 +517,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
*/
static void end_bbio_data_write(struct btrfs_bio *bbio)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
struct folio_iter fi;
@@ -462,9 +529,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
u64 start = folio_pos(folio) + fi.offset;
u32 len = fi.length;
- /* Only order 0 (single page) folios are allowed for data. */
- ASSERT(folio_order(folio) == 0);
-
/* Our read/write should always be sector aligned. */
if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
@@ -509,46 +573,25 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
*/
static void end_bbio_data_read(struct btrfs_bio *bbio)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
struct bio *bio = &bbio->bio;
struct folio_iter fi;
- const u32 sectorsize = fs_info->sectorsize;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
struct folio *folio = fi.folio;
struct inode *inode = folio->mapping->host;
- u64 start;
- u64 end;
- u32 len;
+ u64 start = folio_pos(folio) + fi.offset;
btrfs_debug(fs_info,
"%s: bi_sector=%llu, err=%d, mirror=%u",
__func__, bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
- /*
- * We always issue full-sector reads, but if some block in a
- * folio fails to read, blk_update_request() will advance
- * bv_offset and adjust bv_len to compensate. Print a warning
- * for unaligned offsets, and an error if they don't add up to
- * a full sector.
- */
- if (!IS_ALIGNED(fi.offset, sectorsize))
- btrfs_err(fs_info,
- "partial page read in btrfs with offset %zu and length %zu",
- fi.offset, fi.length);
- else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
- btrfs_info(fs_info,
- "incomplete page read with offset %zu and length %zu",
- fi.offset, fi.length);
-
- start = folio_pos(folio) + fi.offset;
- end = start + fi.length - 1;
- len = fi.length;
if (likely(uptodate)) {
+ u64 end = start + fi.length - 1;
loff_t i_size = i_size_read(inode);
/*
@@ -573,7 +616,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/* Update page status and unlock. */
- end_folio_read(folio, uptodate, start, len);
+ end_folio_read(folio, uptodate, start, fi.length);
}
bio_put(bio);
}
@@ -582,6 +625,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* Populate every free slot in a provided array with folios using GFP_NOFS.
*
* @nr_folios: number of folios to allocate
+ * @order: the order of the folios to be allocated
* @folio_array: the array to fill with folios; any existing non-NULL entries in
* the array will be skipped
*
@@ -589,12 +633,13 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* -ENOMEM otherwise, the partially allocated folios would be freed and
* the array slots zeroed
*/
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array)
+int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
+ struct folio **folio_array)
{
for (int i = 0; i < nr_folios; i++) {
if (folio_array[i])
continue;
- folio_array[i] = folio_alloc(GFP_NOFS, 0);
+ folio_array[i] = folio_alloc(GFP_NOFS, order);
if (!folio_array[i])
goto error;
}
@@ -603,6 +648,7 @@ error:
for (int i = 0; i < nr_folios; i++) {
if (folio_array[i])
folio_put(folio_array[i]);
+ folio_array[i] = NULL;
}
return -ENOMEM;
}
@@ -664,13 +710,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
}
static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
- struct folio *folio, u64 disk_bytenr,
- unsigned int pg_offset)
+ u64 disk_bytenr, loff_t file_offset)
{
struct bio *bio = &bio_ctrl->bbio->bio;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
- struct folio *bv_folio = page_folio(bvec->bv_page);
if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
/*
@@ -681,19 +724,11 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
}
/*
- * The contig check requires the following conditions to be met:
- *
- * 1) The folios are belonging to the same inode
- * This is implied by the call chain.
- *
- * 2) The range has adjacent logical bytenr
- *
- * 3) The range has adjacent file offset
- * This is required for the usage of btrfs_bio->file_offset.
+ * To merge into a bio both the disk sector and the logical offset in
+ * the file need to be contiguous.
*/
- return bio_end_sector(bio) == sector &&
- folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len ==
- folio_pos(folio) + pg_offset;
+ return bio_ctrl->next_file_offset == file_offset &&
+ bio_end_sector(bio) == sector;
}
static void alloc_new_bio(struct btrfs_inode *inode,
@@ -703,14 +738,13 @@ static void alloc_new_bio(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_bio *bbio;
- bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
- bio_ctrl->end_io_func, NULL);
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode,
+ file_offset, bio_ctrl->end_io_func, NULL);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint;
- bbio->inode = inode;
- bbio->file_offset = file_offset;
bio_ctrl->bbio = bbio;
bio_ctrl->len_to_oe_boundary = U32_MAX;
+ bio_ctrl->next_file_offset = file_offset;
/* Limit data write bios to the ordered boundary. */
if (bio_ctrl->wbc) {
@@ -741,33 +775,35 @@ static void alloc_new_bio(struct btrfs_inode *inode,
* @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
+ * @read_em_generation: generation of the extent_map we are submitting
+ * (only used for read)
*
* The will either add the page into the existing @bio_ctrl->bbio, or allocate a
* new one in @bio_ctrl->bbio.
- * The mirror number for this IO should already be initizlied in
+ * The mirror number for this IO should already be initialized in
* @bio_ctrl->mirror_num.
*/
static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
u64 disk_bytenr, struct folio *folio,
- size_t size, unsigned long pg_offset)
+ size_t size, unsigned long pg_offset,
+ u64 read_em_generation)
{
struct btrfs_inode *inode = folio_to_inode(folio);
+ loff_t file_offset = folio_pos(folio) + pg_offset;
ASSERT(pg_offset + size <= folio_size(folio));
ASSERT(bio_ctrl->end_io_func);
if (bio_ctrl->bbio &&
- !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset))
+ !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset))
submit_one_bio(bio_ctrl);
do {
u32 len = size;
/* Allocate new bio if needed */
- if (!bio_ctrl->bbio) {
- alloc_new_bio(inode, bio_ctrl, disk_bytenr,
- folio_pos(folio) + pg_offset);
- }
+ if (!bio_ctrl->bbio)
+ alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
/* Cap to the current ordered extent boundary if there is one. */
if (len > bio_ctrl->len_to_oe_boundary) {
@@ -781,14 +817,20 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
submit_one_bio(bio_ctrl);
continue;
}
+ /*
+ * Now that the folio is definitely added to the bio, include its
+ * generation in the max generation calculation.
+ */
+ bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation);
+ bio_ctrl->next_file_offset += len;
if (bio_ctrl->wbc)
- wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
- len);
+ wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len);
size -= len;
pg_offset += len;
disk_bytenr += len;
+ file_offset += len;
/*
* len_to_oe_boundary defaults to U32_MAX, which isn't folio or
@@ -822,7 +864,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
static int attach_extent_buffer_folio(struct extent_buffer *eb,
struct folio *folio,
- struct btrfs_subpage *prealloc)
+ struct btrfs_folio_state *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
@@ -846,7 +888,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
/* Already mapped, just free prealloc */
if (folio_test_private(folio)) {
- btrfs_free_subpage(prealloc);
+ btrfs_free_folio_state(prealloc);
return 0;
}
@@ -855,7 +897,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
folio_attach_private(folio, prealloc);
else
/* Do new allocation to attach subpage */
- ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
+ ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}
@@ -871,7 +913,7 @@ int set_folio_extent_mapped(struct folio *folio)
fs_info = folio_to_fs_info(folio);
if (btrfs_is_subpage(fs_info, folio))
- return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
+ return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
@@ -888,7 +930,7 @@ void clear_folio_extent_mapped(struct folio *folio)
fs_info = folio_to_fs_info(folio);
if (btrfs_is_subpage(fs_info, folio))
- return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
+ return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_detach_private(folio);
}
@@ -903,13 +945,13 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode,
if (*em_cached) {
em = *em_cached;
- if (extent_map_in_tree(em) && start >= em->start &&
- start < extent_map_end(em)) {
+ if (btrfs_extent_map_in_tree(em) && start >= em->start &&
+ start < btrfs_extent_map_end(em)) {
refcount_inc(&em->refs);
return em;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*em_cached = NULL;
}
@@ -922,6 +964,25 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode,
return em;
}
+
+static void btrfs_readahead_expand(struct readahead_control *ractl,
+ const struct extent_map *em)
+{
+ const u64 ra_pos = readahead_pos(ractl);
+ const u64 ra_end = ra_pos + readahead_length(ractl);
+ const u64 em_end = em->start + em->len;
+
+ /* No expansion for holes and inline extents. */
+ if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE)
+ return;
+
+ ASSERT(em_end >= ra_pos,
+ "extent_map %llu %llu ends before current readahead position %llu",
+ em->start, em->len, ra_pos);
+ if (em_end > ra_end)
+ readahead_expand(ractl, ra_pos, em_end - ra_pos);
+}
+
/*
* basic readpage implementation. Locked extent state structs are inserted
* into the tree that are removed when the IO is done (by the end_io
@@ -930,7 +991,7 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -963,6 +1024,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
bool force_bio_submit = false;
u64 disk_bytenr;
u64 block_start;
+ u64 em_gen;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
@@ -980,20 +1042,30 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
return PTR_ERR(em);
}
extent_offset = cur - em->start;
- BUG_ON(extent_map_end(em) <= cur);
+ BUG_ON(btrfs_extent_map_end(em) <= cur);
BUG_ON(end < cur);
- compress_type = extent_map_compression(em);
+ compress_type = btrfs_extent_map_compression(em);
+
+ /*
+ * Only expand readahead for extents which are already creating
+ * the pages anyway in add_ra_bio_pages, which is compressed
+ * extents in the non subpage case.
+ */
+ if (bio_ctrl->ractl &&
+ !btrfs_is_subpage(fs_info, folio) &&
+ compress_type != BTRFS_COMPRESS_NONE)
+ btrfs_readahead_expand(bio_ctrl->ractl, em);
if (compress_type != BTRFS_COMPRESS_NONE)
disk_bytenr = em->disk_bytenr;
else
- disk_bytenr = extent_map_block_start(em) + extent_offset;
+ disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
else
- block_start = extent_map_block_start(em);
+ block_start = btrfs_extent_map_block_start(em);
/*
* If we have a file range that points to a compressed extent
@@ -1030,14 +1102,14 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (compress_type != BTRFS_COMPRESS_NONE &&
- prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->start)
+ bio_ctrl->last_em_start != U64_MAX &&
+ bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
- if (prev_em_start)
- *prev_em_start = em->start;
+ bio_ctrl->last_em_start = em->start;
- free_extent_map(em);
+ em_gen = em->generation;
+ btrfs_free_extent_map(em);
em = NULL;
/* we've found a hole, just zero and go on */
@@ -1060,7 +1132,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
- pg_offset);
+ pg_offset, em_gen);
}
return 0;
}
@@ -1126,7 +1198,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
* finished our folio read and unlocked the folio.
*/
if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
- u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ u64 range_len = umin(folio_next_pos(folio),
ordered->file_offset + ordered->num_bytes) - cur;
ret = true;
@@ -1148,7 +1220,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
* So we return true and update @next_ret to the OE/folio boundary.
*/
if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
- u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ u64 range_len = umin(folio_next_pos(folio),
ordered->file_offset + ordered->num_bytes) - cur;
/*
@@ -1212,7 +1284,7 @@ static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end,
ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));
again:
- lock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, cached_state);
cur_pos = start;
while (cur_pos < end) {
struct btrfs_ordered_extent *ordered;
@@ -1235,7 +1307,7 @@ again:
}
/* Now wait for the OE to finish. */
- unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
btrfs_put_ordered_extent(ordered);
/* We have unlocked the whole range, restart from the beginning. */
@@ -1249,15 +1321,18 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
const u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
+ .last_em_start = U64_MAX,
+ };
struct extent_map *em_cached = NULL;
int ret;
lock_extents_for_read(inode, start, end, &cached_state);
- ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
- unlock_extent(&inode->io_tree, start, end, &cached_state);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
- free_extent_map(em_cached);
+ btrfs_free_extent_map(em_cached);
/*
* If btrfs_do_readpage() failed we will want to submit the assembled
@@ -1443,8 +1518,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* We've hit an error during previous delalloc range,
* have to cleanup the remaining locked ranges.
*/
- unlock_extent(&inode->io_tree, found_start,
- found_start + found_len - 1, NULL);
+ btrfs_unlock_extent(&inode->io_tree, found_start,
+ found_start + found_len - 1, NULL);
unlock_delalloc_folio(&inode->vfs_inode, folio,
found_start,
found_start + found_len - 1);
@@ -1523,7 +1598,7 @@ out:
/*
* Return 0 if we have submitted or queued the sector for submission.
- * Return <0 for critical errors.
+ * Return <0 for critical errors, and the sector will have its dirty flag cleared.
*
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
@@ -1546,23 +1621,32 @@ static int submit_one_sector(struct btrfs_inode *inode,
ASSERT(filepos < i_size);
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
- if (IS_ERR(em))
+ if (IS_ERR(em)) {
+ /*
+ * When submission failed, we should still clear the folio dirty.
+ * Or the folio will be written back again but without any
+ * ordered extent.
+ */
+ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
return PTR_ERR(em);
+ }
extent_offset = filepos - em->start;
- em_end = extent_map_end(em);
+ em_end = btrfs_extent_map_end(em);
ASSERT(filepos <= em_end);
ASSERT(IS_ALIGNED(em->start, sectorsize));
ASSERT(IS_ALIGNED(em->len, sectorsize));
- block_start = extent_map_block_start(em);
- disk_bytenr = extent_map_block_start(em) + extent_offset;
+ block_start = btrfs_extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
- ASSERT(!extent_map_is_compressed(em));
+ ASSERT(!btrfs_extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
ASSERT(block_start != EXTENT_MAP_INLINE);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
/*
@@ -1582,7 +1666,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
ASSERT(folio_test_writeback(folio));
submit_extent_folio(bio_ctrl, disk_bytenr, folio,
- sectorsize, filepos - folio_pos(folio));
+ sectorsize, filepos - folio_pos(folio), 0);
return 0;
}
@@ -1603,15 +1687,18 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
- bool error = false;
+ int found_error = 0;
+ const u64 end = start + len;
const u64 folio_start = folio_pos(folio);
+ const u64 folio_end = folio_start + folio_size(folio);
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur;
int bit;
int ret = 0;
- ASSERT(start >= folio_start &&
- start + len <= folio_start + folio_size(folio));
+ ASSERT(start >= folio_start, "start=%llu folio_start=%llu", start, folio_start);
+ ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu",
+ start, len, folio_start, folio_size(folio));
ret = btrfs_writepage_cow_fixup(folio);
if (ret == -EAGAIN) {
@@ -1620,10 +1707,14 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
folio_unlock(folio);
return 1;
}
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_folio_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_set_writeback(fs_info, folio, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
return ret;
+ }
- for (cur = start; cur < start + len; cur += fs_info->sectorsize)
+ for (cur = start; cur < end; cur += fs_info->sectorsize)
set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
blocks_per_folio);
@@ -1634,8 +1725,24 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
if (cur >= i_size) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, cur,
+ folio_end - cur);
+ /*
+ * We have just run delalloc before getting here, so
+ * there must be an ordered extent.
+ */
+ ASSERT(ordered != NULL);
+ spin_lock(&inode->ordered_tree_lock);
+ set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+ ordered->truncated_len = min(ordered->truncated_len,
+ cur - ordered->file_offset);
+ spin_unlock(&inode->ordered_tree_lock);
+ btrfs_put_ordered_extent(ordered);
+
btrfs_mark_ordered_io_finished(inode, folio, cur,
- start + len - cur, true);
+ end - cur, true);
/*
* This range is beyond i_size, thus we don't need to
* bother writing back.
@@ -1644,8 +1751,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_folio_clear_dirty(fs_info, folio, cur,
- start + len - cur);
+ btrfs_folio_clear_dirty(fs_info, folio, cur, end - cur);
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
@@ -1663,7 +1769,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*/
btrfs_mark_ordered_io_finished(inode, folio, cur,
fs_info->sectorsize, false);
- error = true;
+ if (!found_error)
+ found_error = ret;
continue;
}
submitted_io = true;
@@ -1677,14 +1784,14 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
*
- * If we hit any error, the corresponding sector will still be dirty
- * thus no need to clear PAGECACHE_TAG_DIRTY.
+ * If we hit any error, the corresponding sector will have its dirty
+ * flag cleared and writeback finished, thus no need to handle the error case.
*/
- if (!submitted_io && !error) {
+ if (!submitted_io && !found_error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
- return ret;
+ return found_error;
}
/*
@@ -1703,7 +1810,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
- unsigned long end_index = i_size >> PAGE_SHIFT;
+ const pgoff_t end_index = i_size >> PAGE_SHIFT;
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
@@ -1718,7 +1825,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
return 0;
}
- if (folio->index == end_index)
+ if (folio_contains(folio, end_index))
folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
/*
@@ -1744,7 +1851,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_err_rl(fs_info,
"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
btrfs_ino(inode), folio_pos(folio));
ret = -EUCLEAN;
goto done;
@@ -1764,7 +1871,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
folio_size(folio), bio_ctrl, i_size);
if (ret == 1)
return 0;
- if (ret < 0)
+ if (unlikely(ret < 0))
btrfs_err_rl(fs_info,
"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
btrfs_root_id(inode->root), btrfs_ino(inode),
@@ -1814,8 +1921,19 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
*/
spin_lock(&eb->refs_lock);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
+ unsigned long flags;
+
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
spin_unlock(&eb->refs_lock);
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
+
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-eb->len,
@@ -1901,24 +2019,151 @@ static void set_btree_ioerr(struct extent_buffer *eb)
}
}
+static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_set_mark(&xas, mark);
+ xas_unlock_irqrestore(&xas, flags);
+}
+
+static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_clear_mark(&xas, mark);
+ xas_unlock_irqrestore(&xas, flags);
+}
+
+static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info,
+ unsigned long start, unsigned long end)
+{
+ XA_STATE(xas, &fs_info->buffer_tree, start);
+ unsigned int tagged = 0;
+ void *eb;
+
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) {
+ xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ if (++tagged % XA_CHECK_SCHED)
+ continue;
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
+ }
+ xas_unlock_irq(&xas);
+}
+
+struct eb_batch {
+ unsigned int nr;
+ unsigned int cur;
+ struct extent_buffer *ebs[PAGEVEC_SIZE];
+};
+
+static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb)
+{
+ batch->ebs[batch->nr++] = eb;
+ return (batch->nr < PAGEVEC_SIZE);
+}
+
+static inline void eb_batch_init(struct eb_batch *batch)
+{
+ batch->nr = 0;
+ batch->cur = 0;
+}
+
+static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch)
+{
+ if (batch->cur >= batch->nr)
+ return NULL;
+ return batch->ebs[batch->cur++];
+}
+
+static inline void eb_batch_release(struct eb_batch *batch)
+{
+ for (unsigned int i = 0; i < batch->nr; i++)
+ free_extent_buffer(batch->ebs[i]);
+ eb_batch_init(batch);
+}
+
+static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max,
+ xa_mark_t mark)
+{
+ struct extent_buffer *eb;
+
+retry:
+ eb = xas_find_marked(xas, max, mark);
+
+ if (xas_retry(xas, eb))
+ goto retry;
+
+ if (!eb)
+ return NULL;
+
+ if (!refcount_inc_not_zero(&eb->refs)) {
+ xas_reset(xas);
+ goto retry;
+ }
+
+ if (unlikely(eb != xas_reload(xas))) {
+ free_extent_buffer(eb);
+ xas_reset(xas);
+ goto retry;
+ }
+
+ return eb;
+}
+
+static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
+ unsigned long *start,
+ unsigned long end, xa_mark_t tag,
+ struct eb_batch *batch)
+{
+ XA_STATE(xas, &fs_info->buffer_tree, *start);
+ struct extent_buffer *eb;
+
+ rcu_read_lock();
+ while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
+ if (!eb_batch_add(batch, eb)) {
+ *start = ((eb->start + eb->len) >> fs_info->nodesize_bits);
+ goto out;
+ }
+ }
+ if (end == ULONG_MAX)
+ *start = ULONG_MAX;
+ else
+ *start = end + 1;
+out:
+ rcu_read_unlock();
+
+ return batch->nr;
+}
+
/*
* The endio specific version which won't touch any unsafe spinlock in endio
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
- const struct btrfs_fs_info *fs_info, u64 start)
+ struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
+ unsigned long index = (start >> fs_info->nodesize_bits);
rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits);
- if (eb && atomic_inc_not_zero(&eb->refs)) {
- rcu_read_unlock();
- return eb;
- }
+ eb = xa_load(&fs_info->buffer_tree, index);
+ if (eb && !refcount_inc_not_zero(&eb->refs))
+ eb = NULL;
rcu_read_unlock();
- return NULL;
+ return eb;
}
static void end_bbio_meta_write(struct btrfs_bio *bbio)
@@ -1933,10 +2178,8 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio)
btrfs_meta_folio_clear_writeback(fi.folio, eb);
}
- clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
-
+ buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK);
+ clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
bio_put(&bbio->bio);
}
@@ -1978,16 +2221,15 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
- eb->fs_info, end_bbio_meta_write, eb);
+ BTRFS_I(fs_info->btree_inode), eb->start,
+ end_bbio_meta_write, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(wbc, &bbio->bio);
- bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
- bbio->file_offset = eb->start;
for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
- u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ u32 range_len = min_t(u64, folio_next_pos(folio),
eb->start + eb->len) - range_start;
folio_lock(folio);
@@ -2000,167 +2242,48 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
wbc_account_cgroup_owner(wbc, folio, range_len);
folio_unlock(folio);
}
+ /*
+ * If the fs is already in error status, do not submit any writeback
+ * but immediately finish it.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info))) {
+ btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info)));
+ return;
+ }
btrfs_submit_bbio(bbio, 0);
}
/*
- * Submit one subpage btree page.
- *
- * The main difference to submit_eb_page() is:
- * - Page locking
- * For subpage, we don't rely on page locking at all.
- *
- * - Flush write bio
- * We only flush bio if we may be unable to fit current extent buffers into
- * current bio.
+ * Wait for all eb writeback in the given range to finish.
*
- * Return >=0 for the number of submitted extent buffers.
- * Return <0 for fatal error.
+ * @fs_info: The fs_info for this file system.
+ * @start: The offset of the range to start waiting on writeback.
+ * @end: The end of the range, inclusive. This is meant to be used in
+ * conjunction with wait_marked_extents, so this will usually be
+ * the_next_eb->start - 1.
*/
-static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
+void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
+ u64 end)
{
- struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- int submitted = 0;
- u64 folio_start = folio_pos(folio);
- int bit_start = 0;
- int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
- const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
+ struct eb_batch batch;
+ unsigned long start_index = (start >> fs_info->nodesize_bits);
+ unsigned long end_index = (end >> fs_info->nodesize_bits);
- /* Lock and write each dirty extent buffers in the range */
- while (bit_start < blocks_per_folio) {
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ eb_batch_init(&batch);
+ while (start_index <= end_index) {
struct extent_buffer *eb;
- unsigned long flags;
- u64 start;
+ unsigned int nr_ebs;
- /*
- * Take private lock to ensure the subpage won't be detached
- * in the meantime.
- */
- spin_lock(&folio->mapping->i_private_lock);
- if (!folio_test_private(folio)) {
- spin_unlock(&folio->mapping->i_private_lock);
+ nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index,
+ PAGECACHE_TAG_WRITEBACK, &batch);
+ if (!nr_ebs)
break;
- }
- spin_lock_irqsave(&subpage->lock, flags);
- if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio,
- subpage->bitmaps)) {
- spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&folio->mapping->i_private_lock);
- bit_start++;
- continue;
- }
-
- start = folio_start + bit_start * fs_info->sectorsize;
- bit_start += sectors_per_node;
-
- /*
- * Here we just want to grab the eb without touching extra
- * spin locks, so call find_extent_buffer_nolock().
- */
- eb = find_extent_buffer_nolock(fs_info, start);
- spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&folio->mapping->i_private_lock);
-
- /*
- * The eb has already reached 0 refs thus find_extent_buffer()
- * doesn't return it. We don't need to write back such eb
- * anyway.
- */
- if (!eb)
- continue;
- if (lock_extent_buffer_for_io(eb, wbc)) {
- write_one_eb(eb, wbc);
- submitted++;
- }
- free_extent_buffer(eb);
- }
- return submitted;
-}
-
-/*
- * Submit all page(s) of one extent buffer.
- *
- * @page: the page of one extent buffer
- * @eb_context: to determine if we need to submit this page, if current page
- * belongs to this eb, we don't need to submit
- *
- * The caller should pass each page in their bytenr order, and here we use
- * @eb_context to determine if we have submitted pages of one extent buffer.
- *
- * If we have, we just skip until we hit a new page that doesn't belong to
- * current @eb_context.
- *
- * If not, we submit all the page(s) of the extent buffer.
- *
- * Return >0 if we have submitted the extent buffer successfully.
- * Return 0 if we don't need to submit the page, as it's already submitted by
- * previous call.
- * Return <0 for fatal error.
- */
-static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx)
-{
- struct writeback_control *wbc = ctx->wbc;
- struct address_space *mapping = folio->mapping;
- struct extent_buffer *eb;
- int ret;
-
- if (!folio_test_private(folio))
- return 0;
-
- if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
- return submit_eb_subpage(folio, wbc);
-
- spin_lock(&mapping->i_private_lock);
- if (!folio_test_private(folio)) {
- spin_unlock(&mapping->i_private_lock);
- return 0;
- }
-
- eb = folio_get_private(folio);
-
- /*
- * Shouldn't happen and normally this would be a BUG_ON but no point
- * crashing the machine for something we can survive anyway.
- */
- if (WARN_ON(!eb)) {
- spin_unlock(&mapping->i_private_lock);
- return 0;
- }
-
- if (eb == ctx->eb) {
- spin_unlock(&mapping->i_private_lock);
- return 0;
- }
- ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->i_private_lock);
- if (!ret)
- return 0;
-
- ctx->eb = eb;
-
- ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx);
- if (ret) {
- if (ret == -EBUSY)
- ret = 0;
- free_extent_buffer(eb);
- return ret;
- }
-
- if (!lock_extent_buffer_for_io(eb, wbc)) {
- free_extent_buffer(eb);
- return 0;
- }
- /* Implies write in zoned mode. */
- if (ctx->zoned_bg) {
- /* Mark the last eb in the block group. */
- btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb);
- ctx->zoned_bg->meta_write_pointer += eb->len;
+ while ((eb = eb_batch_next(&batch)) != NULL)
+ wait_on_extent_buffer_writeback(eb);
+ eb_batch_release(&batch);
+ cond_resched();
}
- write_one_eb(eb, wbc);
- free_extent_buffer(eb);
- return 1;
}
int btree_write_cache_pages(struct address_space *mapping,
@@ -2171,25 +2294,27 @@ int btree_write_cache_pages(struct address_space *mapping,
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
- struct folio_batch fbatch;
- unsigned int nr_folios;
- pgoff_t index;
- pgoff_t end; /* Inclusive */
+ struct eb_batch batch;
+ unsigned int nr_ebs;
+ unsigned long index;
+ unsigned long end;
int scanned = 0;
xa_mark_t tag;
- folio_batch_init(&fbatch);
+ eb_batch_init(&batch);
if (wbc->range_cyclic) {
- index = mapping->writeback_index; /* Start from prev offset */
+ index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits);
end = -1;
+
/*
* Start from the beginning does not need to cycle over the
* range, mark it as scanned.
*/
scanned = (index == 0);
} else {
- index = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
+ index = (wbc->range_start >> fs_info->nodesize_bits);
+ end = (wbc->range_end >> fs_info->nodesize_bits);
+
scanned = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -2199,31 +2324,39 @@ int btree_write_cache_pages(struct address_space *mapping,
btrfs_zoned_meta_io_lock(fs_info);
retry:
if (wbc->sync_mode == WB_SYNC_ALL)
- tag_pages_for_writeback(mapping, index, end);
+ buffer_tree_tag_for_writeback(fs_info, index, end);
while (!done && !nr_to_write_done && (index <= end) &&
- (nr_folios = filemap_get_folios_tag(mapping, &index, end,
- tag, &fbatch))) {
- unsigned i;
+ (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) {
+ struct extent_buffer *eb;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = fbatch.folios[i];
+ while ((eb = eb_batch_next(&batch)) != NULL) {
+ ctx.eb = eb;
+
+ ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx);
+ if (ret) {
+ if (ret == -EBUSY)
+ ret = 0;
- ret = submit_eb_page(folio, &ctx);
- if (ret == 0)
+ if (ret) {
+ done = 1;
+ break;
+ }
continue;
- if (ret < 0) {
- done = 1;
- break;
}
- /*
- * the filesystem may choose to bump up nr_to_write.
- * We have to make sure to honor the new nr_to_write
- * at any time
- */
- nr_to_write_done = wbc->nr_to_write <= 0;
+ if (!lock_extent_buffer_for_io(eb, wbc))
+ continue;
+
+ /* Implies write in zoned mode. */
+ if (ctx.zoned_bg) {
+ /* Mark the last eb in the block group. */
+ btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb);
+ ctx.zoned_bg->meta_write_pointer += eb->len;
+ }
+ write_one_eb(eb, wbc);
}
- folio_batch_release(&fbatch);
+ nr_to_write_done = (wbc->nr_to_write <= 0);
+ eb_batch_release(&batch);
cond_resched();
}
if (!scanned && !done) {
@@ -2349,10 +2482,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
&BTRFS_I(inode)->runtime_flags))
wbc->tagged_writepages = 1;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
@@ -2396,7 +2526,7 @@ retry:
* In above case, [32K, 96K) is asynchronously submitted
* for compression, and [124K, 128K) needs to be written back.
*
- * If we didn't wait wrtiteback for page 64K, [128K, 128K)
+ * If we didn't wait writeback for page 64K, [128K, 128K)
* won't be submitted as the page still has writeback flag
* and will be skipped in the next check.
*
@@ -2508,7 +2638,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
continue;
}
- cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end);
+ cur_end = min_t(u64, folio_next_pos(folio) - 1, end);
cur_len = cur_end + 1 - cur;
ASSERT(folio_test_locked(folio));
@@ -2560,24 +2690,27 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb
void btrfs_readahead(struct readahead_control *rac)
{
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ | REQ_RAHEAD,
+ .ractl = rac,
+ .last_em_start = U64_MAX,
+ };
struct folio *folio;
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
const u64 start = readahead_pos(rac);
const u64 end = start + readahead_length(rac) - 1;
struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
- u64 prev_em_start = (u64)-1;
lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
- btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
- unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
if (em_cached)
- free_extent_map(em_cached);
+ btrfs_free_extent_map(em_cached);
submit_one_bio(&bio_ctrl);
}
@@ -2601,7 +2734,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent(tree, start, end, &cached_state);
+ btrfs_lock_extent(tree, start, end, &cached_state);
folio_wait_writeback(folio);
/*
@@ -2609,46 +2742,54 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* so here we only need to unlock the extent range to free any
* existing extent state.
*/
- unlock_extent(tree, start, end, &cached_state);
+ btrfs_unlock_extent(tree, start, end, &cached_state);
return 0;
}
/*
- * a helper for release_folio, this tests for areas of the page that
- * are locked or under IO and drops the related state bits if it is safe
- * to drop the page.
+ * A helper for struct address_space_operations::release_folio, this tests for
+ * areas of the folio that are locked or under IO and drops the related state
+ * bits if it is safe to drop the folio.
*/
static bool try_release_extent_state(struct extent_io_tree *tree,
struct folio *folio)
{
+ struct extent_state *cached_state = NULL;
u64 start = folio_pos(folio);
u64 end = start + folio_size(folio) - 1;
- bool ret;
+ u32 range_bits;
+ u32 clear_bits;
+ bool ret = false;
+ int ret2;
- if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
- ret = false;
- } else {
- u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
- EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
- EXTENT_QGROUP_RESERVED);
- int ret2;
+ btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state);
- /*
- * At this point we can safely clear everything except the
- * locked bit, the nodatasum bit and the delalloc new bit.
- * The delalloc new bit will be cleared by ordered extent
- * completion.
- */
- ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
+ /*
+ * We can release the folio if it's locked only for ordered extent
+ * completion, since that doesn't require using the folio.
+ */
+ if ((range_bits & EXTENT_LOCKED) &&
+ !(range_bits & EXTENT_FINISHING_ORDERED))
+ goto out;
+
+ clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW |
+ EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED |
+ EXTENT_FINISHING_ORDERED);
+ /*
+ * At this point we can safely clear everything except the locked,
+ * nodatasum, delalloc new and finishing ordered bits. The delalloc new
+ * bit will be cleared by ordered extent completion.
+ */
+ ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state);
+ /*
+ * If clear_extent_bit failed for enomem reasons, we can't allow the
+ * release to continue.
+ */
+ if (ret2 == 0)
+ ret = true;
+out:
+ btrfs_free_extent_state(cached_state);
- /* if clear_extent_bit failed for enomem reasons,
- * we can't allow the release to continue.
- */
- if (ret2 < 0)
- ret = false;
- else
- ret = true;
- }
return ret;
}
@@ -2671,18 +2812,19 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
struct extent_map *em;
write_lock(&extent_tree->lock);
- em = lookup_extent_mapping(extent_tree, start, len);
+ em = btrfs_lookup_extent_mapping(extent_tree, start, len);
if (!em) {
write_unlock(&extent_tree->lock);
break;
}
if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
write_unlock(&extent_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
break;
}
- if (test_range_bit_exists(io_tree, em->start,
- extent_map_end(em) - 1, EXTENT_LOCKED))
+ if (btrfs_test_range_bit_exists(io_tree, em->start,
+ btrfs_extent_map_end(em) - 1,
+ EXTENT_LOCKED))
goto next;
/*
* If it's not in the list of modified extents, used by a fast
@@ -2709,15 +2851,15 @@ remove_em:
* fsync performance for workloads with a data size that exceeds
* or is close to the system's memory).
*/
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
/* Once for the inode's extent map tree. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
- start = extent_map_end(em);
+ start = btrfs_extent_map_end(em);
write_unlock(&extent_tree->lock);
/* Once for us, for the lookup_extent_mapping() reference. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (need_resched()) {
/*
@@ -2741,13 +2883,13 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
static bool folio_range_has_eb(struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
lockdep_assert_held(&folio->mapping->i_private_lock);
if (folio_test_private(folio)) {
- subpage = folio_get_private(folio);
- if (atomic_read(&subpage->eb_refs))
+ bfs = folio_get_private(folio);
+ if (atomic_read(&bfs->eb_refs))
return true;
}
return false;
@@ -2756,6 +2898,7 @@ static bool folio_range_has_eb(struct folio *folio)
static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct address_space *mapping = folio->mapping;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
/*
@@ -2763,21 +2906,20 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* be done under the i_private_lock.
*/
if (mapped)
- spin_lock(&folio->mapping->i_private_lock);
+ spin_lock(&mapping->i_private_lock);
if (!folio_test_private(folio)) {
if (mapped)
- spin_unlock(&folio->mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
return;
}
if (!btrfs_meta_is_subpage(fs_info)) {
/*
- * We do this since we'll remove the pages after we've
- * removed the eb from the radix tree, so we could race
- * and have this page now attached to the new eb. So
- * only clear folio if it's still connected to
- * this eb.
+ * We do this since we'll remove the pages after we've removed
+ * the eb from the xarray, so we could race and have this page
+ * now attached to the new eb. So only clear folio if it's
+ * still connected to this eb.
*/
if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
@@ -2787,7 +2929,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
folio_detach_private(folio);
}
if (mapped)
- spin_unlock(&folio->mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
return;
}
@@ -2797,7 +2939,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
+ btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return;
}
@@ -2808,9 +2950,9 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* page range and no unfinished IO.
*/
if (!folio_range_has_eb(folio))
- btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
+ btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
- spin_unlock(&folio->mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
}
/* Release all folios attached to the extent buffer */
@@ -2825,9 +2967,6 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
continue;
detach_extent_buffer_folio(eb, folio);
-
- /* One for when we allocated the folio. */
- folio_put(folio);
}
}
@@ -2855,16 +2994,35 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
btrfs_leak_debug_add_eb(eb);
spin_lock_init(&eb->refs_lock);
- atomic_set(&eb->refs, 1);
+ refcount_set(&eb->refs, 1);
ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
+/*
+ * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer()
+ * does not call folio_put(), and we need to set the folios to NULL so that
+ * btrfs_release_extent_buffer() will not detach them a second time.
+ */
+static void cleanup_extent_buffer_folios(struct extent_buffer *eb)
+{
+ const int num_folios = num_extent_folios(eb);
+
+ /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */
+ for (int i = 0; i < num_folios; i++) {
+ ASSERT(eb->folios[i]);
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ folio_put(eb->folios[i]);
+ eb->folios[i] = NULL;
+ }
+}
+
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
struct extent_buffer *new;
+ int num_folios;
int ret;
new = __alloc_extent_buffer(src->fs_info, src->start);
@@ -2879,25 +3037,34 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
ret = alloc_eb_folio_array(new, false);
- if (ret) {
- btrfs_release_extent_buffer(new);
- return NULL;
- }
+ if (ret)
+ goto release_eb;
- for (int i = 0; i < num_extent_folios(src); i++) {
+ ASSERT(num_extent_folios(src) == num_extent_folios(new),
+ "%d != %d", num_extent_folios(src), num_extent_folios(new));
+ /* Explicitly use the cached num_extent value from now on. */
+ num_folios = num_extent_folios(src);
+ for (int i = 0; i < num_folios; i++) {
struct folio *folio = new->folios[i];
ret = attach_extent_buffer_folio(new, folio, NULL);
- if (ret < 0) {
- btrfs_release_extent_buffer(new);
- return NULL;
- }
+ if (ret < 0)
+ goto cleanup_folios;
WARN_ON(folio_test_dirty(folio));
}
+ for (int i = 0; i < num_folios; i++)
+ folio_put(new->folios[i]);
+
copy_extent_buffer_full(new, src);
set_extent_buffer_uptodate(new);
return new;
+
+cleanup_folios:
+ cleanup_extent_buffer_folios(new);
+release_eb:
+ btrfs_release_extent_buffer(new);
+ return NULL;
}
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -2912,13 +3079,15 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
ret = alloc_eb_folio_array(eb, false);
if (ret)
- goto out;
+ goto release_eb;
for (int i = 0; i < num_extent_folios(eb); i++) {
ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
- goto out_detach;
+ goto cleanup_folios;
}
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ folio_put(eb->folios[i]);
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
@@ -2926,15 +3095,10 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
-out_detach:
- for (int i = 0; i < num_extent_folios(eb); i++) {
- if (eb->folios[i]) {
- detach_extent_buffer_folio(eb, eb->folios[i]);
- folio_put(eb->folios[i]);
- }
- }
-out:
- kmem_cache_free(extent_buffer_cache, eb);
+cleanup_folios:
+ cleanup_extent_buffer_folios(eb);
+release_eb:
+ btrfs_release_extent_buffer(eb);
return NULL;
}
@@ -2942,9 +3106,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
/*
- * The TREE_REF bit is first set when the extent_buffer is added
- * to the radix tree. It is also reset, if unset, when a new reference
- * is created by find_extent_buffer.
+ * The TREE_REF bit is first set when the extent_buffer is added to the
+ * xarray. It is also reset, if unset, when a new reference is created
+ * by find_extent_buffer.
*
* It is only cleared in two cases: freeing the last non-tree
* reference to the extent_buffer when its STALE bit is set or
@@ -2956,21 +3120,20 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
* conditions between the calls to check_buffer_tree_ref in those
* codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * The actual lifetime of the extent_buffer in the radix tree is
- * adequately protected by the refcount, but the TREE_REF bit and
- * its corresponding reference are not. To protect against this
- * class of races, we call check_buffer_tree_ref from the codepaths
- * which trigger io. Note that once io is initiated, TREE_REF can no
- * longer be cleared, so that is the moment at which any such race is
- * best fixed.
+ * The actual lifetime of the extent_buffer in the xarray is adequately
+ * protected by the refcount, but the TREE_REF bit and its corresponding
+ * reference are not. To protect against this class of races, we call
+ * check_buffer_tree_ref() from the code paths which trigger io. Note that
+ * once io is initiated, TREE_REF can no longer be cleared, so that is
+ * the moment at which any such race is best fixed.
*/
- refs = atomic_read(&eb->refs);
+ refs = refcount_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
return;
spin_lock(&eb->refs_lock);
if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
spin_unlock(&eb->refs_lock);
}
@@ -3026,30 +3189,29 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
+ xa_lock_irq(&fs_info->buffer_tree);
+ exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
+ NULL, eb, GFP_NOFS);
+ if (xa_is_err(exists)) {
+ ret = xa_err(exists);
+ xa_unlock_irq(&fs_info->buffer_tree);
+ btrfs_release_extent_buffer(eb);
+ return ERR_PTR(ret);
}
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
- else
+ if (exists) {
+ if (!refcount_inc_not_zero(&exists->refs)) {
+ /* The extent buffer is being freed, retry. */
+ xa_unlock_irq(&fs_info->buffer_tree);
goto again;
+ }
+ xa_unlock_irq(&fs_info->buffer_tree);
+ btrfs_release_extent_buffer(eb);
+ return exists;
}
+ xa_unlock_irq(&fs_info->buffer_tree);
check_buffer_tree_ref(eb);
- set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
return eb;
-free_eb:
- btrfs_release_extent_buffer(eb);
- return exists;
#else
/* Stub to avoid linker error when compiled with optimizations turned off. */
return NULL;
@@ -3064,9 +3226,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&folio->mapping->i_private_lock);
/*
- * For subpage case, we completely rely on radix tree to ensure we
- * don't try to insert two ebs for the same bytenr. So here we always
- * return NULL and just continue.
+ * For subpage case, we completely rely on xarray to ensure we don't try
+ * to insert two ebs for the same bytenr. So here we always return NULL
+ * and just continue.
*/
if (btrfs_meta_is_subpage(fs_info))
return NULL;
@@ -3082,7 +3244,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
* just overwrite folio private.
*/
exists = folio_get_private(folio);
- if (atomic_inc_not_zero(&exists->refs))
+ if (refcount_inc_not_zero(&exists->refs))
return exists;
WARN_ON(folio_test_dirty(folio));
@@ -3095,30 +3257,30 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
*/
static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
{
- if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+ const u32 nodesize = fs_info->nodesize;
+
+ if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) {
btrfs_err(fs_info, "bad tree block start %llu", start);
return true;
}
- if (fs_info->nodesize < PAGE_SIZE &&
- offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
+ if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) {
btrfs_err(fs_info,
- "tree block crosses page boundary, start %llu nodesize %u",
- start, fs_info->nodesize);
+ "tree block is not nodesize aligned, start %llu nodesize %u",
+ start, nodesize);
return true;
}
- if (fs_info->nodesize >= PAGE_SIZE &&
- !PAGE_ALIGNED(start)) {
+ if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) {
btrfs_err(fs_info,
"tree block is not page aligned, start %llu nodesize %u",
- start, fs_info->nodesize);
+ start, nodesize);
return true;
}
- if (!IS_ALIGNED(start, fs_info->nodesize) &&
- !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
+ if (unlikely(!IS_ALIGNED(start, nodesize) &&
+ !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) {
btrfs_warn(fs_info,
"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
- start, fs_info->nodesize);
+ start, nodesize);
}
return false;
}
@@ -3132,14 +3294,14 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
- struct btrfs_subpage *prealloc,
+ struct btrfs_folio_state *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
- const unsigned long index = eb->start >> PAGE_SHIFT;
- struct folio *existing_folio = NULL;
+ const pgoff_t index = eb->start >> PAGE_SHIFT;
+ struct folio *existing_folio;
int ret;
ASSERT(found_eb_ret);
@@ -3148,6 +3310,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
ASSERT(eb->folios[i]);
retry:
+ existing_folio = NULL;
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL);
if (!ret)
@@ -3155,10 +3318,8 @@ retry:
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
- if (IS_ERR(existing_folio)) {
- existing_folio = NULL;
+ if (IS_ERR(existing_folio))
goto retry;
- }
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
@@ -3199,7 +3360,7 @@ finish:
/*
* To inform we have an extra eb under allocation, so that
* detach_extent_buffer_page() won't release the folio private when the
- * eb hasn't been inserted into radix tree yet.
+ * eb hasn't been inserted into the xarray yet.
*
* The ref will be decreased when the eb releases the page, in
* detach_extent_buffer_page(). Thus needs no special handling in the
@@ -3216,7 +3377,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct btrfs_subpage *prealloc = NULL;
+ struct btrfs_folio_state *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
int uptodate = 1;
@@ -3261,7 +3422,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* manually if we exit earlier.
*/
if (btrfs_meta_is_subpage(fs_info)) {
- prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
+ prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
goto out;
@@ -3272,7 +3433,7 @@ reallocate:
/* Allocate all pages first. */
ret = alloc_eb_folio_array(eb, true);
if (ret < 0) {
- btrfs_free_subpage(prealloc);
+ btrfs_free_folio_state(prealloc);
goto out;
}
@@ -3306,7 +3467,7 @@ reallocate:
* using 0-order folios.
*/
if (unlikely(ret == -EAGAIN)) {
- ASSERT(0);
+ DEBUG_WARN("folio order mismatch between new eb and filemap");
goto reallocate;
}
attached++;
@@ -3333,10 +3494,9 @@ reallocate:
/*
* We can't unlock the pages just yet since the extent buffer
- * hasn't been properly inserted in the radix tree, this
- * opens a race with btree_release_folio which can free a page
- * while we are still filling in all pages for the buffer and
- * we could crash.
+ * hasn't been properly inserted into the xarray, this opens a
+ * race with btree_release_folio() which can free a page while we
+ * are still filling in all pages for the buffer and we could crash.
*/
}
if (uptodate)
@@ -3345,38 +3505,46 @@ reallocate:
if (page_contig)
eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret)
+ xa_lock_irq(&fs_info->buffer_tree);
+ existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
+ start >> fs_info->nodesize_bits, NULL, eb,
+ GFP_NOFS);
+ if (xa_is_err(existing_eb)) {
+ ret = xa_err(existing_eb);
+ xa_unlock_irq(&fs_info->buffer_tree);
goto out;
-
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- ret = 0;
- existing_eb = find_extent_buffer(fs_info, start);
- if (existing_eb)
- goto out;
- else
+ }
+ if (existing_eb) {
+ if (!refcount_inc_not_zero(&existing_eb->refs)) {
+ xa_unlock_irq(&fs_info->buffer_tree);
goto again;
+ }
+ xa_unlock_irq(&fs_info->buffer_tree);
+ goto out;
}
+ xa_unlock_irq(&fs_info->buffer_tree);
+
/* add one reference for the tree */
check_buffer_tree_ref(eb);
- set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
/*
* Now it's safe to unlock the pages because any calls to
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (int i = 0; i < num_extent_folios(eb); i++)
+ for (int i = 0; i < num_extent_folios(eb); i++) {
folio_unlock(eb->folios[i]);
+ /*
+ * A folio that has been added to an address_space mapping
+ * should not continue holding the refcount from its original
+ * allocation indefinitely.
+ */
+ folio_put(eb->folios[i]);
+ }
return eb;
out:
- WARN_ON(!atomic_dec_and_test(&eb->refs));
+ WARN_ON(!refcount_dec_and_test(&eb->refs));
/*
* Any attached folios need to be detached before we unlock them. This
@@ -3386,26 +3554,22 @@ out:
* want that to grab this eb, as we're getting ready to free it. So we
* have to detach it first and then unlock it.
*
- * We have to drop our reference and NULL it out here because in the
- * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
- * Below when we call btrfs_release_extent_buffer() we will call
- * detach_extent_buffer_folio() on our remaining pages in the !subpage
- * case. If we left eb->folios[i] populated in the subpage case we'd
- * double put our reference and be super sad.
+ * Note: the bounds is num_extent_pages() as we need to go through all slots.
*/
- for (int i = 0; i < attached; i++) {
- ASSERT(eb->folios[i]);
- detach_extent_buffer_folio(eb, eb->folios[i]);
- folio_unlock(eb->folios[i]);
- folio_put(eb->folios[i]);
+ for (int i = 0; i < num_extent_pages(eb); i++) {
+ struct folio *folio = eb->folios[i];
+
+ if (i < attached) {
+ ASSERT(folio);
+ detach_extent_buffer_folio(eb, folio);
+ folio_unlock(folio);
+ } else if (!folio) {
+ continue;
+ }
+
+ folio_put(folio);
eb->folios[i] = NULL;
}
- /*
- * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
- * so it can be cleaned up without utilizing folio->mapping.
- */
- set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
-
btrfs_release_extent_buffer(eb);
if (ret < 0)
return ERR_PTR(ret);
@@ -3426,20 +3590,28 @@ static int release_extent_buffer(struct extent_buffer *eb)
{
lockdep_assert_held(&eb->refs_lock);
- WARN_ON(atomic_read(&eb->refs) == 0);
- if (atomic_dec_and_test(&eb->refs)) {
- if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
- struct btrfs_fs_info *fs_info = eb->fs_info;
+ if (refcount_dec_and_test(&eb->refs)) {
+ struct btrfs_fs_info *fs_info = eb->fs_info;
- spin_unlock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
- spin_lock(&fs_info->buffer_lock);
- radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> fs_info->sectorsize_bits);
- spin_unlock(&fs_info->buffer_lock);
- } else {
- spin_unlock(&eb->refs_lock);
- }
+ /*
+ * We're erasing, theoretically there will be no allocations, so
+ * just use GFP_ATOMIC.
+ *
+ * We use cmpxchg instead of erase because we do not know if
+ * this eb is actually in the tree or not, we could be cleaning
+ * up an eb that we allocated but never inserted into the tree.
+ * Thus use cmpxchg to remove it from the tree if it is there,
+ * or leave the other entry if this isn't in the tree.
+ *
+ * The documentation says that putting a NULL value is the same
+ * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't
+ * in this case.
+ */
+ xa_cmpxchg_irq(&fs_info->buffer_tree,
+ eb->start >> fs_info->nodesize_bits, eb, NULL,
+ GFP_ATOMIC);
btrfs_leak_debug_del_eb(eb);
/* Should be safe to release folios at this point. */
@@ -3464,22 +3636,26 @@ void free_extent_buffer(struct extent_buffer *eb)
if (!eb)
return;
- refs = atomic_read(&eb->refs);
+ refs = refcount_read(&eb->refs);
while (1) {
- if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
- || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
- refs == 1))
+ if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) {
+ if (refs == 1)
+ break;
+ } else if (refs <= 3) {
break;
- if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
+ }
+
+ /* Optimization to avoid locking eb->refs_lock. */
+ if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1))
return;
}
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) == 2 &&
+ if (refcount_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
!extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_dec(&eb->refs);
+ refcount_dec(&eb->refs);
/*
* I know this is terrible, but it's temporary until we stop tracking
@@ -3496,9 +3672,9 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
spin_lock(&eb->refs_lock);
set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
- if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+ if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_dec(&eb->refs);
+ refcount_dec(&eb->refs);
release_extent_buffer(eb);
}
@@ -3508,8 +3684,8 @@ static void btree_clear_folio_dirty_tag(struct folio *folio)
ASSERT(folio_test_locked(folio));
xa_lock_irq(&folio->mapping->i_pages);
if (!folio_test_dirty(folio))
- __xa_clear_mark(&folio->mapping->i_pages,
- folio_index(folio), PAGECACHE_TAG_DIRTY);
+ __xa_clear_mark(&folio->mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&folio->mapping->i_pages);
}
@@ -3540,6 +3716,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
return;
+ buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY);
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
fs_info->dirty_metadata_batch);
@@ -3555,7 +3732,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
btree_clear_folio_dirty_tag(folio);
folio_unlock(folio);
}
- WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(refcount_read(&eb->refs) == 0);
}
void set_extent_buffer_dirty(struct extent_buffer *eb)
@@ -3566,7 +3743,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(refcount_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
@@ -3588,6 +3765,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
folio_lock(eb->folios[0]);
for (int i = 0; i < num_extent_folios(eb); i++)
btrfs_meta_folio_set_dirty(eb->folios[i], eb);
+ buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY);
if (subpage)
folio_unlock(eb->folios[0]);
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
@@ -3624,9 +3802,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
static void clear_extent_buffer_reading(struct extent_buffer *eb)
{
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags);
}
static void end_bbio_meta_read(struct btrfs_bio *bbio)
@@ -3647,12 +3823,10 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0)
uptodate = false;
- if (uptodate) {
+ if (uptodate)
set_extent_buffer_uptodate(eb);
- } else {
+ else
clear_extent_buffer_uptodate(eb);
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- }
clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
@@ -3663,6 +3837,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
const struct btrfs_tree_parent_check *check)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_bio *bbio;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
@@ -3691,22 +3866,19 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
return 0;
}
- clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
check_buffer_tree_ref(eb);
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
- REQ_OP_READ | REQ_META, eb->fs_info,
- end_bbio_meta_read, eb);
+ REQ_OP_READ | REQ_META, BTRFS_I(fs_info->btree_inode),
+ eb->start, end_bbio_meta_read, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
- bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
- bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
- u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ u32 range_len = min_t(u64, folio_next_pos(folio),
eb->start + eb->len) - range_start;
bio_add_folio_nofail(&bbio->bio, folio, range_len,
@@ -3726,7 +3898,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
return ret;
wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)))
return -EIO;
return 0;
}
@@ -3737,7 +3909,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
btrfs_warn(eb->fs_info,
"access to eb bytenr %llu len %u out of range start %lu len %lu",
eb->start, eb->len, start, len);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
return true;
}
@@ -4085,8 +4257,8 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number to test
*/
-int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
- unsigned long nr)
+bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long nr)
{
unsigned long i;
size_t offset;
@@ -4273,82 +4445,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
-#define GANG_LOOKUP_SIZE 16
-static struct extent_buffer *get_next_extent_buffer(
- const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr)
-{
- struct extent_buffer *gang[GANG_LOOKUP_SIZE];
- struct extent_buffer *found = NULL;
- u64 folio_start = folio_pos(folio);
- u64 cur = folio_start;
-
- ASSERT(in_range(bytenr, folio_start, PAGE_SIZE));
- lockdep_assert_held(&fs_info->buffer_lock);
-
- while (cur < folio_start + PAGE_SIZE) {
- int ret;
- int i;
-
- ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
- (void **)gang, cur >> fs_info->sectorsize_bits,
- min_t(unsigned int, GANG_LOOKUP_SIZE,
- PAGE_SIZE / fs_info->nodesize));
- if (ret == 0)
- goto out;
- for (i = 0; i < ret; i++) {
- /* Already beyond page end */
- if (gang[i]->start >= folio_start + PAGE_SIZE)
- goto out;
- /* Found one */
- if (gang[i]->start >= bytenr) {
- found = gang[i];
- goto out;
- }
- }
- cur = gang[ret - 1]->start + gang[ret - 1]->len;
- }
-out:
- return found;
-}
-
static int try_release_subpage_extent_buffer(struct folio *folio)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- u64 cur = folio_pos(folio);
- const u64 end = cur + PAGE_SIZE;
+ struct extent_buffer *eb;
+ unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits);
+ unsigned long index = start;
+ unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
int ret;
- while (cur < end) {
- struct extent_buffer *eb = NULL;
-
- /*
- * Unlike try_release_extent_buffer() which uses folio private
- * to grab buffer, for subpage case we rely on radix tree, thus
- * we need to ensure radix tree consistency.
- *
- * We also want an atomic snapshot of the radix tree, thus go
- * with spinlock rather than RCU.
- */
- spin_lock(&fs_info->buffer_lock);
- eb = get_next_extent_buffer(fs_info, folio, cur);
- if (!eb) {
- /* No more eb in the page range after or at cur */
- spin_unlock(&fs_info->buffer_lock);
- break;
- }
- cur = eb->start + eb->len;
-
+ rcu_read_lock();
+ xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
/*
* The same as try_release_extent_buffer(), to ensure the eb
* won't disappear out from under us.
*/
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ rcu_read_unlock();
+
+ if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
- spin_unlock(&fs_info->buffer_lock);
- break;
+ rcu_read_lock();
+ continue;
}
- spin_unlock(&fs_info->buffer_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a
@@ -4366,7 +4485,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
* release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
+ rcu_read_lock();
}
+ rcu_read_unlock();
+
/*
* Finally to check if we have cleared folio private, as if we have
* released all ebs in the page, the folio private should be cleared now.
@@ -4378,7 +4500,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
ret = 0;
spin_unlock(&folio->mapping->i_private_lock);
return ret;
-
}
int try_release_extent_buffer(struct folio *folio)
@@ -4407,7 +4528,7 @@ int try_release_extent_buffer(struct folio *folio)
* this page.
*/
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
spin_unlock(&folio->mapping->i_private_lock);
return 0;
@@ -4453,7 +4574,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
if (IS_ERR(eb))
return;
- if (btrfs_buffer_uptodate(eb, gen, 1)) {
+ if (btrfs_buffer_uptodate(eb, gen, true)) {
free_extent_buffer(eb);
return;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2e261892c7bc..02ebb2f238af 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -12,7 +12,6 @@
#include <linux/rwsem.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include "compression.h"
#include "messages.h"
#include "ulist.h"
#include "misc.h"
@@ -38,16 +37,10 @@ struct btrfs_tree_parent_check;
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
- EXTENT_BUFFER_CORRUPT,
- /* this got triggered by readahead */
- EXTENT_BUFFER_READAHEAD,
EXTENT_BUFFER_TREE_REF,
EXTENT_BUFFER_STALE,
EXTENT_BUFFER_WRITEBACK,
- /* read IO error */
- EXTENT_BUFFER_READ_ERR,
EXTENT_BUFFER_UNMAPPED,
- EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
/* Indicate the extent buffer is written zeroed out (for zoned) */
@@ -79,7 +72,7 @@ enum {
* single word in a bitmap may straddle two pages in the extent buffer.
*/
#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
-#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1)
#define BITMAP_FIRST_BYTE_MASK(start) \
((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
#define BITMAP_LAST_BYTE_MASK(nbits) \
@@ -104,7 +97,7 @@ struct extent_buffer {
void *addr;
spinlock_t refs_lock;
- atomic_t refs;
+ refcount_t refs;
int read_mirror;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
@@ -246,6 +239,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
+void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
void btrfs_readahead(struct readahead_control *rac);
int set_folio_extent_mapped(struct folio *folio);
void clear_folio_extent_mapped(struct folio *folio);
@@ -298,6 +292,8 @@ static inline int __pure num_extent_pages(const struct extent_buffer *eb)
*/
static inline int __pure num_extent_folios(const struct extent_buffer *eb)
{
+ if (!eb->folios[0])
+ return 0;
if (folio_order(eb->folios[0]))
return 1;
return num_extent_pages(eb);
@@ -348,8 +344,8 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long len);
void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len);
-int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
- unsigned long pos);
+bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long pos);
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
@@ -369,7 +365,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
bool nofail);
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array);
+int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
+ struct folio **folio_array);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7f46abbd6311..7e38c23a0c1c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -13,7 +13,7 @@
static struct kmem_cache *extent_map_cache;
-int __init extent_map_init(void)
+int __init btrfs_extent_map_init(void)
{
extent_map_cache = kmem_cache_create("btrfs_extent_map",
sizeof(struct extent_map), 0, 0, NULL);
@@ -22,7 +22,7 @@ int __init extent_map_init(void)
return 0;
}
-void __cold extent_map_exit(void)
+void __cold btrfs_extent_map_exit(void)
{
kmem_cache_destroy(extent_map_cache);
}
@@ -31,7 +31,7 @@ void __cold extent_map_exit(void)
* Initialize the extent tree @tree. Should be called for each new inode or
* other user of the extent_map interface.
*/
-void extent_map_tree_init(struct extent_map_tree *tree)
+void btrfs_extent_map_tree_init(struct extent_map_tree *tree)
{
tree->root = RB_ROOT;
INIT_LIST_HEAD(&tree->modified_extents);
@@ -42,7 +42,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
* Allocate a new extent_map structure. The new structure is returned with a
* reference count of one and needs to be freed using free_extent_map()
*/
-struct extent_map *alloc_extent_map(void)
+struct extent_map *btrfs_alloc_extent_map(void)
{
struct extent_map *em;
em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
@@ -58,12 +58,12 @@ struct extent_map *alloc_extent_map(void)
* Drop the reference out on @em by one and free the structure if the reference
* count hits zero.
*/
-void free_extent_map(struct extent_map *em)
+void btrfs_free_extent_map(struct extent_map *em)
{
if (!em)
return;
if (refcount_dec_and_test(&em->refs)) {
- WARN_ON(extent_map_in_tree(em));
+ WARN_ON(btrfs_extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
kmem_cache_free(extent_map_cache, em);
}
@@ -84,7 +84,7 @@ static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
rb_erase(&em->rb_node, &inode->extent_tree.root);
RB_CLEAR_NODE(&em->rb_node);
- if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
+ if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(inode->root)))
percpu_counter_dec(&fs_info->evictable_extent_maps);
}
@@ -102,19 +102,19 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
if (em->start < entry->start)
p = &(*p)->rb_left;
- else if (em->start >= extent_map_end(entry))
+ else if (em->start >= btrfs_extent_map_end(entry))
p = &(*p)->rb_right;
else
return -EEXIST;
}
orig_parent = parent;
- while (parent && em->start >= extent_map_end(entry)) {
+ while (parent && em->start >= btrfs_extent_map_end(entry)) {
parent = rb_next(parent);
entry = rb_entry(parent, struct extent_map, rb_node);
}
if (parent)
- if (end > entry->start && em->start < extent_map_end(entry))
+ if (end > entry->start && em->start < btrfs_extent_map_end(entry))
return -EEXIST;
parent = orig_parent;
@@ -124,7 +124,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
entry = rb_entry(parent, struct extent_map, rb_node);
}
if (parent)
- if (end > entry->start && em->start < extent_map_end(entry))
+ if (end > entry->start && em->start < btrfs_extent_map_end(entry))
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
@@ -136,8 +136,8 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
* Search through the tree for an extent_map with a given offset. If it can't
* be found, try to find some neighboring extents
*/
-static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
- struct rb_node **prev_or_next_ret)
+static struct rb_node *tree_search(struct rb_root *root, u64 offset,
+ struct rb_node **prev_or_next_ret)
{
struct rb_node *n = root->rb_node;
struct rb_node *prev = NULL;
@@ -154,14 +154,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
if (offset < entry->start)
n = n->rb_left;
- else if (offset >= extent_map_end(entry))
+ else if (offset >= btrfs_extent_map_end(entry))
n = n->rb_right;
else
return n;
}
orig_prev = prev;
- while (prev && offset >= extent_map_end(prev_entry)) {
+ while (prev && offset >= btrfs_extent_map_end(prev_entry)) {
prev = rb_next(prev);
prev_entry = rb_entry(prev, struct extent_map, rb_node);
}
@@ -188,14 +188,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
static inline u64 extent_map_block_len(const struct extent_map *em)
{
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return em->disk_num_bytes;
return em->len;
}
static inline u64 extent_map_block_end(const struct extent_map *em)
{
- const u64 block_start = extent_map_block_start(em);
+ const u64 block_start = btrfs_extent_map_block_start(em);
const u64 block_end = block_start + extent_map_block_len(em);
if (block_end < block_start)
@@ -210,7 +210,7 @@ static bool can_merge_extent_map(const struct extent_map *em)
return false;
/* Don't merge compressed extents, we need to know their actual size. */
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return false;
if (em->flags & EXTENT_FLAG_LOGGING)
@@ -230,7 +230,7 @@ static bool can_merge_extent_map(const struct extent_map *em)
/* Check to see if two extent_map structs are adjacent and safe to merge. */
static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
{
- if (extent_map_end(prev) != next->start)
+ if (btrfs_extent_map_end(prev) != next->start)
return false;
/*
@@ -242,7 +242,7 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
return false;
if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
- return extent_map_block_start(next) == extent_map_block_end(prev);
+ return btrfs_extent_map_block_start(next) == extent_map_block_end(prev);
/* HOLES and INLINE extents. */
return next->disk_bytenr == prev->disk_bytenr;
@@ -270,8 +270,8 @@ static void merge_ondisk_extents(const struct extent_map *prev, const struct ext
u64 new_offset;
/* @prev and @next should not be compressed. */
- ASSERT(!extent_map_is_compressed(prev));
- ASSERT(!extent_map_is_compressed(next));
+ ASSERT(!btrfs_extent_map_is_compressed(prev));
+ ASSERT(!btrfs_extent_map_is_compressed(next));
/*
* There are two different cases where @prev and @next can be merged.
@@ -327,9 +327,9 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
if (em->offset + em->len > em->ram_bytes)
dump_extent_map(fs_info, "ram_bytes too small", em);
if (em->offset + em->len > em->disk_num_bytes &&
- !extent_map_is_compressed(em))
+ !btrfs_extent_map_is_compressed(em))
dump_extent_map(fs_info, "disk_num_bytes too small", em);
- if (!extent_map_is_compressed(em) &&
+ if (!btrfs_extent_map_is_compressed(em) &&
em->ram_bytes != em->disk_num_bytes)
dump_extent_map(fs_info,
"ram_bytes mismatch with disk_num_bytes for non-compressed em",
@@ -361,8 +361,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
- if (rb)
- merge = rb_entry(rb, struct extent_map, rb_node);
+ merge = rb_entry_safe(rb, struct extent_map, rb_node);
+
if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
em->len += merge->len;
@@ -374,13 +374,13 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
validate_extent_map(fs_info, em);
remove_em(inode, merge);
- free_extent_map(merge);
+ btrfs_free_extent_map(merge);
}
}
rb = rb_next(&em->rb_node);
- if (rb)
- merge = rb_entry(rb, struct extent_map, rb_node);
+ merge = rb_entry_safe(rb, struct extent_map, rb_node);
+
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
@@ -389,7 +389,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
remove_em(inode, merge);
- free_extent_map(merge);
+ btrfs_free_extent_map(merge);
}
}
@@ -409,7 +409,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
* -ENOENT when the extent is not found in the tree
* -EUCLEAN if the found extent does not match the expected start
*/
-int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
+int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -417,7 +417,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
struct extent_map *em;
write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, start, len);
+ em = btrfs_lookup_extent_mapping(tree, start, len);
if (WARN_ON(!em)) {
btrfs_warn(fs_info,
@@ -444,23 +444,23 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
out:
write_unlock(&tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ret;
}
-void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
+void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
{
lockdep_assert_held_write(&inode->extent_tree.lock);
em->flags &= ~EXTENT_FLAG_LOGGING;
- if (extent_map_in_tree(em))
+ if (btrfs_extent_map_in_tree(em))
try_merge_map(inode, em);
}
static inline void setup_extent_mapping(struct btrfs_inode *inode,
struct extent_map *em,
- int modified)
+ bool modified)
{
refcount_inc(&em->refs);
@@ -486,7 +486,7 @@ static inline void setup_extent_mapping(struct btrfs_inode *inode,
* taken, or a reference dropped if the merge attempt was successful.
*/
static int add_extent_mapping(struct btrfs_inode *inode,
- struct extent_map *em, int modified)
+ struct extent_map *em, bool modified)
{
struct extent_map_tree *tree = &inode->extent_tree;
struct btrfs_root *root = inode->root;
@@ -502,22 +502,21 @@ static int add_extent_mapping(struct btrfs_inode *inode,
setup_extent_mapping(inode, em, modified);
- if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
+ if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(root)))
percpu_counter_inc(&fs_info->evictable_extent_maps);
return 0;
}
-static struct extent_map *
-__lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len, int strict)
+static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len, bool strict)
{
struct extent_map *em;
struct rb_node *rb_node;
struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->root, start, &prev_or_next);
+ rb_node = tree_search(&tree->root, start, &prev_or_next);
if (!rb_node) {
if (prev_or_next)
rb_node = prev_or_next;
@@ -527,7 +526,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
em = rb_entry(rb_node, struct extent_map, rb_node);
- if (strict && !(end > em->start && start < extent_map_end(em)))
+ if (strict && !(end > em->start && start < btrfs_extent_map_end(em)))
return NULL;
refcount_inc(&em->refs);
@@ -546,10 +545,10 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
* intersect, so check the object returned carefully to make sure that no
* additional lookups are needed.
*/
-struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len)
+struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
{
- return __lookup_extent_mapping(tree, start, len, 1);
+ return lookup_extent_mapping(tree, start, len, true);
}
/*
@@ -564,10 +563,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
*
* If one can't be found, any nearby extent may be returned
*/
-struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len)
+struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
{
- return __lookup_extent_mapping(tree, start, len, 0);
+ return lookup_extent_mapping(tree, start, len, false);
}
/*
@@ -579,7 +578,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
* Remove @em from the extent tree of @inode. No reference counts are dropped,
* and no checks are done to see if the range is in use.
*/
-void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
+void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
{
struct extent_map_tree *tree = &inode->extent_tree;
@@ -595,7 +594,7 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
static void replace_extent_mapping(struct btrfs_inode *inode,
struct extent_map *cur,
struct extent_map *new,
- int modified)
+ bool modified)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -605,7 +604,7 @@ static void replace_extent_mapping(struct btrfs_inode *inode,
validate_extent_map(fs_info, new);
WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
- ASSERT(extent_map_in_tree(cur));
+ ASSERT(btrfs_extent_map_in_tree(cur));
if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root);
@@ -651,7 +650,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
u64 end;
u64 start_diff;
- if (map_start < em->start || map_start >= extent_map_end(em))
+ if (map_start < em->start || map_start >= btrfs_extent_map_end(em))
return -EINVAL;
if (existing->start > map_start) {
@@ -662,16 +661,16 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
next = next_extent_map(prev);
}
- start = prev ? extent_map_end(prev) : em->start;
+ start = prev ? btrfs_extent_map_end(prev) : em->start;
start = max_t(u64, start, em->start);
- end = next ? next->start : extent_map_end(em);
- end = min_t(u64, end, extent_map_end(em));
+ end = next ? next->start : btrfs_extent_map_end(em);
+ end = min_t(u64, end, btrfs_extent_map_end(em));
start_diff = start - em->start;
em->start = start;
em->len = end - start;
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
em->offset += start_diff;
- return add_extent_mapping(inode, em, 0);
+ return add_extent_mapping(inode, em, false);
}
/*
@@ -708,7 +707,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
if (em->disk_bytenr == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
- ret = add_extent_mapping(inode, em, 0);
+ ret = add_extent_mapping(inode, em, false);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
* an overlapping map exists in the tree
@@ -716,7 +715,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
if (ret == -EEXIST) {
struct extent_map *existing;
- existing = search_extent_mapping(&inode->extent_tree, start, len);
+ existing = btrfs_search_extent_mapping(&inode->extent_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -725,8 +724,8 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
* extent causing the -EEXIST.
*/
if (start >= existing->start &&
- start < extent_map_end(existing)) {
- free_extent_map(em);
+ start < btrfs_extent_map_end(existing)) {
+ btrfs_free_extent_map(em);
*em_in = existing;
ret = 0;
} else {
@@ -739,14 +738,14 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
*/
ret = merge_extent_mapping(inode, existing, em, start);
if (WARN_ON(ret)) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*em_in = NULL;
btrfs_warn(fs_info,
"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
- existing->start, extent_map_end(existing),
+ existing->start, btrfs_extent_map_end(existing),
orig_start, orig_start + orig_len, start);
}
- free_extent_map(existing);
+ btrfs_free_extent_map(existing);
}
}
@@ -772,8 +771,8 @@ static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
em = rb_entry(node, struct extent_map, rb_node);
em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
- remove_extent_mapping(inode, em);
- free_extent_map(em);
+ btrfs_remove_extent_mapping(inode, em);
+ btrfs_free_extent_map(em);
if (cond_resched_rwlock_write(&tree->lock))
node = rb_first(&tree->root);
@@ -826,15 +825,15 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
* range ends after our range (and they might be the same extent map),
* because we need to split those two extent maps at the boundaries.
*/
- split = alloc_extent_map();
- split2 = alloc_extent_map();
+ split = btrfs_alloc_extent_map();
+ split2 = btrfs_alloc_extent_map();
write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
while (em) {
/* extent_map_end() returns exclusive value (last byte + 1). */
- const u64 em_end = extent_map_end(em);
+ const u64 em_end = btrfs_extent_map_end(em);
struct extent_map *next_em = NULL;
u64 gen;
unsigned long flags;
@@ -898,7 +897,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->generation = gen;
split->flags = flags;
replace_extent_mapping(inode, em, split, modified);
- free_extent_map(split);
+ btrfs_free_extent_map(split);
split = split2;
split2 = NULL;
}
@@ -925,7 +924,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->ram_bytes = split->len;
}
- if (extent_map_in_tree(em)) {
+ if (btrfs_extent_map_in_tree(em)) {
replace_extent_mapping(inode, em, split, modified);
} else {
int ret;
@@ -936,11 +935,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
if (WARN_ON(ret != 0) && modified)
btrfs_set_inode_full_sync(inode);
}
- free_extent_map(split);
+ btrfs_free_extent_map(split);
split = NULL;
}
remove_em:
- if (extent_map_in_tree(em)) {
+ if (btrfs_extent_map_in_tree(em)) {
/*
* If the extent map is still in the tree it means that
* either of the following is true:
@@ -965,25 +964,25 @@ remove_em:
ASSERT(!split);
btrfs_set_inode_full_sync(inode);
}
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
}
/*
* Once for the tree reference (we replaced or removed the
* extent map from the tree).
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
/* Once for us (for our lookup reference). */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = next_em;
}
write_unlock(&em_tree->lock);
- free_extent_map(split);
- free_extent_map(split2);
+ btrfs_free_extent_map(split);
+ btrfs_free_extent_map(split2);
}
/*
@@ -1007,7 +1006,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map_tree *tree = &inode->extent_tree;
int ret;
- ASSERT(!extent_map_in_tree(new_em));
+ ASSERT(!btrfs_extent_map_in_tree(new_em));
/*
* The caller has locked an appropriate file range in the inode's io
@@ -1033,8 +1032,8 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
*
* This function is used when an ordered_extent needs to be split.
*/
-int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
- u64 new_logical)
+int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
+ u64 new_logical)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
@@ -1046,25 +1045,25 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
ASSERT(pre != 0);
ASSERT(pre < len);
- split_pre = alloc_extent_map();
+ split_pre = btrfs_alloc_extent_map();
if (!split_pre)
return -ENOMEM;
- split_mid = alloc_extent_map();
+ split_mid = btrfs_alloc_extent_map();
if (!split_mid) {
ret = -ENOMEM;
goto out_free_pre;
}
- lock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL);
write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em) {
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
+ if (unlikely(!em)) {
ret = -EIO;
goto out_unlock;
}
ASSERT(em->len == len);
- ASSERT(!extent_map_is_compressed(em));
+ ASSERT(!btrfs_extent_map_is_compressed(em));
ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE);
ASSERT(em->flags & EXTENT_FLAG_PINNED);
ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
@@ -1083,7 +1082,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_pre->flags = flags;
split_pre->generation = em->generation;
- replace_extent_mapping(inode, em, split_pre, 1);
+ replace_extent_mapping(inode, em, split_pre, true);
/*
* Now we only have an extent_map at:
@@ -1093,25 +1092,25 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
/* Insert the middle extent_map. */
split_mid->start = em->start + pre;
split_mid->len = em->len - pre;
- split_mid->disk_bytenr = extent_map_block_start(em) + pre;
+ split_mid->disk_bytenr = btrfs_extent_map_block_start(em) + pre;
split_mid->disk_num_bytes = split_mid->len;
split_mid->offset = 0;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->generation = em->generation;
- add_extent_mapping(inode, split_mid, 1);
+ add_extent_mapping(inode, split_mid, true);
/* Once for us */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Once for the tree */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out_unlock:
write_unlock(&em_tree->lock);
- unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
- free_extent_map(split_mid);
+ btrfs_unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ btrfs_free_extent_map(split_mid);
out_free_pre:
- free_extent_map(split_pre);
+ btrfs_free_extent_map(split_pre);
return ret;
}
@@ -1168,10 +1167,10 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
btrfs_set_inode_full_sync(inode);
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
trace_btrfs_extent_map_shrinker_remove_em(inode, em);
/* Drop the reference for the tree. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
nr_dropped++;
next:
if (ctx->scanned >= ctx->nr_to_scan)
@@ -1338,7 +1337,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
if (!root)
continue;
- if (is_fstree(btrfs_root_id(root)))
+ if (btrfs_is_fstree(btrfs_root_id(root)))
nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
@@ -1373,7 +1372,7 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
return;
- queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
+ queue_work(system_dfl_wq, &fs_info->em_shrinker_work);
}
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index cd123b266b64..6f685f3c9327 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -8,8 +8,7 @@
#include <linux/rbtree.h>
#include <linux/list.h>
#include <linux/refcount.h>
-#include "misc.h"
-#include "compression.h"
+#include "fs.h"
struct btrfs_inode;
struct btrfs_fs_info;
@@ -108,8 +107,8 @@ struct extent_map_tree {
struct btrfs_inode;
-static inline void extent_map_set_compression(struct extent_map *em,
- enum btrfs_compression_type type)
+static inline void btrfs_extent_map_set_compression(struct extent_map *em,
+ enum btrfs_compression_type type)
{
if (type == BTRFS_COMPRESS_ZLIB)
em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
@@ -119,7 +118,8 @@ static inline void extent_map_set_compression(struct extent_map *em,
em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
}
-static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
+static inline enum btrfs_compression_type btrfs_extent_map_compression(
+ const struct extent_map *em)
{
if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
return BTRFS_COMPRESS_ZLIB;
@@ -137,50 +137,50 @@ static inline enum btrfs_compression_type extent_map_compression(const struct ex
* More efficient way to determine if extent is compressed, instead of using
* 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
*/
-static inline bool extent_map_is_compressed(const struct extent_map *em)
+static inline bool btrfs_extent_map_is_compressed(const struct extent_map *em)
{
return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
EXTENT_FLAG_COMPRESS_LZO |
EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
}
-static inline int extent_map_in_tree(const struct extent_map *em)
+static inline int btrfs_extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
}
-static inline u64 extent_map_block_start(const struct extent_map *em)
+static inline u64 btrfs_extent_map_block_start(const struct extent_map *em)
{
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return em->disk_bytenr;
return em->disk_bytenr + em->offset;
}
return em->disk_bytenr;
}
-static inline u64 extent_map_end(const struct extent_map *em)
+static inline u64 btrfs_extent_map_end(const struct extent_map *em)
{
if (em->start + em->len < em->start)
return (u64)-1;
return em->start + em->len;
}
-void extent_map_tree_init(struct extent_map_tree *tree);
-struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len);
-void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
-int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
- u64 new_logical);
-
-struct extent_map *alloc_extent_map(void);
-void free_extent_map(struct extent_map *em);
-int __init extent_map_init(void);
-void __cold extent_map_exit(void);
-int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
-void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
-struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len);
+void btrfs_extent_map_tree_init(struct extent_map_tree *tree);
+struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
+int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
+ u64 new_logical);
+
+struct extent_map *btrfs_alloc_extent_map(void);
+void btrfs_free_extent_map(struct extent_map *em);
+int __init btrfs_extent_map_init(void);
+void __cold btrfs_extent_map_exit(void);
+int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
+void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
+struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len);
void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index b80c07ad8c5e..f2eaaef8422b 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -153,7 +153,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
if (cache_end > offset) {
if (offset == cache->offset) {
/*
- * We cached a dealloc range (found in the io tree) for
+ * We cached a delalloc range (found in the io tree) for
* a hole or prealloc extent and we have now found a
* file extent item for the same offset. What we have
* now is more recent and up to date, so discard what
@@ -320,7 +320,7 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p
* the cost of allocating a new one.
*/
ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
- atomic_inc(&clone->refs);
+ refcount_inc(&clone->refs);
ret = btrfs_next_leaf(inode->root, path);
if (ret != 0)
@@ -634,7 +634,7 @@ static int extent_fiemap(struct btrfs_inode *inode,
const u64 ino = btrfs_ino(inode);
struct extent_state *cached_state = NULL;
struct extent_state *delalloc_cached_state = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end = 0;
@@ -661,7 +661,7 @@ restart:
range_end = round_up(start + len, sectorsize);
prev_extent_end = range_start;
- lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
@@ -841,7 +841,7 @@ check_eof_delalloc:
}
out_unlock:
- unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
btrfs_release_path(path);
@@ -871,10 +871,9 @@ out_unlock:
ret = emit_last_fiemap_cache(fieinfo, &cache);
out:
- free_extent_state(delalloc_cached_state);
+ btrfs_free_extent_state(delalloc_cached_state);
kfree(cache.entries);
btrfs_free_backref_share_ctx(backref_ctx);
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 344b4db487a0..14e5257f0f04 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -18,6 +18,7 @@
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
+#include "volumes.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
@@ -46,7 +47,7 @@
void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
u64 start, end, i_size;
- int ret;
+ bool found;
spin_lock(&inode->lock);
i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
@@ -55,9 +56,9 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
goto out_unlock;
}
- ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
- &end, EXTENT_DIRTY);
- if (!ret && start == 0)
+ found = btrfs_find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
+ &end, EXTENT_DIRTY);
+ if (found && start == 0)
i_size = min(i_size, end + 1);
else
i_size = 0;
@@ -91,8 +92,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
- return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
- EXTENT_DIRTY, NULL);
+ return btrfs_set_extent_bit(inode->file_extent_tree, start, start + len - 1,
+ EXTENT_DIRTY, NULL);
}
/*
@@ -121,8 +122,8 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
len == (u64)-1);
- return clear_extent_bit(inode->file_extent_tree, start,
- start + len - 1, EXTENT_DIRTY, NULL);
+ return btrfs_clear_extent_bit(inode->file_extent_tree, start,
+ start + len - 1, EXTENT_DIRTY, NULL);
}
static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes)
@@ -336,7 +337,7 @@ out:
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
-blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
+int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -347,12 +348,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
u32 orig_len = bio->bi_iter.bi_size;
u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
- blk_status_t ret = BLK_STS_OK;
+ int ret = 0;
u32 bio_offset = 0;
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state))
- return BLK_STS_OK;
+ return 0;
/*
* This function is only called for read bio.
@@ -369,12 +370,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
ASSERT(bio_op(bio) == REQ_OP_READ);
path = btrfs_alloc_path();
if (!path)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+ bbio->csum = kvcalloc(nblocks, csum_size, GFP_NOFS);
if (!bbio->csum)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
} else {
bbio->csum = bbio->csum_inline;
}
@@ -393,8 +394,38 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
* between reading the free space cache and updating the csum tree.
*/
if (btrfs_is_free_space_inode(inode)) {
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
+ }
+
+ /*
+ * If we are searching for a csum of an extent from a past
+ * transaction, we can search in the commit root and reduce
+ * lock contention on the csum tree extent buffers.
+ *
+ * This is important because that lock is an rwsem which gets
+ * pretty heavy write load under memory pressure and sustained
+ * csum overwrites, unlike the commit_root_sem. (Memory pressure
+ * makes us writeback the nodes multiple times per transaction,
+ * which makes us cow them each time, taking the write lock.)
+ *
+ * Due to how rwsem is implemented, there is a possible
+ * priority inversion where the readers holding the lock don't
+ * get scheduled (say they're in a cgroup stuck in heavy reclaim)
+ * which then blocks writers, including transaction commit. By
+ * using a semaphore with fewer writers (only a commit switching
+ * the roots), we make this issue less likely.
+ *
+ * Note that we don't rely on btrfs_search_slot to lock the
+ * commit root csum. We call search_slot multiple times, which would
+ * create a potential race where a commit comes in between searches
+ * while we are not holding the commit_root_sem, and we get csums
+ * from across transactions.
+ */
+ if (bbio->csum_search_commit_root) {
+ path->search_commit_root = true;
+ path->skip_locking = true;
+ down_read(&fs_info->commit_root_sem);
}
while (bio_offset < orig_len) {
@@ -406,9 +437,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
orig_len - bio_offset, csum_dst);
if (count < 0) {
- ret = errno_to_blk_status(count);
+ ret = count;
if (bbio->csum != bbio->csum_inline)
- kfree(bbio->csum);
+ kvfree(bbio->csum);
bbio->csum = NULL;
break;
}
@@ -427,12 +458,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
memset(csum_dst, 0, csum_size);
count = 1;
- if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_is_data_reloc_root(inode->root)) {
u64 file_offset = bbio->file_offset + bio_offset;
- set_extent_bit(&inode->io_tree, file_offset,
- file_offset + sectorsize - 1,
- EXTENT_NODATASUM, NULL);
+ btrfs_set_extent_bit(&inode->io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, NULL);
} else {
btrfs_warn_rl(fs_info,
"csum hole found for disk bytenr range [%llu, %llu)",
@@ -442,6 +473,8 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
bio_offset += count * sectorsize;
}
+ if (bbio->csum_search_commit_root)
+ up_read(&fs_info->commit_root_sem);
return ret;
}
@@ -732,23 +765,55 @@ fail:
return ret;
}
+static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src)
+{
+ struct btrfs_inode *inode = bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ struct bio *bio = &bbio->bio;
+ struct btrfs_ordered_sum *sums = bbio->sums;
+ struct bvec_iter iter = *src;
+ phys_addr_t paddr;
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 step = min(blocksize, PAGE_SIZE);
+ const u32 nr_steps = blocksize / step;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
+ u32 offset = 0;
+ int index = 0;
+
+ shash->tfm = fs_info->csum_shash;
+
+ btrfs_bio_for_each_block(paddr, bio, &iter, step) {
+ paddrs[(offset / step) % nr_steps] = paddr;
+ offset += step;
+
+ if (IS_ALIGNED(offset, blocksize)) {
+ btrfs_calculate_block_csum_pages(fs_info, paddrs, sums->sums + index);
+ index += fs_info->csum_size;
+ }
+ }
+}
+
+static void csum_one_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, csum_work);
+
+ ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
+ ASSERT(bbio->async_csum == true);
+ csum_one_bio(bbio, &bbio->csum_saved_iter);
+ complete(&bbio->csum_done);
+}
+
/*
* Calculate checksums of the data contained inside a bio.
*/
-blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
+int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async)
{
struct btrfs_ordered_extent *ordered = bbio->ordered;
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct bio *bio = &bbio->bio;
struct btrfs_ordered_sum *sums;
- char *data;
- struct bvec_iter iter;
- struct bio_vec bvec;
- int index;
- unsigned int blockcount;
- int i;
unsigned nofs_flag;
nofs_flag = memalloc_nofs_save();
@@ -757,35 +822,23 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
memalloc_nofs_restore(nofs_flag);
if (!sums)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
+ sums->logical = bbio->orig_logical;
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
-
- sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- index = 0;
-
- shash->tfm = fs_info->csum_shash;
-
- bio_for_each_segment(bvec, bio, iter) {
- blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
- bvec.bv_len + fs_info->sectorsize
- - 1);
-
- for (i = 0; i < blockcount; i++) {
- data = bvec_kmap_local(&bvec);
- crypto_shash_digest(shash,
- data + (i * fs_info->sectorsize),
- fs_info->sectorsize,
- sums->sums + index);
- kunmap_local(data);
- index += fs_info->csum_size;
- }
-
- }
-
bbio->sums = sums;
btrfs_add_ordered_sum(ordered, sums);
+
+ if (!async) {
+ csum_one_bio(bbio, &bbio->bio.bi_iter);
+ return 0;
+ }
+ init_completion(&bbio->csum_done);
+ bbio->async_csum = true;
+ bbio->csum_saved_iter = bbio->bio.bi_iter;
+ INIT_WORK(&bbio->csum_work, csum_one_bio_work);
+ schedule_work(&bbio->csum_work);
return 0;
}
@@ -794,11 +847,11 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
* record the updated logical address on Zone Append completion.
* Allocate just the structure with an empty sums array here for that case.
*/
-blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
+int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
{
bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS);
if (!bbio->sums)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
bbio->sums->len = bbio->bio.bi_iter.bi_size;
bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
btrfs_add_ordered_sum(bbio->ordered, bbio->sums);
@@ -993,7 +1046,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
* item changed size or key
*/
ret = btrfs_split_item(trans, root, path, &key, offset);
- if (ret && ret != -EAGAIN) {
+ if (unlikely(ret && ret != -EAGAIN)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1048,7 +1101,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key file_key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_csum_item *item;
struct btrfs_csum_item *item_end;
struct extent_buffer *leaf = NULL;
@@ -1124,10 +1177,10 @@ again:
}
btrfs_release_path(path);
- path->search_for_extension = 1;
+ path->search_for_extension = true;
ret = btrfs_search_slot(trans, root, &file_key, path,
csum_size, 1);
- path->search_for_extension = 0;
+ path->search_for_extension = false;
if (ret < 0)
goto out;
@@ -1259,7 +1312,6 @@ found:
goto again;
}
out:
- btrfs_free_path(path);
return ret;
}
@@ -1297,7 +1349,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
em->offset = btrfs_file_extent_offset(leaf, fi);
if (compress_type != BTRFS_COMPRESS_NONE) {
- extent_map_set_compression(em, compress_type);
+ btrfs_extent_map_set_compression(em, compress_type);
} else {
/*
* Older kernels can create regular non-hole data
@@ -1317,7 +1369,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->start = 0;
em->len = fs_info->sectorsize;
em->offset = 0;
- extent_map_set_compression(em, compress_type);
+ btrfs_extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 6181a70ec3ef..5645c5e3abdb 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -7,7 +7,7 @@
#include <linux/list.h>
#include <uapi/linux/btrfs_tree.h>
#include "ctree.h"
-#include "accessors.h"
+#include "ordered-data.h"
struct extent_map;
struct btrfs_file_extent_item;
@@ -53,7 +53,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio);
+int btrfs_lookup_bio_sums(struct btrfs_bio *bbio);
int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid, u64 pos,
u64 num_bytes);
@@ -64,8 +64,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio);
-blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio);
+int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async);
+int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 262a707d8990..7a501e73d880 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -75,7 +75,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
- u64 end_pos = pos + write_bytes;
+ const u64 end_pos = pos + write_bytes;
loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
@@ -86,11 +86,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
extra_bits |= EXTENT_NORESERVE;
start_pos = round_down(pos, fs_info->sectorsize);
- num_bytes = round_up(write_bytes + pos - start_pos,
- fs_info->sectorsize);
+ num_bytes = round_up(end_pos - start_pos, fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
- ASSERT(folio_pos(folio) <= pos &&
- folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
+ ASSERT(folio_pos(folio) <= pos && folio_next_pos(folio) >= end_pos);
end_of_last_block = start_pos + num_bytes - 1;
@@ -98,9 +96,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
- clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- cached);
+ btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ cached);
ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
@@ -328,7 +326,7 @@ next_slot:
args->start - extent_offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -427,7 +425,7 @@ delete_extent_item:
key.offset - extent_offset,
0, false);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -444,7 +442,7 @@ delete_extent_item:
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -508,20 +506,19 @@ out:
return ret;
}
-static int extent_mergeable(struct extent_buffer *leaf, int slot,
- u64 objectid, u64 bytenr, u64 orig_offset,
- u64 *start, u64 *end)
+static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid,
+ u64 bytenr, u64 orig_offset, u64 *start, u64 *end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 extent_end;
if (slot < 0 || slot >= btrfs_header_nritems(leaf))
- return 0;
+ return false;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
- return 0;
+ return false;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
@@ -530,15 +527,15 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
- return 0;
+ return false;
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
if ((*start && *start != key.offset) || (*end && *end != extent_end))
- return 0;
+ return false;
*start = key.offset;
*end = extent_end;
- return 1;
+ return true;
}
/*
@@ -553,7 +550,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_file_extent_item *fi;
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
@@ -589,21 +586,20 @@ again:
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ino ||
- key.type != BTRFS_EXTENT_DATA_KEY) {
+ if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
+ if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (key.offset > start || extent_end < end) {
+ if (unlikely(key.offset > start || extent_end < end)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -678,7 +674,7 @@ again:
btrfs_release_path(path);
goto again;
}
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -706,7 +702,7 @@ again:
ref.ref_root = btrfs_root_id(root);
btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -714,7 +710,7 @@ again:
if (split == start) {
key.offset = start;
} else {
- if (start != key.offset) {
+ if (unlikely(start != key.offset)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -746,7 +742,7 @@ again:
del_slot = path->slots[0] + 1;
del_nr++;
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -764,7 +760,7 @@ again:
del_slot = path->slots[0];
del_nr++;
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -785,13 +781,12 @@ again:
extent_end - key.offset);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
out:
- btrfs_free_path(path);
return ret;
}
@@ -800,18 +795,17 @@ out:
* On success return a locked folio and 0
*/
static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
- u64 len, bool force_uptodate)
+ u64 len)
{
u64 clamp_start = max_t(u64, pos, folio_pos(folio));
- u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
+ u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio));
const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
int ret = 0;
if (folio_test_uptodate(folio))
return 0;
- if (!force_uptodate &&
- IS_ALIGNED(clamp_start, blocksize) &&
+ if (IS_ALIGNED(clamp_start, blocksize) &&
IS_ALIGNED(clamp_end, blocksize))
return 0;
@@ -819,7 +813,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64
if (ret)
return ret;
folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
folio_unlock(folio);
return -EIO;
}
@@ -858,32 +852,27 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
*/
static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
loff_t pos, size_t write_bytes,
- bool force_uptodate, bool nowait)
+ bool nowait)
{
- unsigned long index = pos >> PAGE_SHIFT;
+ const pgoff_t index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
- fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
+ fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
+ fgf_set_order(write_bytes);
struct folio *folio;
int ret = 0;
again:
folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
- if (IS_ERR(folio)) {
- if (nowait)
- ret = -EAGAIN;
- else
- ret = PTR_ERR(folio);
- return ret;
- }
- /* Only support page sized folio yet. */
- ASSERT(folio_order(folio) == 0);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
return ret;
}
- ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
+ ret = prepare_uptodate_folio(inode, folio, pos, write_bytes);
if (ret) {
/* The folio is already unlocked. */
folio_put(folio);
@@ -924,14 +913,15 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
struct btrfs_ordered_extent *ordered;
if (nowait) {
- if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
- cached_state)) {
+ if (!btrfs_try_lock_extent(&inode->io_tree, start_pos,
+ last_pos, cached_state)) {
folio_unlock(folio);
folio_put(folio);
return -EAGAIN;
}
} else {
- lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
+ btrfs_lock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
}
ordered = btrfs_lookup_ordered_range(inode, start_pos,
@@ -939,8 +929,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
if (ordered &&
ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
- unlock_extent(&inode->io_tree, start_pos, last_pos,
- cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
folio_unlock(folio);
folio_put(folio);
btrfs_start_ordered_extent(ordered);
@@ -970,6 +960,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
* @pos: File offset.
* @write_bytes: The length to write, will be updated to the nocow writeable
* range.
+ * @nowait: Indicate if we can block or not (non-blocking IO context).
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks.
@@ -977,8 +968,9 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
* Return:
* > 0 If we can nocow, and updates @write_bytes.
* 0 If we can't do a nocow write.
- * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
- * root is in progress.
+ * -EAGAIN If we can't do a nocow write because snapshotting of the inode's
+ * root is in progress or because we are in a non-blocking IO
+ * context and need to block (@nowait is true).
* < 0 If an error happened.
*
* NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
@@ -990,8 +982,8 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
struct btrfs_root *root = inode->root;
struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
- u64 num_bytes;
- int ret;
+ u64 cur_offset;
+ int ret = 0;
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
@@ -1002,7 +994,6 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
- num_bytes = lockend - lockstart + 1;
if (nowait) {
if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
@@ -1014,13 +1005,35 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
&cached_state);
}
- ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait);
- if (ret <= 0)
- btrfs_drew_write_unlock(&root->snapshot_lock);
- else
- *write_bytes = min_t(size_t, *write_bytes ,
- num_bytes - pos + lockstart);
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ cur_offset = lockstart;
+ while (cur_offset < lockend) {
+ u64 num_bytes = lockend - cur_offset + 1;
+
+ ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait);
+ if (ret <= 0) {
+ /*
+ * If cur_offset == lockstart it means we haven't found
+ * any extent against which we can NOCOW, so unlock the
+ * snapshot lock.
+ */
+ if (cur_offset == lockstart)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ break;
+ }
+ cur_offset += num_bytes;
+ }
+
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ /*
+ * cur_offset > lockstart means there's at least a partial range we can
+ * NOCOW, and that range can cover one or more extents.
+ */
+ if (cur_offset > lockstart) {
+ *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos);
+ return 1;
+ }
return ret;
}
@@ -1077,241 +1090,306 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
return 0;
}
-ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
+static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved,
+ u64 start, u64 len, bool only_release_metadata)
{
- struct file *file = iocb->ki_filp;
- loff_t pos;
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_changeset *data_reserved = NULL;
- u64 release_bytes = 0;
- u64 lockstart;
- u64 lockend;
- size_t num_written = 0;
- ssize_t ret;
- loff_t old_isize;
- unsigned int ilock_flags = 0;
- const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
- unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
- bool only_release_metadata = false;
+ if (len == 0)
+ return;
- if (nowait)
- ilock_flags |= BTRFS_ILOCK_TRY;
-
- ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
- if (ret < 0)
- return ret;
+ if (only_release_metadata) {
+ btrfs_check_nocow_unlock(inode);
+ btrfs_delalloc_release_metadata(inode, len, true);
+ } else {
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
- /*
- * We can only trust the isize with inode lock held, or it can race with
- * other buffered writes and cause incorrect call of
- * pagecache_isize_extended() to overwrite existing data.
- */
- old_isize = i_size_read(inode);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ round_down(start, fs_info->sectorsize),
+ len, true);
+ }
+}
- ret = generic_write_checks(iocb, i);
- if (ret <= 0)
- goto out;
+/*
+ * Reserve data and metadata space for this buffered write range.
+ *
+ * Return >0 for the number of bytes reserved, which is always block aligned.
+ * Return <0 for error.
+ */
+static ssize_t reserve_space(struct btrfs_inode *inode,
+ struct extent_changeset **data_reserved,
+ u64 start, size_t *len, bool nowait,
+ bool *only_release_metadata)
+{
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const unsigned int block_offset = (start & (fs_info->sectorsize - 1));
+ size_t reserve_bytes;
+ int ret;
- ret = btrfs_write_check(iocb, ret);
- if (ret < 0)
- goto out;
+ ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait);
+ if (ret < 0) {
+ int can_nocow;
- pos = iocb->ki_pos;
- while (iov_iter_count(i) > 0) {
- struct extent_state *cached_state = NULL;
- size_t offset = offset_in_page(pos);
- size_t sector_offset;
- size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
- size_t reserve_bytes;
- size_t copied;
- size_t dirty_sectors;
- size_t num_sectors;
- struct folio *folio = NULL;
- int extents_locked;
- bool force_page_uptodate = false;
+ if (nowait && (ret == -ENOSPC || ret == -EAGAIN))
+ return -EAGAIN;
/*
- * Fault pages before locking them in prepare_one_folio()
- * to avoid recursive lock
+ * If we don't have to COW at the offset, reserve metadata only.
+ * write_bytes may get smaller than requested here.
*/
- if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
- ret = -EFAULT;
- break;
- }
+ can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait);
+ if (can_nocow < 0)
+ ret = can_nocow;
+ if (can_nocow > 0)
+ ret = 0;
+ if (ret)
+ return ret;
+ *only_release_metadata = true;
+ }
- only_release_metadata = false;
- sector_offset = pos & (fs_info->sectorsize - 1);
+ reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize);
+ WARN_ON(reserve_bytes == 0);
+ ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes,
+ reserve_bytes, nowait);
+ if (ret) {
+ if (!*only_release_metadata)
+ btrfs_free_reserved_data_space(inode, *data_reserved,
+ start, *len);
+ else
+ btrfs_check_nocow_unlock(inode);
- extent_changeset_release(data_reserved);
- ret = btrfs_check_data_free_space(BTRFS_I(inode),
- &data_reserved, pos,
- write_bytes, nowait);
- if (ret < 0) {
- int can_nocow;
+ if (nowait && ret == -ENOSPC)
+ ret = -EAGAIN;
+ return ret;
+ }
+ return reserve_bytes;
+}
- if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
- ret = -EAGAIN;
- break;
- }
+/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */
+static void shrink_reserved_space(struct btrfs_inode *inode,
+ struct extent_changeset *data_reserved,
+ u64 reserved_start, u64 reserved_len,
+ u64 new_len, bool only_release_metadata)
+{
+ const u64 diff = reserved_len - new_len;
- /*
- * If we don't have to COW at the offset, reserve
- * metadata only. write_bytes may get smaller than
- * requested here.
- */
- can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
- &write_bytes, nowait);
- if (can_nocow < 0)
- ret = can_nocow;
- if (can_nocow > 0)
- ret = 0;
- if (ret)
- break;
- only_release_metadata = true;
- }
+ ASSERT(new_len <= reserved_len);
+ btrfs_delalloc_shrink_extents(inode, reserved_len, new_len);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, diff, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ reserved_start + new_len, diff, true);
+}
- reserve_bytes = round_up(write_bytes + sector_offset,
- fs_info->sectorsize);
- WARN_ON(reserve_bytes == 0);
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- reserve_bytes,
- reserve_bytes, nowait);
- if (ret) {
- if (!only_release_metadata)
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, pos,
- write_bytes);
- else
- btrfs_check_nocow_unlock(BTRFS_I(inode));
+/* Calculate the maximum amount of bytes we can write into one folio. */
+static size_t calc_write_bytes(const struct btrfs_inode *inode,
+ const struct iov_iter *iter, u64 start)
+{
+ const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping);
- if (nowait && ret == -ENOSPC)
- ret = -EAGAIN;
- break;
- }
+ return min(max_folio_size - (start & (max_folio_size - 1)),
+ iov_iter_count(iter));
+}
+
+/*
+ * Do the heavy-lifting work to copy one range into one folio of the page cache.
+ *
+ * Return > 0 in case we copied all bytes or just some of them.
+ * Return 0 if no bytes were copied, in which case the caller should retry.
+ * Return <0 on error.
+ */
+static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
+ struct extent_changeset **data_reserved, u64 start,
+ bool nowait)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_state *cached_state = NULL;
+ size_t write_bytes = calc_write_bytes(inode, iter, start);
+ size_t copied;
+ const u64 reserved_start = round_down(start, fs_info->sectorsize);
+ u64 reserved_len;
+ struct folio *folio = NULL;
+ int extents_locked;
+ u64 lockstart;
+ u64 lockend;
+ bool only_release_metadata = false;
+ const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
+ int ret;
+
+ /*
+ * Fault all pages before locking them in prepare_one_folio() to avoid
+ * recursive lock.
+ */
+ if (unlikely(fault_in_iov_iter_readable(iter, write_bytes)))
+ return -EFAULT;
+ extent_changeset_release(*data_reserved);
+ ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait,
+ &only_release_metadata);
+ if (ret < 0)
+ return ret;
+ reserved_len = ret;
+ /* Write range must be inside the reserved range. */
+ ASSERT(reserved_start <= start);
+ ASSERT(start + write_bytes <= reserved_start + reserved_len);
- release_bytes = reserve_bytes;
again:
- ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
- if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
- break;
- }
+ ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
+ bdp_flags);
+ if (ret) {
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ return ret;
+ }
- ret = prepare_one_folio(inode, &folio, pos, write_bytes,
- force_page_uptodate, false);
- if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
- break;
- }
+ ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false);
+ if (ret) {
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ return ret;
+ }
- extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
- folio, pos, write_bytes, &lockstart,
- &lockend, nowait, &cached_state);
- if (extents_locked < 0) {
- if (!nowait && extents_locked == -EAGAIN)
- goto again;
+ /*
+ * The reserved range goes beyond the current folio, shrink the reserved
+ * space to the folio boundary.
+ */
+ if (reserved_start + reserved_len > folio_next_pos(folio)) {
+ const u64 last_block = folio_next_pos(folio);
+
+ shrink_reserved_space(inode, *data_reserved, reserved_start,
+ reserved_len, last_block - reserved_start,
+ only_release_metadata);
+ write_bytes = last_block - start;
+ reserved_len = last_block - reserved_start;
+ }
+
+ extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start,
+ write_bytes, &lockstart,
+ &lockend, nowait,
+ &cached_state);
+ if (extents_locked < 0) {
+ if (!nowait && extents_locked == -EAGAIN)
+ goto again;
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
- ret = extents_locked;
- break;
- }
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ ret = extents_locked;
+ return ret;
+ }
- copied = copy_folio_from_iter_atomic(folio,
- offset_in_folio(folio, pos), write_bytes, i);
- flush_dcache_folio(folio);
+ copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start),
+ write_bytes, iter);
+ flush_dcache_folio(folio);
+
+ if (unlikely(copied < write_bytes)) {
+ u64 last_block;
/*
- * If we get a partial write, we can end up with partially
- * uptodate page. Although if sector size < page size we can
- * handle it, but if it's not sector aligned it can cause
- * a lot of complexity, so make sure they don't happen by
- * forcing retry this copy.
+ * The original write range doesn't need an uptodate folio as
+ * the range is block aligned. But now a short copy happened.
+ * We cannot handle it without an uptodate folio.
+ *
+ * So just revert the range and we will retry.
*/
- if (unlikely(copied < write_bytes)) {
- if (!folio_test_uptodate(folio)) {
- iov_iter_revert(i, copied);
- copied = 0;
- }
+ if (!folio_test_uptodate(folio)) {
+ iov_iter_revert(iter, copied);
+ copied = 0;
}
- num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
- dirty_sectors = round_up(copied + sector_offset,
- fs_info->sectorsize);
- dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
-
+ /* No copied bytes, unlock, release reserved space and exit. */
if (copied == 0) {
- force_page_uptodate = true;
- dirty_sectors = 0;
- } else {
- force_page_uptodate = false;
+ if (extents_locked)
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend,
+ &cached_state);
+ else
+ btrfs_free_extent_state(cached_state);
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ btrfs_drop_folio(fs_info, folio, start, copied);
+ return 0;
}
- if (num_sectors > dirty_sectors) {
- /* release everything except the sectors we dirtied */
- release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
- if (only_release_metadata) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes, true);
- } else {
- u64 release_start = round_up(pos + copied,
- fs_info->sectorsize);
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, release_start,
- release_bytes, true);
- }
- }
+ /* Release the reserved space beyond the last block. */
+ last_block = round_up(start + copied, fs_info->sectorsize);
- release_bytes = round_up(copied + sector_offset,
- fs_info->sectorsize);
+ shrink_reserved_space(inode, *data_reserved, reserved_start,
+ reserved_len, last_block - reserved_start,
+ only_release_metadata);
+ reserved_len = last_block - reserved_start;
+ }
- ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
- &cached_state, only_release_metadata);
+ ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
+ only_release_metadata);
+ /*
+ * If we have not locked the extent range, because the range's start
+ * offset is >= i_size, we might still have a non-NULL cached extent
+ * state, acquired while marking the extent range as delalloc through
+ * btrfs_dirty_page(). Therefore free any possible cached extent state
+ * to avoid a memory leak.
+ */
+ if (extents_locked)
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ else
+ btrfs_free_extent_state(cached_state);
- /*
- * If we have not locked the extent range, because the range's
- * start offset is >= i_size, we might still have a non-NULL
- * cached extent state, acquired while marking the extent range
- * as delalloc through btrfs_dirty_page(). Therefore free any
- * possible cached extent state to avoid a memory leak.
- */
- if (extents_locked)
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
- else
- free_extent_state(cached_state);
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ if (ret) {
+ btrfs_drop_folio(fs_info, folio, start, copied);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ return ret;
+ }
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
- if (ret) {
- btrfs_drop_folio(fs_info, folio, pos, copied);
- break;
- }
+ btrfs_drop_folio(fs_info, folio, start, copied);
+ return copied;
+}
- release_bytes = 0;
- if (only_release_metadata)
- btrfs_check_nocow_unlock(BTRFS_I(inode));
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ loff_t pos;
+ struct inode *inode = file_inode(file);
+ struct extent_changeset *data_reserved = NULL;
+ size_t num_written = 0;
+ ssize_t ret;
+ loff_t old_isize;
+ unsigned int ilock_flags = 0;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
- btrfs_drop_folio(fs_info, folio, pos, copied);
+ if (nowait)
+ ilock_flags |= BTRFS_ILOCK_TRY;
- cond_resched();
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
+ if (ret < 0)
+ return ret;
- pos += copied;
- num_written += copied;
- }
+ /*
+ * We can only trust the isize with inode lock held, or it can race with
+ * other buffered writes and cause incorrect call of
+ * pagecache_isize_extended() to overwrite existing data.
+ */
+ old_isize = i_size_read(inode);
- if (release_bytes) {
- if (only_release_metadata) {
- btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes, true);
- } else {
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved,
- round_down(pos, fs_info->sectorsize),
- release_bytes, true);
- }
+ ret = generic_write_checks(iocb, iter);
+ if (ret <= 0)
+ goto out;
+
+ ret = btrfs_write_check(iocb, ret);
+ if (ret < 0)
+ goto out;
+
+ pos = iocb->ki_pos;
+ while (iov_iter_count(iter) > 0) {
+ ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait);
+ if (ret < 0)
+ break;
+ pos += ret;
+ num_written += ret;
+ cond_resched();
}
extent_changeset_free(data_reserved);
@@ -1362,6 +1440,8 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written, num_sync;
+ if (unlikely(btrfs_is_shutdown(inode->root->fs_info)))
+ return -EIO;
/*
* If the fs flips readonly due to some impossible error, although we
* have opened a file as writable, we have to stop this write operation
@@ -1406,7 +1486,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
if (private) {
kfree(private->filldir_buf);
- free_extent_state(private->llseek_cached_state);
+ btrfs_free_extent_state(private->llseek_cached_state);
kfree(private);
filp->private_data = NULL;
}
@@ -1774,28 +1854,25 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct folio *folio = page_folio(page);
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
unsigned long zero_start;
loff_t size;
size_t fsize = folio_size(folio);
- vm_fault_t ret;
- int ret2;
- int reserved = 0;
+ int ret;
+ bool only_release_metadata = false;
u64 reserved_space;
u64 page_start;
u64 page_end;
u64 end;
- ASSERT(folio_order(folio) == 0);
-
reserved_space = fsize;
- sb_start_pagefault(inode->i_sb);
+ sb_start_pagefault(inode->vfs_inode.i_sb);
page_start = folio_pos(folio);
page_end = page_start + folio_size(folio) - 1;
end = page_end;
@@ -1808,38 +1885,53 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- page_start, reserved_space);
- if (!ret2) {
- ret2 = file_update_time(vmf->vma->vm_file);
- reserved = 1;
- }
- if (ret2) {
- ret = vmf_error(ret2);
- if (reserved)
- goto out;
+ ret = btrfs_check_data_free_space(inode, &data_reserved, page_start,
+ reserved_space, false);
+ if (ret < 0) {
+ size_t write_bytes = reserved_space;
+
+ if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0)
+ goto out_noreserve;
+
+ only_release_metadata = true;
+
+ /*
+ * Can't write the whole range, there may be shared extents or
+ * holes in the range, bail out with @only_release_metadata set
+ * to true so that we unlock the nocow lock before returning the
+ * error.
+ */
+ if (write_bytes < reserved_space)
+ goto out_noreserve;
+ }
+ ret = btrfs_delalloc_reserve_metadata(inode, reserved_space,
+ reserved_space, false);
+ if (ret < 0) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ page_start, reserved_space);
goto out_noreserve;
}
- /* Make the VM retry the fault. */
- ret = VM_FAULT_NOPAGE;
+ ret = file_update_time(vmf->vma->vm_file);
+ if (ret < 0)
+ goto out;
again:
- down_read(&BTRFS_I(inode)->i_mmap_lock);
+ down_read(&inode->i_mmap_lock);
folio_lock(folio);
- size = i_size_read(inode);
+ size = i_size_read(&inode->vfs_inode);
- if ((folio->mapping != inode->i_mapping) ||
+ if ((folio->mapping != inode->vfs_inode.i_mapping) ||
(page_start >= size)) {
/* Page got truncated out from underneath us. */
goto out_unlock;
}
folio_wait_writeback(folio);
- lock_extent(io_tree, page_start, page_end, &cached_state);
- ret2 = set_folio_extent_mapped(folio);
- if (ret2 < 0) {
- ret = vmf_error(ret2);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(io_tree, page_start, page_end, &cached_state);
+ ret = set_folio_extent_mapped(folio);
+ if (ret < 0) {
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
@@ -1847,23 +1939,27 @@ again:
* We can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish.
*/
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, fsize);
if (ordered) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
folio_unlock(folio);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ up_read(&inode->i_mmap_lock);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
}
- if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
+ if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, fs_info->sectorsize);
if (reserved_space < fsize) {
+ const u64 to_free = fsize - reserved_space;
+
end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, page_start,
- fsize - reserved_space, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, to_free, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ end + 1, to_free, true);
}
}
@@ -1874,15 +1970,13 @@ again:
* clear any delalloc bits within this page range since we have to
* reserve data&meta space before lock_page() (see above comments).
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
-
- ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
- &cached_state);
- if (ret2) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- ret = VM_FAULT_SIGBUS;
+ btrfs_clear_extent_bit(io_tree, page_start, end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state);
+ if (ret < 0) {
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
@@ -1899,27 +1993,44 @@ again:
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
- btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+ btrfs_set_inode_last_sub_trans(inode);
+
+ if (only_release_metadata)
+ btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE,
+ &cached_state);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
+ up_read(&inode->i_mmap_lock);
- btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
- sb_end_pagefault(inode->i_sb);
+ btrfs_delalloc_release_extents(inode, fsize);
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
+ sb_end_pagefault(inode->vfs_inode.i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
out_unlock:
folio_unlock(folio);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ up_read(&inode->i_mmap_lock);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
- reserved_space, (ret != 0));
-out_noreserve:
- sb_end_pagefault(inode->i_sb);
+ btrfs_delalloc_release_extents(inode, fsize);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, reserved_space, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ reserved_space, true);
extent_changeset_free(data_reserved);
- return ret;
+out_noreserve:
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
+
+ sb_end_pagefault(inode->vfs_inode.i_sb);
+
+ if (ret < 0)
+ return vmf_error(ret);
+
+ /* Make the VM retry the fault. */
+ return VM_FAULT_NOPAGE;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -1928,46 +2039,49 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
.page_mkwrite = btrfs_page_mkwrite,
};
-static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+static int btrfs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *filp = desc->file;
struct address_space *mapping = filp->f_mapping;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(filp)))))
+ return -EIO;
if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(filp);
- vma->vm_ops = &btrfs_file_vm_ops;
+ desc->vm_ops = &btrfs_file_vm_ops;
return 0;
}
-static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
- int slot, u64 start, u64 end)
+static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
+ int slot, u64 start, u64 end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
if (slot < 0 || slot >= btrfs_header_nritems(leaf))
- return 0;
+ return false;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY)
- return 0;
+ return false;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
- return 0;
+ return false;
if (btrfs_file_extent_disk_bytenr(leaf, fi))
- return 0;
+ return false;
if (key.offset == end)
- return 1;
+ return true;
if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
- return 1;
- return 0;
+ return true;
+ return false;
}
static int fill_holes(struct btrfs_trans_handle *trans,
@@ -2041,7 +2155,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
- hole_em = alloc_extent_map();
+ hole_em = btrfs_alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_map_range(inode, offset, end - 1, false);
btrfs_set_inode_full_sync(inode);
@@ -2055,7 +2169,7 @@ out:
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
- free_extent_map(hole_em);
+ btrfs_free_extent_map(hole_em);
if (ret)
btrfs_set_inode_full_sync(inode);
}
@@ -2088,15 +2202,33 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
0 : *start + *len - em->start - em->len;
*start = em->start + em->len;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ret;
}
-static void btrfs_punch_hole_lock_range(struct inode *inode,
- const u64 lockstart,
- const u64 lockend,
- struct extent_state **cached_state)
+/*
+ * Check if there is no folio in the range.
+ *
+ * We cannot utilize filemap_range_has_page() in a filemap with large folios
+ * as we can hit the following false positive:
+ *
+ * start end
+ * | |
+ * |//|//|//|//| | | | | | | | |//|//|
+ * \ / \ /
+ * Folio A Folio B
+ *
+ * That large folio A and B cover the start and end indexes.
+ * In that case filemap_range_has_page() will always return true, but the above
+ * case is fine for btrfs_punch_hole_lock_range() usage.
+ *
+ * So here we only ensure that no other folios is in the range, excluding the
+ * head/tail large folio.
+ */
+static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
{
+ struct folio_batch fbatch;
+ bool ret = false;
/*
* For subpage case, if the range is not at page boundary, we could
* have pages at the leading/tailing part of the range.
@@ -2104,15 +2236,48 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* will always return true.
* So here we need to do extra page alignment for
* filemap_range_has_page().
+ *
+ * And do not decrease page_lockend right now, as it can be 0.
*/
- const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
- const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+ const u64 page_lockstart = round_up(start, PAGE_SIZE);
+ const u64 page_lockend = round_down(end + 1, PAGE_SIZE);
+ const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
+ const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
+ pgoff_t tmp = start_index;
+ int found_folios;
+
+ /* The same page or adjacent pages. */
+ if (page_lockend <= page_lockstart)
+ return false;
+
+ folio_batch_init(&fbatch);
+ found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch);
+ for (int i = 0; i < found_folios; i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ /* A large folio begins before the start. Not a target. */
+ if (folio->index < start_index)
+ continue;
+ /* A large folio extends beyond the end. Not a target. */
+ if (folio_next_index(folio) > end_index)
+ continue;
+ /* A folio doesn't cover the head/tail index. Found a target. */
+ ret = true;
+ break;
+ }
+ folio_batch_release(&fbatch);
+ return ret;
+}
+static void btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart, const u64 lockend,
+ struct extent_state **cached_state)
+{
while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
- lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
/*
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
@@ -2123,12 +2288,11 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* locking the range check if we have pages in the range, and if
* we do, unlock the range and retry.
*/
- if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
- page_lockend))
+ if (!check_range_has_page(inode, lockstart, lockend))
break;
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
}
btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
@@ -2241,7 +2405,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_block_rsv *rsv;
+ struct btrfs_block_rsv rsv;
unsigned int rsv_count;
u64 cur_offset;
u64 len = end - start;
@@ -2250,13 +2414,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
if (end <= start)
return -EINVAL;
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv) {
- ret = -ENOMEM;
- goto out;
- }
- rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ rsv.failfast = true;
/*
* 1 - update the inode
@@ -2273,14 +2433,14 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
- goto out_free;
+ goto out_release;
}
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
min_size, false);
if (WARN_ON(ret))
goto out_trans;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
cur_offset = start;
drop_args.path = path;
@@ -2302,9 +2462,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
* got EOPNOTSUPP via prealloc then we messed up and
* need to abort.
*/
- if (ret &&
- (ret != -EOPNOTSUPP ||
- (extent_info && extent_info->is_new_extent)))
+ if (unlikely(ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent))))
btrfs_abort_transaction(trans, ret);
break;
}
@@ -2315,7 +2475,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
cur_offset < ino_size) {
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
- if (ret) {
+ if (unlikely(ret)) {
/*
* If we failed then we didn't insert our hole
* entries for the area we dropped, so now the
@@ -2335,7 +2495,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_inode_clear_file_extent_range(inode,
cur_offset,
drop_args.drop_end - cur_offset);
- if (ret) {
+ if (unlikely(ret)) {
/*
* We couldn't clear our area, so we could
* presumably adjust up and corrupt the fs, so
@@ -2354,7 +2514,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, replace_len,
drop_args.bytes_found);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -2396,10 +2556,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, false);
+ &rsv, min_size, false);
if (WARN_ON(ret))
break;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
cur_offset = drop_args.drop_end;
len = end - cur_offset;
@@ -2449,7 +2609,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
cur_offset < drop_args.drop_end) {
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
- if (ret) {
+ if (unlikely(ret)) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2458,7 +2618,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
drop_args.drop_end - cur_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
@@ -2468,7 +2628,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, extent_info->data_len,
drop_args.bytes_found);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
@@ -2476,16 +2636,15 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
out_trans:
if (!trans)
- goto out_free;
+ goto out_release;
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret)
btrfs_end_transaction(trans);
else
*trans_out = trans;
-out_free:
- btrfs_free_block_rsv(fs_info, rsv);
-out:
+out_release:
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
return ret;
}
@@ -2501,7 +2660,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
u64 lockend;
u64 tail_start;
u64 tail_len;
- u64 orig_start = offset;
+ const u64 orig_start = offset;
+ const u64 orig_end = offset + len - 1;
int ret = 0;
bool same_block;
u64 ino_size;
@@ -2533,18 +2693,14 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
- * We needn't truncate any block which is beyond the end of the file
- * because we are sure there is no data there.
- */
- /*
* Only do this if we are in the same block and we aren't doing the
* entire block.
*/
if (same_block && len < fs_info->sectorsize) {
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
- 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
+ orig_start, orig_end);
} else {
ret = 0;
}
@@ -2554,7 +2710,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
/* zero back part of the first block */
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end);
if (ret) {
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
@@ -2591,8 +2747,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
if (tail_start + tail_len < ino_size) {
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode),
- tail_start + tail_len,
- 0, 1);
+ tail_start + tail_len - 1,
+ orig_start, orig_end);
if (ret)
goto out_only_mutex;
}
@@ -2626,8 +2782,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret) {
/*
@@ -2701,12 +2857,22 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 range_start;
+ u64 range_end;
int ret;
int ret2;
if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
return 0;
+ range_start = round_down(i_size_read(inode), root->fs_info->sectorsize);
+ range_end = round_up(end, root->fs_info->sectorsize);
+
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start,
+ range_end - range_start);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -2745,7 +2911,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
else
ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ret;
}
@@ -2760,6 +2926,8 @@ static int btrfs_zero_range(struct inode *inode,
int ret;
u64 alloc_hint = 0;
const u64 sectorsize = fs_info->sectorsize;
+ const u64 orig_start = offset;
+ const u64 orig_end = offset + len - 1;
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -2789,7 +2957,7 @@ static int btrfs_zero_range(struct inode *inode,
* do nothing except updating the inode's i_size if
* needed.
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
goto out;
@@ -2802,9 +2970,9 @@ static int btrfs_zero_range(struct inode *inode,
ASSERT(IS_ALIGNED(alloc_start, sectorsize));
len = offset + len - alloc_start;
offset = alloc_start;
- alloc_hint = extent_map_block_start(em) + em->len;
+ alloc_hint = btrfs_extent_map_block_start(em) + em->len;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
@@ -2815,22 +2983,22 @@ static int btrfs_zero_range(struct inode *inode,
}
if (em->flags & EXTENT_FLAG_PREALLOC) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
goto out;
}
if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
- free_extent_map(em);
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
- 0);
+ btrfs_free_extent_map(em);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
+ orig_start, orig_end);
if (!ret)
ret = btrfs_fallocate_update_isize(inode,
offset + len,
mode);
return ret;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
alloc_start = round_down(offset, sectorsize);
alloc_end = alloc_start + sectorsize;
goto reserve_space;
@@ -2854,7 +3022,8 @@ static int btrfs_zero_range(struct inode *inode,
alloc_start = round_down(offset, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset,
+ orig_start, orig_end);
if (ret)
goto out;
} else {
@@ -2871,8 +3040,8 @@ static int btrfs_zero_range(struct inode *inode,
alloc_end = round_up(offset + len, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
- 0, 1);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
+ orig_start, orig_end);
if (ret)
goto out;
} else {
@@ -2897,16 +3066,16 @@ reserve_space:
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
goto out;
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
fs_info->sectorsize,
offset + len, &alloc_hint);
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
/* btrfs_prealloc_file_range releases reserved space on error */
if (ret) {
space_reserved = false;
@@ -2946,6 +3115,9 @@ static long btrfs_fallocate(struct file *file, int mode,
int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
int ret;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode))))
+ return -EIO;
+
/* Do not allow fallocate in ZONED mode */
if (btrfs_is_zoned(inode_to_fs_info(inode)))
return -EOPNOTSUPP;
@@ -2992,7 +3164,8 @@ static long btrfs_fallocate(struct file *file, int mode,
* need to zero out the end of the block if i_size lands in the
* middle of a block.
*/
- ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size,
+ inode->i_size, (u64)-1);
if (ret)
goto out;
}
@@ -3017,8 +3190,8 @@ static long btrfs_fallocate(struct file *file, int mode,
}
locked_end = alloc_end - 1;
- lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
@@ -3030,8 +3203,8 @@ static long btrfs_fallocate(struct file *file, int mode,
ret = PTR_ERR(em);
break;
}
- last_byte = min(extent_map_end(em), alloc_end);
- actual_end = min_t(u64, extent_map_end(em), offset + len);
+ last_byte = min(btrfs_extent_map_end(em), alloc_end);
+ actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
@@ -3040,19 +3213,19 @@ static long btrfs_fallocate(struct file *file, int mode,
ret = add_falloc_range(&reserve_list, cur_offset, range_len);
if (ret < 0) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
&data_reserved, cur_offset, range_len);
if (ret < 0) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
break;
}
qgroup_reserved += range_len;
data_space_needed += range_len;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
cur_offset = last_byte;
}
@@ -3106,8 +3279,8 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
- unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
out:
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
extent_changeset_free(data_reserved);
@@ -3141,10 +3314,10 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
if (inode->delalloc_bytes > 0) {
spin_unlock(&inode->lock);
*delalloc_start_ret = start;
- delalloc_len = count_range_bits(&inode->io_tree,
- delalloc_start_ret, end,
- len, EXTENT_DELALLOC, 1,
- cached_state);
+ delalloc_len = btrfs_count_range_bits(&inode->io_tree,
+ delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1,
+ cached_state);
} else {
spin_unlock(&inode->lock);
}
@@ -3187,7 +3360,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
* We could also use the extent map tree to find such delalloc that is
* being flushed, but using the ordered extents tree is more efficient
* because it's usually much smaller as ordered extents are removed from
- * the tree once they complete. With the extent maps, we mau have them
+ * the tree once they complete. With the extent maps, we may have them
* in the extent map tree for a very long time, and they were either
* created by previous writes or loaded by read operations.
*/
@@ -3453,7 +3626,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
last_extent_end = lockstart;
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
@@ -3599,7 +3772,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
}
out:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
btrfs_free_path(path);
if (ret < 0)
@@ -3636,6 +3809,9 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode))))
+ return -EIO;
+
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
ret = fsverity_file_open(inode, filp);
@@ -3648,6 +3824,9 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp)))))
+ return -EIO;
+
if (iocb->ki_flags & IOCB_DIRECT) {
ret = btrfs_direct_read(iocb, to);
if (ret < 0 || !iov_iter_count(to) ||
@@ -3658,13 +3837,23 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return filemap_read(iocb, to, ret);
}
+static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(in)))))
+ return -EIO;
+
+ return filemap_splice_read(in, ppos, pipe, len, flags);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
.read_iter = btrfs_file_read_iter,
- .splice_read = filemap_splice_read,
+ .splice_read = btrfs_file_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
- .mmap = btrfs_file_mmap,
+ .mmap_prepare = btrfs_file_mmap_prepare,
.open = btrfs_file_open,
.release = btrfs_release_file,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 05e173311c1a..f0f72850fab2 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -308,8 +308,9 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
bool locked = false;
if (block_group) {
- struct btrfs_path *path = btrfs_alloc_path();
+ BTRFS_PATH_AUTO_FREE(path);
+ path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto fail;
@@ -330,13 +331,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_CLEAR;
spin_unlock(&block_group->lock);
- btrfs_free_path(path);
}
btrfs_i_size_write(inode, 0);
truncate_pagecache(vfs_inode, 0);
- lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
/*
@@ -348,7 +348,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
if (ret)
goto fail;
@@ -366,7 +366,7 @@ fail:
static void readahead_cache(struct inode *inode)
{
struct file_ra_state ra;
- unsigned long last_index;
+ pgoff_t last_index;
file_ra_state_init(&ra, inode->i_mapping);
last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
@@ -457,7 +457,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
mask);
if (IS_ERR(folio)) {
io_ctl_drop_pages(io_ctl);
- return -ENOMEM;
+ return PTR_ERR(folio);
}
ret = set_folio_extent_mapped(folio);
@@ -968,8 +968,8 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
path = btrfs_alloc_path();
if (!path)
return 0;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
/*
* We must pass a path with search_commit_root set to btrfs_iget in
@@ -1080,9 +1080,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list)) {
- cluster = list_entry(block_group->cluster_list.next,
- struct btrfs_free_cluster,
- block_group_list);
+ cluster = list_first_entry(&block_group->cluster_list,
+ struct btrfs_free_cluster, block_group_list);
}
if (!node && cluster) {
@@ -1160,8 +1159,8 @@ update_cache_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DELALLOC, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1172,9 +1171,9 @@ update_cache_item(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1, EXTENT_DELALLOC,
- NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1, EXTENT_DELALLOC,
+ NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1219,9 +1218,9 @@ static noinline_for_stack int write_pinned_extent_entries(
start = block_group->start;
while (start < block_group->start + block_group->length) {
- if (!find_first_extent_bit(unpin, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY, NULL))
+ if (!btrfs_find_first_extent_bit(unpin, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL))
return 0;
/* This pinned extent is out of our range */
@@ -1267,8 +1266,8 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DELALLOC, NULL);
return ret;
}
@@ -1288,8 +1287,8 @@ cleanup_write_cache_enospc(struct inode *inode,
struct extent_state **cached_state)
{
io_ctl_drop_pages(io_ctl);
- unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1414,8 +1413,8 @@ static int __btrfs_write_out_cache(struct inode *inode,
if (ret)
goto out_unlock;
- lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1475,8 +1474,8 @@ static int __btrfs_write_out_cache(struct inode *inode,
io_ctl_drop_pages(io_ctl);
io_ctl_free(io_ctl);
- unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
/*
* at this point the pages are under IO and we're happy,
@@ -2283,7 +2282,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
* to reserve them to larger extents, however if we have plenty
- * of cache left then go ahead an dadd them, no sense in adding
+ * of cache left then go ahead and add them, no sense in adding
* the overhead of a bitmap if we don't have to.
*/
if (info->bytes <= fs_info->sectorsize * 8) {
@@ -2342,9 +2341,8 @@ again:
struct rb_node *node;
struct btrfs_free_space *entry;
- cluster = list_entry(block_group->cluster_list.next,
- struct btrfs_free_cluster,
- block_group_list);
+ cluster = list_first_entry(&block_group->cluster_list,
+ struct btrfs_free_cluster, block_group_list);
spin_lock(&cluster->lock);
node = rb_first(&cluster->root);
if (!node) {
@@ -3194,7 +3192,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- int err;
+ int ret2;
u64 search_start = cluster->window_start;
u64 search_bytes = bytes;
u64 ret = 0;
@@ -3202,8 +3200,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
search_start = min_start;
search_bytes = bytes;
- err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
- if (err) {
+ ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
+ if (ret2) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
return 0;
@@ -3658,7 +3656,7 @@ static int do_trimming(struct btrfs_block_group *block_group,
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int ret;
- int update = 0;
+ bool bg_ro;
const u64 end = start + bytes;
const u64 reserved_end = reserved_start + reserved_bytes;
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
@@ -3666,12 +3664,14 @@ static int do_trimming(struct btrfs_block_group *block_group,
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (!block_group->ro) {
+ bg_ro = block_group->ro;
+ if (!bg_ro) {
block_group->reserved += reserved_bytes;
+ spin_unlock(&block_group->lock);
space_info->bytes_reserved += reserved_bytes;
- update = 1;
+ } else {
+ spin_unlock(&block_group->lock);
}
- spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
@@ -3692,14 +3692,16 @@ static int do_trimming(struct btrfs_block_group *block_group,
list_del(&trim_entry->list);
mutex_unlock(&ctl->cache_writeout_mutex);
- if (update) {
+ if (!bg_ro) {
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->ro)
- space_info->bytes_readonly += reserved_bytes;
+ bg_ro = block_group->ro;
block_group->reserved -= reserved_bytes;
- space_info->bytes_reserved -= reserved_bytes;
spin_unlock(&block_group->lock);
+
+ space_info->bytes_reserved -= reserved_bytes;
+ if (bg_ro)
+ space_info->bytes_readonly += reserved_bytes;
spin_unlock(&space_info->lock);
}
@@ -3831,7 +3833,7 @@ out_unlock:
/*
* If we break out of trimming a bitmap prematurely, we should reset the
- * trimming bit. In a rather contrieved case, it's possible to race here so
+ * trimming bit. In a rather contrived case, it's possible to race here so
* reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
*
* start = start of bitmap
@@ -4144,7 +4146,7 @@ int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool act
if (!active) {
set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
ret = cleanup_free_space_cache_v1(fs_info, trans);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 39c6b96a4c25..1ad2ad384b9e 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -35,7 +35,7 @@ static struct btrfs_root *btrfs_free_space_root(
return btrfs_global_root(block_group->fs_info, &key);
}
-void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
+void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
size_t bitmap_size;
@@ -82,22 +82,19 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
info = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_info);
btrfs_set_free_space_extent_count(leaf, info, 0);
btrfs_set_free_space_flags(leaf, info, 0);
-
- ret = 0;
-out:
btrfs_release_path(path);
- return ret;
+ return 0;
}
EXPORT_FOR_TESTS
-struct btrfs_free_space_info *search_free_space_info(
+struct btrfs_free_space_info *btrfs_search_free_space_info(
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow)
@@ -117,7 +114,7 @@ struct btrfs_free_space_info *search_free_space_info(
if (ret != 0) {
btrfs_warn(fs_info, "missing free space info for %llu",
block_group->start);
- ASSERT(0);
+ DEBUG_WARN();
return ERR_PTR(-ENOENT);
}
@@ -140,13 +137,13 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
- if (ret == 0) {
- ASSERT(0);
+ if (unlikely(ret == 0)) {
+ DEBUG_WARN();
return -EIO;
}
- if (p->slots[0] == 0) {
- ASSERT(0);
+ if (unlikely(p->slots[0] == 0)) {
+ DEBUG_WARN("no previous slot found");
return -EIO;
}
p->slots[0]--;
@@ -168,11 +165,9 @@ static unsigned long *alloc_bitmap(u32 bitmap_size)
/*
* GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
- * into the filesystem as the free space bitmap can be modified in the
- * critical section of a transaction commit.
- *
- * TODO: push the memalloc_nofs_{save,restore}() to the caller where we
- * know that recursion is unsafe.
+ * into the filesystem here. All callers hold a transaction handle
+ * open, so if a GFP_KERNEL allocation recurses into the filesystem
+ * and triggers a transaction commit, we would deadlock.
*/
nofs_flag = memalloc_nofs_save();
ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL);
@@ -201,9 +196,9 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
}
EXPORT_FOR_TESTS
-int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path)
+int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
@@ -221,10 +216,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
- if (!bitmap) {
- ret = -ENOMEM;
- goto out;
- }
+ if (unlikely(!bitmap))
+ return 0;
start = block_group->start;
end = block_group->start + block_group->length;
@@ -235,8 +228,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -271,30 +266,35 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
}
- info = search_free_space_info(trans, block_group, path, 1);
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+ block_group->using_free_space_bitmaps = true;
+ block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
btrfs_release_path(path);
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
ret = -EIO;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -315,8 +315,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
data_size);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -331,15 +333,13 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ret = 0;
out:
kvfree(bitmap);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
EXPORT_FOR_TESTS
-int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path)
+int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
@@ -356,10 +356,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
- if (!bitmap) {
- ret = -ENOMEM;
- goto out;
- }
+ if (unlikely(!bitmap))
+ return 0;
start = block_group->start;
end = block_group->start + block_group->length;
@@ -370,8 +368,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -400,49 +400,56 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
data_size = free_space_bitmap_size(fs_info,
found_key.offset);
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+ path->slots[0]--;
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
read_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
nr++;
- path->slots[0]--;
} else {
ASSERT(0);
}
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
}
- info = search_free_space_info(trans, block_group, path, 1);
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+ block_group->using_free_space_bitmaps = false;
+ block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
btrfs_release_path(path);
- nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
+ nrbits = block_group->length >> fs_info->sectorsize_bits;
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
ASSERT(start_bit < end_bit);
- key.objectid = start + start_bit * block_group->fs_info->sectorsize;
+ key.objectid = start + start_bit * fs_info->sectorsize;
key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
- key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;
+ key.offset = (end_bit - start_bit) * fs_info->sectorsize;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
extent_count++;
@@ -450,21 +457,19 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
}
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
ret = -EIO;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
ret = 0;
out:
kvfree(bitmap);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -481,11 +486,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (new_extents == 0)
return 0;
- info = search_free_space_info(trans, block_group, path, 1);
- if (IS_ERR(info)) {
- ret = PTR_ERR(info);
- goto out;
- }
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
flags = btrfs_free_space_flags(path->nodes[0], info);
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
@@ -495,19 +499,18 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count > block_group->bitmap_high_thresh) {
- ret = convert_free_space_to_bitmaps(trans, block_group, path);
+ ret = btrfs_convert_free_space_to_bitmaps(trans, block_group, path);
} else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count < block_group->bitmap_low_thresh) {
- ret = convert_free_space_to_extents(trans, block_group, path);
+ ret = btrfs_convert_free_space_to_extents(trans, block_group, path);
}
-out:
return ret;
}
EXPORT_FOR_TESTS
-int free_space_test_bit(struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 offset)
+bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset)
{
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -525,13 +528,13 @@ int free_space_test_bit(struct btrfs_block_group *block_group,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
i = div_u64(offset - found_start,
block_group->fs_info->sectorsize);
- return !!extent_buffer_test_bit(leaf, ptr, i);
+ return extent_buffer_test_bit(leaf, ptr, i);
}
-static void free_space_set_bits(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 *start, u64 *size,
- int bit)
+static void free_space_modify_bits(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 *start, u64 *size,
+ bool set_bits)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_buffer *leaf;
@@ -555,7 +558,7 @@ static void free_space_set_bits(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
first = (*start - found_start) >> fs_info->sectorsize_bits;
last = (end - found_start) >> fs_info->sectorsize_bits;
- if (bit)
+ if (set_bits)
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
@@ -599,13 +602,14 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path,
- u64 start, u64 size, int remove)
+ u64 start, u64 size, bool remove)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
- int prev_bit, next_bit;
+ bool prev_bit_set = false;
+ bool next_bit_set = false;
int new_extents;
int ret;
@@ -622,16 +626,16 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
- goto out;
+ return ret;
- prev_bit = free_space_test_bit(block_group, path, prev_block);
+ prev_bit_set = btrfs_free_space_test_bit(block_group, path, prev_block);
/* The previous block may have been in the previous bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (start >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
} else {
key.objectid = start;
@@ -640,9 +644,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
- goto out;
-
- prev_bit = -1;
+ return ret;
}
/*
@@ -652,13 +654,13 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
cur_start = start;
cur_size = size;
while (1) {
- free_space_set_bits(trans, block_group, path, &cur_start, &cur_size,
- !remove);
+ free_space_modify_bits(trans, block_group, path, &cur_start,
+ &cur_size, !remove);
if (cur_size == 0)
break;
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
/*
@@ -671,42 +673,36 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
if (end >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
- next_bit = free_space_test_bit(block_group, path, end);
- } else {
- next_bit = -1;
+ next_bit_set = btrfs_free_space_test_bit(block_group, path, end);
}
if (remove) {
new_extents = -1;
- if (prev_bit == 1) {
+ if (prev_bit_set) {
/* Leftover on the left. */
new_extents++;
}
- if (next_bit == 1) {
+ if (next_bit_set) {
/* Leftover on the right. */
new_extents++;
}
} else {
new_extents = 1;
- if (prev_bit == 1) {
+ if (prev_bit_set) {
/* Merging with neighbor on the left. */
new_extents--;
}
- if (next_bit == 1) {
+ if (next_bit_set) {
/* Merging with neighbor on the right. */
new_extents--;
}
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
@@ -727,7 +723,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -759,7 +755,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
/* Delete the existing key (cases 1-4). */
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
/* Add a key for leftovers at the beginning (cases 3 and 4). */
if (start > found_start) {
@@ -770,7 +766,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
- goto out;
+ return ret;
new_extents++;
}
@@ -783,81 +779,89 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
- goto out;
+ return ret;
new_extents++;
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
-EXPORT_FOR_TESTS
-int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size)
+static int using_bitmaps(struct btrfs_block_group *bg, struct btrfs_path *path)
{
struct btrfs_free_space_info *info;
u32 flags;
- int ret;
- if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
- ret = __add_block_group_free_space(trans, block_group, path);
- if (ret)
- return ret;
- }
+ if (bg->using_free_space_bitmaps_cached)
+ return bg->using_free_space_bitmaps;
- info = search_free_space_info(NULL, block_group, path, 0);
+ info = btrfs_search_free_space_info(NULL, bg, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
btrfs_release_path(path);
- if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ bg->using_free_space_bitmaps = (flags & BTRFS_FREE_SPACE_USING_BITMAPS);
+ bg->using_free_space_bitmaps_cached = true;
+
+ return bg->using_free_space_bitmaps;
+}
+
+EXPORT_FOR_TESTS
+int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ int ret;
+
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
+
+ ret = using_bitmaps(block_group, path);
+ if (ret < 0)
+ return ret;
+
+ if (ret)
return modify_free_space_bitmap(trans, block_group, path,
- start, size, 1);
- } else {
- return remove_free_space_extent(trans, block_group, path,
- start, size);
- }
+ start, size, true);
+
+ return remove_free_space_extent(trans, block_group, path, start, size);
}
-int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size)
+int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
{
struct btrfs_block_group *block_group;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
- if (!block_group) {
- ASSERT(0);
+ if (unlikely(!block_group)) {
+ DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
mutex_lock(&block_group->free_space_lock);
- ret = __remove_from_free_space_tree(trans, block_group, path, start,
- size);
+ ret = __btrfs_remove_from_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
-
- btrfs_put_block_group(block_group);
-out:
- btrfs_free_path(path);
if (ret)
btrfs_abort_transaction(trans, ret);
+
+ btrfs_put_block_group(block_group);
+
return ret;
}
@@ -904,7 +908,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -927,7 +931,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
if (found_end == start) {
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
new_key.objectid = found_start;
new_key.offset += key.offset;
new_extents--;
@@ -944,7 +948,7 @@ right:
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -968,7 +972,7 @@ right:
if (found_start == end) {
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
new_key.offset += key.offset;
new_extents--;
}
@@ -978,78 +982,67 @@ insert:
/* Insert the new key (cases 1-4). */
ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
EXPORT_FOR_TESTS
-int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size)
+int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
{
- struct btrfs_free_space_info *info;
- u32 flags;
int ret;
- if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
- ret = __add_block_group_free_space(trans, block_group, path);
- if (ret)
- return ret;
- }
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
- info = search_free_space_info(NULL, block_group, path, 0);
- if (IS_ERR(info))
- return PTR_ERR(info);
- flags = btrfs_free_space_flags(path->nodes[0], info);
- btrfs_release_path(path);
+ ret = using_bitmaps(block_group, path);
+ if (ret < 0)
+ return ret;
- if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ if (ret)
return modify_free_space_bitmap(trans, block_group, path,
- start, size, 0);
- } else {
- return add_free_space_extent(trans, block_group, path, start,
- size);
- }
+ start, size, false);
+
+ return add_free_space_extent(trans, block_group, path, start, size);
}
-int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size)
+int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
{
struct btrfs_block_group *block_group;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
- if (!block_group) {
- ASSERT(0);
+ if (unlikely(!block_group)) {
+ DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
mutex_lock(&block_group->free_space_lock);
- ret = __add_to_free_space_tree(trans, block_group, path, start, size);
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
-
- btrfs_put_block_group(block_group);
-out:
- btrfs_free_path(path);
if (ret)
btrfs_abort_transaction(trans, ret);
+
+ btrfs_put_block_group(block_group);
+
return ret;
}
@@ -1099,11 +1092,22 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
- ASSERT(ret == 0);
-
+ /*
+ * If ret is 1 (no key found), it means this is an empty block group,
+ * without any extents allocated from it and there's no block group
+ * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
+ * because we are using the block group tree feature (so block group
+ * items are stored in the block group tree) or this is a new block
+ * group created in the current transaction and its block group item
+ * was not yet inserted in the extent tree (that happens in
+ * btrfs_create_pending_block_groups() -> insert_block_group_item()).
+ * It also means there are no extents allocated for block groups with a
+ * start offset beyond this block group's end offset (this is the last,
+ * highest, block group).
+ */
start = block_group->start;
end = block_group->start + block_group->length;
- while (1) {
+ while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -1112,11 +1116,11 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
break;
if (start < key.objectid) {
- ret = __add_to_free_space_tree(trans,
- block_group,
- path2, start,
- key.objectid -
- start);
+ ret = __btrfs_add_to_free_space_tree(trans,
+ block_group,
+ path2, start,
+ key.objectid -
+ start);
if (ret)
goto out_locked;
}
@@ -1133,12 +1137,10 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_next_item(extent_root, path);
if (ret < 0)
goto out_locked;
- if (ret)
- break;
}
if (start < end) {
- ret = __add_to_free_space_tree(trans, block_group, path2,
- start, end - start);
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path2,
+ start, end - start);
if (ret)
goto out_locked;
}
@@ -1174,7 +1176,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
goto out_clear;
}
ret = btrfs_global_root_insert(free_space_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_put_root(free_space_root);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -1186,7 +1188,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out_clear;
@@ -1217,6 +1219,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
{
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
+ struct rb_node *node;
int nr;
int ret;
@@ -1245,6 +1248,16 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
+ node = rb_first_cached(&trans->fs_info->block_group_cache_tree);
+ while (node) {
+ struct btrfs_block_group *bg;
+
+ bg = rb_entry(node, struct btrfs_block_group, cache_node);
+ clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags);
+ node = rb_next(node);
+ cond_resched();
+ }
+
return 0;
}
@@ -1268,14 +1281,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
ret = btrfs_del_root(trans, &free_space_root->root_key);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1293,7 +1306,7 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
free_space_root->node, 0, 1);
btrfs_put_root(free_space_root);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1322,7 +1335,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1334,12 +1347,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
+
+ if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
+ &block_group->runtime_flags))
+ goto next;
+
ret = populate_free_space_tree(trans, block_group);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
+next:
if (btrfs_should_end_transaction(trans)) {
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(free_space_root, 1);
@@ -1362,54 +1381,82 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
+ bool own_path = false;
int ret;
- clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
+ if (!test_and_clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ &block_group->runtime_flags))
+ return 0;
+
+ /*
+ * While rebuilding the free space tree we may allocate new metadata
+ * block groups while modifying the free space tree.
+ *
+ * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we
+ * can use multiple transactions, every time btrfs_end_transaction() is
+ * called at btrfs_rebuild_free_space_tree() we finish the creation of
+ * new block groups by calling btrfs_create_pending_block_groups(), and
+ * that in turn calls us, through add_block_group_free_space(), to add
+ * a free space info item and a free space extent item for the block
+ * group.
+ *
+ * Then later btrfs_rebuild_free_space_tree() may find such new block
+ * groups and processes them with populate_free_space_tree(), which can
+ * fail with EEXIST since there are already items for the block group in
+ * the free space tree. Notice that we say "may find" because a new
+ * block group may be added to the block groups rbtree in a node before
+ * or after the block group currently being processed by the rebuild
+ * process. So signal the rebuild process to skip such new block groups
+ * if it finds them.
+ */
+ set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags);
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (unlikely(!path)) {
+ btrfs_abort_transaction(trans, -ENOMEM);
+ return -ENOMEM;
+ }
+ own_path = true;
+ }
ret = add_new_free_space_info(trans, block_group, path);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path,
+ block_group->start, block_group->length);
if (ret)
- return ret;
+ btrfs_abort_transaction(trans, ret);
+
+out:
+ if (own_path)
+ btrfs_free_path(path);
- return __add_to_free_space_tree(trans, block_group, path,
- block_group->start,
- block_group->length);
+ return ret;
}
-int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group)
+int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path = NULL;
- int ret = 0;
+ int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
mutex_lock(&block_group->free_space_lock);
- if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
- goto out;
-
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = __add_block_group_free_space(trans, block_group, path);
-
-out:
- btrfs_free_path(path);
+ ret = __add_block_group_free_space(trans, block_group, NULL);
mutex_unlock(&block_group->free_space_lock);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
-int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group)
+int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
u64 start, end;
@@ -1425,9 +1472,10 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
- goto out;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
start = block_group->start;
@@ -1439,8 +1487,10 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
- goto out;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -1468,16 +1518,15 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
- goto out;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
btrfs_release_path(path);
}
ret = 0;
-out:
- btrfs_free_path(path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+
return ret;
}
@@ -1489,7 +1538,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- int prev_bit = 0, bit;
+ bool prev_bit_set = false;
/* Initialize to silence GCC. */
u64 extent_start = 0;
u64 end, offset;
@@ -1506,7 +1555,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
while (1) {
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
break;
@@ -1520,10 +1569,12 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
offset = key.objectid;
while (offset < key.objectid + key.offset) {
- bit = free_space_test_bit(block_group, path, offset);
- if (prev_bit == 0 && bit == 1) {
+ bool bit_set;
+
+ bit_set = btrfs_free_space_test_bit(block_group, path, offset);
+ if (!prev_bit_set && bit_set) {
extent_start = offset;
- } else if (prev_bit == 1 && bit == 0) {
+ } else if (prev_bit_set && !bit_set) {
u64 space_added;
ret = btrfs_add_new_free_space(block_group,
@@ -1531,7 +1582,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
offset,
&space_added);
if (ret)
- goto out;
+ return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
@@ -1539,30 +1590,27 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
}
extent_count++;
}
- prev_bit = bit;
+ prev_bit_set = bit_set;
offset += fs_info->sectorsize;
}
}
- if (prev_bit == 1) {
+ if (prev_bit_set) {
ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL);
if (ret)
- goto out;
+ return ret;
extent_count++;
}
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
- ret = -EIO;
- goto out;
+ DEBUG_WARN();
+ return -EIO;
}
- ret = 0;
-out:
- return ret;
+ return 0;
}
static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
@@ -1589,7 +1637,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
break;
@@ -1605,7 +1653,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
key.objectid + key.offset,
&space_added);
if (ret)
- goto out;
+ return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
@@ -1614,22 +1662,19 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
extent_count++;
}
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
- ret = -EIO;
- goto out;
+ DEBUG_WARN();
+ return -EIO;
}
- ret = 0;
-out:
- return ret;
+ return 0;
}
-int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
@@ -1646,11 +1691,11 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
* Just like caching_thread() doesn't want to deadlock on the extent
* tree, we don't want to deadlock on the free space tree.
*/
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
path->reada = READA_FORWARD;
- info = search_free_space_info(NULL, block_group, path, 0);
+ info = btrfs_search_free_space_info(NULL, block_group, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index e6c6d6f4f221..3d9a5d4477fc 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -22,39 +22,39 @@ struct btrfs_trans_handle;
#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
-void set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
+void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info);
-int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
-int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group);
-int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group);
-int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size);
-int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size);
+int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
+int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_free_space_info *
-search_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, int cow);
-int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+btrfs_search_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size);
-int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size);
-int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path);
-int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path);
-int free_space_test_bit(struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 offset);
+ struct btrfs_path *path, int cow);
+int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset);
#endif
#endif
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index b2bb86f8d7cf..feb0a2faa837 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -55,6 +55,54 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
}
/*
+ * We support the following block sizes for all systems:
+ *
+ * - 4K
+ * This is the most common block size. For PAGE SIZE > 4K cases the subpage
+ * mode is used.
+ *
+ * - PAGE_SIZE
+ * The straightforward block size to support.
+ *
+ * And extra support for the following block sizes based on the kernel config:
+ *
+ * - MIN_BLOCKSIZE
+ * This is either 4K (regular builds) or 2K (debug builds)
+ * This allows testing subpage routines on x86_64.
+ */
+bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize)
+{
+ /* @blocksize should be validated first. */
+ ASSERT(is_power_of_2(blocksize) && blocksize >= BTRFS_MIN_BLOCKSIZE &&
+ blocksize <= BTRFS_MAX_BLOCKSIZE);
+
+ if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE)
+ return true;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /*
+ * For bs > ps support it's done by specifying a minimal folio order
+ * for filemap, thus implying large data folios.
+ * For HIGHMEM systems, we can not always access the content of a (large)
+ * folio in one go, but go through them page by page.
+ *
+ * A lot of features don't implement a proper PAGE sized loop for large
+ * folios, this includes:
+ *
+ * - compression
+ * - verity
+ * - encoded write
+ *
+ * Considering HIGHMEM is such a pain to deal with and it's going
+ * to be deprecated eventually, just reject HIGHMEM && bs > ps cases.
+ */
+ if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE)
+ return false;
+ return true;
+#endif
+ return false;
+}
+
+/*
* Start exclusive operation @type, return true on success.
*/
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index bcca43046064..0f7e1ef27891 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -29,6 +29,7 @@
#include "extent-io-tree.h"
#include "async-thread.h"
#include "block-rsv.h"
+#include "messages.h"
struct inode;
struct super_block;
@@ -59,6 +60,8 @@ struct btrfs_space_info;
#define BTRFS_MIN_BLOCKSIZE (SZ_4K)
#endif
+#define BTRFS_MAX_BLOCKSIZE (SZ_64K)
+
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
#define BTRFS_OLDEST_GENERATION 0ULL
@@ -71,6 +74,13 @@ struct btrfs_space_info;
#define BTRFS_SUPER_INFO_SIZE 4096
static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
+/* Array of bytes with variable length, hexadecimal format 0x1234 */
+#define BTRFS_CSUM_FMT "0x%*phN"
+#define BTRFS_CSUM_FMT_VALUE(size, bytes) size, bytes
+
+#define BTRFS_KEY_FMT "(%llu %u %llu)"
+#define BTRFS_KEY_FMT_VALUE(key) (key)->objectid, (key)->type, (key)->offset
+
/*
* Number of metadata items necessary for an unlink operation:
*
@@ -102,6 +112,8 @@ enum {
BTRFS_FS_STATE_RO,
/* Track if a transaction abort has been reported on this filesystem */
BTRFS_FS_STATE_TRANS_ABORTED,
+ /* Track if log replay has failed. */
+ BTRFS_FS_STATE_LOG_REPLAY_ABORTED,
/*
* Bio operations should be blocked on this filesystem because a source
* or target device is being destroyed as part of a device replace
@@ -120,6 +132,12 @@ enum {
/* No more delayed iput can be queued. */
BTRFS_FS_STATE_NO_DELAYED_IPUT,
+ /*
+ * Emergency shutdown, a step further than transaction aborted by
+ * rejecting all operations.
+ */
+ BTRFS_FS_STATE_EMERGENCY_SHUTDOWN,
+
BTRFS_FS_STATE_COUNT
};
@@ -243,6 +261,7 @@ enum {
BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30),
BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31),
BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32),
+ BTRFS_MOUNT_REF_TRACKER = (1ULL << 33),
};
/*
@@ -280,7 +299,7 @@ enum {
#ifdef CONFIG_BTRFS_EXPERIMENTAL
/*
- * Features under developmen like Extent tree v2 support is enabled
+ * Features under development like Extent tree v2 support is enabled
* only under CONFIG_BTRFS_EXPERIMENTAL
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -300,8 +319,19 @@ enum {
#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_WARNING_COMMIT_INTERVAL (300)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
+enum btrfs_compression_type {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LZO = 2,
+ BTRFS_COMPRESS_ZSTD = 3,
+ BTRFS_NR_COMPRESS_TYPES = 4,
+
+ BTRFS_DEFRAG_DONT_COMPRESS,
+};
+
struct btrfs_dev_replace {
/* See #define above */
u64 replace_state;
@@ -419,6 +449,8 @@ struct btrfs_commit_stats {
u64 last_commit_dur;
/* The total commit duration in ns */
u64 total_commit_dur;
+ /* Start of the last critical section in ns. */
+ u64 critical_section_start_time;
};
struct btrfs_fs_info {
@@ -471,6 +503,8 @@ struct btrfs_fs_info {
struct btrfs_block_rsv delayed_block_rsv;
/* Block reservation for delayed refs */
struct btrfs_block_rsv delayed_refs_rsv;
+ /* Block reservation for treelog tree */
+ struct btrfs_block_rsv treelog_rsv;
struct btrfs_block_rsv empty_block_rsv;
@@ -500,6 +534,9 @@ struct btrfs_fs_info {
u64 last_trans_log_full_commit;
unsigned long long mount_opt;
+ /* Compress related structures. */
+ void *compr_wsm[BTRFS_NR_COMPRESS_TYPES];
+
int compress_type;
int compress_level;
u32 commit_interval;
@@ -621,7 +658,6 @@ struct btrfs_fs_info {
struct workqueue_struct *endio_workers;
struct workqueue_struct *endio_meta_workers;
struct workqueue_struct *rmw_workers;
- struct workqueue_struct *compressed_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
@@ -710,8 +746,6 @@ struct btrfs_fs_info {
u32 data_chunk_allocations;
u32 metadata_ratio;
- void *bdev_holder;
-
/* Private scrub information */
struct mutex scrub_lock;
atomic_t scrubs_running;
@@ -736,12 +770,6 @@ struct btrfs_fs_info {
spinlock_t qgroup_lock;
/*
- * Used to avoid frequently calling ulist_alloc()/ulist_free()
- * when doing qgroup accounting, it must be protected by qgroup_lock.
- */
- struct ulist *qgroup_ulist;
-
- /*
* Protect user change for quota operations. If a transaction is needed,
* it must be started before locking this lock.
*/
@@ -776,10 +804,8 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* Extent buffer radix tree */
- spinlock_t buffer_lock;
- /* Entries are eb->start / sectorsize */
- struct radix_tree_root buffer_radix;
+ /* Entries are eb->start >> nodesize_bits */
+ struct xarray buffer_tree;
/* Next backup root to be overwritten */
int backup_root_index;
@@ -810,9 +836,12 @@ struct btrfs_fs_info {
/* Cached block sizes */
u32 nodesize;
+ u32 nodesize_bits;
u32 sectorsize;
/* ilog2 of sectorsize, use to avoid 64bit division */
u32 sectorsize_bits;
+ u32 block_min_order;
+ u32 block_max_order;
u32 csum_size;
u32 csums_per_leaf;
u32 stripesize;
@@ -882,12 +911,10 @@ struct btrfs_fs_info {
struct lockdep_map btrfs_trans_pending_ordered_map;
struct lockdep_map btrfs_ordered_extent_map;
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
spinlock_t ref_verify_lock;
struct rb_root block_tree;
-#endif
-#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
struct list_head allocated_roots;
@@ -909,6 +936,12 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
+/* Return the minimal folio size of the fs. */
+static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info)
+{
+ return 1U << (PAGE_SHIFT + fs_info->block_min_order);
+}
+
static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
{
return READ_ONCE(fs_info->generation);
@@ -1001,6 +1034,7 @@ static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs
return folio_size(folio) >> fs_info->sectorsize_bits;
}
+bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize);
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
@@ -1099,6 +1133,27 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
&(fs_info)->fs_state)))
+static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info)
+{
+ return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state);
+}
+
+static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * Here we do not want to use handle_fs_error(), which will mark the fs
+ * read-only.
+ * Some call sites like shutdown ioctl will mark the fs shutdown when
+ * the fs is frozen. But thaw path will handle RO and RW fs
+ * differently.
+ *
+ * So here we only mark the fs error without flipping it RO.
+ */
+ WRITE_ONCE(fs_info->fs_error, -EIO);
+ if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state))
+ btrfs_crit(fs_info, "emergency shutdown");
+}
+
/*
* We use folio flag owner_2 to indicate there is an ordered extent with
* unfinished IO.
@@ -1111,9 +1166,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
#define EXPORT_FOR_TESTS
-static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
+static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
- return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+ return unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state));
}
void btrfs_test_destroy_inode(struct inode *inode);
@@ -1122,9 +1177,9 @@ void btrfs_test_destroy_inode(struct inode *inode);
#define EXPORT_FOR_TESTS static
-static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
+static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
- return 0;
+ return false;
}
#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3530de0618c8..b73e1dd97208 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -78,13 +78,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
}
/* Returns NULL if no extref found */
-struct btrfs_inode_extref *
-btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct fscrypt_str *name,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow)
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid)
{
int ret;
struct btrfs_key key;
@@ -93,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ERR_PTR(ret);
if (ret > 0)
@@ -109,7 +106,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
u64 inode_objectid, u64 ref_objectid,
u64 *index)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_inode_extref *extref;
struct extent_buffer *leaf;
@@ -129,9 +126,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
- ret = -ENOENT;
+ return -ENOENT;
if (ret < 0)
- goto out;
+ return ret;
/*
* Sanity check - did we find the right item for this name?
@@ -140,10 +137,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*/
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
ref_objectid, name);
- if (!extref) {
+ if (unlikely(!extref)) {
btrfs_abort_transaction(trans, -ENOENT);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
leaf = path->nodes[0];
@@ -152,12 +148,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*index = btrfs_inode_extref_index(leaf, extref);
if (del_len == item_size) {
- /*
- * Common case only one ref in the item, remove the
- * whole item.
- */
- ret = btrfs_del_item(trans, root, path);
- goto out;
+ /* Common case only one ref in the item, remove the whole item. */
+ return btrfs_del_item(trans, root, path);
}
ptr = (unsigned long)extref;
@@ -168,9 +160,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
btrfs_truncate_item(trans, path, item_size - del_len, 1);
-out:
- btrfs_free_path(path);
-
return ret;
}
@@ -260,7 +249,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
int ret;
int ins_len = name->len + sizeof(*extref);
unsigned long ptr;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -279,13 +268,13 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
path->slots[0],
ref_objectid,
name))
- goto out;
+ return ret;
btrfs_extend_item(trans, path, ins_len);
ret = 0;
}
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
@@ -298,9 +287,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)&extref->name;
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
@@ -324,7 +312,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->skip_release_on_error = 1;
+ path->skip_release_on_error = true;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
@@ -456,7 +444,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_truncate_control *control)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
@@ -639,7 +627,7 @@ delete:
if (control->clear_extent_range) {
ret = btrfs_inode_clear_file_extent_range(control->inode,
clear_start, clear_len);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -678,7 +666,7 @@ delete:
btrfs_init_data_ref(&ref, control->ino, extent_offset,
btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -696,7 +684,7 @@ delete:
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
pending_del_nr);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -729,13 +717,12 @@ delete:
}
out:
if (ret >= 0 && pending_del_nr) {
- int err;
+ int ret2;
- err = btrfs_del_items(trans, root, path, pending_del_slot,
- pending_del_nr);
- if (err) {
- btrfs_abort_transaction(trans, err);
- ret = err;
+ ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr);
+ if (unlikely(ret2)) {
+ btrfs_abort_transaction(trans, ret2);
+ ret = ret2;
}
}
@@ -743,6 +730,5 @@ out:
if (!ret && control->last_size > new_size)
control->last_size = new_size;
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index c11b97fdccc4..6d9f5ad20646 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -101,13 +101,10 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
-struct btrfs_inode_extref *btrfs_lookup_inode_extref(
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct fscrypt_str *name,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow);
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid);
struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cc67d1a2d611..c4bee47829ed 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9,6 +9,7 @@
#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
@@ -71,6 +72,10 @@
#include "backref.h"
#include "raid-stripe-tree.h"
#include "fiemap.h"
+#include "delayed-inode.h"
+
+#define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0)
+#define COW_FILE_RANGE_NO_INLINE (1UL << 1)
struct btrfs_iget_args {
u64 ino;
@@ -127,7 +132,7 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
struct btrfs_fs_info *fs_info = warn->fs_info;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
- struct inode_fs_paths *ipath = NULL;
+ struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
struct btrfs_root *local_root;
struct btrfs_key key;
unsigned int nofs_flag;
@@ -174,8 +179,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
return ret;
}
ret = paths_from_inode(inum, ipath);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_put_root(local_root);
goto err;
+ }
/*
* We deliberately ignore the bit ipath might have been too small to
@@ -190,7 +197,6 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
}
btrfs_put_root(local_root);
- free_ipath(ipath);
return 0;
err:
@@ -198,7 +204,6 @@ err:
"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
warn->logical, warn->mirror_num, root, inum, offset, ret);
- free_ipath(ipath);
return ret;
}
@@ -230,21 +235,21 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
if (logical == U64_MAX) {
btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
btrfs_warn_rl(fs_info,
-"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+"csum failed root %lld ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
return;
}
logical += file_off;
btrfs_warn_rl(fs_info,
-"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+"csum failed root %lld ino %llu off %llu logical %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
btrfs_root_id(inode->root),
btrfs_ino(inode), file_off, logical,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
@@ -308,26 +313,26 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
const u32 csum_size = root->fs_info->csum_size;
/* For data reloc tree, it's better to do a backref lookup instead. */
- if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
/* Output without objectid, which is more meaningful */
if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
btrfs_warn_rl(root->fs_info,
-"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+"csum failed root %lld ino %lld off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
btrfs_root_id(root), btrfs_ino(inode),
logical_start,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
} else {
btrfs_warn_rl(root->fs_info,
-"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+"csum failed root %llu ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
btrfs_root_id(root), btrfs_ino(inode),
logical_start,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum),
+ BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
}
}
@@ -367,7 +372,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
}
/*
- * Unock inode i_rwsem.
+ * Unlock inode i_rwsem.
*
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
@@ -395,16 +400,18 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
u64 offset, u64 bytes)
{
- unsigned long index = offset >> PAGE_SHIFT;
- unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ pgoff_t index = offset >> PAGE_SHIFT;
+ const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
struct folio *folio;
while (index <= end_index) {
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
- index++;
- if (IS_ERR(folio))
+ if (IS_ERR(folio)) {
+ index++;
continue;
+ }
+ index = folio_next_index(folio);
/*
* Here we just clear all Ordered bits for every page in the
* range, then btrfs_mark_ordered_io_finished() will handle
@@ -423,18 +430,18 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args)
{
- int err;
+ int ret;
if (args->default_acl) {
- err = __btrfs_set_acl(trans, args->inode, args->default_acl,
+ ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
ACL_TYPE_DEFAULT);
- if (err)
- return err;
+ if (ret)
+ return ret;
}
if (args->acl) {
- err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
- if (err)
- return err;
+ ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
+ if (ret)
+ return ret;
}
if (!args->default_acl && !args->acl)
cache_no_acl(args->inode);
@@ -586,6 +593,10 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
if (size < i_size_read(&inode->vfs_inode))
return false;
+ /* Encrypted file cannot be inlined. */
+ if (IS_ENCRYPTED(&inode->vfs_inode))
+ return false;
+
return true;
}
@@ -629,7 +640,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
drop_args.replace_extent = true;
drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -637,7 +648,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
compressed_folio, update_i_size);
- if (ret && ret != -ENOSPC) {
+ if (unlikely(ret && ret != -ENOSPC)) {
btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
@@ -647,7 +658,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, inode);
- if (ret && ret != -ENOSPC) {
+ if (unlikely(ret && ret != -ENOSPC)) {
btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
@@ -686,12 +697,12 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
return 1;
- lock_extent(&inode->io_tree, offset, end, &cached);
+ btrfs_lock_extent(&inode->io_tree, offset, end, &cached);
ret = __cow_file_range_inline(inode, size, compressed_size,
compress_type, compressed_folio,
update_i_size);
if (ret > 0) {
- unlock_extent(&inode->io_tree, offset, end, &cached);
+ btrfs_unlock_extent(&inode->io_tree, offset, end, &cached);
return ret;
}
@@ -777,33 +788,19 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!btrfs_inode_can_compress(inode)) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
- btrfs_ino(inode));
+ DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
return 0;
}
- /*
- * Only enable sector perfect compression for experimental builds.
- *
- * This is a big feature change for subpage cases, and can hit
- * different corner cases, so only limit this feature for
- * experimental build for now.
- *
- * ETA for moving this out of experimental builds is 6.15.
- */
- if (fs_info->sectorsize < PAGE_SIZE &&
- !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
- if (!PAGE_ALIGNED(start) ||
- !PAGE_ALIGNED(end + 1))
- return 0;
- }
+ /* Defrag ioctl takes precedence over mount options and properties. */
+ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
+ return 0;
+ if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
+ inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES)
+ return 1;
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
- /* defrag ioctl */
- if (inode->defrag_compress)
- return 1;
/* bad compression ratios */
if (inode->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
@@ -825,12 +822,11 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
{
- unsigned long end_index = end >> PAGE_SHIFT;
+ const pgoff_t end_index = end >> PAGE_SHIFT;
struct folio *folio;
int ret = 0;
- for (unsigned long index = start >> PAGE_SHIFT;
- index <= end_index; index++) {
+ for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
if (IS_ERR(folio)) {
if (!ret)
@@ -864,21 +860,26 @@ static void compress_file_range(struct btrfs_work *work)
struct btrfs_inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
u64 i_size;
int ret = 0;
- struct folio **folios;
+ struct folio **folios = NULL;
unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
- unsigned int poff;
+ unsigned int loff;
int i;
int compress_type = fs_info->compress_type;
int compress_level = fs_info->compress_level;
+ if (unlikely(btrfs_is_shutdown(fs_info)))
+ goto cleanup_and_bail_uncompressed;
+
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
/*
@@ -912,8 +913,8 @@ static void compress_file_range(struct btrfs_work *work)
actual_end = min_t(u64, i_size, end + 1);
again:
folios = NULL;
- nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
+ nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1;
+ nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift);
/*
* we don't want to send crud past the end of i_size through
@@ -960,7 +961,7 @@ again:
goto cleanup_and_bail_uncompressed;
}
- if (inode->defrag_compress) {
+ if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
compress_type = inode->defrag_compress;
compress_level = inode->defrag_compress_level;
} else if (inode->prop_compress) {
@@ -969,18 +970,18 @@ again:
/* Compression level is applied here. */
ret = btrfs_compress_folios(compress_type, compress_level,
- mapping, start, folios, &nr_folios, &total_in,
+ inode, start, folios, &nr_folios, &total_in,
&total_compressed);
if (ret)
goto mark_incompressible;
/*
- * Zero the tail end of the last page, as we might be sending it down
+ * Zero the tail end of the last folio, as we might be sending it down
* to disk.
*/
- poff = offset_in_page(total_compressed);
- if (poff)
- folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
+ loff = (total_compressed & (min_folio_size - 1));
+ if (loff)
+ folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff);
/*
* Try to create an inline extent.
@@ -1109,6 +1110,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
+ bool free_pages = false;
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1129,14 +1131,17 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
}
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
+ ASSERT(!async_extent->folios);
+ ASSERT(async_extent->nr_folios == 0);
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
ret = btrfs_reserve_extent(root, async_extent->ram_size,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, *alloc_hint, &ins, 1, 1);
+ 0, *alloc_hint, &ins, true, true);
if (ret) {
/*
* We can't reserve contiguous space for the compressed size.
@@ -1145,10 +1150,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* fall back to uncompressed.
*/
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
- lock_extent(io_tree, start, end, &cached);
+ btrfs_lock_extent(io_tree, start, end, &cached);
/* Here we're doing allocation and writeback of the compressed pages */
file_extent.disk_bytenr = ins.objectid;
@@ -1163,10 +1169,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
ret = PTR_ERR(em);
goto out_free_reserve;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_COMPRESSED);
+ 1U << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -1186,12 +1192,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
done:
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
+ if (free_pages)
+ free_async_extent_pages(async_extent);
kfree(async_extent);
return;
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
NULL, &cached,
@@ -1218,7 +1226,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
u64 alloc_hint = 0;
read_lock(&em_tree->lock);
- em = search_extent_mapping(em_tree, start, num_bytes);
+ em = btrfs_search_extent_mapping(em_tree, start, num_bytes);
if (em) {
/*
* if block start isn't an actual block number then find the
@@ -1226,15 +1234,15 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* block is also bogus then just don't worry about it.
*/
if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- em = search_extent_mapping(em_tree, 0, 0);
+ btrfs_free_extent_map(em);
+ em = btrfs_search_extent_mapping(em_tree, 0, 0);
if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
- alloc_hint = extent_map_block_start(em);
+ alloc_hint = btrfs_extent_map_block_start(em);
if (em)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
} else {
- alloc_hint = extent_map_block_start(em);
- free_extent_map(em);
+ alloc_hint = btrfs_extent_map_block_start(em);
+ btrfs_free_extent_map(em);
}
}
read_unlock(&em_tree->lock);
@@ -1251,18 +1259,18 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* locked_folio is the folio that writepage had locked already. We use
* it to make sure we don't do extra locks or unlocks.
*
- * When this function fails, it unlocks all pages except @locked_folio.
+ * When this function fails, it unlocks all folios except @locked_folio.
*
* When this function successfully creates an inline extent, it returns 1 and
- * unlocks all pages including locked_folio and starts I/O on them.
- * (In reality inline extents are limited to a single page, so locked_folio is
- * the only page handled anyway).
+ * unlocks all folios including locked_folio and starts I/O on them.
+ * (In reality inline extents are limited to a single block, so locked_folio is
+ * the only folio handled anyway).
*
- * When this function succeed and creates a normal extent, the page locking
+ * When this function succeed and creates a normal extent, the folio locking
* status depends on the passed in flags:
*
- * - If @keep_locked is set, all pages are kept locked.
- * - Else all pages except for @locked_folio are unlocked.
+ * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked.
+ * - Else all folios except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
* while-loop, the ordered extents created in previous iterations are cleaned up.
@@ -1270,7 +1278,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
static noinline int cow_file_range(struct btrfs_inode *inode,
struct folio *locked_folio, u64 start,
u64 end, u64 *done_offset,
- bool keep_locked, bool no_inline)
+ unsigned long flags)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1287,6 +1295,11 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
unsigned long page_ops;
int ret = 0;
+ if (unlikely(btrfs_is_shutdown(fs_info))) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
if (btrfs_is_free_space_inode(inode)) {
ret = -EINVAL;
goto out_unlock;
@@ -1298,7 +1311,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- if (!no_inline) {
+ if (!(flags & COW_FILE_RANGE_NO_INLINE)) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
@@ -1326,7 +1339,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* Do set the Ordered (Private2) bit so we know this page was properly
* setup for writepage.
*/
- page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
+ page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
/*
@@ -1351,7 +1364,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
min_alloc_size, 0, alloc_hint,
- &ins, 1, 1);
+ &ins, true, true);
if (ret == -EAGAIN) {
/*
* btrfs_reserve_extent only returns -EAGAIN for zoned
@@ -1397,24 +1410,24 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* Locked range will be released either during error clean up or
* after the whole range is finished.
*/
- lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
- &cached);
+ btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
+ &cached);
em = btrfs_create_io_em(inode, start, &file_extent,
BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, start,
- start + cur_alloc_size - 1, &cached);
+ btrfs_unlock_extent(&inode->io_tree, start,
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_REGULAR);
+ 1U << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
- unlock_extent(&inode->io_tree, start,
- start + cur_alloc_size - 1, &cached);
+ btrfs_unlock_extent(&inode->io_tree, start,
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1469,7 +1482,7 @@ out_drop_extent_cache:
btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_unlock:
/*
* Now, we have three regions to clean up:
@@ -1537,10 +1550,11 @@ out_unlock:
btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
end - start - cur_alloc_size + 1, NULL);
}
- btrfs_err_rl(fs_info,
- "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
- __func__, btrfs_root_id(inode->root),
- btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
+ btrfs_err(fs_info,
+"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), orig_start, end + 1 - orig_start,
+ start, cur_alloc_size, ret);
return ret;
}
@@ -1578,8 +1592,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
PAGE_SHIFT;
while (!list_empty(&async_chunk->extents)) {
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
+ async_extent = list_first_entry(&async_chunk->extents,
+ struct async_extent, list);
list_del(&async_extent->list);
submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
}
@@ -1693,7 +1707,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
while (start <= end) {
ret = cow_file_range(inode, locked_folio, start, end,
- &done_offset, true, false);
+ &done_offset, COW_FILE_RANGE_KEEP_LOCKED);
if (ret)
return ret;
extent_write_locked_range(&inode->vfs_inode, locked_folio,
@@ -1749,9 +1763,9 @@ static int fallback_to_cow(struct btrfs_inode *inode,
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
- lock_extent(io_tree, start, end, &cached_state);
- count = count_range_bits(io_tree, &range_start, end, range_bytes,
- EXTENT_NORESERVE, 0, NULL);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
+ count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
+ EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1765,18 +1779,24 @@ static int fallback_to_cow(struct btrfs_inode *inode,
spin_unlock(&sinfo->lock);
if (count > 0)
- clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
- NULL);
+ btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
+ &cached_state);
}
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
/*
* Don't try to create inline extents, as a mix of inline extent that
* is written out and unlocked directly and a normal NOCOW extent
* doesn't work.
+ *
+ * And here we do not unlock the folio after a successful run.
+ * The folios will be unlocked after everything is finished, or by error handling.
+ *
+ * This is to ensure error handling won't need to clear dirty/ordered flags without
+ * a locked folio, which can race with writeback.
*/
- ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
- true);
+ ret = cow_file_range(inode, locked_folio, start, end, NULL,
+ COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED);
ASSERT(ret != 1);
return ret;
}
@@ -1919,64 +1939,17 @@ static int can_nocow_file_extent(struct btrfs_path *path,
return ret < 0 ? ret : can_nocow;
}
-/*
- * Cleanup the dirty folios which will never be submitted due to error.
- *
- * When running a delalloc range, we may need to split the ranges (due to
- * fragmentation or NOCOW). If we hit an error in the later part, we will error
- * out and previously successfully executed range will never be submitted, thus
- * we have to cleanup those folios by clearing their dirty flag, starting and
- * finishing the writeback.
- */
-static void cleanup_dirty_folios(struct btrfs_inode *inode,
- struct folio *locked_folio,
- u64 start, u64 end, int error)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct address_space *mapping = inode->vfs_inode.i_mapping;
- pgoff_t start_index = start >> PAGE_SHIFT;
- pgoff_t end_index = end >> PAGE_SHIFT;
- u32 len;
-
- ASSERT(end + 1 - start < U32_MAX);
- ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
- IS_ALIGNED(end + 1, fs_info->sectorsize));
- len = end + 1 - start;
-
- /*
- * Handle the locked folio first.
- * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
- */
- btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
-
- for (pgoff_t index = start_index; index <= end_index; index++) {
- struct folio *folio;
-
- /* Already handled at the beginning. */
- if (index == locked_folio->index)
- continue;
- folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
- /* Cache already dropped, no need to do any cleanup. */
- if (IS_ERR(folio))
- continue;
- btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
- folio_unlock(folio);
- folio_put(folio);
- }
- mapping_set_error(mapping, error);
-}
-
static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
struct extent_state **cached,
struct can_nocow_file_extent_args *nocow_args,
u64 file_pos, bool is_prealloc)
{
struct btrfs_ordered_extent *ordered;
- u64 len = nocow_args->file_extent.num_bytes;
- u64 end = file_pos + len - 1;
+ const u64 len = nocow_args->file_extent.num_bytes;
+ const u64 end = file_pos + len - 1;
int ret = 0;
- lock_extent(&inode->io_tree, file_pos, end, cached);
+ btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);
if (is_prealloc) {
struct extent_map *em;
@@ -1984,21 +1957,21 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, file_pos, end, cached);
- return PTR_ERR(em);
+ ret = PTR_ERR(em);
+ goto error;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
}
ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
is_prealloc
- ? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW));
+ ? (1U << BTRFS_ORDERED_PREALLOC)
+ : (1U << BTRFS_ORDERED_NOCOW));
if (IS_ERR(ordered)) {
if (is_prealloc)
btrfs_drop_extent_map_range(inode, file_pos, end, false);
- unlock_extent(&inode->io_tree, file_pos, end, cached);
- return PTR_ERR(ordered);
+ ret = PTR_ERR(ordered);
+ goto error;
}
if (btrfs_is_data_reloc_root(inode->root))
@@ -2010,23 +1983,30 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
ret = btrfs_reloc_clone_csums(ordered);
btrfs_put_ordered_extent(ordered);
+ if (ret < 0)
+ goto error;
extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
- /*
- * On error, we need to cleanup the ordered extents we created.
- *
- * We do not clear the folio Dirty flags because they are set and
- * cleaered by the caller.
- */
- if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, file_pos, end);
+ PAGE_SET_ORDERED);
+ return ret;
+
+error:
+ btrfs_cleanup_ordered_extents(inode, file_pos, len);
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ btrfs_err(inode->root->fs_info,
+ "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
+ file_pos, len, ret);
return ret;
}
/*
- * when nowcow writeback call back. This checks for snapshots or COW copies
+ * When nocow writeback calls back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
*
* If no cow copies or snapshots exist, we write directly to the existing
@@ -2038,18 +2018,28 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
- struct btrfs_path *path;
+ struct btrfs_path *path = NULL;
u64 cow_start = (u64)-1;
/*
* If not 0, represents the inclusive end of the last fallback_to_cow()
* range. Only for error handling.
+ *
+ * The same for nocow_end, it's to avoid double cleaning up the range
+ * already cleaned by nocow_one_range().
*/
u64 cow_end = 0;
+ u64 nocow_end = 0;
u64 cur_offset = start;
int ret;
bool check_prev = true;
u64 ino = btrfs_ino(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };
+ /* The range that has ordered extent(s). */
+ u64 oe_cleanup_start;
+ u64 oe_cleanup_len = 0;
+ /* The range that is untouched. */
+ u64 untouched_start;
+ u64 untouched_len = 0;
/*
* Normally on a zoned device we're only doing COW writes, but in case
@@ -2058,6 +2048,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
*/
ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
+ if (unlikely(btrfs_is_shutdown(fs_info))) {
+ ret = -EIO;
+ goto error;
+ }
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -2129,12 +2123,13 @@ next_slot:
/*
* If the found extent starts after requested offset, then
- * adjust extent_end to be right before this extent begins
+ * adjust cur_offset to be right before this extent begins.
*/
if (found_key.offset > cur_offset) {
- extent_end = found_key.offset;
- extent_type = 0;
- goto must_cow;
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = found_key.offset;
+ goto next_slot;
}
/*
@@ -2212,8 +2207,10 @@ must_cow:
&nocow_args, cur_offset,
extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (ret < 0)
+ if (ret < 0) {
+ nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
goto error;
+ }
cur_offset = extent_end;
}
btrfs_release_path(path);
@@ -2230,86 +2227,105 @@ must_cow:
cow_start = (u64)-1;
}
- btrfs_free_path(path);
- return 0;
-
-error:
/*
- * There are several error cases:
- *
- * 1) Failed without falling back to COW
- * start cur_offset end
- * |/////////////| |
+ * Everything is finished without an error, can unlock the folios now.
*
- * In this case, cow_start should be (u64)-1.
- *
- * For range [start, cur_offset) the folios are already unlocked (except
- * @locked_folio), EXTENT_DELALLOC already removed.
- * Need to clear the dirty flags and finish the ordered extents.
- *
- * 2) Failed with error before calling fallback_to_cow()
- *
- * start cow_start end
- * |/////////////| |
- *
- * In this case, only @cow_start is set, @cur_offset is between
- * [cow_start, end)
- *
- * It's mostly the same as case 1), just replace @cur_offset with
- * @cow_start.
- *
- * 3) Failed with error from fallback_to_cow()
- *
- * start cow_start cow_end end
- * |/////////////|-----------| |
- *
- * In this case, both @cow_start and @cow_end is set.
- *
- * For range [start, cow_start) it's the same as case 1).
- * But for range [cow_start, cow_end), all the cleanup is handled by
- * cow_file_range(), we should not touch anything in that range.
- *
- * So for all above cases, if @cow_start is set, cleanup ordered extents
- * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
+ * No need to touch the io tree range nor set folio ordered flag, as
+ * fallback_to_cow() and nocow_one_range() have already handled them.
*/
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
+ extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK);
- if (cur_offset > start) {
- btrfs_cleanup_ordered_extents(inode, start, cur_offset - start);
- cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
- }
+ btrfs_free_path(path);
+ return 0;
- /*
- * If an error happened while a COW region is outstanding, cur_offset
- * needs to be reset to @cow_end + 1 to skip the COW range, as
- * cow_file_range() will do the proper cleanup at error.
- */
- if (cow_end)
- cur_offset = cow_end + 1;
+error:
+ if (cow_start == (u64)-1) {
+ /*
+ * case a)
+ * start cur_offset end
+ * | OE cleanup | Untouched |
+ *
+ * We finished a fallback_to_cow() or nocow_one_range() call,
+ * but failed to check the next range.
+ *
+ * or
+ * start cur_offset nocow_end end
+ * | OE cleanup | Skip | Untouched |
+ *
+ * nocow_one_range() failed, the range [cur_offset, nocow_end] is
+ * already cleaned up.
+ */
+ oe_cleanup_start = start;
+ oe_cleanup_len = cur_offset - start;
+ if (nocow_end)
+ untouched_start = nocow_end + 1;
+ else
+ untouched_start = cur_offset;
+ untouched_len = end + 1 - untouched_start;
+ } else if (cow_start != (u64)-1 && cow_end == 0) {
+ /*
+ * case b)
+ * start cow_start cur_offset end
+ * | OE cleanup | Untouched |
+ *
+ * We got a range that needs COW, but before we hit the next NOCOW range,
+ * thus [cow_start, cur_offset) doesn't yet have any OE.
+ */
+ oe_cleanup_start = start;
+ oe_cleanup_len = cow_start - start;
+ untouched_start = cow_start;
+ untouched_len = end + 1 - untouched_start;
+ } else {
+ /*
+ * case c)
+ * start cow_start cow_end end
+ * | OE cleanup | Skip | Untouched |
+ *
+ * fallback_to_cow() failed, and fallback_to_cow() will do the
+ * cleanup for its range, we shouldn't touch the range
+ * [cow_start, cow_end].
+ */
+ ASSERT(cow_start != (u64)-1 && cow_end != 0);
+ oe_cleanup_start = start;
+ oe_cleanup_len = cow_start - start;
+ untouched_start = cow_end + 1;
+ untouched_len = end + 1 - untouched_start;
+ }
+
+ if (oe_cleanup_len) {
+ const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1;
+ btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len);
+ extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end,
+ locked_folio, NULL,
+ EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ }
- /*
- * We need to lock the extent here because we're clearing DELALLOC and
- * we're not locked at this point.
- */
- if (cur_offset < end) {
+ if (untouched_len) {
struct extent_state *cached = NULL;
+ const u64 untouched_end = untouched_start + untouched_len - 1;
- lock_extent(&inode->io_tree, cur_offset, end, &cached);
- extent_clear_unlock_delalloc(inode, cur_offset, end,
+ /*
+ * We need to lock the extent here because we're clearing DELALLOC and
+ * we're not locked at this point.
+ */
+ btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached);
+ extent_clear_unlock_delalloc(inode, untouched_start, untouched_end,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
- btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
+ btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL);
}
btrfs_free_path(path);
- btrfs_err_rl(fs_info,
- "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
- __func__, btrfs_root_id(inode->root),
- btrfs_ino(inode), start, end + 1 - start, ret);
+ btrfs_err(fs_info,
+"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d",
+ __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
+ start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len,
+ untouched_start, untouched_len, ret);
return ret;
}
@@ -2317,7 +2333,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
if (inode->defrag_bytes &&
- test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
+ btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
return false;
return true;
}
@@ -2339,7 +2355,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
* can confuse the caller.
*/
ASSERT(!(end <= folio_pos(locked_folio) ||
- start >= folio_pos(locked_folio) + folio_size(locked_folio)));
+ start >= folio_next_pos(locked_folio)));
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_folio, start, end);
@@ -2355,8 +2371,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
true);
else
- ret = cow_file_range(inode, locked_folio, start, end, NULL,
- false, false);
+ ret = cow_file_range(inode, locked_folio, start, end, NULL, 0);
return ret;
}
@@ -2606,7 +2621,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
!btrfs_is_free_space_inode(inode) &&
!(state->state & EXTENT_NORESERVE) &&
(bits & EXTENT_CLEAR_DATA_RESV))
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
fs_info->delalloc_batch);
@@ -2690,12 +2705,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
if (em_len > search_len)
em_len = search_len;
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW, cached_state);
+ ret = btrfs_set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW, cached_state);
next:
- search_start = extent_map_end(em);
- free_extent_map(em);
+ search_start = btrfs_extent_map_end(em);
+ btrfs_free_extent_map(em);
if (ret)
return ret;
}
@@ -2725,8 +2740,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
return ret;
}
- return set_extent_bit(&inode->io_tree, start, end,
- EXTENT_DELALLOC | extra_bits, cached_state);
+ return btrfs_set_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC | extra_bits, cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -2747,7 +2762,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 page_start = folio_pos(folio);
- u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
+ u64 page_end = folio_next_pos(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
@@ -2801,7 +2816,7 @@ again:
if (ret)
goto out_page;
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (folio_test_ordered(folio))
@@ -2809,8 +2824,8 @@ again:
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
@@ -2836,7 +2851,7 @@ out_reserved:
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page:
if (ret) {
/*
@@ -2888,10 +2903,10 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
* We should not hit such out-of-band dirty folios anymore.
*/
if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
btrfs_err_rl(fs_info,
"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_root_id(BTRFS_I(inode)->root),
btrfs_ino(BTRFS_I(inode)),
folio_pos(folio));
return -EUCLEAN;
@@ -2937,7 +2952,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
const u64 sectorsize = root->fs_info->sectorsize;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key ins;
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
@@ -2992,7 +3007,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* If we dropped an inline extent here, we know the range where it is
* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
* number of bytes only for that range containing the inline extent.
- * The remaining of the range will be processed when clearning the
+ * The remaining of the range will be processed when clearing the
* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
*/
if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
@@ -3019,8 +3034,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
file_pos - offset,
qgroup_reserved, &ins);
out:
- btrfs_free_path(path);
-
return ret;
}
@@ -3110,14 +3123,15 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
if (!freespace_inode)
btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
- if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+ if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) {
ret = -EIO;
goto out;
}
- if (btrfs_is_zoned(fs_info))
- btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes);
+ ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ if (ret)
+ goto out;
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
@@ -3136,8 +3150,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
* depending on their current state).
*/
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
- clear_bits |= EXTENT_LOCKED;
- lock_extent(io_tree, start, end, &cached_state);
+ clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED;
+ btrfs_lock_extent_bits(io_tree, start, end,
+ EXTENT_LOCKED | EXTENT_FINISHING_ORDERED,
+ &cached_state);
}
if (freespace_inode)
@@ -3153,7 +3169,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3161,7 +3177,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
/* Logic error */
ASSERT(list_empty(&ordered_extent->list));
- if (!list_empty(&ordered_extent->list)) {
+ if (unlikely(!list_empty(&ordered_extent->list))) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3169,7 +3185,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) {
+ if (unlikely(ret)) {
/* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
}
@@ -3196,20 +3212,20 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = unpin_extent_cache(inode, ordered_extent->file_offset,
- ordered_extent->num_bytes, trans->transid);
- if (ret < 0) {
+ ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = add_pending_csums(trans, &ordered_extent->list);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3221,26 +3237,24 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
*/
if ((clear_bits & EXTENT_DELALLOC_NEW) &&
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
- clear_extent_bit(&inode->io_tree, start, end,
- EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
+ &cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) { /* -ENOMEM or corruption */
+ if (unlikely(ret)) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
}
out:
- clear_extent_bit(&inode->io_tree, start, end, clear_bits,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits,
+ &cached_state);
if (trans)
btrfs_end_transaction(trans);
if (ret || truncated) {
- u64 unwritten_start = start;
-
/*
* If we failed to finish this ordered extent for any reason we
* need to make sure BTRFS_ORDERED_IOERR is set on the ordered
@@ -3252,10 +3266,6 @@ out:
if (ret)
btrfs_mark_ordered_extent_error(ordered_extent);
- if (truncated)
- unwritten_start += logical_len;
- clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
-
/*
* Drop extent maps for the part of the extent we didn't write.
*
@@ -3270,9 +3280,15 @@ out:
* we don't mess with the extent map tree in the NOCOW case, but
* for now simply skip this if we are the free space inode.
*/
- if (!btrfs_is_free_space_inode(inode))
+ if (!btrfs_is_free_space_inode(inode)) {
+ u64 unwritten_start = start;
+
+ if (truncated)
+ unwritten_start += logical_len;
+
btrfs_drop_extent_map_range(inode, unwritten_start,
end, false);
+ }
/*
* If the ordered extent had an IOERR or something else went
@@ -3299,7 +3315,7 @@ out:
NULL);
btrfs_free_reserved_extent(fs_info,
ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes, 1);
+ ordered_extent->disk_num_bytes, true);
/*
* Actually free the qgroup rsv which was released when
* the ordered extent was created.
@@ -3334,35 +3350,89 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
}
/*
- * Verify the checksum for a single sector without any extra action that depend
- * on the type of I/O.
+ * Calculate the checksum of an fs block at physical memory address @paddr,
+ * and save the result to @dest.
+ *
+ * The folio containing @paddr must be large enough to contain a full fs block.
*/
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- u32 pgoff, u8 *csum, const u8 * const csum_expected)
+void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info,
+ const phys_addr_t paddr, u8 *dest)
{
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- char *kaddr;
+ struct folio *folio = page_folio(phys_to_page(paddr));
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 step = min(blocksize, PAGE_SIZE);
+ const u32 nr_steps = blocksize / step;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
- ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
+ /* The full block must be inside the folio. */
+ ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
- shash->tfm = fs_info->csum_shash;
+ for (int i = 0; i < nr_steps; i++) {
+ u32 pindex = offset_in_folio(folio, paddr + i * step) >> PAGE_SHIFT;
- kaddr = kmap_local_page(page) + pgoff;
- crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
- kunmap_local(kaddr);
+ /*
+ * For bs <= ps cases, we will only run the loop once, so the offset
+ * inside the page will only added to paddrs[0].
+ *
+ * For bs > ps cases, the block must be page aligned, thus offset
+ * inside the page will always be 0.
+ */
+ paddrs[i] = page_to_phys(folio_page(folio, pindex)) + offset_in_page(paddr);
+ }
+ return btrfs_calculate_block_csum_pages(fs_info, paddrs, dest);
+}
+
+/*
+ * Calculate the checksum of a fs block backed by multiple noncontiguous pages
+ * at @paddrs[] and save the result to @dest.
+ *
+ * The folio containing @paddr must be large enough to contain a full fs block.
+ */
+void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info,
+ const phys_addr_t paddrs[], u8 *dest)
+{
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 step = min(blocksize, PAGE_SIZE);
+ const u32 nr_steps = blocksize / step;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ for (int i = 0; i < nr_steps; i++) {
+ const phys_addr_t paddr = paddrs[i];
+ void *kaddr;
+
+ ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE);
+ kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
+ crypto_shash_update(shash, kaddr, step);
+ kunmap_local(kaddr);
+ }
+ crypto_shash_final(shash, dest);
+}
- if (memcmp(csum, csum_expected, fs_info->csum_size))
+/*
+ * Verify the checksum for a single sector without any extra action that depend
+ * on the type of I/O.
+ *
+ * @kaddr must be a properly kmapped address.
+ */
+int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
+ const u8 * const csum_expected)
+{
+ btrfs_calculate_block_csum_folio(fs_info, paddr, csum);
+ if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
return -EIO;
return 0;
}
/*
- * Verify the checksum of a single data sector.
+ * Verify the checksum of a single data sector, which can be scattered at
+ * different noncontiguous pages.
*
* @bbio: btrfs_io_bio which contains the csum
* @dev: device the sector is on
* @bio_offset: offset to the beginning of the bio (in bytes)
- * @bv: bio_vec to check
+ * @paddrs: physical addresses which back the fs block
*
* Check if the checksum on a data block is valid. When a checksum mismatch is
* detected, report the error and fill the corrupted range with zero.
@@ -3370,33 +3440,34 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
* Return %true if the sector is ok or had no checksum to start with, else %false.
*/
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
- u32 bio_offset, struct bio_vec *bv)
+ u32 bio_offset, const phys_addr_t paddrs[])
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 step = min(blocksize, PAGE_SIZE);
+ const u32 nr_steps = blocksize / step;
u64 file_offset = bbio->file_offset + bio_offset;
- u64 end = file_offset + bv->bv_len - 1;
+ u64 end = file_offset + blocksize - 1;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
- ASSERT(bv->bv_len == fs_info->sectorsize);
-
if (!bbio->csum)
return true;
if (btrfs_is_data_reloc_root(inode->root) &&
- test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
- NULL)) {
+ btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
+ NULL)) {
/* Skip the range without csum for data reloc inode */
- clear_extent_bits(&inode->io_tree, file_offset, end,
- EXTENT_NODATASUM);
+ btrfs_clear_extent_bit(&inode->io_tree, file_offset, end,
+ EXTENT_NODATASUM, NULL);
return true;
}
csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
fs_info->csum_size;
- if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
- csum_expected))
+ btrfs_calculate_block_csum_pages(fs_info, paddrs, csum);
+ if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
goto zeroit;
return true;
@@ -3405,7 +3476,8 @@ zeroit:
bbio->mirror_num);
if (dev)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
- memzero_bvec(bv);
+ for (int i = 0; i < nr_steps; i++)
+ memzero_page(phys_to_page(paddrs[i]), offset_in_page(paddrs[i]), step);
return false;
}
@@ -3519,7 +3591,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
int ret;
ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
- if (ret && ret != -EEXIST) {
+ if (unlikely(ret && ret != -EEXIST)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3544,7 +3616,7 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
int btrfs_orphan_cleanup(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
@@ -3734,19 +3806,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
out:
if (ret)
btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
- btrfs_free_path(path);
return ret;
}
/*
- * very simple check to peek ahead in the leaf looking for xattrs. If we
- * don't find any xattrs, we know there can't be any acls.
+ * Look ahead in the leaf for xattrs. If we don't find any then we know there
+ * can't be any ACLs.
*
- * slot is the slot the inode is in, objectid is the objectid of the inode
+ * @leaf: the eb leaf where to search
+ * @slot: the slot the inode is in
+ * @objectid: the objectid of the inode
+ *
+ * Return true if there is xattr/ACL, false otherwise.
*/
-static noinline int acls_after_inode_item(struct extent_buffer *leaf,
- int slot, u64 objectid,
- int *first_xattr_slot)
+static noinline bool acls_after_inode_item(struct extent_buffer *leaf,
+ int slot, u64 objectid,
+ int *first_xattr_slot)
{
u32 nritems = btrfs_header_nritems(leaf);
struct btrfs_key found_key;
@@ -3766,45 +3841,50 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
while (slot < nritems) {
btrfs_item_key_to_cpu(leaf, &found_key, slot);
- /* we found a different objectid, there must not be acls */
+ /* We found a different objectid, there must be no ACLs. */
if (found_key.objectid != objectid)
- return 0;
+ return false;
- /* we found an xattr, assume we've got an acl */
+ /* We found an xattr, assume we've got an ACL. */
if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
if (*first_xattr_slot == -1)
*first_xattr_slot = slot;
if (found_key.offset == xattr_access ||
found_key.offset == xattr_default)
- return 1;
+ return true;
}
/*
- * we found a key greater than an xattr key, there can't
- * be any acls later on
+ * We found a key greater than an xattr key, there can't be any
+ * ACLs later on.
*/
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
- return 0;
+ return false;
slot++;
scanned++;
/*
- * it goes inode, inode backrefs, xattrs, extents,
- * so if there are a ton of hard links to an inode there can
- * be a lot of backrefs. Don't waste time searching too hard,
- * this is just an optimization
+ * The item order goes like:
+ * - inode
+ * - inode backrefs
+ * - xattrs
+ * - extents,
+ *
+ * so if there are lots of hard links to an inode there can be
+ * a lot of backrefs. Don't waste time searching too hard,
+ * this is just an optimization.
*/
if (scanned >= 8)
break;
}
- /* we hit the end of the leaf before we found an xattr or
- * something larger than an xattr. We have to assume the inode
- * has acls
+ /*
+ * We hit the end of the leaf before we found an xattr or something
+ * larger than an xattr. We have to assume the inode has ACLs.
*/
if (*first_xattr_slot == -1)
*first_xattr_slot = slot;
- return 1;
+ return true;
}
static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
@@ -3824,7 +3904,8 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
if (!inode->file_extent_tree)
return -ENOMEM;
- extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
+ btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
/* Lockdep class is set only for the file extent tree. */
lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
@@ -3855,7 +3936,7 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
ASSERT(ret != -ENOMEM);
return ret;
} else if (existing) {
- WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING)));
}
return 0;
@@ -3882,10 +3963,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
bool filled = false;
int first_xattr_slot;
- ret = btrfs_init_file_extent_tree(inode);
- if (ret)
- goto out;
-
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
@@ -3917,8 +3994,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
@@ -3947,8 +4022,14 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
&inode->flags, &inode->ro_flags);
btrfs_update_inode_mapping_flags(inode);
+ btrfs_set_inode_mapping_order(inode);
cache_index:
+ ret = btrfs_init_file_extent_tree(inode);
+ if (ret)
+ goto out;
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any
@@ -4079,45 +4160,35 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode)
{
- struct btrfs_map_token token;
u64 flags;
- btrfs_init_map_token(&token, leaf);
-
- btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
- btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
- btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_token_inode_mode(&token, item, inode->i_mode);
- btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
-
- btrfs_set_token_timespec_sec(&token, &item->atime,
- inode_get_atime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode_get_atime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode_get_mtime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode_get_mtime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
- btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
-
- btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
- btrfs_set_token_inode_generation(&token, item,
- BTRFS_I(inode)->generation);
- btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
- btrfs_set_token_inode_transid(&token, item, trans->transid);
- btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
+
+ btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
- btrfs_set_token_inode_flags(&token, item, flags);
- btrfs_set_token_inode_block_group(&token, item, 0);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
/*
@@ -4127,7 +4198,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
@@ -4141,7 +4212,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto failed;
+ return ret;
}
leaf = path->nodes[0];
@@ -4150,10 +4221,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
btrfs_set_inode_last_trans(trans, inode);
- ret = 0;
-failed:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -4198,6 +4266,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
return ret;
}
+static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
+{
+ struct timespec64 now;
+
+ /*
+ * If we are replaying a log tree, we do not want to update the mtime
+ * and ctime of the parent directory with the current time, since the
+ * log replay procedure is responsible for setting them to their correct
+ * values (the ones it had when the fsync was done).
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
+ return;
+
+ now = inode_set_ctime_current(&dir->vfs_inode);
+ inode_set_mtime_to_ts(&dir->vfs_inode, now);
+}
+
/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
@@ -4219,20 +4304,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
u64 dir_ino = btrfs_ino(dir);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
if (IS_ERR_OR_NULL(di)) {
- ret = di ? PTR_ERR(di) : -ENOENT;
- goto err;
+ btrfs_free_path(path);
+ return di ? PTR_ERR(di) : -ENOENT;
}
ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ /*
+ * Down the call chains below we'll also need to allocate a path, so no
+ * need to hold on to this one for longer than necessary.
+ */
+ btrfs_free_path(path);
if (ret)
- goto err;
- btrfs_release_path(path);
+ return ret;
/*
* If we don't have dir index, we have to get it by looking up
@@ -4253,21 +4340,21 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
- if (ret) {
- btrfs_info(fs_info,
- "failed to delete reference to %.*s, inode %llu parent %llu",
- name->len, name->name, ino, dir_ino);
+ if (unlikely(ret)) {
+ btrfs_crit(fs_info,
+ "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
+ name->len, name->name, btrfs_root_id(root), ino, dir_ino);
btrfs_abort_transaction(trans, ret);
- goto err;
+ return ret;
}
skip_backref:
if (rename_ctx)
rename_ctx->index = index;
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto err;
+ return ret;
}
/*
@@ -4277,8 +4364,8 @@ skip_backref:
* operations on the log tree, increasing latency for applications.
*/
if (!rename_ctx) {
- btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
- btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
+ btrfs_del_inode_ref_in_log(trans, name, inode, dir);
+ btrfs_del_dir_entries_in_log(trans, name, dir, index);
}
/*
@@ -4291,19 +4378,14 @@ skip_backref:
* holding.
*/
btrfs_run_delayed_iput(fs_info, inode);
-err:
- btrfs_free_path(path);
- if (ret)
- goto out;
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
- ret = btrfs_update_inode(trans, dir);
-out:
- return ret;
+ update_time_after_link_or_unlink(dir);
+
+ return btrfs_update_inode(trans, dir);
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -4382,7 +4464,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = dir->root;
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
@@ -4425,7 +4507,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_dir_item_key_to_cpu(leaf, di, &key);
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4456,14 +4538,14 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
ret = btrfs_del_root_ref(trans, objectid,
btrfs_root_id(root), dir_ino,
&index, &fname.disk_name);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4475,7 +4557,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
if (ret)
btrfs_abort_transaction(trans, ret);
out:
- btrfs_free_path(path);
fscrypt_free_filename(&fname);
return ret;
}
@@ -4487,7 +4568,7 @@ out:
static noinline int may_destroy_subvol(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
struct btrfs_key key;
struct fscrypt_str name = FSTR_INIT("default", 7);
@@ -4509,7 +4590,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
key.objectid);
- goto out;
+ return ret;
}
btrfs_release_path(path);
}
@@ -4520,14 +4601,13 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
- if (ret == 0) {
+ return ret;
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
*/
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
ret = 0;
@@ -4537,8 +4617,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -4554,7 +4633,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
- if (atomic_read(&inode->vfs_inode.i_count) > 1)
+ if (icount_read(&inode->vfs_inode) > 1)
d_prune_aliases(&inode->vfs_inode);
min_ino = btrfs_ino(inode) + 1;
@@ -4637,13 +4716,13 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
btrfs_record_snapshot_destroy(trans, dir);
ret = btrfs_unlink_subvol(trans, dir, dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
ret = btrfs_record_root_in_trans(trans, dest);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4657,7 +4736,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_insert_orphan_item(trans,
fs_info->tree_root,
btrfs_root_id(dest));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4665,7 +4744,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
- if (ret && ret != -ENOENT) {
+ if (unlikely(ret && ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4674,7 +4753,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
btrfs_root_id(dest));
- if (ret && ret != -ENOENT) {
+ if (unlikely(ret && ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4710,68 +4789,68 @@ out_up_write:
return ret;
}
-static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry)
{
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_inode *dir = BTRFS_I(vfs_dir);
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
struct btrfs_trans_handle *trans;
- u64 last_unlink_trans;
struct fscrypt_name fname;
- if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
btrfs_err(fs_info,
"extent tree v2 doesn't support snapshot deletion yet");
return -EOPNOTSUPP;
}
- return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ return btrfs_delete_subvolume(dir, dentry);
}
- ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname);
if (ret)
return ret;
/* This needs to handle no-key deletions later on */
- trans = __unlink_start_trans(BTRFS_I(dir));
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
}
- if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to its
+ * parent directory. This is to prevent an unrecoverable log tree in the
+ * case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ *
+ * This is because we can't unlink other roots when replaying the dir
+ * deletes for directory foo.
+ */
+ if (inode->last_unlink_trans >= trans->transid)
+ btrfs_record_snapshot_destroy(trans, dir);
+
+ if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ ret = btrfs_unlink_subvol(trans, dir, dentry);
goto out;
}
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, inode);
if (ret)
goto out;
- last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
-
/* now the directory is empty */
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
- &fname.disk_name);
- if (!ret) {
- btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * Propagate the last_unlink_trans value of the deleted dir to
- * its parent directory. This is to prevent an unrecoverable
- * log tree in the case we do something like this:
- * 1) create dir foo
- * 2) create snapshot under dir foo
- * 3) delete the snapshot
- * 4) rmdir foo
- * 5) mkdir foo
- * 6) fsync foo or some file inside foo
- */
- if (last_unlink_trans >= trans->transid)
- BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
- }
+ ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name);
+ if (!ret)
+ btrfs_i_size_write(inode, 0);
out:
btrfs_end_transaction(trans);
out_notrans:
@@ -4781,20 +4860,80 @@ out_notrans:
return ret;
}
+static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize)
+{
+ ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u",
+ blockstart, blocksize);
+
+ if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1)
+ return true;
+ return false;
+}
+
+static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start)
+{
+ const pgoff_t index = (start >> PAGE_SHIFT);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct folio *folio;
+ u64 zero_start;
+ u64 zero_end;
+ int ret = 0;
+
+again:
+ folio = filemap_lock_folio(mapping, index);
+ /* No folio present. */
+ if (IS_ERR(folio))
+ return 0;
+
+ if (!folio_test_uptodate(folio)) {
+ ret = btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
+ if (unlikely(!folio_test_uptodate(folio))) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+ folio_wait_writeback(folio);
+
+ /*
+ * We do not need to lock extents nor wait for OE, as it's already
+ * beyond EOF.
+ */
+
+ zero_start = max_t(u64, folio_pos(folio), start);
+ zero_end = folio_next_pos(folio);
+ folio_zero_range(folio, zero_start - folio_pos(folio),
+ zero_end - zero_start);
+
+out_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+}
+
/*
- * Read, zero a chunk and write a block.
+ * Handle the truncation of a fs block.
+ *
+ * @inode - inode that we're zeroing
+ * @offset - the file offset of the block to truncate
+ * The value must be inside [@start, @end], and the function will do
+ * extra checks if the block that covers @offset needs to be zeroed.
+ * @start - the start file offset of the range we want to zero
+ * @end - the end (inclusive) file offset of the range we want to zero.
*
- * @inode - inode that we're zeroing
- * @from - the offset to start zeroing
- * @len - the length to zero, 0 to zero the entire range respective to the
- * offset
- * @front - zero up to the offset instead of from the offset on
+ * If the range is not block aligned, read out the folio that covers @offset,
+ * and if needed zero blocks that are inside the folio and covered by [@start, @end).
+ * If @start or @end + 1 lands inside a block, that block will be marked dirty
+ * for writeback.
*
- * This will find the block for the "from" offset and cow the block and zero the
- * part we want to zero. This is used with truncate and hole punching.
+ * This is utilized by hole punch, zero range, file expansion.
*/
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- int front)
+int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
@@ -4804,27 +4943,66 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
struct extent_changeset *data_reserved = NULL;
bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (blocksize - 1);
+ pgoff_t index = (offset >> PAGE_SHIFT);
struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping);
- size_t write_bytes = blocksize;
int ret = 0;
+ const bool in_head_block = is_inside_block(offset, round_down(start, blocksize),
+ blocksize);
+ const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize),
+ blocksize);
+ bool need_truncate_head = false;
+ bool need_truncate_tail = false;
+ u64 zero_start;
+ u64 zero_end;
u64 block_start;
u64 block_end;
- if (IS_ALIGNED(offset, blocksize) &&
- (!len || IS_ALIGNED(len, blocksize)))
+ /* @offset should be inside the range. */
+ ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu",
+ offset, start, end);
+
+ /* The range is aligned at both ends. */
+ if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) {
+ /*
+ * For block size < page size case, we may have polluted blocks
+ * beyond EOF. So we also need to zero them out.
+ */
+ if (end == (u64)-1 && blocksize < PAGE_SIZE)
+ ret = truncate_block_zero_beyond_eof(inode, start);
goto out;
+ }
- block_start = round_down(from, blocksize);
+ /*
+ * @offset may not be inside the head nor tail block. In that case we
+ * don't need to do anything.
+ */
+ if (!in_head_block && !in_tail_block)
+ goto out;
+
+ /*
+ * Skip the truncation if the range in the target block is already aligned.
+ * The seemingly complex check will also handle the same block case.
+ */
+ if (in_head_block && !IS_ALIGNED(start, blocksize))
+ need_truncate_head = true;
+ if (in_tail_block && !IS_ALIGNED(end + 1, blocksize))
+ need_truncate_tail = true;
+ if (!need_truncate_head && !need_truncate_tail)
+ goto out;
+
+ block_start = round_down(offset, blocksize);
block_end = block_start + blocksize - 1;
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
blocksize, false);
if (ret < 0) {
+ size_t write_bytes = blocksize;
+
if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
- /* For nocow case, no need to reserve data space */
+ /* For nocow case, no need to reserve data space. */
+ ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u",
+ write_bytes, blocksize);
only_release_metadata = true;
} else {
goto out;
@@ -4841,10 +5019,13 @@ again:
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio)) {
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
- ret = -ENOMEM;
+ ret = PTR_ERR(folio);
goto out;
}
@@ -4856,7 +5037,7 @@ again:
folio_put(folio);
goto again;
}
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = -EIO;
goto out_unlock;
}
@@ -4874,11 +5055,11 @@ again:
folio_wait_writeback(folio);
- lock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
folio_unlock(folio);
folio_put(folio);
btrfs_start_ordered_extent(ordered);
@@ -4886,37 +5067,46 @@ again:
goto again;
}
- clear_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
- unlock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
goto out_unlock;
}
- if (offset != blocksize) {
- if (!len)
- len = blocksize - offset;
- if (front)
- folio_zero_range(folio, block_start - folio_pos(folio),
- offset);
- else
- folio_zero_range(folio,
- (block_start - folio_pos(folio)) + offset,
- len);
+ if (end == (u64)-1) {
+ /*
+ * We're truncating beyond EOF, the remaining blocks normally are
+ * already holes thus no need to zero again, but it's possible for
+ * fs block size < page size cases to have memory mapped writes
+ * to pollute ranges beyond EOF.
+ *
+ * In that case although such polluted blocks beyond EOF will
+ * not reach disk, it still affects our page caches.
+ */
+ zero_start = max_t(u64, folio_pos(folio), start);
+ zero_end = min_t(u64, folio_next_pos(folio) - 1, end);
+ } else {
+ zero_start = max_t(u64, block_start, start);
+ zero_end = min_t(u64, block_end, end);
}
+ folio_zero_range(folio, zero_start - folio_pos(folio),
+ zero_end - zero_start + 1);
+
btrfs_folio_clear_checked(fs_info, folio, block_start,
block_end + 1 - block_start);
btrfs_folio_set_dirty(fs_info, folio, block_start,
block_end + 1 - block_start);
- unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
- set_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_NORESERVE, NULL);
+ btrfs_set_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_NORESERVE, &cached_state);
+
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
out_unlock:
if (ret) {
@@ -4967,7 +5157,7 @@ static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -5009,7 +5199,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
* rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- ret = btrfs_truncate_block(inode, oldsize, 0, 0);
+ ret = btrfs_truncate_block(inode, oldsize, oldsize, -1);
if (ret)
return ret;
@@ -5026,7 +5216,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
em = NULL;
break;
}
- last_byte = min(extent_map_end(em), block_end);
+ last_byte = min(btrfs_extent_map_end(em), block_end);
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
@@ -5042,7 +5232,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
if (ret)
break;
- hole_em = alloc_extent_map();
+ hole_em = btrfs_alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_map_range(inode, cur_offset,
cur_offset + hole_size - 1,
@@ -5059,7 +5249,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->generation = btrfs_get_fs_generation(fs_info);
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
- free_extent_map(hole_em);
+ btrfs_free_extent_map(hole_em);
} else {
ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
@@ -5067,14 +5257,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
break;
}
next:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
cur_offset = last_byte;
if (cur_offset >= block_end)
break;
}
- free_extent_map(em);
- unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
return ret;
}
@@ -5154,7 +5344,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
if (ret && inode->i_nlink) {
- int err;
+ int ret2;
/*
* Truncate failed, so fix up the in-memory size. We
@@ -5162,9 +5352,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* wait for disk_i_size to be stable and then update the
* in-memory size to match.
*/
- err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
- if (err)
- return err;
+ ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
+ if (ret2)
+ return ret2;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
}
}
@@ -5177,31 +5367,31 @@ static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
- int err;
+ int ret;
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(idmap, dentry, attr);
- if (err)
- return err;
+ ret = setattr_prepare(idmap, dentry, attr);
+ if (ret)
+ return ret;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- err = btrfs_setsize(inode, attr);
- if (err)
- return err;
+ ret = btrfs_setsize(inode, attr);
+ if (ret)
+ return ret;
}
if (attr->ia_valid) {
setattr_copy(idmap, inode, attr);
inode_inc_iversion(inode);
- err = btrfs_dirty_inode(BTRFS_I(inode));
+ ret = btrfs_dirty_inode(BTRFS_I(inode));
- if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(idmap, dentry, inode->i_mode);
+ if (!ret && attr->ia_valid & ATTR_MODE)
+ ret = posix_acl_chmod(idmap, dentry, inode->i_mode);
}
- return err;
+ return ret;
}
/*
@@ -5222,7 +5412,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct rb_node *node;
- ASSERT(inode->i_state & I_FREEING);
+ ASSERT(inode_state_read_once(inode) & I_FREEING);
truncate_inode_pages_final(&inode->i_data);
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
@@ -5258,7 +5448,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
state_flags = state->state;
spin_unlock(&io_tree->lock);
- lock_extent(io_tree, start, end, &cached_state);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
@@ -5272,9 +5462,9 @@ static void evict_inode_truncate_pages(struct inode *inode)
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
end - start + 1, NULL);
- clear_extent_bit(io_tree, start, end,
- EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
- &cached_state);
+ btrfs_clear_extent_bit(io_tree, start, end,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
+ &cached_state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -5335,7 +5525,7 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_fs_info *fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *rsv = NULL;
+ struct btrfs_block_rsv rsv;
int ret;
trace_btrfs_inode_evict(inode);
@@ -5383,11 +5573,9 @@ void btrfs_evict_inode(struct inode *inode)
*/
btrfs_kill_delayed_inode_items(BTRFS_I(inode));
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv)
- goto out;
- rsv->size = btrfs_calc_metadata_size(fs_info, 1);
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = btrfs_calc_metadata_size(fs_info, 1);
+ rsv.failfast = true;
btrfs_i_size_write(BTRFS_I(inode), 0);
@@ -5399,11 +5587,11 @@ void btrfs_evict_inode(struct inode *inode)
.min_type = 0,
};
- trans = evict_refill_and_join(root, rsv);
+ trans = evict_refill_and_join(root, &rsv);
if (IS_ERR(trans))
- goto out;
+ goto out_release;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
ret = btrfs_truncate_inode_items(trans, root, &control);
trans->block_rsv = &fs_info->trans_block_rsv;
@@ -5415,7 +5603,7 @@ void btrfs_evict_inode(struct inode *inode)
*/
btrfs_btree_balance_dirty_nodelay(fs_info);
if (ret && ret != -ENOSPC && ret != -EAGAIN)
- goto out;
+ goto out_release;
else if (!ret)
break;
}
@@ -5429,16 +5617,17 @@ void btrfs_evict_inode(struct inode *inode)
* If it turns out that we are dropping too many of these, we might want
* to add a mechanism for retrying these after a commit.
*/
- trans = evict_refill_and_join(root, rsv);
+ trans = evict_refill_and_join(root, &rsv);
if (!IS_ERR(trans)) {
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
}
+out_release:
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
out:
- btrfs_free_block_rsv(fs_info, rsv);
/*
* If we didn't successfully delete, the orphan item will still be in
* the tree and we'll retry on the next mount. Again, we might also want
@@ -5460,7 +5649,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
struct btrfs_key *location, u8 *type)
{
struct btrfs_dir_item *di;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = dir->root;
int ret = 0;
struct fscrypt_name fname;
@@ -5471,7 +5660,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
if (ret < 0)
- goto out;
+ return ret;
/*
* fscrypt_setup_filename() should never return a positive value, but
* gcc on sparc/parisc thinks it can, so assert that doesn't happen.
@@ -5488,19 +5677,18 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
- if (location->type != BTRFS_INODE_ITEM_KEY &&
- location->type != BTRFS_ROOT_ITEM_KEY) {
+ if (unlikely(location->type != BTRFS_INODE_ITEM_KEY &&
+ location->type != BTRFS_ROOT_ITEM_KEY)) {
ret = -EUCLEAN;
btrfs_warn(root->fs_info,
-"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
+"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location " BTRFS_KEY_FMT ")",
__func__, fname.disk_name.name, btrfs_ino(dir),
- location->objectid, location->type, location->offset);
+ BTRFS_KEY_FMT_VALUE(location));
}
if (!ret)
*type = btrfs_dir_ftype(path->nodes[0], di);
out:
fscrypt_free_filename(&fname);
- btrfs_free_path(path);
return ret;
}
@@ -5515,7 +5703,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
struct btrfs_key *location,
struct btrfs_root **sub_root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *new_root;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
@@ -5571,7 +5759,6 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
location->offset = 0;
err = 0;
out:
- btrfs_free_path(path);
fscrypt_free_filename(&fname);
return err;
}
@@ -5585,7 +5772,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
bool empty = false;
xa_lock(&root->inodes);
- entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ /*
+ * This btrfs_inode is being freed and has already been unhashed at this
+ * point. It's possible that another btrfs_inode has already been
+ * allocated for the same inode and inserted itself into the root, so
+ * don't delete it in that case.
+ *
+ * Note that this shouldn't need to allocate memory, so the gfp flags
+ * don't really matter.
+ */
+ entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
+ GFP_ATOMIC);
if (entry == inode)
empty = xa_empty(&root->inodes);
xa_unlock(&root->inodes);
@@ -5653,7 +5850,7 @@ struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->vfs_inode.i_state & I_NEW))
+ if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
return inode;
ret = btrfs_read_locked_inode(inode, path);
@@ -5677,18 +5874,22 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->vfs_inode.i_state & I_NEW))
+ if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
return inode;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ iget_failed(&inode->vfs_inode);
return ERR_PTR(-ENOMEM);
+ }
ret = btrfs_read_locked_inode(inode, path);
btrfs_free_path(path);
if (ret)
return ERR_PTR(ret);
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC;
unlock_new_inode(&inode->vfs_inode);
return inode;
}
@@ -5770,7 +5971,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_CAST(inode);
/* Do extra check against inode mode with di_type */
- if (btrfs_inode_type(inode) != di_type) {
+ if (unlikely(btrfs_inode_type(inode) != di_type)) {
btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
inode->vfs_inode.i_mode, btrfs_inode_type(inode),
@@ -5848,7 +6049,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key, found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
int ret;
@@ -5862,15 +6063,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
/* FIXME: we should be able to handle this */
if (ret == 0)
- goto out;
- ret = 0;
+ return ret;
if (path->slots[0] == 0) {
inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
+ return 0;
}
path->slots[0]--;
@@ -5881,13 +6081,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
if (found_key.objectid != btrfs_ino(inode) ||
found_key.type != BTRFS_DIR_INDEX_KEY) {
inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
+ return 0;
}
inode->index_cnt = found_key.offset + 1;
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
@@ -5990,7 +6189,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
void *addr;
LIST_HEAD(ins_list);
LIST_HEAD(del_list);
@@ -6073,8 +6272,7 @@ again:
if (ret)
goto nopos;
- ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
- if (ret)
+ if (btrfs_readdir_delayed_dir_index(ctx, &ins_list))
goto nopos;
/*
@@ -6103,7 +6301,6 @@ nopos:
err:
if (put)
btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
- btrfs_free_path(path);
return ret;
}
@@ -6145,8 +6342,8 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
}
/*
- * This is a copy of file_update_time. We need this so we can return error on
- * ENOSPC for updating the inode in the case of file write and mmap writes.
+ * We need our own ->update_time so that we can return error on ENOSPC for
+ * updating the inode in the case of file write and mmap writes.
*/
static int btrfs_update_time(struct inode *inode, int flags)
{
@@ -6361,6 +6558,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (!args->subvol)
btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
+ btrfs_set_inode_mapping_order(BTRFS_I(inode));
if (S_ISREG(inode->i_mode)) {
if (btrfs_test_opt(fs_info, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
@@ -6414,7 +6612,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
batch.nr = args->orphan ? 1 : 2;
ret = btrfs_insert_empty_items(trans, root, path, &batch);
- if (ret != 0) {
+ if (unlikely(ret != 0)) {
btrfs_abort_transaction(trans, ret);
goto discard;
}
@@ -6491,7 +6689,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
*/
if (!args->subvol) {
ret = btrfs_init_inode_security(trans, args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto discard;
}
@@ -6511,13 +6709,17 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (args->orphan) {
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
} else {
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
0, BTRFS_I(inode)->dir_index);
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto discard;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
}
return 0;
@@ -6545,7 +6747,7 @@ out:
*/
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const struct fscrypt_str *name, int add_backref, u64 index)
+ const struct fscrypt_str *name, bool add_backref, u64 index)
{
int ret = 0;
struct btrfs_key key;
@@ -6578,7 +6780,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_inode_type(inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
- else if (ret) {
+ else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -6586,15 +6788,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
name->len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
- /*
- * If we are replaying a log tree, we do not want to update the mtime
- * and ctime of the parent directory with the current time, since the
- * log replay procedure is responsible for setting them to their correct
- * values (the ones it had when the fsync was done).
- */
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
- inode_set_mtime_to_ts(&parent_inode->vfs_inode,
- inode_set_ctime_current(&parent_inode->vfs_inode));
+ update_time_after_link_or_unlink(parent_inode);
ret = btrfs_update_inode(trans, parent_inode);
if (ret)
@@ -6604,20 +6798,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
fail_dir_item:
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
u64 local_index;
- int err;
- err = btrfs_del_root_ref(trans, key.objectid,
- btrfs_root_id(root), parent_ino,
- &local_index, name);
- if (err)
- btrfs_abort_transaction(trans, err);
+ int ret2;
+
+ ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root),
+ parent_ino, &local_index, name);
+ if (ret2)
+ btrfs_abort_transaction(trans, ret2);
} else if (add_backref) {
- u64 local_index;
- int err;
+ int ret2;
- err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
- &local_index);
- if (err)
- btrfs_abort_transaction(trans, err);
+ ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL);
+ if (ret2)
+ btrfs_abort_transaction(trans, ret2);
}
/* Return the original error code */
@@ -6636,30 +6828,33 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
};
unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
- int err;
+ int ret;
- err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
- if (err)
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
goto out_inode;
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_new_inode_args;
}
- err = btrfs_create_new_inode(trans, &new_inode_args);
- if (!err)
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+ if (!ret) {
+ if (S_ISDIR(inode->i_mode))
+ inode->i_opflags |= IOP_FASTPERM_MAY_EXEC;
d_instantiate_new(dentry, inode);
+ }
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
- if (err)
+ if (ret)
iput(inode);
- return err;
+ return ret;
}
static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
@@ -6700,8 +6895,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct fscrypt_name fname;
u64 index;
- int err;
- int drop_inode = 0;
+ int ret;
/* do not allow sys_link's with other subvols of the same device */
if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
@@ -6710,12 +6904,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
- if (err)
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (ret)
goto fail;
- err = btrfs_set_inode_index(BTRFS_I(dir), &index);
- if (err)
+ ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
+ if (ret)
goto fail;
/*
@@ -6726,53 +6920,52 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
*/
trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
goto fail;
}
/* There are several dir indexes for this inode, clear the cache. */
BTRFS_I(inode)->dir_index = 0ULL;
- inc_nlink(inode);
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ihold(inode);
- set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
- err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index);
+ if (ret)
+ goto fail;
- if (err) {
- drop_inode = 1;
- } else {
- struct dentry *parent = dentry->d_parent;
+ /* Link added now we update the inode item with the new link count. */
+ inc_nlink(inode);
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
- err = btrfs_update_inode(trans, BTRFS_I(inode));
- if (err)
+ if (inode->i_nlink == 1) {
+ /*
+ * If the new hard link count is 1, it's a file created with the
+ * open(2) O_TMPFILE flag.
+ */
+ ret = btrfs_orphan_del(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto fail;
- if (inode->i_nlink == 1) {
- /*
- * If new hard link count is 1, it's a file created
- * with open(2) O_TMPFILE flag.
- */
- err = btrfs_orphan_del(trans, BTRFS_I(inode));
- if (err)
- goto fail;
}
- d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
+ /* Grab reference for the new dentry passed to d_instantiate(). */
+ ihold(inode);
+ d_instantiate(dentry, inode);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
+
fail:
fscrypt_free_filename(&fname);
if (trans)
btrfs_end_transaction(trans);
- if (drop_inode) {
- inode_dec_link_count(inode);
- iput(inode);
- }
btrfs_btree_balance_dirty(fs_info);
- return err;
+ return ret;
}
static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
@@ -6893,18 +7086,18 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct extent_map_tree *em_tree = &inode->extent_tree;
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
if (em) {
if (em->start > start || em->start + em->len <= start)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
else
goto out;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto out;
@@ -6928,8 +7121,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
* point the commit_root has everything we need.
*/
if (btrfs_is_free_space_inode(inode)) {
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
}
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
@@ -6964,7 +7157,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
- if (!S_ISREG(inode->vfs_inode.i_mode)) {
+ if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) {
ret = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
@@ -7041,7 +7234,7 @@ not_found:
insert:
ret = 0;
btrfs_release_path(path);
- if (em->start > start || extent_map_end(em) <= start) {
+ if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
@@ -7058,7 +7251,7 @@ out:
trace_btrfs_get_extent(root, inode, em);
if (ret) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
return em;
@@ -7102,7 +7295,7 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct can_nocow_file_extent_args nocow_args = { 0 };
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
struct extent_buffer *leaf;
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -7118,13 +7311,12 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
offset, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 1) {
if (path->slots[0] == 0) {
- /* can't find the item, must cow */
- ret = 0;
- goto out;
+ /* Can't find the item, must COW. */
+ return 0;
}
path->slots[0]--;
}
@@ -7133,17 +7325,17 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
- /* not our file or wrong item type, must cow */
- goto out;
+ /* Not our file or wrong item type, must COW. */
+ return 0;
}
if (key.offset > offset) {
- /* Wrong offset, must cow */
- goto out;
+ /* Wrong offset, must COW. */
+ return 0;
}
if (btrfs_file_extent_end(path) <= offset)
- goto out;
+ return 0;
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, fi);
@@ -7158,15 +7350,13 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
if (ret != 1) {
/* Treat errors as not being able to NOCOW. */
- ret = 0;
- goto out;
+ return 0;
}
- ret = 0;
if (btrfs_extent_readonly(fs_info,
nocow_args.file_extent.disk_bytenr +
nocow_args.file_extent.offset))
- goto out;
+ return 0;
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -7174,21 +7364,18 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
range_end = round_up(offset + nocow_args.file_extent.num_bytes,
root->fs_info->sectorsize) - 1;
- ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
- if (ret) {
- ret = -EAGAIN;
- goto out;
- }
+ ret = btrfs_test_range_bit_exists(io_tree, offset, range_end,
+ EXTENT_DELALLOC);
+ if (ret)
+ return -EAGAIN;
}
if (file_extent)
memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
*len = nocow_args.file_extent.num_bytes;
- ret = 1;
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 1;
}
/* The callers of this must take lock_extent() */
@@ -7236,7 +7423,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
break;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7249,15 +7436,15 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
em->offset = file_extent->offset;
em->flags |= EXTENT_FLAG_PINNED;
if (type == BTRFS_ORDERED_COMPRESSED)
- extent_map_set_compression(em, file_extent->compression);
+ btrfs_extent_map_set_compression(em, file_extent->compression);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
- /* em got 2 refs now, callers needs to do free_extent_map once. */
+ /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */
return em;
}
@@ -7271,13 +7458,13 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
static void wait_subpage_spinlock(struct folio *folio)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7290,8 +7477,8 @@ static void wait_subpage_spinlock(struct folio *folio)
* Here we just acquire the spinlock so that all existing callers
* should exit and we're safe to release/invalidate the page.
*/
- spin_lock_irq(&subpage->lock);
- spin_unlock_irq(&subpage->lock);
+ spin_lock_irq(&bfs->lock);
+ spin_unlock_irq(&bfs->lock);
}
static int btrfs_launder_folio(struct folio *folio)
@@ -7324,7 +7511,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
{
int ret = filemap_migrate_folio(mapping, dst, src, mode);
- if (ret != MIGRATEPAGE_SUCCESS)
+ if (ret)
return ret;
if (folio_test_ordered(src)) {
@@ -7332,7 +7519,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
folio_set_ordered(dst);
}
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
#else
#define btrfs_migrate_folio NULL
@@ -7348,7 +7535,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
u64 page_start = folio_pos(folio);
u64 page_end = page_start + folio_size(folio) - 1;
u64 cur;
- int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
+ int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING;
/*
* We have folio locked so no new ordered extent can be created on this
@@ -7384,7 +7571,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
}
if (!inode_evicting)
- lock_extent(tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(tree, page_start, page_end, &cached_state);
cur = page_start;
while (cur < page_end) {
@@ -7440,16 +7627,16 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* btrfs_finish_ordered_io().
*/
if (!inode_evicting)
- clear_extent_bit(tree, cur, range_end,
- EXTENT_DELALLOC |
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
+ btrfs_clear_extent_bit(tree, cur, range_end,
+ EXTENT_DELALLOC |
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
ordered->truncated_len = min(ordered->truncated_len,
cur - ordered->file_offset);
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
/*
* If the ordered extent has finished, we're safe to delete all
@@ -7485,12 +7672,11 @@ next:
* Since the IO will never happen for this page.
*/
btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
- if (!inode_evicting) {
- clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
- extra_flags, &cached_state);
- }
+ if (!inode_evicting)
+ btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG | extra_flags,
+ &cached_state);
cur = range_end + 1;
}
/*
@@ -7512,19 +7698,22 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
.ino = btrfs_ino(inode),
.min_type = BTRFS_EXTENT_DATA_KEY,
.clear_extent_range = true,
+ .new_size = inode->vfs_inode.i_size,
};
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *rsv;
+ struct btrfs_block_rsv rsv;
int ret;
struct btrfs_trans_handle *trans;
- u64 mask = fs_info->sectorsize - 1;
const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
+ const u64 lock_start = round_down(inode->vfs_inode.i_size, fs_info->sectorsize);
+ const u64 i_size_up = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
+
+ /* Our inode is locked and the i_size can't be changed concurrently. */
+ btrfs_assert_inode_locked(inode);
if (!skip_writeback) {
- ret = btrfs_wait_ordered_range(inode,
- inode->vfs_inode.i_size & (~mask),
- (u64)-1);
+ ret = btrfs_wait_ordered_range(inode, lock_start, (u64)-1);
if (ret)
return ret;
}
@@ -7557,11 +7746,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
* 2) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv)
- return -ENOMEM;
- rsv->size = min_size;
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = min_size;
+ rsv.failfast = true;
/*
* 1 for the truncate slack space
@@ -7574,7 +7761,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
}
/* Migrate the slack space for the truncate to our reserve */
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
min_size, false);
/*
* We have reserved 2 metadata units when we started the transaction and
@@ -7586,30 +7773,25 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
goto out;
}
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
while (1) {
struct extent_state *cached_state = NULL;
- const u64 new_size = inode->vfs_inode.i_size;
- const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
- control.new_size = new_size;
- lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
/*
* We want to drop from the next block forward in case this new
* size is not block aligned since we will be keeping the last
* block of the extent just the way it is.
*/
- btrfs_drop_extent_map_range(inode,
- ALIGN(new_size, fs_info->sectorsize),
- (u64)-1, false);
+ btrfs_drop_extent_map_range(inode, i_size_up, (u64)-1, false);
ret = btrfs_truncate_inode_items(trans, root, &control);
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
@@ -7629,9 +7811,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
break;
}
- btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
+ btrfs_block_rsv_release(fs_info, &rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, false);
+ &rsv, min_size, false);
/*
* We have reserved 2 metadata units when we started the
* transaction and min_size matches 1 unit, so this should never
@@ -7640,7 +7822,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
if (WARN_ON(ret))
break;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
}
/*
@@ -7653,7 +7835,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
+ ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size,
+ inode->vfs_inode.i_size, (u64)-1);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
@@ -7678,7 +7861,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
btrfs_btree_balance_dirty(fs_info);
}
out:
- btrfs_free_block_rsv(fs_info, rsv);
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
/*
* So if we truncate and then write and fsync we normally would just
* write the extents that changed, which is a problem if we need to
@@ -7734,6 +7917,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ /* new_delalloc_bytes and last_dir_index_offset are in a union. */
ei->new_delalloc_bytes = 0;
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
@@ -7765,10 +7949,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->i_otime_nsec = 0;
inode = &ei->vfs_inode;
- extent_map_tree_init(&ei->extent_tree);
+ btrfs_extent_map_tree_init(&ei->extent_tree);
/* This io tree sets the valid inode. */
- extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
+ btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
ei->file_extent_tree = NULL;
@@ -7868,7 +8052,7 @@ int btrfs_drop_inode(struct inode *inode)
if (btrfs_root_refs(&root->root_item) == 0)
return 1;
else
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
static void init_once(void *foo)
@@ -7876,6 +8060,9 @@ static void init_once(void *foo)
struct btrfs_inode *ei = foo;
inode_init_once(&ei->vfs_inode);
+#ifdef CONFIG_FS_VERITY
+ ei->i_verity_info = NULL;
+#endif
}
void __cold btrfs_destroy_cachep(void)
@@ -7933,7 +8120,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
- stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->subvol = btrfs_root_id(BTRFS_I(inode)->root);
stat->result_mask |= STATX_SUBVOL;
spin_lock(&BTRFS_I(inode)->lock);
@@ -7966,6 +8153,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ bool logs_pinned = false;
struct fscrypt_name old_fname, new_fname;
struct fscrypt_str *old_name, *new_name;
@@ -8076,7 +8264,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
if (ret) {
- if (need_abort)
+ if (unlikely(need_abort))
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8089,6 +8277,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
+ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not for
+ * root entries) pin the log early to prevent any concurrent
+ * task from logging the directory after we removed the old
+ * entries and before we add the new entries, otherwise that
+ * task can sync a log without any entry for the inodes we are
+ * renaming and therefore replaying that log, if a power failure
+ * happens after syncing the log, would result in deleting the
+ * inodes.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8099,7 +8312,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8107,12 +8320,12 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_name, &old_rename_ctx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8121,7 +8334,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8129,12 +8342,12 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_name, &new_rename_ctx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8142,14 +8355,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
new_name, 0, old_idx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
old_name, 0, new_idx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8160,30 +8373,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
/*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
+ * Do the log updates for all inodes.
+ *
+ * If either entry is for a root we don't need to update the logs since
+ * we've called btrfs_set_log_full_commit() before.
*/
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(dest);
-
- /* Do the log updates for all inodes. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
+ }
- /* Now unpin the logs. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+out_fail:
+ if (logs_pinned) {
btrfs_end_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
-out_fail:
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8233,6 +8439,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
struct fscrypt_name old_fname, new_fname;
+ bool logs_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -8367,13 +8574,36 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not a
+ * root entry) pin the log to prevent any concurrent task from
+ * logging the directory after we removed the old entry and
+ * before we add the new entry, otherwise that task can sync
+ * a log without any entry for the inode we are renaming and
+ * therefore replaying that log, if a power failure happens
+ * after syncing the log, would result in deleting the inode.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8381,12 +8611,12 @@ static int btrfs_rename(struct mnt_idmap *idmap,
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
&old_fname.disk_name, &rename_ctx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8397,7 +8627,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8406,7 +8636,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
&new_fname.disk_name);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8414,7 +8644,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans,
BTRFS_I(d_inode(new_dentry)));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8423,7 +8653,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
&new_fname.disk_name, 0, index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8431,13 +8661,13 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
ret = btrfs_create_new_inode(trans, &whiteout_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
} else {
@@ -8447,6 +8677,10 @@ static int btrfs_rename(struct mnt_idmap *idmap,
}
}
out_fail:
+ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+ btrfs_end_log_trans(dest);
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8528,15 +8762,13 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root,
- struct writeback_control *wbc, bool snapshot,
- bool in_reclaim_context)
+static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write,
+ bool snapshot, bool in_reclaim_context)
{
struct btrfs_delalloc_work *work, *next;
LIST_HEAD(works);
LIST_HEAD(splice);
int ret = 0;
- bool full_flush = wbc->nr_to_write == LONG_MAX;
mutex_lock(&root->delalloc_mutex);
spin_lock(&root->delalloc_lock);
@@ -8545,7 +8777,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
struct btrfs_inode *inode;
struct inode *tmp_inode;
- inode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes);
+ inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes);
list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
@@ -8562,10 +8794,10 @@ static int start_delalloc_inodes(struct btrfs_root *root,
if (snapshot)
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
- if (full_flush) {
- work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
+ if (nr_to_write == NULL) {
+ work = btrfs_alloc_delalloc_work(tmp_inode);
if (!work) {
- iput(&inode->vfs_inode);
+ iput(tmp_inode);
ret = -ENOMEM;
goto out;
}
@@ -8573,9 +8805,11 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
+ ret = filemap_flush_nr(tmp_inode->i_mapping,
+ nr_to_write);
btrfs_add_delayed_iput(inode);
- if (ret || wbc->nr_to_write <= 0)
+
+ if (ret || *nr_to_write <= 0)
goto out;
}
cond_resched();
@@ -8601,29 +8835,17 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{
- struct writeback_control wbc = {
- .nr_to_write = LONG_MAX,
- .sync_mode = WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
struct btrfs_fs_info *fs_info = root->fs_info;
if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
-
- return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
+ return start_delalloc_inodes(root, NULL, true, in_reclaim_context);
}
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context)
{
- struct writeback_control wbc = {
- .nr_to_write = nr,
- .sync_mode = WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
+ long *nr_to_write = nr == LONG_MAX ? NULL : &nr;
struct btrfs_root *root;
LIST_HEAD(splice);
int ret;
@@ -8635,13 +8857,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
while (!list_empty(&splice)) {
- /*
- * Reset nr_to_write here so we know that we're doing a full
- * flush.
- */
- if (nr == LONG_MAX)
- wbc.nr_to_write = LONG_MAX;
-
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_root(root);
@@ -8650,9 +8865,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
+ ret = start_delalloc_inodes(root, nr_to_write, false,
+ in_reclaim_context);
btrfs_put_root(root);
- if (ret < 0 || wbc.nr_to_write <= 0)
+ if (ret < 0 || nr <= 0)
goto out;
spin_lock(&fs_info->delalloc_root_lock);
}
@@ -8683,7 +8899,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
.dentry = dentry,
};
unsigned int trans_num_items;
- int err;
+ int ret;
int name_len;
int datasize;
unsigned long ptr;
@@ -8710,26 +8926,26 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode_set_bytes(inode, name_len);
new_inode_args.inode = inode;
- err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
- if (err)
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
goto out_inode;
/* 1 additional item for the inline extent */
trans_num_items++;
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_new_inode_args;
}
- err = btrfs_create_new_inode(trans, &new_inode_args);
- if (err)
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+ if (ret)
goto out;
path = btrfs_alloc_path();
- if (!path) {
- err = -ENOMEM;
- btrfs_abort_transaction(trans, err);
+ if (unlikely(!path)) {
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
discard_new_inode(inode);
inode = NULL;
goto out;
@@ -8738,10 +8954,9 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(name_len);
- err = btrfs_insert_empty_item(trans, root, path, &key,
- datasize);
- if (err) {
- btrfs_abort_transaction(trans, err);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, datasize);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
discard_new_inode(inode);
inode = NULL;
@@ -8763,16 +8978,16 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
btrfs_free_path(path);
d_instantiate_new(dentry, inode);
- err = 0;
+ ret = 0;
out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
- if (err)
+ if (ret)
iput(inode);
- return err;
+ return ret;
}
static struct btrfs_trans_handle *insert_prealloc_file_extent(
@@ -8883,7 +9098,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
*/
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
- min_size, 0, *alloc_hint, &ins, 1, 0);
+ min_size, 0, *alloc_hint, &ins, true, false);
if (ret)
break;
@@ -8909,11 +9124,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
- ins.offset, 0);
+ ins.offset, false);
break;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
cur_offset + ins.offset - 1, false);
@@ -8931,7 +9146,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
num_bytes -= ins.offset;
cur_offset += ins.offset;
@@ -8953,7 +9168,7 @@ next:
ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
if (own_trans)
btrfs_end_transaction(trans);
@@ -8989,6 +9204,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
+/*
+ * NOTE: in case you are adding MAY_EXEC check for directories:
+ * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to
+ * elide calls here.
+ */
static int btrfs_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask)
{
@@ -9103,7 +9323,7 @@ static ssize_t btrfs_encoded_read_inline(
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_file_extent_item *item;
u64 ram_bytes;
@@ -9113,21 +9333,19 @@ static ssize_t btrfs_encoded_read_inline(
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
path->nowait = nowait;
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
extent_start, 0);
if (ret) {
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
/* The extent item disappeared? */
- ret = -EIO;
+ return -EIO;
}
- goto out;
+ return ret;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
@@ -9140,17 +9358,16 @@ static ssize_t btrfs_encoded_read_inline(
ret = btrfs_encoded_io_compression_from_extent(fs_info,
btrfs_file_extent_compression(leaf, item));
if (ret < 0)
- goto out;
+ return ret;
encoded->compression = ret;
if (encoded->compression) {
size_t inline_size;
inline_size = btrfs_file_extent_inline_item_len(leaf,
path->slots[0]);
- if (inline_size > count) {
- ret = -ENOBUFS;
- goto out;
- }
+ if (inline_size > count)
+ return -ENOBUFS;
+
count = inline_size;
encoded->unencoded_len = ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - extent_start;
@@ -9162,13 +9379,12 @@ static ssize_t btrfs_encoded_read_inline(
}
tmp = kmalloc(count, GFP_NOFS);
- if (!tmp) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!tmp)
+ return -ENOMEM;
+
read_extent_buffer(leaf, tmp, ptr, count);
btrfs_release_path(path);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -9176,8 +9392,7 @@ static ssize_t btrfs_encoded_read_inline(
if (ret != count)
ret = -EFAULT;
kfree(tmp);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -9219,7 +9434,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
u64 disk_bytenr, u64 disk_io_size,
struct page **pages, void *uring_ctx)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_encoded_read_private *priv, sync_priv;
struct completion sync_reads;
unsigned long i = 0;
@@ -9244,10 +9458,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
priv->status = 0;
priv->uring_ctx = uring_ctx;
- bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0,
btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- bbio->inode = inode;
do {
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
@@ -9256,10 +9469,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
refcount_inc(&priv->pending_refs);
btrfs_submit_bbio(bbio, 0);
- bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0,
btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- bbio->inode = inode;
continue;
}
@@ -9317,7 +9529,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
if (ret)
goto out;
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -9394,7 +9606,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_unlock_inode;
}
- if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
+ if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) {
ret = -EAGAIN;
goto out_unlock_inode;
}
@@ -9403,7 +9615,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
lockend - start + 1);
if (ordered) {
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
ret = -EAGAIN;
goto out_unlock_inode;
}
@@ -9416,13 +9628,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
if (ret)
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, cached_state);
+ btrfs_lock_extent(io_tree, start, lockend, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
if (!ordered)
break;
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
cond_resched();
}
}
@@ -9440,7 +9652,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
* For inline extents we get everything we need out of the
* extent item.
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
cached_state, extent_start,
@@ -9452,7 +9664,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
* We only want to return up to EOF even if the extent extends beyond
* that.
*/
- encoded->len = min_t(u64, extent_map_end(em),
+ encoded->len = min_t(u64, btrfs_extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
@@ -9460,7 +9672,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (extent_map_is_compressed(em)) {
+ } else if (btrfs_extent_map_is_compressed(em)) {
*disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
@@ -9475,12 +9687,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- extent_map_compression(em));
+ btrfs_extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
} else {
- *disk_bytenr = extent_map_block_start(em) + (start - em->start);
+ *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
@@ -9493,11 +9705,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = count;
*disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
if (*disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
@@ -9509,11 +9721,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
}
out_em:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out_unlock_extent:
/* Leave inode and extent locked if we need to do a read. */
if (!unlocked && ret != -EIOCBQUEUED)
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode:
if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -9650,8 +9862,6 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
}
for (;;) {
- struct btrfs_ordered_extent *ordered;
-
ret = btrfs_wait_ordered_range(inode, start, num_bytes);
if (ret)
goto out_folios;
@@ -9660,14 +9870,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
end >> PAGE_SHIFT);
if (ret)
goto out_folios;
- lock_extent(io_tree, start, end, &cached_state);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
!filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
break;
if (ordered)
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
cond_resched();
}
@@ -9701,7 +9911,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
}
ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
- disk_num_bytes, 0, 0, &ins, 1, 1);
+ disk_num_bytes, 0, 0, &ins, true, true);
if (ret)
goto out_delalloc_release;
extent_reserved = true;
@@ -9717,11 +9927,11 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = PTR_ERR(em);
goto out_free_reserved;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- (1 << BTRFS_ORDERED_ENCODED) |
- (1 << BTRFS_ORDERED_COMPRESSED));
+ (1U << BTRFS_ORDERED_ENCODED) |
+ (1U << BTRFS_ORDERED_COMPRESSED));
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -9732,7 +9942,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (start + encoded->len > inode->vfs_inode.i_size)
i_size_write(&inode->vfs_inode, start + encoded->len);
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
btrfs_delalloc_release_extents(inode, num_bytes);
@@ -9742,7 +9952,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
out_free_reserved:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_delalloc_release:
btrfs_delalloc_release_extents(inode, num_bytes);
btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
@@ -9755,9 +9965,9 @@ out_free_data_space:
* bytes_may_use.
*/
if (!extent_reserved)
- btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+ btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes);
out_unlock:
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
out_folios:
for (i = 0; i < nr_folios; i++) {
if (folios[i])
@@ -10022,7 +10232,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
- lock_extent(io_tree, 0, isize - 1, &cached_state);
+ btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state);
while (prev_extent_end < isize) {
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -10200,7 +10410,7 @@ out:
if (!IS_ERR_OR_NULL(map))
btrfs_free_chunk_map(map);
- unlock_extent(io_tree, 0, isize - 1, &cached_state);
+ btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state);
if (ret)
btrfs_swap_deactivate(file);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a13d81bb56a0..acb484546b1d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -245,7 +245,7 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_
* Set flags/xflags from the internal inode flags. The remaining items of
* fsxattr are zeroed.
*/
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
@@ -254,7 +254,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int btrfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
@@ -376,13 +376,13 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (comp) {
ret = btrfs_set_prop(trans, inode, "btrfs.compression",
comp, strlen(comp), 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0);
- if (ret && ret != -ENODATA) {
+ if (unlikely(ret && ret != -ENODATA)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -503,7 +503,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_root_item *root_item;
+ struct btrfs_root_item AUTO_KFREE(root_item);
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -527,20 +527,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
- goto out_root_item;
+ return ret;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assumes subvolume qgroup's level to be 0.
*/
- if (btrfs_qgroup_level(objectid)) {
- ret = -ENOSPC;
- goto out_root_item;
- }
+ if (btrfs_qgroup_level(objectid))
+ return -ENOSPC;
ret = get_anon_bdev(&anon_dev);
if (ret < 0)
- goto out_root_item;
+ return ret;
new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir);
if (!new_inode_args.inode) {
@@ -633,7 +631,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_clear_buffer_dirty(trans, leaf);
btrfs_tree_unlock(leaf);
ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
- if (ret2 < 0)
+ if (unlikely(ret2 < 0))
btrfs_abort_transaction(trans, ret2);
free_extent_buffer(leaf);
goto out;
@@ -654,26 +652,26 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
/* ... and new_root is owned by new_inode_args.inode now. */
ret = btrfs_record_root_in_trans(trans, new_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
+ btrfs_record_new_subvolume(trans, BTRFS_I(dir));
+
ret = btrfs_create_new_inode(trans, &new_inode_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_record_new_subvolume(trans, BTRFS_I(dir));
-
d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;
@@ -692,8 +690,7 @@ out_inode:
out_anon_dev:
if (anon_dev)
free_anon_bdev(anon_dev);
-out_root_item:
- kfree(root_item);
+
return ret;
}
@@ -841,7 +838,7 @@ free_pending:
static int btrfs_may_delete(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *victim, int isdir)
{
- int error;
+ int ret;
if (d_really_is_negative(victim))
return -ENOENT;
@@ -851,9 +848,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
return -EINVAL;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
- if (error)
- return error;
+ ret = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
+ if (ret)
+ return ret;
if (IS_APPEND(dir))
return -EPERM;
if (check_sticky(idmap, dir, d_inode(victim)) ||
@@ -892,39 +889,32 @@ static inline int btrfs_may_create(struct mnt_idmap *idmap,
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
-static noinline int btrfs_mksubvol(const struct path *parent,
+static noinline int btrfs_mksubvol(struct dentry *parent,
struct mnt_idmap *idmap,
- const char *name, int namelen,
- struct btrfs_root *snap_src,
+ struct qstr *qname, struct btrfs_root *snap_src,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- struct inode *dir = d_inode(parent->dentry);
+ struct inode *dir = d_inode(parent);
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct dentry *dentry;
- struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
- int error;
-
- error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (error == -EINTR)
- return error;
+ struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len);
+ int ret;
- dentry = lookup_one(idmap, name, parent->dentry, namelen);
- error = PTR_ERR(dentry);
+ dentry = start_creating_killable(idmap, parent, qname);
if (IS_ERR(dentry))
- goto out_unlock;
+ return PTR_ERR(dentry);
- error = btrfs_may_create(idmap, dir, dentry);
- if (error)
+ ret = btrfs_may_create(idmap, dir, dentry);
+ if (ret)
goto out_dput;
/*
* even if this name doesn't exist, we may get hash collisions.
* check for them now when we can safely fail
*/
- error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
- dir->i_ino, &name_str);
- if (error)
+ ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, &name_str);
+ if (ret)
goto out_dput;
down_read(&fs_info->subvol_sem);
@@ -933,24 +923,22 @@ static noinline int btrfs_mksubvol(const struct path *parent,
goto out_up_read;
if (snap_src)
- error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
+ ret = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(idmap, dir, dentry, inherit);
+ ret = create_subvol(idmap, dir, dentry, inherit);
- if (!error)
+ if (!ret)
fsnotify_mkdir(dir, dentry);
out_up_read:
up_read(&fs_info->subvol_sem);
out_dput:
- dput(dentry);
-out_unlock:
- btrfs_inode_unlock(BTRFS_I(dir), 0);
- return error;
+ end_creating(dentry);
+ return ret;
}
-static noinline int btrfs_mksnapshot(const struct path *parent,
+static noinline int btrfs_mksnapshot(struct dentry *parent,
struct mnt_idmap *idmap,
- const char *name, int namelen,
+ struct qstr *qname,
struct btrfs_root *root,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
@@ -959,7 +947,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
/*
* Force new buffered writes to reserve space even when NOCOW is
- * possible. This is to avoid later writeback (running dealloc) to
+ * possible. This is to avoid later writeback (running delalloc) to
* fallback to COW mode and unexpectedly fail with ENOSPC.
*/
btrfs_drew_read_lock(&root->snapshot_lock);
@@ -977,8 +965,8 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
- ret = btrfs_mksubvol(parent, idmap, name, namelen,
- root, readonly, inherit);
+ ret = btrfs_mksubvol(parent, idmap, qname, root, readonly, inherit);
+
atomic_dec(&root->snapshot_force_cow);
out:
btrfs_drew_read_unlock(&root->snapshot_lock);
@@ -1169,7 +1157,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
} /* equal, nothing need to do */
if (ret == 0 && new_size != old_size)
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"resize device %s (devid %llu) from %llu to %llu",
btrfs_dev_name(device), device->devid,
old_size, new_size);
@@ -1184,12 +1172,12 @@ out_drop:
static noinline int __btrfs_ioctl_snap_create(struct file *file,
struct mnt_idmap *idmap,
- const char *name, unsigned long fd, int subvol,
+ const char *name, unsigned long fd, bool subvol,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- int namelen;
int ret = 0;
+ struct qstr qname = QSTR_INIT(name, strlen(name));
if (!S_ISDIR(file_inode(file)->i_mode))
return -ENOTDIR;
@@ -1198,21 +1186,20 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
if (ret)
goto out;
- namelen = strlen(name);
if (strchr(name, '/')) {
ret = -EINVAL;
goto out_drop_write;
}
- if (name[0] == '.' &&
- (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+ if (qname.name[0] == '.' &&
+ (qname.len == 1 || (qname.name[1] == '.' && qname.len == 2))) {
ret = -EEXIST;
goto out_drop_write;
}
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, idmap, name,
- namelen, NULL, readonly, inherit);
+ ret = btrfs_mksubvol(file_dentry(file), idmap, &qname, NULL,
+ readonly, inherit);
} else {
CLASS(fd, src)(fd);
struct inode *src_inode;
@@ -1242,8 +1229,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
*/
ret = -EINVAL;
} else {
- ret = btrfs_mksnapshot(&file->f_path, idmap,
- name, namelen,
+ ret = btrfs_mksnapshot(file_dentry(file), idmap, &qname,
BTRFS_I(src_inode)->root,
readonly, inherit);
}
@@ -1255,7 +1241,7 @@ out:
}
static noinline int btrfs_ioctl_snap_create(struct file *file,
- void __user *arg, int subvol)
+ void __user *arg, bool subvol)
{
struct btrfs_ioctl_vol_args *vol_args;
int ret;
@@ -1280,7 +1266,7 @@ out:
}
static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
- void __user *arg, int subvol)
+ void __user *arg, bool subvol)
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
@@ -1446,8 +1432,8 @@ out:
return ret;
}
-static noinline int key_in_sk(const struct btrfs_key *key,
- const struct btrfs_ioctl_search_key *sk)
+static noinline bool key_in_sk(const struct btrfs_key *key,
+ const struct btrfs_ioctl_search_key *sk)
{
struct btrfs_key test;
int ret;
@@ -1458,7 +1444,7 @@ static noinline int key_in_sk(const struct btrfs_key *key,
ret = btrfs_comp_cpu_keys(key, &test);
if (ret < 0)
- return 0;
+ return false;
test.objectid = sk->max_objectid;
test.type = sk->max_type;
@@ -1466,8 +1452,8 @@ static noinline int key_in_sk(const struct btrfs_key *key,
ret = btrfs_comp_cpu_keys(key, &test);
if (ret > 0)
- return 0;
- return 1;
+ return false;
+ return true;
}
static noinline int copy_to_sk(struct btrfs_path *path,
@@ -1610,7 +1596,7 @@ static noinline int search_ioctl(struct btrfs_root *root,
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
int num_found = 0;
unsigned long sk_offset = 0;
@@ -1630,10 +1616,8 @@ static noinline int search_ioctl(struct btrfs_root *root,
} else {
/* Look up the root from the arguments. */
root = btrfs_get_fs_root(info, sk->tree_id, true);
- if (IS_ERR(root)) {
- btrfs_free_path(path);
+ if (IS_ERR(root))
return PTR_ERR(root);
- }
}
key.objectid = sk->min_objectid;
@@ -1667,7 +1651,6 @@ static noinline int search_ioctl(struct btrfs_root *root,
sk->nr_items = num_found;
btrfs_put_root(root);
- btrfs_free_path(path);
return ret;
}
@@ -1750,7 +1733,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
int total_len = 0;
struct btrfs_inode_ref *iref;
struct extent_buffer *l;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
name[0]='\0';
@@ -1811,7 +1794,6 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
ret = 0;
out:
btrfs_put_root(root);
- btrfs_free_path(path);
return ret;
}
@@ -1828,8 +1810,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
struct btrfs_root *root = NULL;
- struct btrfs_path *path;
- struct btrfs_key key, key2;
+ BTRFS_PATH_AUTO_FREE(path);
+ struct btrfs_key key;
struct extent_buffer *leaf;
char *ptr;
int slot;
@@ -1849,10 +1831,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
root = btrfs_get_fs_root(fs_info, treeid, true);
- if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
- }
+ if (IS_ERR(root))
+ return PTR_ERR(root);
key.objectid = dirid;
key.type = BTRFS_INODE_REF_KEY;
@@ -1884,24 +1864,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
read_extent_buffer(leaf, ptr,
(unsigned long)(iref + 1), len);
- /* Check the read+exec permission of this directory */
- ret = btrfs_previous_item(root, path, dirid,
- BTRFS_INODE_ITEM_KEY);
- if (ret < 0) {
- goto out_put;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto out_put;
- }
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- btrfs_item_key_to_cpu(leaf, &key2, slot);
- if (key2.objectid != dirid) {
- ret = -ENOENT;
- goto out_put;
- }
-
/*
* We don't need the path anymore, so release it and
* avoid deadlocks and lockdep warnings in case
@@ -1909,18 +1871,17 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
* btree and lock the same leaf.
*/
btrfs_release_path(path);
- temp_inode = btrfs_iget(key2.objectid, root);
+ temp_inode = btrfs_iget(key.offset, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
goto out_put;
}
+ /* Check the read+exec permission of this directory. */
ret = inode_permission(idmap, &temp_inode->vfs_inode,
MAY_READ | MAY_EXEC);
iput(&temp_inode->vfs_inode);
- if (ret) {
- ret = -EACCES;
+ if (ret)
goto out_put;
- }
if (key.offset == upper_limit)
break;
@@ -1946,12 +1907,10 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
key.type = BTRFS_ROOT_REF_KEY;
key.offset = args->treeid;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return -ENOENT;
leaf = path->nodes[0];
slot = path->slots[0];
@@ -1961,10 +1920,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
item_len = btrfs_item_size(leaf, slot);
/* Check if dirid in ROOT_REF corresponds to passed dirid */
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
- if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
- ret = -EINVAL;
- goto out;
- }
+ if (args->dirid != btrfs_root_ref_dirid(leaf, rref))
+ return -EINVAL;
/* Copy subvolume's name */
item_off += sizeof(struct btrfs_root_ref);
@@ -1974,8 +1931,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
out_put:
btrfs_put_root(root);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2137,7 +2093,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
ret = btrfs_next_leaf(fs_info->tree_root, path);
if (ret < 0) {
goto out;
- } else if (ret > 0) {
+ } else if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -2220,7 +2176,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
goto out;
- } else if (ret > 0) {
+ } else if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -2249,7 +2205,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
ret = btrfs_next_item(root, path);
if (ret < 0) {
goto out;
- } else if (ret > 0) {
+ } else if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -2288,7 +2244,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
struct mnt_idmap *idmap = file_mnt_idmap(file);
char *subvol_name, *subvol_name_ptr = NULL;
- int subvol_namelen;
int ret = 0;
bool destroy_parent = false;
@@ -2411,10 +2366,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out;
}
- subvol_namelen = strlen(subvol_name);
-
if (strchr(subvol_name, '/') ||
- strncmp(subvol_name, "..", subvol_namelen) == 0) {
+ strcmp(subvol_name, "..") == 0) {
ret = -EINVAL;
goto free_subvol_name;
}
@@ -2424,18 +2377,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto free_subvol_name;
}
- ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (ret == -EINTR)
- goto free_subvol_name;
- dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen);
+ dentry = start_removing_killable(idmap, parent, &QSTR(subvol_name));
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
- goto out_unlock_dir;
- }
-
- if (d_really_is_negative(dentry)) {
- ret = -ENOENT;
- goto out_dput;
+ goto out_end_removing;
}
inode = d_inode(dentry);
@@ -2456,7 +2401,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
ret = -EPERM;
if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
- goto out_dput;
+ goto out_end_removing;
/*
* Do not allow deletion if the parent dir is the same
@@ -2467,21 +2412,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
ret = -EINVAL;
if (root == dest)
- goto out_dput;
+ goto out_end_removing;
ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
if (ret)
- goto out_dput;
+ goto out_end_removing;
}
/* check if subvolume may be deleted by a user */
ret = btrfs_may_delete(idmap, dir, dentry, 1);
if (ret)
- goto out_dput;
+ goto out_end_removing;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
ret = -EINVAL;
- goto out_dput;
+ goto out_end_removing;
}
btrfs_inode_lock(BTRFS_I(inode), 0);
@@ -2490,10 +2435,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (!ret)
d_delete_notify(dir, dentry);
-out_dput:
- dput(dentry);
-out_unlock_dir:
- btrfs_inode_unlock(BTRFS_I(dir), 0);
+out_end_removing:
+ end_removing(dentry);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:
@@ -2561,8 +2504,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EOPNOTSUPP;
goto out;
}
- /* compression requires us to start the IO */
- if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) &&
+ (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* Compression or no-compression require to start the IO. */
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) ||
+ (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
range.extent_thresh = (u32)-1;
}
@@ -2703,7 +2652,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
err_drop:
mnt_drop_write_file(file);
if (bdev_file)
- fput(bdev_file);
+ bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2754,7 +2703,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
mnt_drop_write_file(file);
if (bdev_file)
- fput(bdev_file);
+ bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
out_free:
@@ -2893,7 +2842,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(btrfs_root_id(new_root))) {
+ if (!btrfs_is_fstree(btrfs_root_id(new_root))) {
ret = -ENOENT;
goto out_free;
}
@@ -2957,7 +2906,7 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_space_args space_args = { 0 };
struct btrfs_ioctl_space_info space;
struct btrfs_ioctl_space_info *dest;
- struct btrfs_ioctl_space_info *dest_orig;
+ struct btrfs_ioctl_space_info AUTO_KFREE(dest_orig);
struct btrfs_ioctl_space_info __user *user_dest;
struct btrfs_space_info *info;
static const u64 types[] = {
@@ -3078,9 +3027,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
(arg + sizeof(struct btrfs_ioctl_space_args));
if (copy_to_user(user_dest, dest_orig, alloc_size))
- ret = -EFAULT;
+ return -EFAULT;
- kfree(dest_orig);
out:
if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
ret = -EFAULT;
@@ -3142,7 +3090,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
return -EPERM;
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ btrfs_err(fs_info, "scrub: extent tree v2 not yet supported");
return -EINVAL;
}
@@ -3299,7 +3247,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
u64 rel_ptr;
int size;
struct btrfs_ioctl_ino_path_args *ipa = NULL;
- struct inode_fs_paths *ipath = NULL;
+ struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
struct btrfs_path *path;
if (!capable(CAP_DAC_READ_SEARCH))
@@ -3347,7 +3295,6 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
out:
btrfs_free_path(path);
- free_ipath(ipath);
kfree(ipa);
return ret;
@@ -3360,7 +3307,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
int size;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
- struct btrfs_path *path = NULL;
bool ignore_offset;
if (!capable(CAP_SYS_ADMIN))
@@ -3394,14 +3340,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
goto out_loi;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
- ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
- inodes, ignore_offset);
- btrfs_free_path(path);
+ ret = iterate_inodes_from_logical(loi->logical, fs_info, inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -3620,7 +3559,7 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_balance_args *bargs;
+ struct btrfs_ioctl_balance_args AUTO_KFREE(bargs);
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -3642,8 +3581,6 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
-
- kfree(bargs);
out:
mutex_unlock(&fs_info->balance_mutex);
return ret;
@@ -3718,22 +3655,6 @@ drop_write:
return ret;
}
-/*
- * Quick check for ioctl handlers if quotas are enabled. Proper locking must be
- * done before any operations.
- */
-static bool qgroup_enabled(struct btrfs_fs_info *fs_info)
-{
- bool ret = true;
-
- mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!fs_info->quota_root)
- ret = false;
- mutex_unlock(&fs_info->qgroup_ioctl_lock);
-
- return ret;
-}
-
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
@@ -3748,7 +3669,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3765,7 +3686,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
if (!prealloc) {
ret = -ENOMEM;
- goto drop_write;
+ goto out;
}
}
@@ -3818,7 +3739,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(root->fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3836,7 +3757,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
- if (sa->create && is_fstree(sa->qgroupid)) {
+ if (sa->create && btrfs_is_fstree(sa->qgroupid)) {
ret = -EINVAL;
goto out;
}
@@ -3877,7 +3798,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(root->fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3925,7 +3846,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(fs_info))
+ if (!btrfs_qgroup_enabled(fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -4033,7 +3954,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
btrfs_root_id(root));
- if (ret && ret != -ENOENT) {
+ if (unlikely(ret && ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
@@ -4057,7 +3978,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
ret = btrfs_uuid_tree_add(trans, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
btrfs_root_id(root));
- if (ret < 0 && ret != -EEXIST) {
+ if (unlikely(ret < 0 && ret != -EEXIST)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
@@ -4203,7 +4124,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
}
spin_lock(&fs_info->super_lock);
- strcpy(super_block->label, label);
+ strscpy(super_block->label, label);
spin_unlock(&fs_info->super_lock);
ret = btrfs_commit_transaction(trans);
@@ -4253,7 +4174,7 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info,
u64 safe_set, u64 safe_clear)
{
const char *type = btrfs_feature_set_name(set);
- char *names;
+ const char AUTO_KFREE(names);
u64 disallowed, unsupported;
u64 set_mask = flags & change_mask;
u64 clear_mask = ~flags & change_mask;
@@ -4261,12 +4182,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info,
unsupported = set_mask & ~supported_flags;
if (unsupported) {
names = btrfs_printable_features(set, unsupported);
- if (names) {
+ if (names)
btrfs_warn(fs_info,
"this kernel does not support the %s feature bit%s",
names, strchr(names, ',') ? "s" : "");
- kfree(names);
- } else
+ else
btrfs_warn(fs_info,
"this kernel does not support %s bits 0x%llx",
type, unsupported);
@@ -4276,12 +4196,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info,
disallowed = set_mask & ~safe_set;
if (disallowed) {
names = btrfs_printable_features(set, disallowed);
- if (names) {
+ if (names)
btrfs_warn(fs_info,
"can't set the %s feature bit%s while mounted",
names, strchr(names, ',') ? "s" : "");
- kfree(names);
- } else
+ else
btrfs_warn(fs_info,
"can't set %s bits 0x%llx while mounted",
type, disallowed);
@@ -4291,12 +4210,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info,
disallowed = clear_mask & ~safe_clear;
if (disallowed) {
names = btrfs_printable_features(set, disallowed);
- if (names) {
+ if (names)
btrfs_warn(fs_info,
"can't clear the %s feature bit%s while mounted",
names, strchr(names, ',') ? "s" : "");
- kfree(names);
- } else
+ else
btrfs_warn(fs_info,
"can't clear %s bits 0x%llx while mounted",
type, disallowed);
@@ -4510,7 +4428,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
args.compression, &unlocked);
if (!unlocked) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
}
}
@@ -4632,6 +4550,13 @@ out_acct:
return ret;
}
+struct btrfs_uring_encoded_data {
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov;
+ struct iov_iter iter;
+};
+
/*
* Context that's attached to an encoded read io_uring command, in cmd->pdu. It
* contains the fields in btrfs_uring_read_extent that are necessary to finish
@@ -4653,16 +4578,18 @@ struct btrfs_uring_priv {
};
struct io_btrfs_cmd {
+ struct btrfs_uring_encoded_data *data;
struct btrfs_uring_priv *priv;
};
-static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags)
+static void btrfs_uring_read_finished(struct io_tw_req tw_req, io_tw_token_t tw)
{
+ struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
struct btrfs_uring_priv *priv = bc->priv;
struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
- unsigned long index;
+ pgoff_t index;
u64 cur;
size_t page_offset;
ssize_t ret;
@@ -4699,10 +4626,10 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss
ret = priv->count;
out:
- unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
+ btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
- io_uring_cmd_done(cmd, ret, 0, issue_flags);
+ io_uring_cmd_done(cmd, ret, IO_URING_CMD_TASK_WORK_ISSUE_FLAGS);
add_rchar(current, ret);
for (index = 0; index < priv->nr_pages; index++)
@@ -4711,6 +4638,7 @@ out:
kfree(priv->pages);
kfree(priv->iov);
kfree(priv);
+ kfree(bc->data);
}
void btrfs_uring_read_extent_endio(void *ctx, int err)
@@ -4788,51 +4716,45 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
return -EIOCBQUEUED;
out_fail:
- unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
kfree(priv);
return ret;
}
-struct btrfs_uring_encoded_data {
- struct btrfs_ioctl_encoded_io_args args;
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov;
- struct iov_iter iter;
-};
-
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
+ struct file *file = cmd->file;
+ struct btrfs_inode *inode = BTRFS_I(file->f_inode);
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
size_t copy_end;
int ret;
u64 disk_bytenr, disk_io_size;
- struct file *file;
- struct btrfs_inode *inode;
- struct btrfs_fs_info *fs_info;
- struct extent_io_tree *io_tree;
loff_t pos;
struct kiocb kiocb;
struct extent_state *cached_state = NULL;
u64 start, lockend;
void __user *sqe_addr;
- struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out_acct;
}
- file = cmd->file;
- inode = BTRFS_I(file->f_inode);
- fs_info = inode->root->fs_info;
- io_tree = &inode->io_tree;
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
#else
- return -ENOTTY;
+ ret = -ENOTTY;
+ goto out_acct;
#endif
} else {
copy_end = copy_end_kernel;
@@ -4845,7 +4767,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
goto out_acct;
}
- io_uring_cmd_get_async_data(cmd)->op_data = data;
+ bc->data = data;
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -4902,6 +4824,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,
&disk_bytenr, &disk_io_size);
+ if (ret == -EAGAIN)
+ goto out_acct;
if (ret < 0 && ret != -EIOCBQUEUED)
goto out_free;
@@ -4911,7 +4835,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
(const char *)&data->args + copy_end_kernel,
sizeof(data->args) - copy_end_kernel)) {
if (ret == -EIOCBQUEUED) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
}
ret = -EFAULT;
@@ -4941,24 +4865,29 @@ out_acct:
add_rchar(current, ret);
inc_syscr(current);
+ if (ret != -EIOCBQUEUED && ret != -EAGAIN)
+ kfree(data);
+
return ret;
}
static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
+ struct file *file = cmd->file;
loff_t pos;
struct kiocb kiocb;
- struct file *file;
ssize_t ret;
void __user *sqe_addr;
- struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out_acct;
}
-
- file = cmd->file;
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
if (!(file->f_mode & FMODE_WRITE)) {
@@ -4973,7 +4902,7 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
goto out_acct;
}
- io_uring_cmd_get_async_data(cmd)->op_data = data;
+ bc->data = data;
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -5063,11 +4992,17 @@ out_acct:
if (ret > 0)
add_wchar(current, ret);
inc_syscw(current);
+
+ if (ret != -EAGAIN)
+ kfree(data);
return ret;
}
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file)))))
+ return -EIO;
+
switch (cmd->cmd_op) {
case BTRFS_IOC_ENCODED_READ:
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -5211,6 +5146,43 @@ static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *a
return 0;
}
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg)
+{
+ int ret = 0;
+ u32 flags;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(flags, (u32 __user *)arg))
+ return -EFAULT;
+
+ if (flags >= BTRFS_SHUTDOWN_FLAGS_LAST)
+ return -EINVAL;
+
+ if (btrfs_is_shutdown(fs_info))
+ return 0;
+
+ switch (flags) {
+ case BTRFS_SHUTDOWN_FLAGS_LOGFLUSH:
+ case BTRFS_SHUTDOWN_FLAGS_DEFAULT:
+ ret = freeze_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL);
+ if (ret)
+ return ret;
+ btrfs_force_shutdown(fs_info);
+ ret = thaw_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL);
+ if (ret)
+ return ret;
+ break;
+ case BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH:
+ btrfs_force_shutdown(fs_info);
+ break;
+ }
+ return ret;
+}
+#endif
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -5229,13 +5201,13 @@ long btrfs_ioctl(struct file *file, unsigned int
case FITRIM:
return btrfs_ioctl_fitrim(fs_info, argp);
case BTRFS_IOC_SNAP_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 0);
+ return btrfs_ioctl_snap_create(file, argp, false);
case BTRFS_IOC_SNAP_CREATE_V2:
- return btrfs_ioctl_snap_create_v2(file, argp, 0);
+ return btrfs_ioctl_snap_create_v2(file, argp, false);
case BTRFS_IOC_SUBVOL_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 1);
+ return btrfs_ioctl_snap_create(file, argp, true);
case BTRFS_IOC_SUBVOL_CREATE_V2:
- return btrfs_ioctl_snap_create_v2(file, argp, 1);
+ return btrfs_ioctl_snap_create_v2(file, argp, true);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp, false);
case BTRFS_IOC_SNAP_DESTROY_V2:
@@ -5366,6 +5338,10 @@ long btrfs_ioctl(struct file *file, unsigned int
#endif
case BTRFS_IOC_SUBVOL_SYNC_WAIT:
return btrfs_ioctl_subvol_sync(fs_info, argp);
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ case BTRFS_IOC_SHUTDOWN:
+ return btrfs_ioctl_shutdown(fs_info, arg);
+#endif
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e08ea446cf48..ccf6bed9cc24 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -8,7 +8,7 @@
struct file;
struct dentry;
struct mnt_idmap;
-struct fileattr;
+struct file_kattr;
struct io_uring_cmd;
struct btrfs_inode;
struct btrfs_fs_info;
@@ -16,9 +16,9 @@ struct btrfs_ioctl_balance_args;
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int btrfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 81e62b652e21..0035851d72b0 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -149,15 +149,15 @@ void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesti
/*
* Try-lock for read.
*
- * Return 1 if the rwlock has been taken, 0 otherwise
+ * Return true if the rwlock has been taken, false otherwise
*/
-int btrfs_try_tree_read_lock(struct extent_buffer *eb)
+bool btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (down_read_trylock(&eb->lock)) {
trace_btrfs_try_tree_read_lock(eb);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -361,7 +361,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
atomic_inc(&lock->readers);
/*
- * Ensure the pending reader count is perceieved BEFORE this reader
+ * Ensure the pending reader count is perceived BEFORE this reader
* goes to sleep in case of active writers. This guarantees new writers
* won't be allowed and that the current reader will be woken up when
* the last active writer finishes its jobs.
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index c69e57ff804b..a4673e7d95d7 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -74,7 +74,7 @@ enum btrfs_lock_nesting {
BTRFS_NESTING_NEW_ROOT,
/*
- * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * We are limited to MAX_LOCKDEP_SUBCLASSES number of subclasses, so
* add this in here and add a static_assert to keep us from going over
* the limit. As of this writing we're limited to 8, and we're
* definitely using 8, hence this check to keep us from messing up in
@@ -189,7 +189,7 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
}
void btrfs_tree_read_unlock(struct extent_buffer *eb);
-int btrfs_try_tree_read_lock(struct extent_buffer *eb);
+bool btrfs_try_tree_read_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index a45bc11f8665..4758f66da449 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -58,9 +58,6 @@
* 0x1000 | SegHdr N+1| Data payload N+1 ... |
*/
-#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
-#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
-
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
@@ -68,7 +65,14 @@ struct workspace {
struct list_head list;
};
-static struct workspace_manager wsm;
+static u32 workspace_buf_length(const struct btrfs_fs_info *fs_info)
+{
+ return lzo1x_worst_compress(fs_info->sectorsize);
+}
+static u32 workspace_cbuf_length(const struct btrfs_fs_info *fs_info)
+{
+ return lzo1x_worst_compress(fs_info->sectorsize);
+}
void lzo_free_workspace(struct list_head *ws)
{
@@ -80,7 +84,7 @@ void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *lzo_alloc_workspace(void)
+struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info)
{
struct workspace *workspace;
@@ -89,8 +93,8 @@ struct list_head *lzo_alloc_workspace(void)
return ERR_PTR(-ENOMEM);
workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN);
- workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
- workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
+ workspace->buf = kvmalloc(workspace_buf_length(fs_info), GFP_KERNEL | __GFP_NOWARN);
+ workspace->cbuf = kvmalloc(workspace_cbuf_length(fs_info), GFP_KERNEL | __GFP_NOWARN);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -128,19 +132,21 @@ static inline size_t read_compress_length(const char *buf)
*
* Will allocate new pages when needed.
*/
-static int copy_compressed_data_to_page(char *compressed_data,
+static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info,
+ char *compressed_data,
size_t compressed_size,
struct folio **out_folios,
unsigned long max_nr_folio,
- u32 *cur_out,
- const u32 sectorsize)
+ u32 *cur_out)
{
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
u32 sector_bytes_left;
u32 orig_out;
struct folio *cur_folio;
char *kaddr;
- if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
+ if ((*cur_out >> min_folio_shift) >= max_nr_folio)
return -E2BIG;
/*
@@ -149,18 +155,17 @@ static int copy_compressed_data_to_page(char *compressed_data,
*/
ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
- cur_folio = out_folios[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out >> min_folio_shift];
/* Allocate a new page */
if (!cur_folio) {
- cur_folio = btrfs_alloc_compr_folio();
+ cur_folio = btrfs_alloc_compr_folio(fs_info);
if (!cur_folio)
return -ENOMEM;
- out_folios[*cur_out / PAGE_SIZE] = cur_folio;
+ out_folios[*cur_out >> min_folio_shift] = cur_folio;
}
- kaddr = kmap_local_folio(cur_folio, 0);
- write_compress_length(kaddr + offset_in_page(*cur_out),
- compressed_size);
+ kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out));
+ write_compress_length(kaddr, compressed_size);
*cur_out += LZO_LEN;
orig_out = *cur_out;
@@ -172,20 +177,20 @@ static int copy_compressed_data_to_page(char *compressed_data,
kunmap_local(kaddr);
- if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
+ if ((*cur_out >> min_folio_shift) >= max_nr_folio)
return -E2BIG;
- cur_folio = out_folios[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out >> min_folio_shift];
/* Allocate a new page */
if (!cur_folio) {
- cur_folio = btrfs_alloc_compr_folio();
+ cur_folio = btrfs_alloc_compr_folio(fs_info);
if (!cur_folio)
return -ENOMEM;
- out_folios[*cur_out / PAGE_SIZE] = cur_folio;
+ out_folios[*cur_out >> min_folio_shift] = cur_folio;
}
kaddr = kmap_local_folio(cur_folio, 0);
- memcpy(kaddr + offset_in_page(*cur_out),
+ memcpy(kaddr + offset_in_folio(cur_folio, *cur_out),
compressed_data + *cur_out - orig_out, copy_len);
*cur_out += copy_len;
@@ -209,12 +214,15 @@ out:
return 0;
}
-int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
- const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
struct folio *folio_in = NULL;
char *sizes_ptr;
const unsigned long max_nr_folio = *out_folios;
@@ -252,9 +260,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Compress at most one sector of data each time */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
- data_in = kmap_local_folio(folio_in, 0);
- ret = lzo1x_1_compress(data_in +
- offset_in_page(cur_in), in_len,
+ data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in));
+ ret = lzo1x_1_compress(data_in, in_len,
workspace->cbuf, &out_len,
workspace->mem);
kunmap_local(data_in);
@@ -264,9 +271,9 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
goto out;
}
- ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len,
folios, max_nr_folio,
- &cur_out, sectorsize);
+ &cur_out);
if (ret < 0)
goto out;
@@ -281,8 +288,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
goto out;
}
- /* Check if we have reached page boundary */
- if (PAGE_ALIGNED(cur_in)) {
+ /* Check if we have reached folio boundary. */
+ if (IS_ALIGNED(cur_in, min_folio_size)) {
folio_put(folio_in);
folio_in = NULL;
}
@@ -299,7 +306,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
out:
if (folio_in)
folio_put(folio_in);
- *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+ *out_folios = DIV_ROUND_UP(cur_out, min_folio_size);
return ret;
}
@@ -311,15 +318,16 @@ out:
static void copy_compressed_segment(struct compressed_bio *cb,
char *dest, u32 len, u32 *cur_in)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- struct folio *cur_folio;
- u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
- orig_in + len - *cur_in);
+ struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift];
+ u32 copy_len = min_t(u32, orig_in + len - *cur_in,
+ folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in));
ASSERT(copy_len);
- cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE];
memcpy_from_folio(dest + *cur_in - orig_in, cur_folio,
offset_in_folio(cur_folio, *cur_in), copy_len);
@@ -333,6 +341,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
struct workspace *workspace = list_entry(ws, struct workspace, list);
const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
char *kaddr;
int ret;
/* Compressed data length, can be unaligned */
@@ -379,14 +388,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
*/
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
- cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE];
+ cur_folio = cb->compressed_folios[cur_in >> min_folio_shift];
ASSERT(cur_folio);
kaddr = kmap_local_folio(cur_folio, 0);
- seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in));
kunmap_local(kaddr);
cur_in += LZO_LEN;
- if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) {
+ if (unlikely(seg_len > workspace_cbuf_length(fs_info))) {
struct btrfs_inode *inode = cb->bbio.inode;
/*
@@ -446,19 +455,19 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
- size_t max_segment_len = WORKSPACE_BUF_LENGTH;
+ size_t max_segment_len = workspace_buf_length(fs_info);
int ret = 0;
- if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
+ if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2))
return -EUCLEAN;
in_len = read_compress_length(data_in);
- if (in_len != srclen)
+ if (unlikely(in_len != srclen))
return -EUCLEAN;
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
- if (in_len != srclen - LZO_LEN * 2) {
+ if (unlikely(in_len != srclen - LZO_LEN * 2)) {
ret = -EUCLEAN;
goto out;
}
@@ -488,8 +497,7 @@ out:
return ret;
}
-const struct btrfs_compress_op btrfs_lzo_compress = {
- .workspace_manager = &wsm,
+const struct btrfs_compress_levels btrfs_lzo_compress = {
.max_level = 1,
.default_level = 1,
};
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 363fd28c0268..2f853de44473 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -18,11 +18,13 @@ static const char fs_state_chars[] = {
[BTRFS_FS_STATE_REMOUNTING] = 'M',
[BTRFS_FS_STATE_RO] = 0,
[BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_LOG_REPLAY_ABORTED] = 'O',
[BTRFS_FS_STATE_DEV_REPLACING] = 'R',
[BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
[BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C',
[BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S',
[BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
+ [BTRFS_FS_STATE_EMERGENCY_SHUTDOWN] = 'E',
};
static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 08a9272399d2..d8c0bd17dcda 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -36,106 +36,46 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
btrfs_no_printk(fs_info, fmt, ##args)
#endif
-#define btrfs_emerg(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_INFO fmt, ##args)
-
/*
- * Wrappers that use printk_in_rcu
+ * Print a message with filesystem info, enclosed in RCU protection.
*/
-#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+#define btrfs_crit(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+#define btrfs_err(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+#define btrfs_warn(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
/*
- * Wrappers that use a ratelimited printk_in_rcu
- */
-#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
* Wrappers that use a ratelimited printk
*/
-#define btrfs_emerg_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
#define btrfs_crit_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
#define btrfs_err_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
#define btrfs_warn_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
#define btrfs_info_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#else
-#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+/* When printk() is no_printk(), expand to no-op. */
+#define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
+#define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
#endif
#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
@@ -145,40 +85,98 @@ do { \
rcu_read_unlock(); \
} while (0)
-#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_no_printk(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
+ \
+ rcu_read_lock(); \
if (__ratelimit(&_rs)) \
btrfs_printk(fs_info, fmt, ##args); \
-} while (0)
-
-#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_printk_ratelimited(fs_info, fmt, ##args); \
rcu_read_unlock(); \
} while (0)
#ifdef CONFIG_BTRFS_ASSERT
-#define btrfs_assertfail(expr, file, line) ({ \
- pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \
- BUG(); \
-})
+__printf(1, 2)
+static inline void verify_assert_printk_format(const char *fmt, ...) {
+ /* Stub to verify the assertion format string. */
+}
+
+/* Take the first token if any. */
+#define __FIRST_ARG(_, ...) _
+/*
+ * Skip the first token and return the rest, if it's empty the comma is dropped.
+ * As ##__VA_ARGS__ cannot be at the beginning of the macro the __VA_OPT__ is needed
+ * and supported since GCC 8 and Clang 12.
+ */
+#define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__
+
+#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000
+/*
+ * Assertion with optional printk() format.
+ *
+ * Accepted syntax:
+ * ASSERT(condition);
+ * ASSERT(condition, "string");
+ * ASSERT(condition, "variable=%d", variable);
+ *
+ * How it works:
+ * - if there's no format string, ""[0] evaluates at compile time to 0 and the
+ * true branch is executed
+ * - any non-empty format string with the "" prefix evaluates to != 0 at
+ * compile time and the false branch is executed
+ * - stringified condition is printed as %s so we don't accidentally mix format
+ * strings (the % operator)
+ * - there can be only one printk() call, so the format strings and arguments are
+ * spliced together:
+ * DEFAULT_FMT [USER_FMT], DEFAULT_ARGS [, USER_ARGS]
+ * - comma between DEFAULT_ARGS and USER_ARGS is handled by preprocessor
+ * (requires __VA_OPT__ support)
+ * - otherwise we could use __VA_OPT(,) __VA_ARGS__ for the 2nd+ argument of args,
+ */
+#define ASSERT(cond, args...) \
+do { \
+ verify_assert_printk_format("check the format string" args); \
+ if (!likely(cond)) { \
+ if (("" __FIRST_ARG(args) [0]) == 0) { \
+ pr_err("assertion failed: %s :: %ld, in %s:%d\n", \
+ #cond, (long)(cond), __FILE__, __LINE__); \
+ } else { \
+ pr_err("assertion failed: %s :: %ld, in %s:%d (" __FIRST_ARG(args) ")\n", \
+ #cond, (long)(cond), __FILE__, __LINE__ __REST_ARGS(args)); \
+ } \
+ BUG(); \
+ } \
+} while(0)
+
+#else
+
+/* For GCC < 8.x only the simple output. */
+
+#define ASSERT(cond, args...) \
+do { \
+ verify_assert_printk_format("check the format string" args); \
+ if (!likely(cond)) { \
+ pr_err("assertion failed: %s :: %ld, in %s:%d\n", \
+ #cond, (long)(cond), __FILE__, __LINE__); \
+ BUG(); \
+ } \
+} while(0)
+
+#endif
+
+#else
+/* Compile check the @cond expression but don't generate any code. */
+#define ASSERT(cond, args...) BUILD_BUG_ON_INVALID(cond)
+#endif
-#define ASSERT(expr) \
- (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
+#ifdef CONFIG_BTRFS_DEBUG
+/* Verbose warning only under debug build. */
+#define DEBUG_WARN(args...) WARN(1, KERN_ERR args)
#else
-#define ASSERT(expr) (void)(expr)
+#define DEBUG_WARN(...) do {} while(0)
#endif
__printf(5, 6)
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 0d599fd847c9..12c5a9d6564f 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -7,8 +7,18 @@
#include <linux/bitmap.h>
#include <linux/sched.h>
#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/math64.h>
#include <linux/rbtree.h>
+#include <linux/bio.h>
+
+/*
+ * Convenience macros to define a pointer with the __free(kfree) and
+ * __free(kvfree) cleanup attributes and initialized to NULL.
+ */
+#define AUTO_KFREE(name) *name __free(kfree) = NULL
+#define AUTO_KVFREE(name) *name __free(kvfree) = NULL
/*
* Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
@@ -18,6 +28,54 @@
name = (1U << __ ## name ## _BIT), \
__ ## name ## _SEQ = __ ## name ## _BIT
+static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter)
+{
+ struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+ return bvec_phys(&bv);
+}
+
+/*
+ * Iterate bio using btrfs block size.
+ *
+ * This will handle large folio and highmem.
+ *
+ * @paddr: Physical memory address of each iteration
+ * @bio: The bio to iterate
+ * @iter: The bvec_iter (pointer) to use.
+ * @blocksize: The blocksize to iterate.
+ *
+ * This requires all folios in the bio to cover at least one block.
+ */
+#define btrfs_bio_for_each_block(paddr, bio, iter, blocksize) \
+ for (; (iter)->bi_size && \
+ (paddr = bio_iter_phys((bio), (iter)), 1); \
+ bio_advance_iter_single((bio), (iter), (blocksize)))
+
+/* Initialize a bvec_iter to the size of the specified bio. */
+static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ u32 bio_size = 0;
+ int i;
+
+ bio_for_each_bvec_all(bvec, bio, i)
+ bio_size += bvec->bv_len;
+
+ return (struct bvec_iter) {
+ .bi_sector = 0,
+ .bi_size = bio_size,
+ .bi_idx = 0,
+ .bi_bvec_done = 0,
+ };
+}
+
+#define btrfs_bio_for_each_block_all(paddr, bio, blocksize) \
+ for (struct bvec_iter iter = init_bvec_iter_for_bio(bio); \
+ (iter).bi_size && \
+ (paddr = bio_iter_phys((bio), &(iter)), 1); \
+ bio_advance_iter_single((bio), &(iter), (blocksize)))
+
static inline void cond_wake_up(struct wait_queue_head *wq)
{
/*
@@ -119,28 +177,23 @@ static inline struct rb_node *rb_simple_search_first(const struct rb_root *root,
return ret;
}
-static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
- struct rb_node *node)
+static int rb_simple_node_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct rb_simple_node *entry;
+ struct rb_simple_node *new_entry = rb_entry(new, struct rb_simple_node, rb_node);
+ struct rb_simple_node *existing_entry = rb_entry(existing, struct rb_simple_node, rb_node);
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct rb_simple_node, rb_node);
+ if (new_entry->bytenr < existing_entry->bytenr)
+ return -1;
+ else if (new_entry->bytenr > existing_entry->bytenr)
+ return 1;
- if (bytenr < entry->bytenr)
- p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
- p = &(*p)->rb_right;
- else
- return parent;
- }
+ return 0;
+}
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
+static inline struct rb_node *rb_simple_insert(struct rb_root *root,
+ struct rb_simple_node *simple_node)
+{
+ return rb_find_add(&simple_node->rb_node, root, rb_simple_node_bytenr_cmp);
}
static inline bool bitmap_test_range_all_set(const unsigned long *addr,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 03c945711003..5df02c707aee 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -153,25 +153,30 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
struct btrfs_ordered_extent *entry;
int ret;
u64 qgroup_rsv = 0;
+ const bool is_nocow = (flags &
+ ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
- if (flags &
- ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
- /* For nocow write, we can release the qgroup rsv right now */
+ /*
+ * For a NOCOW write we can free the qgroup reserve right now. For a COW
+ * one we transfer the reserved space from the inode's iotree into the
+ * ordered extent by calling btrfs_qgroup_release_data() and tracking
+ * the qgroup reserved amount in the ordered extent, so that later after
+ * completing the ordered extent, when running the data delayed ref it
+ * creates, we free the reserved data with btrfs_qgroup_free_refroot().
+ */
+ if (is_nocow)
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
- if (ret < 0)
- return ERR_PTR(ret);
- } else {
- /*
- * The ordered extent has reserved qgroup space, release now
- * and pass the reserved number for qgroup_record to free.
- */
+ else
ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
- if (ret < 0)
- return ERR_PTR(ret);
- }
+
+ if (ret < 0)
+ return ERR_PTR(ret);
+
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
- if (!entry)
- return ERR_PTR(-ENOMEM);
+ if (!entry) {
+ entry = ERR_PTR(-ENOMEM);
+ goto out;
+ }
entry->file_offset = file_offset;
entry->num_bytes = num_bytes;
@@ -180,7 +185,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
entry->disk_num_bytes = disk_num_bytes;
entry->offset = offset;
entry->bytes_left = num_bytes;
- entry->inode = BTRFS_I(igrab(&inode->vfs_inode));
+ if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) {
+ kmem_cache_free(btrfs_ordered_extent_cache, entry);
+ entry = ERR_PTR(-ESTALE);
+ goto out;
+ }
+ entry->inode = inode;
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = qgroup_rsv;
@@ -203,6 +213,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
btrfs_mod_outstanding_extents(inode, 1);
spin_unlock(&inode->lock);
+out:
+ if (IS_ERR(entry) && !is_nocow)
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ btrfs_root_id(inode->root),
+ qgroup_rsv, BTRFS_QGROUP_RSV_DATA);
+
return entry;
}
@@ -221,14 +237,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
/* One ref for the tree. */
refcount_inc(&entry->refs);
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = tree_insert(&inode->ordered_tree, entry->file_offset,
&entry->rb_node);
if (unlikely(node))
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
entry->file_offset);
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -253,7 +269,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
* @disk_bytenr: Offset of extent on disk.
* @disk_num_bytes: Size of extent on disk.
* @offset: Offset into unencoded data where file data starts.
- * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*).
* @compress_type: Compression algorithm used for data.
*
* Most of these parameters correspond to &struct btrfs_file_extent_item. The
@@ -312,9 +328,9 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
{
struct btrfs_inode *inode = entry->inode;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
}
void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
@@ -343,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (folio) {
ASSERT(folio->mapping);
ASSERT(folio_pos(folio) <= file_offset);
- ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
+ ASSERT(file_offset + len <= folio_next_pos(folio));
/*
* Ordered flag indicates whether we still have
@@ -401,15 +417,14 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
bool uptodate)
{
struct btrfs_inode *inode = ordered->inode;
- unsigned long flags;
bool ret;
trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
ret = can_finish_ordered_extent(ordered, folio, file_offset, len,
uptodate);
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
/*
* If this is a COW write it means we created new extent maps for the
@@ -465,18 +480,16 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
{
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- unsigned long flags;
u64 cur = file_offset;
+ const u64 end = file_offset + num_bytes;
- trace_btrfs_writepage_end_io_hook(inode, file_offset,
- file_offset + num_bytes - 1,
- uptodate);
+ trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate);
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
- while (cur < file_offset + num_bytes) {
+ spin_lock(&inode->ordered_tree_lock);
+ while (cur < end) {
u64 entry_end;
- u64 end;
- u32 len;
+ u64 this_end;
+ u64 len;
node = ordered_tree_search(inode, cur);
/* No ordered extents at all */
@@ -519,19 +532,18 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
* |
* cur
*/
- end = min(entry->file_offset + entry->num_bytes,
- file_offset + num_bytes) - 1;
- ASSERT(end + 1 - cur < U32_MAX);
- len = end + 1 - cur;
+ this_end = min(entry_end, end);
+ len = this_end - cur;
+ ASSERT(len < U32_MAX);
if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) {
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
btrfs_queue_ordered_fn(entry);
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
}
cur += len;
}
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
}
/*
@@ -557,10 +569,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
{
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- unsigned long flags;
bool finished = false;
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
if (cached && *cached) {
entry = *cached;
goto have_entry;
@@ -597,7 +608,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
return finished;
}
@@ -607,23 +618,18 @@ out:
*/
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
{
- struct list_head *cur;
- struct btrfs_ordered_sum *sum;
-
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
+ struct btrfs_ordered_sum *sum;
+ struct btrfs_ordered_sum *tmp;
+
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
- if (entry->inode)
- btrfs_add_delayed_iput(entry->inode);
- while (!list_empty(&entry->list)) {
- cur = entry->list.next;
- sum = list_entry(cur, struct btrfs_ordered_sum, list);
- list_del(&sum->list);
+ btrfs_add_delayed_iput(entry->inode);
+ list_for_each_entry_safe(sum, tmp, &entry->list, list)
kvfree(sum);
- }
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
}
@@ -667,7 +673,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
- spin_lock_irq(&btrfs_inode->ordered_tree_lock);
+ spin_lock(&btrfs_inode->ordered_tree_lock);
node = &entry->rb_node;
rb_erase(node, &btrfs_inode->ordered_tree);
RB_CLEAR_NODE(node);
@@ -675,7 +681,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
btrfs_inode->ordered_tree_last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
- spin_unlock_irq(&btrfs_inode->ordered_tree_lock);
+ spin_unlock(&btrfs_inode->ordered_tree_lock);
/*
* The current running transaction is waiting on us, we need to let it
@@ -960,9 +966,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
{
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- unsigned long flags;
- spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ spin_lock(&inode->ordered_tree_lock);
node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -975,7 +980,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
trace_btrfs_ordered_extent_lookup(inode, entry);
}
out:
- spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -988,7 +993,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = ordered_tree_search(inode, file_offset);
if (!node) {
node = ordered_tree_search(inode, file_offset + len);
@@ -1015,7 +1020,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_range(inode, entry);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -1030,7 +1035,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
btrfs_assert_inode_locked(inode);
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
struct btrfs_ordered_extent *ordered;
@@ -1044,7 +1049,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
refcount_inc(&ordered->refs);
trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
}
/*
@@ -1057,7 +1062,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -1066,7 +1071,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -1088,7 +1093,7 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
struct rb_node *next;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
node = inode->ordered_tree.rb_node;
/*
* Here we don't want to use tree_search() which will use tree->last
@@ -1143,7 +1148,7 @@ out:
trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
return entry;
}
@@ -1173,7 +1178,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
cachedp = cached_state;
while (1) {
- lock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_lock_extent(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -1186,7 +1191,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
refcount_dec(&cache->refs);
break;
}
- unlock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_unlock_extent(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
@@ -1204,7 +1209,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
{
struct btrfs_ordered_extent *ordered;
- if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
+ if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state))
return false;
ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
@@ -1212,7 +1217,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
return true;
btrfs_put_ordered_extent(ordered);
- unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
return false;
}
@@ -1275,9 +1280,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
/*
* Take the root's ordered_extent_lock to avoid a race with
* btrfs_wait_ordered_extents() when updating the disk_bytenr and
- * disk_num_bytes fields of the ordered extent below. And we disable
- * IRQs because the inode's ordered_tree_lock is used in IRQ context
- * elsewhere.
+ * disk_num_bytes fields of the ordered extent below.
*
* There's no concern about a previous caller of
* btrfs_wait_ordered_extents() getting the trimmed ordered extent
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fc821aa446f0..f189bf09ce6a 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -6,12 +6,19 @@
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
+#include "file-item.h"
#include "print-tree.h"
#include "accessors.h"
#include "tree-checker.h"
#include "volumes.h"
#include "raid-stripe-tree.h"
+/*
+ * Large enough buffer size for the stringification of any key type yet short
+ * enough to use the stack and avoid allocations.
+ */
+#define KEY_TYPE_BUF_SIZE 32
+
struct root_name_map {
u64 id;
const char *name;
@@ -124,7 +131,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)(ei + 1);
btrfs_tree_block_key(eb, info, &key);
- pr_info("\t\ttree block key (%llu %u %llu) level %d\n",
+ pr_info("\t\ttree block key " BTRFS_KEY_FMT " level %d\n",
btrfs_disk_key_objectid(&key), key.type,
btrfs_disk_key_offset(&key),
btrfs_tree_block_level(eb, info));
@@ -190,7 +197,7 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
u32 item_size)
{
if (!IS_ALIGNED(item_size, sizeof(u64))) {
- pr_warn("BTRFS: uuid item with illegal size %lu!\n",
+ btrfs_warn(l->fs_info, "uuid item with illegal size %lu",
(unsigned long)item_size);
return;
}
@@ -223,25 +230,212 @@ static void print_eb_refs_lock(const struct extent_buffer *eb)
{
#ifdef CONFIG_BTRFS_DEBUG
btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
- atomic_read(&eb->refs), eb->lock_owner, current->pid);
+ refcount_read(&eb->refs), eb->lock_owner, current->pid);
#endif
}
+static void print_timespec(const struct extent_buffer *eb,
+ struct btrfs_timespec *timespec,
+ const char *prefix, const char *suffix)
+{
+ const u64 secs = btrfs_timespec_sec(eb, timespec);
+ const u32 nsecs = btrfs_timespec_nsec(eb, timespec);
+
+ pr_info("%s%llu.%u%s", prefix, secs, nsecs, suffix);
+}
+
+static void print_inode_item(const struct extent_buffer *eb, int i)
+{
+ struct btrfs_inode_item *ii = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
+
+ pr_info("\t\tinode generation %llu transid %llu size %llu nbytes %llu\n",
+ btrfs_inode_generation(eb, ii), btrfs_inode_transid(eb, ii),
+ btrfs_inode_size(eb, ii), btrfs_inode_nbytes(eb, ii));
+ pr_info("\t\tblock group %llu mode %o links %u uid %u gid %u\n",
+ btrfs_inode_block_group(eb, ii), btrfs_inode_mode(eb, ii),
+ btrfs_inode_nlink(eb, ii), btrfs_inode_uid(eb, ii),
+ btrfs_inode_gid(eb, ii));
+ pr_info("\t\trdev %llu sequence %llu flags 0x%llx\n",
+ btrfs_inode_rdev(eb, ii), btrfs_inode_sequence(eb, ii),
+ btrfs_inode_flags(eb, ii));
+ print_timespec(eb, &ii->atime, "\t\tatime ", "\n");
+ print_timespec(eb, &ii->ctime, "\t\tctime ", "\n");
+ print_timespec(eb, &ii->mtime, "\t\tmtime ", "\n");
+ print_timespec(eb, &ii->otime, "\t\totime ", "\n");
+}
+
+static void print_dir_item(const struct extent_buffer *eb, int i)
+{
+ const u32 size = btrfs_item_size(eb, i);
+ struct btrfs_dir_item *di = btrfs_item_ptr(eb, i, struct btrfs_dir_item);
+ u32 cur = 0;
+
+ while (cur < size) {
+ const u32 name_len = btrfs_dir_name_len(eb, di);
+ const u32 data_len = btrfs_dir_data_len(eb, di);
+ const u32 len = sizeof(*di) + name_len + data_len;
+ struct btrfs_key location;
+
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ pr_info("\t\tlocation key " BTRFS_KEY_FMT " type %d\n",
+ BTRFS_KEY_FMT_VALUE(&location), btrfs_dir_ftype(eb, di));
+ pr_info("\t\ttransid %llu data_len %u name_len %u\n",
+ btrfs_dir_transid(eb, di), data_len, name_len);
+ di = (struct btrfs_dir_item *)((char *)di + len);
+ cur += len;
+ }
+}
+
+static void print_inode_ref_item(const struct extent_buffer *eb, int i)
+{
+ const u32 size = btrfs_item_size(eb, i);
+ struct btrfs_inode_ref *ref = btrfs_item_ptr(eb, i, struct btrfs_inode_ref);
+ u32 cur = 0;
+
+ while (cur < size) {
+ const u64 index = btrfs_inode_ref_index(eb, ref);
+ const u32 name_len = btrfs_inode_ref_name_len(eb, ref);
+ const u32 len = sizeof(*ref) + name_len;
+
+ pr_info("\t\tindex %llu name_len %u\n", index, name_len);
+ ref = (struct btrfs_inode_ref *)((char *)ref + len);
+ cur += len;
+ }
+}
+
+static void print_inode_extref_item(const struct extent_buffer *eb, int i)
+{
+ const u32 size = btrfs_item_size(eb, i);
+ struct btrfs_inode_extref *extref;
+ u32 cur = 0;
+
+ extref = btrfs_item_ptr(eb, i, struct btrfs_inode_extref);
+ while (cur < size) {
+ const u64 index = btrfs_inode_extref_index(eb, extref);
+ const u32 name_len = btrfs_inode_extref_name_len(eb, extref);
+ const u64 parent = btrfs_inode_extref_parent(eb, extref);
+ const u32 len = sizeof(*extref) + name_len;
+
+ pr_info("\t\tindex %llu parent %llu name_len %u\n",
+ index, parent, name_len);
+ extref = (struct btrfs_inode_extref *)((char *)extref + len);
+ cur += len;
+ }
+}
+
+static void print_dir_log_index_item(const struct extent_buffer *eb, int i)
+{
+ struct btrfs_dir_log_item *dlog;
+
+ dlog = btrfs_item_ptr(eb, i, struct btrfs_dir_log_item);
+ pr_info("\t\tdir log end %llu\n", btrfs_dir_log_end(eb, dlog));
+}
+
+static void print_extent_csum(const struct extent_buffer *eb, int i)
+{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ const u32 size = btrfs_item_size(eb, i);
+ const u32 csum_bytes = (size / fs_info->csum_size) * fs_info->sectorsize;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(eb, &key, i);
+ pr_info("\t\trange start %llu end %llu length %u\n",
+ key.offset, key.offset + csum_bytes, csum_bytes);
+}
+
+static void print_file_extent_item(const struct extent_buffer *eb, int i)
+{
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ pr_info("\t\tgeneration %llu type %hhu\n",
+ btrfs_file_extent_generation(eb, fi),
+ btrfs_file_extent_type(eb, fi));
+
+ if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n",
+ btrfs_file_extent_inline_item_len(eb, i),
+ btrfs_file_extent_ram_bytes(eb, fi),
+ btrfs_file_extent_compression(eb, fi));
+ return;
+ }
+
+ pr_info("\t\textent data disk bytenr %llu nr %llu\n",
+ btrfs_file_extent_disk_bytenr(eb, fi),
+ btrfs_file_extent_disk_num_bytes(eb, fi));
+ pr_info("\t\textent data offset %llu nr %llu ram %llu\n",
+ btrfs_file_extent_offset(eb, fi),
+ btrfs_file_extent_num_bytes(eb, fi),
+ btrfs_file_extent_ram_bytes(eb, fi));
+ pr_info("\t\textent compression %hhu\n",
+ btrfs_file_extent_compression(eb, fi));
+}
+
+static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size)
+{
+ static const char *key_to_str[256] = {
+ [BTRFS_INODE_ITEM_KEY] = "INODE_ITEM",
+ [BTRFS_INODE_REF_KEY] = "INODE_REF",
+ [BTRFS_INODE_EXTREF_KEY] = "INODE_EXTREF",
+ [BTRFS_DIR_ITEM_KEY] = "DIR_ITEM",
+ [BTRFS_DIR_INDEX_KEY] = "DIR_INDEX",
+ [BTRFS_DIR_LOG_ITEM_KEY] = "DIR_LOG_ITEM",
+ [BTRFS_DIR_LOG_INDEX_KEY] = "DIR_LOG_INDEX",
+ [BTRFS_XATTR_ITEM_KEY] = "XATTR_ITEM",
+ [BTRFS_VERITY_DESC_ITEM_KEY] = "VERITY_DESC_ITEM",
+ [BTRFS_VERITY_MERKLE_ITEM_KEY] = "VERITY_MERKLE_ITEM",
+ [BTRFS_ORPHAN_ITEM_KEY] = "ORPHAN_ITEM",
+ [BTRFS_ROOT_ITEM_KEY] = "ROOT_ITEM",
+ [BTRFS_ROOT_REF_KEY] = "ROOT_REF",
+ [BTRFS_ROOT_BACKREF_KEY] = "ROOT_BACKREF",
+ [BTRFS_EXTENT_ITEM_KEY] = "EXTENT_ITEM",
+ [BTRFS_METADATA_ITEM_KEY] = "METADATA_ITEM",
+ [BTRFS_TREE_BLOCK_REF_KEY] = "TREE_BLOCK_REF",
+ [BTRFS_SHARED_BLOCK_REF_KEY] = "SHARED_BLOCK_REF",
+ [BTRFS_EXTENT_DATA_REF_KEY] = "EXTENT_DATA_REF",
+ [BTRFS_SHARED_DATA_REF_KEY] = "SHARED_DATA_REF",
+ [BTRFS_EXTENT_OWNER_REF_KEY] = "EXTENT_OWNER_REF",
+ [BTRFS_EXTENT_CSUM_KEY] = "EXTENT_CSUM",
+ [BTRFS_EXTENT_DATA_KEY] = "EXTENT_DATA",
+ [BTRFS_BLOCK_GROUP_ITEM_KEY] = "BLOCK_GROUP_ITEM",
+ [BTRFS_FREE_SPACE_INFO_KEY] = "FREE_SPACE_INFO",
+ [BTRFS_FREE_SPACE_EXTENT_KEY] = "FREE_SPACE_EXTENT",
+ [BTRFS_FREE_SPACE_BITMAP_KEY] = "FREE_SPACE_BITMAP",
+ [BTRFS_CHUNK_ITEM_KEY] = "CHUNK_ITEM",
+ [BTRFS_DEV_ITEM_KEY] = "DEV_ITEM",
+ [BTRFS_DEV_EXTENT_KEY] = "DEV_EXTENT",
+ [BTRFS_TEMPORARY_ITEM_KEY] = "TEMPORARY_ITEM",
+ [BTRFS_DEV_REPLACE_KEY] = "DEV_REPLACE",
+ [BTRFS_STRING_ITEM_KEY] = "STRING_ITEM",
+ [BTRFS_QGROUP_STATUS_KEY] = "QGROUP_STATUS",
+ [BTRFS_QGROUP_RELATION_KEY] = "QGROUP_RELATION",
+ [BTRFS_QGROUP_INFO_KEY] = "QGROUP_INFO",
+ [BTRFS_QGROUP_LIMIT_KEY] = "QGROUP_LIMIT",
+ [BTRFS_PERSISTENT_ITEM_KEY] = "PERSISTENT_ITEM",
+ [BTRFS_UUID_KEY_SUBVOL] = "UUID_KEY_SUBVOL",
+ [BTRFS_UUID_KEY_RECEIVED_SUBVOL] = "UUID_KEY_RECEIVED_SUBVOL",
+ [BTRFS_RAID_STRIPE_KEY] = "RAID_STRIPE",
+ };
+
+ if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+ scnprintf(buf, buf_size, "UNTYPED");
+ else if (key_to_str[key->type])
+ scnprintf(buf, buf_size, "%s", key_to_str[key->type]);
+ else
+ scnprintf(buf, buf_size, "UNKNOWN.%d", key->type);
+}
+
void btrfs_print_leaf(const struct extent_buffer *l)
{
struct btrfs_fs_info *fs_info;
int i;
u32 type, nr;
struct btrfs_root_item *ri;
- struct btrfs_dir_item *di;
- struct btrfs_inode_item *ii;
struct btrfs_block_group_item *bi;
- struct btrfs_file_extent_item *fi;
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
struct btrfs_dev_extent *dev_extent;
struct btrfs_key key;
- struct btrfs_key found_key;
if (!l)
return;
@@ -255,25 +449,35 @@ void btrfs_print_leaf(const struct extent_buffer *l)
btrfs_leaf_free_space(l), btrfs_header_owner(l));
print_eb_refs_lock(l);
for (i = 0 ; i < nr ; i++) {
+ char key_buf[KEY_TYPE_BUF_SIZE];
+
btrfs_item_key_to_cpu(l, &key, i);
type = key.type;
- pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
- i, key.objectid, type, key.offset,
+ key_type_string(&key, key_buf, KEY_TYPE_BUF_SIZE);
+
+ pr_info("\titem %d key (%llu %s %llu) itemoff %d itemsize %d\n",
+ i, key.objectid, key_buf, key.offset,
btrfs_item_offset(l, i), btrfs_item_size(l, i));
switch (type) {
case BTRFS_INODE_ITEM_KEY:
- ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
- pr_info("\t\tinode generation %llu size %llu mode %o\n",
- btrfs_inode_generation(l, ii),
- btrfs_inode_size(l, ii),
- btrfs_inode_mode(l, ii));
+ print_inode_item(l, i);
+ break;
+ case BTRFS_INODE_REF_KEY:
+ print_inode_ref_item(l, i);
+ break;
+ case BTRFS_INODE_EXTREF_KEY:
+ print_inode_extref_item(l, i);
break;
case BTRFS_DIR_ITEM_KEY:
- di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
- btrfs_dir_item_key_to_cpu(l, di, &found_key);
- pr_info("\t\tdir oid %llu flags %u\n",
- found_key.objectid,
- btrfs_dir_flags(l, di));
+ case BTRFS_DIR_INDEX_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ print_dir_item(l, i);
+ break;
+ case BTRFS_DIR_LOG_INDEX_KEY:
+ print_dir_log_index_item(l, i);
+ break;
+ case BTRFS_EXTENT_CSUM_KEY:
+ print_extent_csum(l, i);
break;
case BTRFS_ROOT_ITEM_KEY:
ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
@@ -303,24 +507,7 @@ void btrfs_print_leaf(const struct extent_buffer *l)
btrfs_shared_data_ref_count(l, sref));
break;
case BTRFS_EXTENT_DATA_KEY:
- fi = btrfs_item_ptr(l, i,
- struct btrfs_file_extent_item);
- pr_info("\t\tgeneration %llu type %hhu\n",
- btrfs_file_extent_generation(l, fi),
- btrfs_file_extent_type(l, fi));
- if (btrfs_file_extent_type(l, fi) ==
- BTRFS_FILE_EXTENT_INLINE) {
- pr_info("\t\tinline extent data size %llu\n",
- btrfs_file_extent_ram_bytes(l, fi));
- break;
- }
- pr_info("\t\textent data disk bytenr %llu nr %llu\n",
- btrfs_file_extent_disk_bytenr(l, fi),
- btrfs_file_extent_disk_num_bytes(l, fi));
- pr_info("\t\textent data offset %llu nr %llu ram %llu\n",
- btrfs_file_extent_offset(l, fi),
- btrfs_file_extent_num_bytes(l, fi),
- btrfs_file_extent_ram_bytes(l, fi));
+ print_file_extent_item(l, i);
break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
bi = btrfs_item_ptr(l, i,
@@ -410,10 +597,9 @@ void btrfs_print_tree(const struct extent_buffer *c, bool follow)
print_eb_refs_lock(c);
for (i = 0; i < nr; i++) {
btrfs_node_key_to_cpu(c, &key, i);
- pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n",
- i, key.objectid, key.type, key.offset,
- btrfs_node_blockptr(c, i),
- btrfs_node_ptr_generation(c, i));
+ pr_info("\tkey %d " BTRFS_KEY_FMT " block %llu gen %llu\n",
+ i, BTRFS_KEY_FMT_VALUE(&key), btrfs_node_blockptr(c, i),
+ btrfs_node_ptr_generation(c, i));
}
if (!follow)
return;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d6fa36674270..9e2b53e90dcb 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -83,7 +83,7 @@ static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
+ trace_btrfs_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
qgroup->rsv.values[type] += num_bytes;
}
@@ -91,7 +91,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
+ trace_btrfs_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
if (qgroup->rsv.values[type] >= num_bytes) {
qgroup->rsv.values[type] -= num_bytes;
return;
@@ -160,23 +160,34 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+static int btrfs_qgroup_qgroupid_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *qgroupid = key;
+ const struct btrfs_qgroup *qgroup = rb_entry(node, struct btrfs_qgroup, node);
+
+ if (qgroup->qgroupid < *qgroupid)
+ return -1;
+ else if (qgroup->qgroupid > *qgroupid)
+ return 1;
+
+ return 0;
+}
+
/* must be called with qgroup_ioctl_lock held */
static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
- struct rb_node *n = fs_info->qgroup_tree.rb_node;
- struct btrfs_qgroup *qgroup;
+ struct rb_node *node;
- while (n) {
- qgroup = rb_entry(n, struct btrfs_qgroup, node);
- if (qgroup->qgroupid < qgroupid)
- n = n->rb_left;
- else if (qgroup->qgroupid > qgroupid)
- n = n->rb_right;
- else
- return qgroup;
- }
- return NULL;
+ node = rb_find(&qgroupid, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_key_cmp);
+ return rb_entry_safe(node, struct btrfs_qgroup, node);
+}
+
+static int btrfs_qgroup_qgroupid_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct btrfs_qgroup *new_qgroup = rb_entry(new, struct btrfs_qgroup, node);
+
+ return btrfs_qgroup_qgroupid_key_cmp(&new_qgroup->qgroupid, existing);
}
/*
@@ -191,39 +202,25 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *prealloc,
u64 qgroupid)
{
- struct rb_node **p = &fs_info->qgroup_tree.rb_node;
- struct rb_node *parent = NULL;
- struct btrfs_qgroup *qgroup;
+ struct rb_node *node;
/* Caller must have pre-allocated @prealloc. */
ASSERT(prealloc);
- while (*p) {
- parent = *p;
- qgroup = rb_entry(parent, struct btrfs_qgroup, node);
-
- if (qgroup->qgroupid < qgroupid) {
- p = &(*p)->rb_left;
- } else if (qgroup->qgroupid > qgroupid) {
- p = &(*p)->rb_right;
- } else {
- kfree(prealloc);
- return qgroup;
- }
+ prealloc->qgroupid = qgroupid;
+ node = rb_find_add(&prealloc->node, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_cmp);
+ if (node) {
+ kfree(prealloc);
+ return rb_entry(node, struct btrfs_qgroup, node);
}
- qgroup = prealloc;
- qgroup->qgroupid = qgroupid;
- INIT_LIST_HEAD(&qgroup->groups);
- INIT_LIST_HEAD(&qgroup->members);
- INIT_LIST_HEAD(&qgroup->dirty);
- INIT_LIST_HEAD(&qgroup->iterator);
- INIT_LIST_HEAD(&qgroup->nested_iterator);
-
- rb_link_node(&qgroup->node, parent, p);
- rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
+ INIT_LIST_HEAD(&prealloc->groups);
+ INIT_LIST_HEAD(&prealloc->members);
+ INIT_LIST_HEAD(&prealloc->dirty);
+ INIT_LIST_HEAD(&prealloc->iterator);
+ INIT_LIST_HEAD(&prealloc->nested_iterator);
- return qgroup;
+ return prealloc;
}
static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
@@ -349,13 +346,27 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid
}
#endif
-static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+__printf(2, 3)
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
+ const u64 old_flags = fs_info->qgroup_flags;
+
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
return;
fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+ if (!(old_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_warn_rl(fs_info, "qgroup marked inconsistent, %pV", &vaf);
+ va_end(args);
+ }
}
static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
@@ -386,12 +397,6 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
if (!fs_info->quota_root)
return 0;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
- ret = -ENOMEM;
- goto out;
- }
-
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -434,13 +439,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
qgroup_read_enable_gen(fs_info, l, slot, ptr);
- } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_err(fs_info,
- "qgroup generation mismatch, marked as inconsistent");
- }
+ else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation)
+ qgroup_mark_inconsistent(fs_info, "qgroup generation mismatch");
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -451,10 +453,8 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = find_qgroup_rb(fs_info, found_key.offset);
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
- (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
- btrfs_err(fs_info, "inconsistent qgroup config");
- qgroup_mark_inconsistent(fs_info);
- }
+ (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY))
+ qgroup_mark_inconsistent(fs_info, "inconsistent qgroup config");
if (!qgroup) {
struct btrfs_qgroup *prealloc;
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -476,7 +476,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
* during mount before we start doing things like creating
* subvolumes.
*/
- if (is_fstree(qgroup->qgroupid) &&
+ if (btrfs_is_fstree(qgroup->qgroupid) &&
qgroup->qgroupid > tree_root->free_objectid)
/*
* Don't need to check against BTRFS_LAST_FREE_OBJECTID,
@@ -581,8 +581,6 @@ out:
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
} else {
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
btrfs_sysfs_del_qgroups(fs_info);
}
@@ -630,29 +628,30 @@ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
- * first two are in single-threaded paths.And for the third one, we have set
- * quota_root to be null with qgroup_lock held before, so it is safe to clean
- * up the in-memory structures without qgroup_lock held.
+ * first two are in single-threaded paths.
*/
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
{
struct rb_node *n;
struct btrfs_qgroup *qgroup;
+ /*
+ * btrfs_quota_disable() can be called concurrently with
+ * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the
+ * lock.
+ */
+ spin_lock(&fs_info->qgroup_lock);
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(qgroup);
+ spin_unlock(&fs_info->qgroup_lock);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
+ spin_lock(&fs_info->qgroup_lock);
}
- /*
- * We call btrfs_free_qgroup_config() when unmounting
- * filesystem and disabling quota, so we set qgroup_ulist
- * to be null here to avoid double free.
- */
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
+ spin_unlock(&fs_info->qgroup_lock);
+
btrfs_sysfs_del_qgroups(fs_info);
}
@@ -661,7 +660,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
{
int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -673,7 +672,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
key.offset = dst;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
- btrfs_free_path(path);
return ret;
}
@@ -682,7 +680,7 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
{
int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -695,24 +693,19 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
- ret = btrfs_del_item(trans, quota_root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, quota_root, path);
}
static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct btrfs_root *quota_root, u64 qgroupid)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_qgroup_info_item *qgroup_info;
struct btrfs_qgroup_limit_item *qgroup_limit;
struct extent_buffer *leaf;
@@ -738,7 +731,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_info));
if (ret && ret != -EEXIST)
- goto out;
+ return ret;
leaf = path->nodes[0];
qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
@@ -755,7 +748,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_limit));
if (ret && ret != -EEXIST)
- goto out;
+ return ret;
leaf = path->nodes[0];
qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
@@ -766,17 +759,14 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
{
int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -788,33 +778,27 @@ static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
key.offset = qgroupid;
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
ret = btrfs_del_item(trans, quota_root, path);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
key.type = BTRFS_QGROUP_LIMIT_KEY;
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
ret = btrfs_del_item(trans, quota_root, path);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -822,7 +806,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
struct btrfs_qgroup *qgroup)
{
struct btrfs_root *quota_root = trans->fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *l;
struct btrfs_qgroup_limit_item *qgroup_limit;
@@ -842,7 +826,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
ret = -ENOENT;
if (ret)
- goto out;
+ return ret;
l = path->nodes[0];
slot = path->slots[0];
@@ -852,8 +836,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -862,7 +845,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root = fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *l;
struct btrfs_qgroup_info_item *qgroup_info;
@@ -885,7 +868,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
ret = -ENOENT;
if (ret)
- goto out;
+ return ret;
l = path->nodes[0];
slot = path->slots[0];
@@ -895,8 +878,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -904,7 +886,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root = fs_info->quota_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *l;
struct btrfs_qgroup_status_item *ptr;
@@ -924,7 +906,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
ret = -ENOENT;
if (ret)
- goto out;
+ return ret;
l = path->nodes[0];
slot = path->slots[0];
@@ -934,8 +916,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -945,7 +926,7 @@ out:
static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *leaf = NULL;
int ret;
@@ -962,7 +943,7 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
nr = btrfs_header_nritems(leaf);
if (!nr)
@@ -975,14 +956,12 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
@@ -998,7 +977,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_qgroup *prealloc = NULL;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *ulist = NULL;
const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
int ret = 0;
int slot;
@@ -1021,12 +999,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
if (fs_info->quota_root)
goto out;
- ulist = ulist_alloc(GFP_KERNEL);
- if (!ulist) {
- ret = -ENOMEM;
- goto out;
- }
-
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
@@ -1066,9 +1038,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
if (fs_info->quota_root)
goto out;
- fs_info->qgroup_ulist = ulist;
- ulist = NULL;
-
/*
* initially create the quota tree
*/
@@ -1080,7 +1049,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
}
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out_free_root;
@@ -1092,7 +1061,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*ptr));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1122,7 +1091,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
if (ret > 0)
goto out_add_root;
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1140,7 +1109,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
/* We should not have a stray @prealloc pointer. */
ASSERT(prealloc == NULL);
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
- if (!prealloc) {
+ if (unlikely(!prealloc)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out_free_path;
@@ -1148,26 +1117,21 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
prealloc = NULL;
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- btrfs_abort_transaction(trans, ret);
- goto out_free_path;
- }
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
ret = btrfs_search_slot_for_read(tree_root, &found_key,
path, 1, 0);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1181,7 +1145,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
}
}
ret = btrfs_next_item(tree_root, path);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1192,7 +1156,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
out_add_root:
btrfs_release_path(path);
ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1206,7 +1170,7 @@ out_add_root:
qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1272,18 +1236,21 @@ out_free_root:
if (ret)
btrfs_put_root(quota_root);
out:
- if (ret) {
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
+ if (ret)
btrfs_sysfs_del_qgroups(fs_info);
- }
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
- ulist_free(ulist);
- kfree(prealloc);
+
+ /*
+ * At this point we either failed at allocating prealloc, or we
+ * succeeded and passed the ownership to it to add_qgroup_rb(). In any
+ * case, this needs to be NULL or there is something wrong.
+ */
+ ASSERT(prealloc == NULL);
+
return ret;
}
@@ -1354,11 +1321,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
/*
* We have nothing held here and no trans handle, just return the error
- * if there is one.
+ * if there is one and set back the quota enabled bit since we didn't
+ * actually disable quotas.
*/
ret = flush_reservations(fs_info);
- if (ret)
+ if (ret) {
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
return ret;
+ }
/*
* 1 For the root item
@@ -1393,13 +1363,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_free_qgroup_config(fs_info);
ret = btrfs_clean_quota_tree(trans, quota_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_del_root(trans, &quota_root->root_key);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1470,9 +1440,9 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
- struct btrfs_qgroup *cur;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
+ u64 num_bytes_cmpr = src->excl_cmpr;
int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1480,15 +1450,16 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
goto out;
qgroup_iterator_add(&qgroup_list, qgroup);
- list_for_each_entry(cur, &qgroup_list, iterator) {
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
qgroup->rfer += sign * num_bytes;
- qgroup->rfer_cmpr += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes_cmpr;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr);
qgroup->excl += sign * num_bytes;
- qgroup->excl_cmpr += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes_cmpr;
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
@@ -1555,8 +1526,10 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst
ASSERT(prealloc);
/* Check the level of src and dst first */
- if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) {
+ kfree(prealloc);
return -EINVAL;
+ }
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
@@ -1679,9 +1652,6 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
struct btrfs_qgroup *prealloc = NULL;
int ret = 0;
- if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
- return 0;
-
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
@@ -1712,7 +1682,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
- kfree(prealloc);
+ /*
+ * At this point we either failed at allocating prealloc, or we
+ * succeeded and passed the ownership to it to add_qgroup_rb(). In any
+ * case, this needs to be NULL or there is something wrong.
+ */
+ ASSERT(prealloc == NULL);
return ret;
}
@@ -1724,8 +1699,7 @@ out:
static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
{
struct btrfs_key key;
- struct btrfs_path *path;
- int ret;
+ BTRFS_PATH_AUTO_FREE(path);
/*
* Squota would never be inconsistent, but there can still be case
@@ -1758,13 +1732,11 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup
if (!path)
return -ENOMEM;
- ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
- btrfs_free_path(path);
/*
* The @ret from btrfs_find_root() exactly matches our definition for
* the return value, thus can be returned directly.
*/
- return ret;
+ return btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
}
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
@@ -1823,7 +1795,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
btrfs_warn_rl(fs_info,
"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
btrfs_qgroup_level(qgroup->qgroupid),
@@ -1843,14 +1815,13 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
if (qgroup->rfer || qgroup->excl ||
qgroup->rfer_cmpr || qgroup->excl_cmpr) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- btrfs_warn_rl(fs_info,
-"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
- btrfs_qgroup_level(qgroup->qgroupid),
- btrfs_qgroup_subvolid(qgroup->qgroupid),
- qgroup->rfer, qgroup->rfer_cmpr,
- qgroup->excl, qgroup->excl_cmpr);
- qgroup_mark_inconsistent(fs_info);
+ DEBUG_WARN();
+ qgroup_mark_inconsistent(fs_info,
+ "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rfer, qgroup->rfer_cmpr,
+ qgroup->excl, qgroup->excl_cmpr);
}
}
del_qgroup_rb(fs_info, qgroupid);
@@ -1873,7 +1844,8 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
struct btrfs_trans_handle *trans;
int ret;
- if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root)
+ if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) ||
+ !fs_info->quota_root)
return 0;
/*
@@ -1968,11 +1940,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_limit_item(trans, qgroup);
- if (ret) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_info(fs_info, "unable to update quota limit for %llu",
- qgroupid);
- }
+ if (ret)
+ qgroup_mark_inconsistent(fs_info, "qgroup item update error %d", ret);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
@@ -2027,7 +1996,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
xa_unlock(&delayed_refs->dirty_extents);
if (xa_is_err(ret)) {
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret));
return xa_err(ret);
}
@@ -2094,10 +2063,8 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_warn(fs_info,
-"error accounting new delayed refs extent (err code: %d), quota inconsistent",
- ret);
+ qgroup_mark_inconsistent(fs_info,
+ "error accounting new delayed refs extent: %d", ret);
return 0;
}
@@ -2318,7 +2285,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
bool trace_leaf)
{
struct btrfs_key key;
- struct btrfs_path *src_path;
+ BTRFS_PATH_AUTO_FREE(src_path);
struct btrfs_fs_info *fs_info = trans->fs_info;
u32 nodesize = fs_info->nodesize;
int cur_level = root_level;
@@ -2330,10 +2297,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
return -EINVAL;
src_path = btrfs_alloc_path();
- if (!src_path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!src_path)
+ return -ENOMEM;
if (dst_level)
btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
@@ -2341,7 +2306,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
/* For src_path */
- atomic_inc(&src_eb->refs);
+ refcount_inc(&src_eb->refs);
src_path->nodes[root_level] = src_eb;
src_path->slots[root_level] = dst_path->slots[root_level];
src_path->locks[root_level] = 0;
@@ -2359,10 +2324,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
parent_slot = src_path->slots[cur_level + 1];
eb = btrfs_read_node_slot(eb, parent_slot);
- if (IS_ERR(eb)) {
- ret = PTR_ERR(eb);
- goto out;
- }
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
src_path->nodes[cur_level] = eb;
@@ -2383,10 +2346,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
&src_key, src_path->slots[cur_level]);
}
/* Content mismatch, something went wrong */
- if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
- ret = -ENOENT;
- goto out;
- }
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key))
+ return -ENOENT;
cur_level--;
}
@@ -2397,21 +2358,20 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
nodesize);
if (ret < 0)
- goto out;
+ return ret;
ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
nodesize);
if (ret < 0)
- goto out;
+ return ret;
/* Record leaf file extents */
if (dst_level == 0 && trace_leaf) {
ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
if (ret < 0)
- goto out;
+ return ret;
ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
}
-out:
- btrfs_free_path(src_path);
+
return ret;
}
@@ -2450,9 +2410,9 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
int i;
/* Level sanity check */
- if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
- root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
- root_level < cur_level) {
+ if (unlikely(cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < cur_level)) {
btrfs_err_rl(fs_info,
"%s: bad levels, cur_level=%d root_level=%d",
__func__, cur_level, root_level);
@@ -2468,7 +2428,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
* dst_path->nodes[root_level] must be initialized before
* calling this function.
*/
- if (cur_level == root_level) {
+ if (unlikely(cur_level == root_level)) {
btrfs_err_rl(fs_info,
"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
__func__, root_level, root_level, cur_level);
@@ -2554,7 +2514,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
return 0;
/* Wrong parameter order */
- if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
+ if (unlikely(btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb))) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
btrfs_header_generation(src_eb),
@@ -2562,7 +2522,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
return -EUCLEAN;
}
- if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+ if (unlikely(!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb))) {
ret = -EIO;
goto out;
}
@@ -2574,7 +2534,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
goto out;
}
/* For dst_path */
- atomic_inc(&dst_eb->refs);
+ refcount_inc(&dst_eb->refs);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
@@ -2589,7 +2549,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(dst_path);
if (ret < 0)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
return ret;
}
@@ -2612,7 +2572,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
int level;
u8 drop_subptree_thres;
struct extent_buffer *eb = root_eb;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL);
ASSERT(root_eb != NULL);
@@ -2633,7 +2593,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* mark qgroup inconsistent.
*/
if (root_level >= drop_subptree_thres) {
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "subtree level reached threshold");
return 0;
}
@@ -2645,12 +2605,12 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
ret = btrfs_read_extent_buffer(root_eb, &check);
if (ret)
- goto out;
+ return ret;
}
if (root_level == 0) {
ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
- goto out;
+ return ret;
}
path = btrfs_alloc_path();
@@ -2666,7 +2626,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* walk back up the tree (adjusting slot pointers as we go)
* and restart the search process.
*/
- atomic_inc(&root_eb->refs); /* For path */
+ refcount_inc(&root_eb->refs); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
@@ -2686,10 +2646,8 @@ walk_down:
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
eb = btrfs_read_node_slot(eb, parent_slot);
- if (IS_ERR(eb)) {
- ret = PTR_ERR(eb);
- goto out;
- }
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
@@ -2700,14 +2658,14 @@ walk_down:
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
fs_info->nodesize);
if (ret)
- goto out;
+ return ret;
}
if (level == 0) {
ret = btrfs_qgroup_trace_leaf_items(trans,
path->nodes[level]);
if (ret)
- goto out;
+ return ret;
/* Nonzero return here means we completed our search */
ret = adjust_slots_upwards(path, root_level);
@@ -2721,11 +2679,7 @@ walk_down:
level--;
}
- ret = 0;
-out:
- btrfs_free_path(path);
-
- return ret;
+ return 0;
}
static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
@@ -2753,7 +2707,7 @@ static void qgroup_iterator_nested_clean(struct list_head *head)
*/
static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
struct ulist *roots, struct list_head *qgroups,
- u64 seq, int update_old)
+ u64 seq, bool update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2837,8 +2791,8 @@ static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
- trace_qgroup_update_counters(fs_info, qg, cur_old_count,
- cur_new_count);
+ trace_btrfs_qgroup_update_counters(fs_info, qg, cur_old_count,
+ cur_new_count);
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
@@ -2932,7 +2886,7 @@ static int maybe_fs_roots(struct ulist *roots)
* trees.
* If it contains a non-fs tree, it won't be shared with fs/subvol trees.
*/
- return is_fstree(unode->val);
+ return btrfs_is_fstree(unode->val);
}
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
@@ -3100,8 +3054,7 @@ cleanup:
kfree(record);
}
- trace_qgroup_num_dirty_extents(fs_info, trans->transid,
- num_dirty_extents);
+ trace_btrfs_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents);
return ret;
}
@@ -3134,10 +3087,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup info item update error %d", ret);
ret = update_qgroup_limit_item(trans, qgroup);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup limit item update error %d", ret);
spin_lock(&fs_info->qgroup_lock);
}
if (btrfs_qgroup_enabled(fs_info))
@@ -3148,7 +3103,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
ret = update_qgroup_status_item(trans);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup status item update error %d", ret);
return ret;
}
@@ -3323,13 +3279,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
- struct btrfs_qgroup *prealloc;
+ struct btrfs_qgroup *prealloc = NULL;
struct btrfs_qgroup_list **qlist_prealloc = NULL;
bool free_inherit = false;
bool need_rescan = false;
u32 level_size = 0;
u64 nums;
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
if (!prealloc)
return -ENOMEM;
@@ -3353,8 +3312,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!btrfs_qgroup_enabled(fs_info))
- goto out;
quota_root = fs_info->quota_root;
if (!quota_root) {
@@ -3555,7 +3512,7 @@ out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "qgroup inherit needs a rescan");
if (qlist_prealloc) {
for (int i = 0; i < inherit->num_qgroups; i++)
kfree(qlist_prealloc[i]);
@@ -3563,7 +3520,14 @@ out:
}
if (free_inherit)
kfree(inherit);
- kfree(prealloc);
+
+ /*
+ * At this point we either failed at allocating prealloc, or we
+ * succeeded and passed the ownership to it to add_qgroup_rb(). In any
+ * case, this needs to be NULL or there is something wrong.
+ */
+ ASSERT(prealloc == NULL);
+
return ret;
}
@@ -3589,7 +3553,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
int ret = 0;
LIST_HEAD(qgroup_list);
- if (!is_fstree(ref_root))
+ if (!btrfs_is_fstree(ref_root))
return 0;
if (num_bytes == 0)
@@ -3649,7 +3613,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
- if (!is_fstree(ref_root))
+ if (!btrfs_is_fstree(ref_root))
return;
if (num_bytes == 0)
@@ -3731,10 +3695,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
path, 1, 0);
btrfs_debug(fs_info,
- "current progress key (%llu %u %llu), search_slot ret %d",
- fs_info->qgroup_rescan_progress.objectid,
- fs_info->qgroup_rescan_progress.type,
- fs_info->qgroup_rescan_progress.offset, ret);
+ "current progress key " BTRFS_KEY_FMT ", search_slot ret %d",
+ BTRFS_KEY_FMT_VALUE(&fs_info->qgroup_rescan_progress), ret);
if (ret) {
/*
@@ -3836,8 +3798,8 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
* Rescan should only search for commit root, and any later difference
* should be recorded by qgroup
*/
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
while (!ret && !(stopped = rescan_should_stop(fs_info))) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
@@ -4037,12 +3999,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = true;
- btrfs_queue_work(fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ /*
+ * The rescan worker is only for full accounting qgroups, check if it's
+ * enabled as it is pointless to queue it otherwise. A concurrent quota
+ * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED.
+ */
+ if (btrfs_qgroup_full_accounting(fs_info)) {
+ fs_info->qgroup_rescan_running = true;
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ } else {
+ ret = -ENOTCONN;
+ }
mutex_unlock(&fs_info->qgroup_rescan_lock);
- return 0;
+ return ret;
}
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
@@ -4129,8 +4100,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
* Now the entry is in [start, start + len), revert the
* EXTENT_QGROUP_RESERVED bit.
*/
- clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
- entry_end, EXTENT_QGROUP_RESERVED);
+ clear_ret = btrfs_clear_extent_bit(&inode->io_tree, entry_start, entry_end,
+ EXTENT_QGROUP_RESERVED, NULL);
if (!ret && clear_ret < 0)
ret = clear_ret;
@@ -4217,7 +4188,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)) || len == 0)
+ !btrfs_is_fstree(btrfs_root_id(root)) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -4232,8 +4203,9 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
- ret = set_record_extent_bits(&inode->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, reserved);
+ ret = btrfs_set_record_extent_bits(&inode->io_tree, start,
+ start + len - 1, EXTENT_QGROUP_RESERVED,
+ reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
@@ -4326,9 +4298,10 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&inode->io_tree, free_start,
- free_start + free_len - 1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ ret = btrfs_clear_record_extent_bits(&inode->io_tree, free_start,
+ free_start + free_len - 1,
+ EXTENT_QGROUP_RESERVED,
+ &changeset);
if (ret < 0)
goto out;
freed += changeset.bytes_changed;
@@ -4352,9 +4325,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
- return clear_record_extent_bits(&inode->io_tree, start,
- start + len - 1,
- EXTENT_QGROUP_RESERVED, NULL);
+ return btrfs_clear_record_extent_bits(&inode->io_tree, start,
+ start + len - 1,
+ EXTENT_QGROUP_RESERVED, NULL);
}
/* In release case, we shouldn't have @reserved */
@@ -4362,8 +4335,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len, released);
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1,
+ EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -4468,11 +4441,11 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)) || num_bytes == 0)
+ !btrfs_is_fstree(btrfs_root_id(root)) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
+ trace_btrfs_qgroup_meta_reserve(root, (s64)num_bytes, type);
ret = qgroup_reserve(root, num_bytes, enforce, type);
if (ret < 0)
return ret;
@@ -4513,11 +4486,11 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/* TODO: Update trace point to handle such free */
- trace_qgroup_meta_free_all_pertrans(root);
+ trace_btrfs_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
@@ -4529,7 +4502,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/*
@@ -4539,7 +4512,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
*/
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
+ trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type);
btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
}
@@ -4588,12 +4561,12 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
- trace_qgroup_meta_convert(root, num_bytes);
+ trace_btrfs_qgroup_meta_convert(root, num_bytes);
qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes);
if (!sb_rdonly(fs_info->sb))
add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
@@ -4611,8 +4584,8 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
int ret;
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ ret = btrfs_clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
+ EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
@@ -4672,6 +4645,28 @@ out:
spin_unlock(&swapped_blocks->lock);
}
+static int qgroup_swapped_block_bytenr_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *bytenr = key;
+ const struct btrfs_qgroup_swapped_block *block = rb_entry(node,
+ struct btrfs_qgroup_swapped_block, node);
+
+ if (block->subvol_bytenr < *bytenr)
+ return -1;
+ else if (block->subvol_bytenr > *bytenr)
+ return 1;
+
+ return 0;
+}
+
+static int qgroup_swapped_block_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct btrfs_qgroup_swapped_block *new_block = rb_entry(new,
+ struct btrfs_qgroup_swapped_block, node);
+
+ return qgroup_swapped_block_bytenr_key_cmp(&new_block->subvol_bytenr, existing);
+}
+
/*
* Add subtree roots record into @subvol_root.
*
@@ -4691,16 +4686,15 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_fs_info *fs_info = subvol_root->fs_info;
struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
- struct rb_node **cur;
- struct rb_node *parent = NULL;
+ struct rb_node *node;
int level = btrfs_header_level(subvol_parent) - 1;
int ret = 0;
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
- btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ if (unlikely(btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot))) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
__func__,
@@ -4741,46 +4735,32 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
/* Insert @block into @blocks */
spin_lock(&blocks->lock);
- cur = &blocks->blocks[level].rb_node;
- while (*cur) {
+ node = rb_find_add(&block->node, &blocks->blocks[level], qgroup_swapped_block_bytenr_cmp);
+ if (node) {
struct btrfs_qgroup_swapped_block *entry;
- parent = *cur;
- entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
- node);
+ entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
- if (entry->subvol_bytenr < block->subvol_bytenr) {
- cur = &(*cur)->rb_left;
- } else if (entry->subvol_bytenr > block->subvol_bytenr) {
- cur = &(*cur)->rb_right;
- } else {
- if (entry->subvol_generation !=
- block->subvol_generation ||
- entry->reloc_bytenr != block->reloc_bytenr ||
- entry->reloc_generation !=
- block->reloc_generation) {
- /*
- * Duplicated but mismatch entry found.
- * Shouldn't happen.
- *
- * Marking qgroup inconsistent should be enough
- * for end users.
- */
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- ret = -EEXIST;
- }
- kfree(block);
- goto out_unlock;
+ if (entry->subvol_generation != block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation != block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found. Shouldn't happen.
+ * Marking qgroup inconsistent should be enough for end
+ * users.
+ */
+ DEBUG_WARN("duplicated but mismatched entry found");
+ ret = -EEXIST;
}
+ kfree(block);
+ goto out_unlock;
}
- rb_link_node(&block->node, parent, cur);
- rb_insert_color(&block->node, &blocks->blocks[level]);
blocks->swapped = true;
out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
return ret;
}
@@ -4797,10 +4777,9 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
- struct btrfs_qgroup_swapped_block *block;
+ struct btrfs_qgroup_swapped_block AUTO_KFREE(block);
struct extent_buffer *reloc_eb = NULL;
struct rb_node *node;
- bool found = false;
bool swapped = false;
int level = btrfs_header_level(subvol_eb);
int ret = 0;
@@ -4808,7 +4787,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root)
+ if (!btrfs_is_fstree(btrfs_root_id(root)) || !root->reloc_root)
return 0;
spin_lock(&blocks->lock);
@@ -4816,23 +4795,14 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
spin_unlock(&blocks->lock);
return 0;
}
- node = blocks->blocks[level].rb_node;
-
- while (node) {
- block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
- if (block->subvol_bytenr < subvol_eb->start) {
- node = node->rb_left;
- } else if (block->subvol_bytenr > subvol_eb->start) {
- node = node->rb_right;
- } else {
- found = true;
- break;
- }
- }
- if (!found) {
+ node = rb_find(&subvol_eb->start, &blocks->blocks[level],
+ qgroup_swapped_block_bytenr_key_cmp);
+ if (!node) {
spin_unlock(&blocks->lock);
goto out;
}
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+
/* Found one, remove it from @blocks first and update blocks->swapped */
rb_erase(&block->node, &blocks->blocks[level]);
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
@@ -4856,7 +4826,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
reloc_eb = NULL;
goto free_out;
}
- if (!extent_buffer_uptodate(reloc_eb)) {
+ if (unlikely(!extent_buffer_uptodate(reloc_eb))) {
ret = -EIO;
goto free_out;
}
@@ -4864,14 +4834,12 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
block->last_snapshot, block->trace_leaf);
free_out:
- kfree(block);
free_extent_buffer(reloc_eb);
out:
if (ret < 0) {
- btrfs_err_rl(fs_info,
- "failed to account subtree at bytenr %llu: %d",
- subvol_eb->start, ret);
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
}
return ret;
}
@@ -4902,7 +4870,7 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
return 0;
- if (!is_fstree(root))
+ if (!btrfs_is_fstree(root))
return 0;
/* If the extent predates enabling quotas, don't count it. */
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 1834011ccc49..2987cb7c686e 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -19,7 +19,7 @@ static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
u64 newlen, u64 frontpad)
{
struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
- struct btrfs_stripe_extent *extent, *newitem;
+ struct btrfs_stripe_extent *extent, AUTO_KFREE(newitem);
struct extent_buffer *leaf;
int slot;
size_t item_size;
@@ -53,21 +53,17 @@ static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, stripe_root, path);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
- ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size);
-
-out:
- kfree(newitem);
- return ret;
+ return btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size);
}
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *stripe_root = fs_info->stripe_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *leaf;
u64 found_start;
@@ -260,7 +256,6 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
btrfs_release_path(path);
}
- btrfs_free_path(path);
return ret;
}
@@ -269,7 +264,7 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
struct btrfs_stripe_extent *stripe_extent,
const size_t item_size)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
int ret;
int slot;
@@ -288,7 +283,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
item_size);
- btrfs_free_path(path);
return ret;
}
@@ -301,12 +295,12 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_key stripe_key;
struct btrfs_root *stripe_root = fs_info->stripe_root;
const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
- struct btrfs_stripe_extent *stripe_extent;
+ struct btrfs_stripe_extent AUTO_KFREE(stripe_extent);
const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
int ret;
stripe_extent = kzalloc(item_size, GFP_NOFS);
- if (!stripe_extent) {
+ if (!unlikely(stripe_extent)) {
btrfs_abort_transaction(trans, -ENOMEM);
btrfs_end_transaction(trans);
return -ENOMEM;
@@ -329,13 +323,14 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
item_size);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
ret = update_raid_extent_item(trans, &stripe_key, stripe_extent,
item_size);
- if (ret)
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
-
- kfree(stripe_extent);
+ }
return ret;
}
@@ -373,7 +368,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
struct btrfs_stripe_extent *stripe_extent;
struct btrfs_key stripe_key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
const u64 end = logical + *length;
int num_stripes;
@@ -393,13 +388,13 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
return -ENOMEM;
if (stripe->rst_search_commit_root) {
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
}
ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
if (ret < 0)
- goto free_path;
+ return ret;
if (ret) {
if (path->slots[0] != 0)
path->slots[0]--;
@@ -456,8 +451,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
stripe->physical, devid);
- ret = 0;
- goto free_path;
+ return 0;
}
/* If we're here, we haven't found the requested devid in the stripe. */
@@ -471,8 +465,6 @@ out:
logical, logical + *length, stripe->dev->devid,
btrfs_bg_type_to_raid_name(map_type));
}
-free_path:
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index cdd373c27784..f38d8305e46d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -66,10 +66,10 @@ static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
dump_bioc(fs_info, rbio->bioc);
btrfs_crit(fs_info,
-"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
+"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx",
rbio->flags, rbio->nr_sectors, rbio->nr_data,
rbio->real_stripes, rbio->stripe_nsectors,
- rbio->scrubp, rbio->dbitmap);
+ rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap);
}
#define ASSERT_RBIO(expr, rbio) \
@@ -134,15 +134,10 @@ struct btrfs_stripe_hash_table {
};
/*
- * A bvec like structure to present a sector inside a page.
- *
- * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
+ * The PFN may still be valid, but our paddrs should always be block size
+ * aligned, thus such -1 paddr is definitely not a valid one.
*/
-struct sector_ptr {
- struct page *page;
- unsigned int pgoff:24;
- unsigned int uptodate:8;
-};
+#define INVALID_PADDR (~(phys_addr_t)0)
static void rmw_rbio_work(struct work_struct *work);
static void rmw_rbio_work_locked(struct work_struct *work);
@@ -156,8 +151,8 @@ static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
{
bitmap_free(rbio->error_bitmap);
kfree(rbio->stripe_pages);
- kfree(rbio->bio_sectors);
- kfree(rbio->stripe_sectors);
+ kfree(rbio->bio_paddrs);
+ kfree(rbio->stripe_paddrs);
kfree(rbio->finish_pointers);
}
@@ -200,8 +195,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
struct btrfs_stripe_hash_table *x;
struct btrfs_stripe_hash *cur;
struct btrfs_stripe_hash *h;
- int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
- int i;
+ unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
if (info->stripe_hash_table)
return 0;
@@ -222,7 +216,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
h = table->table;
- for (i = 0; i < num_entries; i++) {
+ for (unsigned int i = 0; i < num_entries; i++) {
cur = h + i;
INIT_LIST_HEAD(&cur->hash_list);
spin_lock_init(&cur->lock);
@@ -233,6 +227,24 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
return 0;
}
+static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr)
+{
+ const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
+
+ ASSERT(sector_nr < rbio->nr_sectors);
+ for (int i = 0; i < rbio->sector_nsteps; i++) {
+ unsigned int index = sector_nr * rbio->sector_nsteps + i;
+ phys_addr_t dst = rbio->stripe_paddrs[index];
+ phys_addr_t src = rbio->bio_paddrs[index];
+
+ ASSERT(dst != INVALID_PADDR);
+ ASSERT(src != INVALID_PADDR);
+
+ memcpy_page(phys_to_page(dst), offset_in_page(dst),
+ phys_to_page(src), offset_in_page(src), step);
+ }
+}
+
/*
* caching an rbio means to copy anything from the
* bio_sectors array into the stripe_pages array. We
@@ -253,24 +265,19 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
for (i = 0; i < rbio->nr_sectors; i++) {
/* Some range not covered by bio (partial write), skip it */
- if (!rbio->bio_sectors[i].page) {
+ if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) {
/*
* Even if the sector is not covered by bio, if it is
* a data sector it should still be uptodate as it is
* read from disk.
*/
if (i < rbio->nr_data * rbio->stripe_nsectors)
- ASSERT(rbio->stripe_sectors[i].uptodate);
+ ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap));
continue;
}
- ASSERT(rbio->stripe_sectors[i].page);
- memcpy_page(rbio->stripe_sectors[i].page,
- rbio->stripe_sectors[i].pgoff,
- rbio->bio_sectors[i].page,
- rbio->bio_sectors[i].pgoff,
- rbio->bioc->fs_info->sectorsize);
- rbio->stripe_sectors[i].uptodate = 1;
+ memcpy_from_bio_to_stripe(rbio, i);
+ set_bit(i, rbio->stripe_uptodate_bitmap);
}
set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
}
@@ -293,19 +300,48 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
}
-static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
- unsigned int page_nr)
+/* Get the sector number of the first sector covered by @page_nr. */
+static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ u32 sector_nr;
+
+ ASSERT(page_nr < rbio->nr_pages);
+
+ sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits;
+ ASSERT(sector_nr < rbio->nr_sectors);
+ return sector_nr;
+}
+
+/*
+ * Get the number of sectors covered by @page_nr.
+ *
+ * For bs > ps cases, the result will always be 1.
+ * For bs <= ps cases, the result will be ps / bs.
+ */
+static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr)
+{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ u32 nr_sectors;
+
+ ASSERT(page_nr < rbio->nr_pages);
+
+ nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits;
+ ASSERT(nr_sectors > 0);
+ return nr_sectors;
+}
+
+static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
+ unsigned int page_nr)
+{
+ const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr);
+ const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr);
int i;
ASSERT(page_nr < rbio->nr_pages);
+ ASSERT(sector_nr + nr_bits < rbio->nr_sectors);
- for (i = sectors_per_page * page_nr;
- i < sectors_per_page * page_nr + sectors_per_page;
- i++) {
- if (!rbio->stripe_sectors[i].uptodate)
+ for (i = sector_nr; i < sector_nr + nr_bits; i++) {
+ if (!test_bit(i, rbio->stripe_uptodate_bitmap))
return false;
}
return true;
@@ -318,41 +354,44 @@ static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
*/
static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
u32 offset;
int i;
- for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
+ for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps;
+ i++, offset += step) {
int page_index = offset >> PAGE_SHIFT;
ASSERT(page_index < rbio->nr_pages);
- rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
- rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
+ if (!rbio->stripe_pages[page_index])
+ continue;
+
+ rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) +
+ offset_in_page(offset);
}
}
static void steal_rbio_page(struct btrfs_raid_bio *src,
struct btrfs_raid_bio *dest, int page_nr)
{
- const u32 sectorsize = src->bioc->fs_info->sectorsize;
- const u32 sectors_per_page = PAGE_SIZE / sectorsize;
- int i;
+ const u32 sector_nr = page_nr_to_sector_nr(src, page_nr);
+ const u32 nr_bits = page_nr_to_num_sectors(src, page_nr);
+
+ ASSERT(page_nr < src->nr_pages);
+ ASSERT(sector_nr + nr_bits < src->nr_sectors);
if (dest->stripe_pages[page_nr])
__free_page(dest->stripe_pages[page_nr]);
dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
src->stripe_pages[page_nr] = NULL;
- /* Also update the sector->uptodate bits. */
- for (i = sectors_per_page * page_nr;
- i < sectors_per_page * page_nr + sectors_per_page; i++)
- dest->stripe_sectors[i].uptodate = true;
+ /* Also update the stripe_uptodate_bitmap bits. */
+ bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits);
}
static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
{
- const int sector_nr = (page_nr << PAGE_SHIFT) >>
- rbio->bioc->fs_info->sectorsize_bits;
+ const int sector_nr = page_nr_to_sector_nr(rbio, page_nr);
/*
* We have ensured PAGE_SIZE is aligned with sectorsize, thus
@@ -507,9 +546,8 @@ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
spin_lock(&table->cache_lock);
while (!list_empty(&table->stripe_cache)) {
- rbio = list_entry(table->stripe_cache.next,
- struct btrfs_raid_bio,
- stripe_cache);
+ rbio = list_first_entry(&table->stripe_cache,
+ struct btrfs_raid_bio, stripe_cache);
__remove_rbio_from_cache(rbio);
}
spin_unlock(&table->cache_lock);
@@ -567,9 +605,9 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
if (table->cache_size > RBIO_CACHE_SIZE) {
struct btrfs_raid_bio *found;
- found = list_entry(table->stripe_cache.prev,
- struct btrfs_raid_bio,
- stripe_cache);
+ found = list_last_entry(&table->stripe_cache,
+ struct btrfs_raid_bio,
+ stripe_cache);
if (found != rbio)
__remove_rbio_from_cache(found);
@@ -667,39 +705,62 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
return 1;
}
-static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
- unsigned int stripe_nr,
- unsigned int sector_nr)
+/* Return the sector index for @stripe_nr and @sector_nr. */
+static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
+ unsigned int ret;
+
ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
- return stripe_nr * rbio->stripe_nsectors + sector_nr;
+ ret = stripe_nr * rbio->stripe_nsectors + sector_nr;
+ ASSERT(ret < rbio->nr_sectors);
+ return ret;
+}
+
+/* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */
+static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr,
+ unsigned int step_nr)
+{
+ unsigned int ret;
+
+ ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr);
+
+ ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr;
+ ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps);
+ return ret;
}
-/* Return a sector from rbio->stripe_sectors, not from the bio list */
-static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
- unsigned int stripe_nr,
- unsigned int sector_nr)
+static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr, unsigned int sector_nr,
+ unsigned int step_nr)
{
- return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
- sector_nr)];
+ return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)];
}
-/* Grab a sector inside P stripe */
-static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
- unsigned int sector_nr)
+static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr, unsigned int step_nr)
{
- return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
+ return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr);
}
-/* Grab a sector inside Q stripe, return NULL if not RAID6 */
-static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
- unsigned int sector_nr)
+static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr, unsigned int step_nr)
{
if (rbio->nr_data + 1 == rbio->real_stripes)
- return NULL;
- return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
+ return INVALID_PADDR;
+ return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr);
+}
+
+/* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */
+static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr, unsigned int sector_nr)
+{
+ return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)];
}
/*
@@ -882,14 +943,14 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
-static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
+static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
{
struct bio *next;
while (cur) {
next = cur->bi_next;
cur->bi_next = NULL;
- cur->bi_status = err;
+ cur->bi_status = status;
bio_endio(cur);
cur = next;
}
@@ -899,7 +960,7 @@ static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
* this frees the rbio and runs through all the bios in the
* bio_list and calls end_io on them
*/
-static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
@@ -928,13 +989,13 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
extra = bio_list_get(&rbio->bio_list);
free_raid_bio(rbio);
- rbio_endio_bio_list(cur, err);
+ rbio_endio_bio_list(cur, status);
if (extra)
- rbio_endio_bio_list(extra, err);
+ rbio_endio_bio_list(extra, status);
}
/*
- * Get a sector pointer specified by its @stripe_nr and @sector_nr.
+ * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr.
*
* @rbio: The raid bio
* @stripe_nr: Stripe number, valid range [0, real_stripe)
@@ -944,34 +1005,52 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
*
* The read/modify/write code wants to reuse the original bio page as much
* as possible, and only use stripe_sectors as fallback.
+ *
+ * Return NULL if bio_list_only is set but the specified sector has no
+ * coresponding bio.
*/
-static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
- int stripe_nr, int sector_nr,
- bool bio_list_only)
+static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr,
+ bool bio_list_only)
{
- struct sector_ptr *sector;
- int index;
+ phys_addr_t *ret = NULL;
+ const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0);
- ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
- rbio, stripe_nr);
- ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
- rbio, sector_nr);
+ ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
- index = stripe_nr * rbio->stripe_nsectors + sector_nr;
- ASSERT(index >= 0 && index < rbio->nr_sectors);
-
- spin_lock(&rbio->bio_list_lock);
- sector = &rbio->bio_sectors[index];
- if (sector->page || bio_list_only) {
- /* Don't return sector without a valid page pointer */
- if (!sector->page)
- sector = NULL;
- spin_unlock(&rbio->bio_list_lock);
- return sector;
+ scoped_guard(spinlock, &rbio->bio_list_lock) {
+ if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
+ /* Don't return sector without a valid page pointer */
+ if (rbio->bio_paddrs[index] != INVALID_PADDR)
+ ret = &rbio->bio_paddrs[index];
+ return ret;
+ }
}
- spin_unlock(&rbio->bio_list_lock);
+ return &rbio->stripe_paddrs[index];
+}
- return &rbio->stripe_sectors[index];
+/*
+ * Similar to sector_paddr_in_rbio(), but with extra consideration for
+ * bs > ps cases, where we can have multiple steps for a fs block.
+ */
+static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr, int step_nr,
+ bool bio_list_only)
+{
+ phys_addr_t ret = INVALID_PADDR;
+ const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr);
+
+ ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
+
+ scoped_guard(spinlock, &rbio->bio_list_lock) {
+ if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
+ /* Don't return sector without a valid page pointer */
+ if (rbio->bio_paddrs[index] != INVALID_PADDR)
+ ret = rbio->bio_paddrs[index];
+ return ret;
+ }
+ }
+ return rbio->stripe_paddrs[index];
}
/*
@@ -987,10 +1066,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
const unsigned int stripe_nsectors =
BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
const unsigned int num_sectors = stripe_nsectors * real_stripes;
+ const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE);
+ const unsigned int sector_nsteps = fs_info->sectorsize / step;
struct btrfs_raid_bio *rbio;
- /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
- ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
+ /*
+ * For bs <= ps cases, ps must be aligned to bs.
+ * For bs > ps cases, bs must be aligned to ps.
+ */
+ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) ||
+ IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE));
/*
* Our current stripe len should be fixed to 64k thus stripe_nsectors
* (at most 16) should be no larger than BITS_PER_LONG.
@@ -1009,19 +1094,22 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM);
rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
GFP_NOFS);
- rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
- GFP_NOFS);
- rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
- GFP_NOFS);
+ rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS);
+ rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS);
rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
+ rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
- if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
- !rbio->finish_pointers || !rbio->error_bitmap) {
+ if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs ||
+ !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
free_raid_bio_pointers(rbio);
kfree(rbio);
return ERR_PTR(-ENOMEM);
}
+ for (int i = 0; i < num_sectors * sector_nsteps; i++) {
+ rbio->stripe_paddrs[i] = INVALID_PADDR;
+ rbio->bio_paddrs[i] = INVALID_PADDR;
+ }
bio_list_init(&rbio->bio_list);
init_waitqueue_head(&rbio->io_wait);
@@ -1036,6 +1124,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
rbio->stripe_nsectors = stripe_nsectors;
+ rbio->sector_nsteps = sector_nsteps;
refcount_set(&rbio->refs, 1);
atomic_set(&rbio->stripes_pending, 0);
@@ -1080,8 +1169,8 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
* @faila and @failb will also be updated to the first and second stripe
* number of the errors.
*/
-static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
- int *faila, int *failb)
+static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
+ int *faila, int *failb)
{
int stripe_nr;
int found_errors = 0;
@@ -1113,20 +1202,41 @@ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
return found_errors;
}
+static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps,
+ unsigned int step)
+{
+ int added = 0;
+ int ret;
+
+ for (int i = 0; i < nr_steps; i++) {
+ ret = bio_add_page(bio, phys_to_page(paddrs[i]), step,
+ offset_in_page(paddrs[i]));
+ if (ret != step)
+ goto revert;
+ added += ret;
+ }
+ return added;
+revert:
+ /*
+ * We don't need to revert the bvec, as the bio will be submitted immediately,
+ * as long as the size is reduced the extra bvec will not be accessed.
+ */
+ bio->bi_iter.bi_size -= added;
+ return 0;
+}
+
/*
* Add a single sector @sector into our list of bios for IO.
*
* Return 0 if everything went well.
- * Return <0 for error.
+ * Return <0 for error, and no byte will be added to @rbio.
*/
-static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list,
- struct sector_ptr *sector,
- unsigned int stripe_nr,
- unsigned int sector_nr,
- enum req_op op)
+static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list,
+ phys_addr_t *paddrs, unsigned int stripe_nr,
+ unsigned int sector_nr, enum req_op op)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 step = min(sectorsize, PAGE_SIZE);
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
@@ -1142,7 +1252,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
rbio, stripe_nr);
ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
rbio, sector_nr);
- ASSERT(sector->page);
+ ASSERT(paddrs != NULL);
stripe = &rbio->bioc->stripes[stripe_nr];
disk_start = stripe->physical + sector_nr * sectorsize;
@@ -1155,9 +1265,9 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
rbio->error_bitmap);
/* Check if we have reached tolerance early. */
- found_errors = get_rbio_veritical_errors(rbio, sector_nr,
- NULL, NULL);
- if (found_errors > rbio->bioc->max_errors)
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr,
+ NULL, NULL);
+ if (unlikely(found_errors > rbio->bioc->max_errors))
return -EIO;
return 0;
}
@@ -1173,8 +1283,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
*/
if (last_end == disk_start && !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
- ret = bio_add_page(last, sector->page, sectorsize,
- sector->pgoff);
+ ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step);
if (ret == sectorsize)
return 0;
}
@@ -1187,31 +1296,27 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
bio->bi_private = rbio;
- __bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
+ ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step);
+ ASSERT(ret == sectorsize);
bio_list_add(bio_list, bio);
return 0;
}
static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- struct bio_vec bvec;
- struct bvec_iter iter;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
+ const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT);
+ struct bvec_iter iter = bio->bi_iter;
+ phys_addr_t paddr;
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
rbio->bioc->full_stripe_logical;
- bio_for_each_segment(bvec, bio, iter) {
- u32 bvec_offset;
+ btrfs_bio_for_each_block(paddr, bio, &iter, step) {
+ unsigned int index = (offset >> step_bits);
- for (bvec_offset = 0; bvec_offset < bvec.bv_len;
- bvec_offset += sectorsize, offset += sectorsize) {
- int index = offset / sectorsize;
- struct sector_ptr *sector = &rbio->bio_sectors[index];
-
- sector->page = bvec.bv_page;
- sector->pgoff = bvec.bv_offset + bvec_offset;
- ASSERT(sector->pgoff < PAGE_SIZE);
- }
+ rbio->bio_paddrs[index] = paddr;
+ offset += step;
}
}
@@ -1289,49 +1394,64 @@ static void assert_rbio(struct btrfs_raid_bio *rbio)
ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
}
-/* Generate PQ for one vertical stripe. */
-static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
+static inline void *kmap_local_paddr(phys_addr_t paddr)
+{
+ /* The sector pointer must have a page mapped to it. */
+ ASSERT(paddr != INVALID_PADDR);
+
+ return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
+}
+
+static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
+ unsigned int step_nr)
{
void **pointers = rbio->finish_pointers;
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- struct sector_ptr *sector;
+ const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
int stripe;
const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
/* First collect one sector from each data stripe */
- for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- }
+ for (stripe = 0; stripe < rbio->nr_data; stripe++)
+ pointers[stripe] = kmap_local_paddr(
+ sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0));
/* Then add the parity stripe */
- sector = rbio_pstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
+ pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr));
if (has_qstripe) {
/*
* RAID6, add the qstripe and call the library function
* to fill in our p/q
*/
- sector = rbio_qstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) +
- sector->pgoff;
+ pointers[stripe++] = kmap_local_paddr(
+ rbio_qstripe_paddr(rbio, sector_nr, step_nr));
assert_rbio(rbio);
- raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
- pointers);
+ raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
} else {
/* raid5 */
- memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
- run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
+ memcpy(pointers[rbio->nr_data], pointers[0], step);
+ run_xor(pointers + 1, rbio->nr_data - 1, step);
}
for (stripe = stripe - 1; stripe >= 0; stripe--)
kunmap_local(pointers[stripe]);
}
+/* Generate PQ for one vertical stripe. */
+static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
+{
+ const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
+
+ for (int i = 0; i < rbio->sector_nsteps; i++)
+ generate_pq_vertical_step(rbio, sectornr, i);
+
+ set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr),
+ rbio->stripe_uptodate_bitmap);
+ if (has_qstripe)
+ set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr),
+ rbio->stripe_uptodate_bitmap);
+}
+
static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
struct bio_list *bio_list)
{
@@ -1358,7 +1478,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
*/
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
stripe = total_sector_nr / rbio->stripe_nsectors;
sectornr = total_sector_nr % rbio->stripe_nsectors;
@@ -1368,14 +1488,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
continue;
if (stripe < rbio->nr_data) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (!sector)
+ paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
+ if (paddrs == NULL)
continue;
} else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
+ ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe,
sectornr, REQ_OP_WRITE);
if (ret)
goto error;
@@ -1393,7 +1513,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
stripe = total_sector_nr / rbio->stripe_nsectors;
sectornr = total_sector_nr % rbio->stripe_nsectors;
@@ -1418,14 +1538,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
continue;
if (stripe < rbio->nr_data) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (!sector)
+ paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
+ if (paddrs == NULL)
continue;
} else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, bio_list, sector,
+ ret = rbio_add_io_paddrs(rbio, bio_list, paddrs,
rbio->real_stripes,
sectornr, REQ_OP_WRITE);
if (ret)
@@ -1473,22 +1593,17 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
}
/*
- * For subpage case, we can no longer set page Up-to-date directly for
- * stripe_pages[], thus we need to locate the sector.
+ * Return the index inside the rbio->stripe_sectors[] array.
+ *
+ * Return -1 if not found.
*/
-static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
- struct page *page,
- unsigned int pgoff)
+static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr)
{
- int i;
-
- for (i = 0; i < rbio->nr_sectors; i++) {
- struct sector_ptr *sector = &rbio->stripe_sectors[i];
-
- if (sector->page == page && sector->pgoff == pgoff)
- return sector;
+ for (int i = 0; i < rbio->nr_sectors; i++) {
+ if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr)
+ return i;
}
- return NULL;
+ return -1;
}
/*
@@ -1498,38 +1613,34 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ const u32 step = min(sectorsize, PAGE_SIZE);
+ u32 offset = 0;
+ phys_addr_t paddr;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct sector_ptr *sector;
- int pgoff;
+ btrfs_bio_for_each_block_all(paddr, bio, step) {
+ /* Hitting the first step of a sector. */
+ if (IS_ALIGNED(offset, sectorsize)) {
+ int sector_nr = find_stripe_sector_nr(rbio, paddr);
- for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
- pgoff += sectorsize) {
- sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
- ASSERT(sector);
- if (sector)
- sector->uptodate = 1;
+ ASSERT(sector_nr >= 0);
+ if (sector_nr >= 0)
+ set_bit(sector_nr, rbio->stripe_uptodate_bitmap);
}
+ offset += step;
}
}
static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- struct bio_vec *bv = bio_first_bvec_all(bio);
+ phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
int i;
for (i = 0; i < rbio->nr_sectors; i++) {
- struct sector_ptr *sector;
-
- sector = &rbio->stripe_sectors[i];
- if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
break;
- sector = &rbio->bio_sectors[i];
- if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
break;
}
ASSERT(i < rbio->nr_sectors);
@@ -1562,9 +1673,12 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
+ const u32 nr_steps = rbio->sector_nsteps;
int total_sector_nr = get_bio_sector_nr(rbio, bio);
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ u32 offset = 0;
+ phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
+ phys_addr_t paddr;
/* No data csum for the whole stripe, no need to verify. */
if (!rbio->csum_bitmap || !rbio->csum_buf)
@@ -1574,26 +1688,26 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
return;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- int bv_offset;
+ btrfs_bio_for_each_block_all(paddr, bio, step) {
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ u8 *expected_csum;
- for (bv_offset = bvec->bv_offset;
- bv_offset < bvec->bv_offset + bvec->bv_len;
- bv_offset += fs_info->sectorsize, total_sector_nr++) {
- u8 csum_buf[BTRFS_CSUM_SIZE];
- u8 *expected_csum = rbio->csum_buf +
- total_sector_nr * fs_info->csum_size;
- int ret;
+ paddrs[(offset / step) % nr_steps] = paddr;
+ offset += step;
- /* No csum for this sector, skip to the next sector. */
- if (!test_bit(total_sector_nr, rbio->csum_bitmap))
- continue;
+ /* Not yet covering the full fs block, continue to the next step. */
+ if (!IS_ALIGNED(offset, fs_info->sectorsize))
+ continue;
- ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
- bv_offset, csum_buf, expected_csum);
- if (ret < 0)
- set_bit(total_sector_nr, rbio->error_bitmap);
- }
+ /* No csum for this sector, skip to the next sector. */
+ if (!test_bit(total_sector_nr, rbio->csum_bitmap))
+ continue;
+
+ expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
+ btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
+ if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0))
+ set_bit(total_sector_nr, rbio->error_bitmap);
+ total_sector_nr++;
}
}
@@ -1689,8 +1803,8 @@ static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
list_sort(NULL, &plug->rbio_list, plug_cmp);
while (!list_empty(&plug->rbio_list)) {
- cur = list_entry(plug->rbio_list.next,
- struct btrfs_raid_bio, plug_list);
+ cur = list_first_entry(&plug->rbio_list,
+ struct btrfs_raid_bio, plug_list);
list_del_init(&cur->plug_list);
if (rbio_is_full(cur)) {
@@ -1788,10 +1902,9 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
int stripe_nr, int sector_nr)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
u8 csum_buf[BTRFS_CSUM_SIZE];
u8 *csum_expected;
- int ret;
if (!rbio->csum_bitmap || !rbio->csum_buf)
return 0;
@@ -1804,57 +1917,32 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
* bio list if possible.
*/
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0);
} else {
- sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
+ paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr);
}
- ASSERT(sector->page);
-
csum_expected = rbio->csum_buf +
(stripe_nr * rbio->stripe_nsectors + sector_nr) *
fs_info->csum_size;
- ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
- csum_buf, csum_expected);
- return ret;
+ btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
+ if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0))
+ return -EIO;
+ return 0;
}
-/*
- * Recover a vertical stripe specified by @sector_nr.
- * @*pointers are the pre-allocated pointers by the caller, so we don't
- * need to allocate/free the pointers again and again.
- */
-static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
- void **pointers, void **unmap_array)
+static void recover_vertical_step(struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr,
+ unsigned int step_nr,
+ int faila, int failb,
+ void **pointers, void **unmap_array)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
- struct sector_ptr *sector;
- const u32 sectorsize = fs_info->sectorsize;
- int found_errors;
- int faila;
- int failb;
+ const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
int stripe_nr;
- int ret = 0;
- /*
- * Now we just use bitmap to mark the horizontal stripes in
- * which we have data when doing parity scrub.
- */
- if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(sector_nr, &rbio->dbitmap))
- return 0;
-
- found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
- &failb);
- /*
- * No errors in the vertical stripe, skip it. Can happen for recovery
- * which only part of a stripe failed csum check.
- */
- if (!found_errors)
- return 0;
-
- if (found_errors > rbio->bioc->max_errors)
- return -EIO;
+ ASSERT(step_nr < rbio->sector_nsteps);
+ ASSERT(sector_nr < rbio->stripe_nsectors);
/*
* Setup our array of pointers with sectors from each stripe
@@ -1863,18 +1951,18 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
* pointer order.
*/
for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ phys_addr_t paddr;
+
/*
* If we're rebuilding a read, we have to use pages from the
* bio list if possible.
*/
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0);
} else {
- sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
+ paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr);
}
- ASSERT(sector->page);
- pointers[stripe_nr] = kmap_local_page(sector->page) +
- sector->pgoff;
+ pointers[stripe_nr] = kmap_local_paddr(paddr);
unmap_array[stripe_nr] = pointers[stripe_nr];
}
@@ -1920,10 +2008,10 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
}
if (failb == rbio->real_stripes - 2) {
- raid6_datap_recov(rbio->real_stripes, sectorsize,
+ raid6_datap_recov(rbio->real_stripes, step,
faila, pointers);
} else {
- raid6_2data_recov(rbio->real_stripes, sectorsize,
+ raid6_2data_recov(rbio->real_stripes, step,
faila, failb, pointers);
}
} else {
@@ -1933,7 +2021,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
ASSERT(failb == -1);
pstripe:
/* Copy parity block into failed block to start with */
- memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
+ memcpy(pointers[faila], pointers[rbio->nr_data], step);
/* Rearrange the pointer array */
p = pointers[faila];
@@ -1943,40 +2031,66 @@ pstripe:
pointers[rbio->nr_data - 1] = p;
/* Xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, sectorsize);
-
+ run_xor(pointers, rbio->nr_data - 1, step);
}
+cleanup:
+ for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
+ kunmap_local(unmap_array[stripe_nr]);
+}
+
+/*
+ * Recover a vertical stripe specified by @sector_nr.
+ * @*pointers are the pre-allocated pointers by the caller, so we don't
+ * need to allocate/free the pointers again and again.
+ */
+static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ void **pointers, void **unmap_array)
+{
+ int found_errors;
+ int faila;
+ int failb;
+ int ret = 0;
+
/*
- * No matter if this is a RMW or recovery, we should have all
- * failed sectors repaired in the vertical stripe, thus they are now
- * uptodate.
- * Especially if we determine to cache the rbio, we need to
- * have at least all data sectors uptodate.
- *
- * If possible, also check if the repaired sector matches its data
- * checksum.
+ * Now we just use bitmap to mark the horizontal stripes in
+ * which we have data when doing parity scrub.
*/
+ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+ !test_bit(sector_nr, &rbio->dbitmap))
+ return 0;
+
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila,
+ &failb);
+ /*
+ * No errors in the vertical stripe, skip it. Can happen for recovery
+ * which only part of a stripe failed csum check.
+ */
+ if (!found_errors)
+ return 0;
+
+ if (unlikely(found_errors > rbio->bioc->max_errors))
+ return -EIO;
+
+ for (int i = 0; i < rbio->sector_nsteps; i++)
+ recover_vertical_step(rbio, sector_nr, i, faila, failb,
+ pointers, unmap_array);
if (faila >= 0) {
ret = verify_one_sector(rbio, faila, sector_nr);
if (ret < 0)
- goto cleanup;
+ return ret;
- sector = rbio_stripe_sector(rbio, faila, sector_nr);
- sector->uptodate = 1;
+ set_bit(rbio_sector_index(rbio, faila, sector_nr),
+ rbio->stripe_uptodate_bitmap);
}
if (failb >= 0) {
ret = verify_one_sector(rbio, failb, sector_nr);
if (ret < 0)
- goto cleanup;
+ return ret;
- sector = rbio_stripe_sector(rbio, failb, sector_nr);
- sector->uptodate = 1;
+ set_bit(rbio_sector_index(rbio, failb, sector_nr),
+ rbio->stripe_uptodate_bitmap);
}
-
-cleanup:
- for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
- kunmap_local(unmap_array[stripe_nr]);
return ret;
}
@@ -2051,7 +2165,7 @@ static void recover_rbio(struct btrfs_raid_bio *rbio)
total_sector_nr++) {
int stripe = total_sector_nr / rbio->stripe_nsectors;
int sectornr = total_sector_nr % rbio->stripe_nsectors;
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
/*
* Skip the range which has error. It can be a range which is
@@ -2068,8 +2182,8 @@ static void recover_rbio(struct btrfs_raid_bio *rbio)
continue;
}
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
sectornr, REQ_OP_READ);
if (ret < 0) {
bio_list_put(&bio_list);
@@ -2114,7 +2228,7 @@ static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_n
int faila;
int failb;
- found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr,
&faila, &failb);
/* This vertical stripe doesn't have errors. */
if (!found_errors)
@@ -2258,13 +2372,13 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
*/
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
- struct sector_ptr *sector;
int stripe = total_sector_nr / rbio->stripe_nsectors;
int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ phys_addr_t *paddrs;
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, REQ_OP_READ);
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
+ sectornr, REQ_OP_READ);
if (ret) {
bio_list_put(&bio_list);
return ret;
@@ -2282,9 +2396,8 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
static void raid_wait_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- blk_status_t err = bio->bi_status;
- if (err)
+ if (bio->bi_status)
rbio_update_error_bitmap(rbio, bio);
bio_put(bio);
if (atomic_dec_and_test(&rbio->stripes_pending))
@@ -2319,14 +2432,15 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
int i;
for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
- struct sector_ptr *sector = &rbio->stripe_sectors[i];
+ phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps];
/*
* We have a sector which doesn't have page nor uptodate,
* thus this rbio can not be cached one, as cached one must
* have all its data sectors present and uptodate.
*/
- if (!sector->page || !sector->uptodate)
+ if (paddr == INVALID_PADDR ||
+ !test_bit(i, rbio->stripe_uptodate_bitmap))
return true;
}
return false;
@@ -2407,8 +2521,8 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
int found_errors;
- found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
- if (found_errors > rbio->bioc->max_errors) {
+ found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL);
+ if (unlikely(found_errors > rbio->bioc->max_errors)) {
ret = -EIO;
break;
}
@@ -2478,46 +2592,121 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
return rbio;
}
+static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio,
+ int sector_nr)
+{
+ const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize);
+ const u32 base = sector_nr * rbio->sector_nsteps;
+
+ for (int i = base; i < base + rbio->sector_nsteps; i++) {
+ const unsigned int page_index = (i * step) >> PAGE_SHIFT;
+ struct page *page;
+
+ if (rbio->stripe_pages[page_index])
+ continue;
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[page_index] = page;
+ }
+ return 0;
+}
+
/*
* We just scrub the parity that we have correct data on the same horizontal,
* so we needn't allocate all pages for all the stripes.
*/
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
int total_sector_nr;
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
- struct page *page;
int sectornr = total_sector_nr % rbio->stripe_nsectors;
- int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
+ int ret;
if (!test_bit(sectornr, &rbio->dbitmap))
continue;
- if (rbio->stripe_pages[index])
- continue;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[index] = page;
+ ret = alloc_rbio_sector_pages(rbio, total_sector_nr);
+ if (ret < 0)
+ return ret;
}
index_stripe_sectors(rbio);
return 0;
}
+/* Return true if the content of the step matches the caclulated one. */
+static bool verify_one_parity_step(struct btrfs_raid_bio *rbio,
+ void *pointers[], unsigned int sector_nr,
+ unsigned int step_nr)
+{
+ const unsigned int nr_data = rbio->nr_data;
+ const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2);
+ const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
+ void *parity;
+ bool ret = false;
+
+ ASSERT(step_nr < rbio->sector_nsteps);
+
+ /* First collect one page from each data stripe. */
+ for (int stripe = 0; stripe < nr_data; stripe++)
+ pointers[stripe] = kmap_local_paddr(
+ sector_paddr_in_rbio(rbio, stripe, sector_nr,
+ step_nr, 0));
+
+ if (has_qstripe) {
+ assert_rbio(rbio);
+ /* RAID6, call the library function to fill in our P/Q. */
+ raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
+ } else {
+ /* RAID5. */
+ memcpy(pointers[nr_data], pointers[0], step);
+ run_xor(pointers + 1, nr_data - 1, step);
+ }
+
+ /* Check scrubbing parity and repair it. */
+ parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr));
+ if (memcmp(parity, pointers[rbio->scrubp], step) != 0)
+ memcpy(parity, pointers[rbio->scrubp], step);
+ else
+ ret = true;
+ kunmap_local(parity);
+
+ for (int stripe = nr_data - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
+ return ret;
+}
+
+/*
+ * The @pointers array should have the P/Q parity already mapped.
+ */
+static void verify_one_parity_sector(struct btrfs_raid_bio *rbio,
+ void *pointers[], unsigned int sector_nr)
+{
+ bool found_error = false;
+
+ for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) {
+ bool match;
+
+ match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr);
+ if (!match)
+ found_error = true;
+ }
+ if (!found_error)
+ bitmap_clear(&rbio->dbitmap, sector_nr, 1);
+}
+
static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
{
struct btrfs_io_context *bioc = rbio->bioc;
- const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = &rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
- int stripe;
int sectornr;
bool has_qstripe;
- struct sector_ptr p_sector = { 0 };
- struct sector_ptr q_sector = { 0 };
+ struct page *page;
+ phys_addr_t p_paddr = INVALID_PADDR;
+ phys_addr_t q_paddr = INVALID_PADDR;
struct bio_list bio_list;
int is_replace = 0;
int ret;
@@ -2547,73 +2736,39 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
*/
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- p_sector.page = alloc_page(GFP_NOFS);
- if (!p_sector.page)
+ page = alloc_page(GFP_NOFS);
+ if (!page)
return -ENOMEM;
- p_sector.pgoff = 0;
- p_sector.uptodate = 1;
+ p_paddr = page_to_phys(page);
+ page = NULL;
+ pointers[nr_data] = kmap_local_paddr(p_paddr);
if (has_qstripe) {
/* RAID6, allocate and map temp space for the Q stripe */
- q_sector.page = alloc_page(GFP_NOFS);
- if (!q_sector.page) {
- __free_page(p_sector.page);
- p_sector.page = NULL;
+ page = alloc_page(GFP_NOFS);
+ if (!page) {
+ __free_page(phys_to_page(p_paddr));
+ p_paddr = INVALID_PADDR;
return -ENOMEM;
}
- q_sector.pgoff = 0;
- q_sector.uptodate = 1;
- pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
+ q_paddr = page_to_phys(page);
+ page = NULL;
+ pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr);
}
bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/* Map the parity stripe just once */
- pointers[nr_data] = kmap_local_page(p_sector.page);
-
- for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
- struct sector_ptr *sector;
- void *parity;
-
- /* first collect one page from each data stripe */
- for (stripe = 0; stripe < nr_data; stripe++) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- }
-
- if (has_qstripe) {
- assert_rbio(rbio);
- /* RAID6, call the library function to fill in our P/Q */
- raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
- pointers);
- } else {
- /* raid5 */
- memcpy(pointers[nr_data], pointers[0], sectorsize);
- run_xor(pointers + 1, nr_data - 1, sectorsize);
- }
- /* Check scrubbing parity and repair it */
- sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
- parity = kmap_local_page(sector->page) + sector->pgoff;
- if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
- memcpy(parity, pointers[rbio->scrubp], sectorsize);
- else
- /* Parity is right, needn't writeback */
- bitmap_clear(&rbio->dbitmap, sectornr, 1);
- kunmap_local(parity);
-
- for (stripe = nr_data - 1; stripe >= 0; stripe--)
- kunmap_local(pointers[stripe]);
- }
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors)
+ verify_one_parity_sector(rbio, pointers, sectornr);
kunmap_local(pointers[nr_data]);
- __free_page(p_sector.page);
- p_sector.page = NULL;
- if (q_sector.page) {
- kunmap_local(pointers[rbio->real_stripes - 1]);
- __free_page(q_sector.page);
- q_sector.page = NULL;
+ __free_page(phys_to_page(p_paddr));
+ p_paddr = INVALID_PADDR;
+ if (q_paddr != INVALID_PADDR) {
+ __free_page(phys_to_page(q_paddr));
+ q_paddr = INVALID_PADDR;
}
/*
@@ -2622,10 +2777,10 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
* everything else.
*/
for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
- sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
+ paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp,
sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
@@ -2640,11 +2795,10 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
*/
ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
- sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- rbio->real_stripes,
+ paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes,
sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
@@ -2692,9 +2846,9 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
int failb;
int found_errors;
- found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr,
&faila, &failb);
- if (found_errors > rbio->bioc->max_errors) {
+ if (unlikely(found_errors > rbio->bioc->max_errors)) {
ret = -EIO;
goto out;
}
@@ -2718,7 +2872,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
* data, so the capability of the repair is declined. (In the
* case of RAID5, we can not repair anything.)
*/
- if (dfail > rbio->bioc->max_errors - 1) {
+ if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
ret = -EIO;
goto out;
}
@@ -2735,7 +2889,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
* scrubbing parity, luckily, use the other one to repair the
* data, or we can not repair the data stripe.
*/
- if (failp != rbio->scrubp) {
+ if (unlikely(failp != rbio->scrubp)) {
ret = -EIO;
goto out;
}
@@ -2761,7 +2915,7 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
total_sector_nr++) {
int sectornr = total_sector_nr % rbio->stripe_nsectors;
int stripe = total_sector_nr / rbio->stripe_nsectors;
- struct sector_ptr *sector;
+ phys_addr_t *paddrs;
/* No data in the vertical stripe, no need to read. */
if (!test_bit(sectornr, &rbio->dbitmap))
@@ -2769,22 +2923,23 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
/*
* We want to find all the sectors missing from the rbio and
- * read them from the disk. If sector_in_rbio() finds a sector
+ * read them from the disk. If sector_paddr_in_rbio() finds a sector
* in the bio list we don't need to read it off the stripe.
*/
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (sector)
+ paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
+ if (paddrs == NULL)
continue;
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
/*
* The bio cache may have handed us an uptodate sector. If so,
* use it.
*/
- if (sector->uptodate)
+ if (test_bit(rbio_sector_index(rbio, stripe, sectornr),
+ rbio->stripe_uptodate_bitmap))
continue;
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
sectornr, REQ_OP_READ);
if (ret) {
bio_list_put(&bio_list);
@@ -2825,8 +2980,8 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio)
for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
int found_errors;
- found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
- if (found_errors > rbio->bioc->max_errors) {
+ found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL);
+ if (unlikely(found_errors > rbio->bioc->max_errors)) {
ret = -EIO;
break;
}
@@ -2850,17 +3005,17 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
* This is for scrub call sites where we already have correct data contents.
* This allows us to avoid reading data stripes again.
*
- * Unfortunately here we have to do page copy, other than reusing the pages.
+ * Unfortunately here we have to do folio copy, other than reusing the pages.
* This is due to the fact rbio has its own page management for its cache.
*/
-void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
- struct page **data_pages, u64 data_logical)
+void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
+ struct folio **data_folios, u64 data_logical)
{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
const u64 offset_in_full_stripe = data_logical -
rbio->bioc->full_stripe_logical;
- const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ unsigned int findex = 0;
+ unsigned int foffset = 0;
int ret;
/*
@@ -2879,14 +3034,24 @@ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
- for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
- struct page *dst = rbio->stripe_pages[page_nr + page_index];
- struct page *src = data_pages[page_nr];
-
- memcpy_page(dst, 0, src, 0, PAGE_SIZE);
- for (int sector_nr = sectors_per_page * page_index;
- sector_nr < sectors_per_page * (page_index + 1);
- sector_nr++)
- rbio->stripe_sectors[sector_nr].uptodate = true;
+ for (unsigned int cur_off = offset_in_full_stripe;
+ cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
+ cur_off += PAGE_SIZE) {
+ const unsigned int pindex = cur_off >> PAGE_SHIFT;
+ void *kaddr;
+
+ kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
+ memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
+ kunmap_local(kaddr);
+
+ foffset += PAGE_SIZE;
+ ASSERT(foffset <= folio_size(data_folios[findex]));
+ if (foffset == folio_size(data_folios[findex])) {
+ findex++;
+ foffset = 0;
+ }
}
+ bitmap_set(rbio->stripe_uptodate_bitmap,
+ offset_in_full_stripe >> fs_info->sectorsize_bits,
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 0d7b4c2fb6ae..1f463ecf7e41 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -16,7 +16,6 @@
#include "volumes.h"
struct page;
-struct sector_ptr;
struct btrfs_fs_info;
enum btrfs_rbio_ops {
@@ -25,6 +24,84 @@ enum btrfs_rbio_ops {
BTRFS_RBIO_PARITY_SCRUB,
};
+/*
+ * Overview of btrfs_raid_bio.
+ *
+ * One btrfs_raid_bio represents a full stripe of RAID56, including both data
+ * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K).
+ *
+ * One btrfs_raid_bio can have one or more bios from higher layer, covering
+ * part or all of the data stripes.
+ *
+ * [PAGES FROM HIGHER LAYER BIOS]
+ * Higher layer bios are in the btrfs_raid_bio::bio_list.
+ *
+ * Pages from the bio_list are represented like the following:
+ *
+ * bio_list: |<- Bio 1 ->| |<- Bio 2 ->| ...
+ * bio_paddrs: [0] [1] [2] [3] [4] [5] ...
+ *
+ * If there is a bio covering a sector (one btrfs fs block), the corresponding
+ * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address
+ * (with the offset inside the page) of the corresponding bio.
+ *
+ * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will
+ * be INVALID_PADDR.
+ *
+ * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)).
+ *
+ * [PAGES FOR INTERNAL USAGES]
+ * Pages not covered by any bio or belonging to P/Q stripes are stored in
+ * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following:
+ *
+ * stripe_pages: |<- Page 0 ->|<- Page 1 ->| ...
+ * stripe_paddrs: [0] [1] [2] [3] [4] ...
+ *
+ * stripe_pages[] array stores all the pages covering the full stripe, including
+ * data and P/Q pages.
+ * stripe_pages[0] is the first page of the first data stripe.
+ * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second
+ * data stripe.
+ *
+ * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write
+ * (the bio covers all data stripes) there is no need to allocate pages for
+ * data stripes (can grab from bio_paddrs[]).
+ *
+ * If the corresponding page of stripe_paddrs[i] is not allocated, the value of
+ * stripe_paddrs[i] will be INVALID_PADDR.
+ *
+ * The length of each entry in stripe_paddrs[] is a step.
+ *
+ * [LOCATING A SECTOR]
+ * To locate a sector for IO, we need the following info:
+ *
+ * - stripe_nr
+ * Starts from 0 (representing the first data stripe), ends at
+ * @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe).
+ *
+ * - sector_nr
+ * Starts from 0 (representing the first sector of the stripe), ends
+ * at BTRFS_STRIPE_LEN / sectorsize - 1.
+ *
+ * - step_nr
+ * A step is min(sector_size, PAGE_SIZE).
+ *
+ * Starts from 0 (representing the first step of the sector), ends
+ * at @sector_nsteps - 1.
+ *
+ * For most call sites they do not need to bother this parameter.
+ * It is for bs > ps support and only for vertical stripe related works.
+ * (e.g. RMW/recover)
+ *
+ * - from which array
+ * Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the
+ * bio_paddrs[] (aka, from the higher layer bios).
+ *
+ * For IO, a physical address is returned, so that we can extract the page and
+ * the offset inside the page for IO.
+ * A special value INVALID_PADDR represents when the physical address is invalid,
+ * normally meaning there is no page allocated for the specified sector.
+ */
struct btrfs_raid_bio {
struct btrfs_io_context *bioc;
@@ -82,6 +159,14 @@ struct btrfs_raid_bio {
/* How many sectors there are for each stripe */
u8 stripe_nsectors;
+ /*
+ * How many steps there are for one sector.
+ *
+ * For bs > ps cases, it's sectorsize / PAGE_SIZE.
+ * For bs <= ps cases, it's always 1.
+ */
+ u8 sector_nsteps;
+
/* Stripe number that we're scrubbing */
u8 scrubp;
@@ -116,13 +201,13 @@ struct btrfs_raid_bio {
struct page **stripe_pages;
/* Pointers to the sectors in the bio_list, for faster lookup */
- struct sector_ptr *bio_sectors;
+ phys_addr_t *bio_paddrs;
- /*
- * For subpage support, we need to map each sector to above
- * stripe_pages.
- */
- struct sector_ptr *stripe_sectors;
+ /* Pointers to the sectors in the stripe_pages[]. */
+ phys_addr_t *stripe_paddrs;
+
+ /* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */
+ unsigned long *stripe_uptodate_bitmap;
/* Allocated with real_stripes-many pointers for finish_*() calls */
void **finish_pointers;
@@ -131,10 +216,6 @@ struct btrfs_raid_bio {
* The bitmap recording where IO errors happened.
* Each bit is corresponding to one sector in either bio_sectors[] or
* stripe_sectors[] array.
- *
- * The reason we don't use another bit in sector_ptr is, we have two
- * arrays of sectors, and a lot of IO can use sectors in both arrays.
- * Thus making it much harder to iterate.
*/
unsigned long *error_bitmap;
@@ -201,8 +282,8 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
-void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
- struct page **data_pages, u64 data_logical);
+void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
+ struct folio **data_folios, u64 data_logical);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
deleted file mode 100644
index 1c2d7cb1fe6f..000000000000
--- a/fs/btrfs/rcu-string.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- */
-
-#ifndef BTRFS_RCU_STRING_H
-#define BTRFS_RCU_STRING_H
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-#include <linux/printk.h>
-
-struct rcu_string {
- struct rcu_head rcu;
- char str[];
-};
-
-static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
-{
- size_t len = strlen(src) + 1;
- struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
- (len * sizeof(char)), mask);
- if (!ret)
- return ret;
- /* Warn if the source got unexpectedly truncated. */
- if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
- kfree(ret);
- return NULL;
- }
- return ret;
-}
-
-static inline void rcu_string_free(struct rcu_string *str)
-{
- if (str)
- kfree_rcu(str, rcu);
-}
-
-#define printk_in_rcu(fmt, ...) do { \
- rcu_read_lock(); \
- printk(fmt, __VA_ARGS__); \
- rcu_read_unlock(); \
-} while (0)
-
-#define printk_ratelimited_in_rcu(fmt, ...) do { \
- rcu_read_lock(); \
- printk_ratelimited(fmt, __VA_ARGS__); \
- rcu_read_unlock(); \
-} while (0)
-
-#define rcu_str_deref(rcu_str) ({ \
- struct rcu_string *__str = rcu_dereference(rcu_str); \
- __str->str; \
-})
-
-#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 2928abf7eb82..e9224145d754 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -75,69 +75,70 @@ struct block_entry {
struct list_head actions;
};
+static int block_entry_bytenr_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *bytenr = key;
+ const struct block_entry *entry = rb_entry(node, struct block_entry, node);
+
+ if (entry->bytenr < *bytenr)
+ return 1;
+ else if (entry->bytenr > *bytenr)
+ return -1;
+
+ return 0;
+}
+
+static int block_entry_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct block_entry *new_entry = rb_entry(new, struct block_entry, node);
+
+ return block_entry_bytenr_key_cmp(&new_entry->bytenr, existing);
+}
+
static struct block_entry *insert_block_entry(struct rb_root *root,
struct block_entry *be)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct block_entry *entry;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct block_entry, node);
- if (entry->bytenr > be->bytenr)
- p = &(*p)->rb_left;
- else if (entry->bytenr < be->bytenr)
- p = &(*p)->rb_right;
- else
- return entry;
- }
+ struct rb_node *node;
- rb_link_node(&be->node, parent_node, p);
- rb_insert_color(&be->node, root);
- return NULL;
+ node = rb_find_add(&be->node, root, block_entry_bytenr_cmp);
+ return rb_entry_safe(node, struct block_entry, node);
}
static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr)
{
- struct rb_node *n;
- struct block_entry *entry = NULL;
+ struct rb_node *node;
- n = root->rb_node;
- while (n) {
- entry = rb_entry(n, struct block_entry, node);
- if (entry->bytenr < bytenr)
- n = n->rb_right;
- else if (entry->bytenr > bytenr)
- n = n->rb_left;
- else
- return entry;
- }
- return NULL;
+ node = rb_find(&bytenr, root, block_entry_bytenr_key_cmp);
+ return rb_entry_safe(node, struct block_entry, node);
+}
+
+static int root_entry_root_objectid_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *objectid = key;
+ const struct root_entry *entry = rb_entry(node, struct root_entry, node);
+
+ if (entry->root_objectid < *objectid)
+ return 1;
+ else if (entry->root_objectid > *objectid)
+ return -1;
+
+ return 0;
+}
+
+static int root_entry_root_objectid_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct root_entry *new_entry = rb_entry(new, struct root_entry, node);
+
+ return root_entry_root_objectid_key_cmp(&new_entry->root_objectid, existing);
}
static struct root_entry *insert_root_entry(struct rb_root *root,
struct root_entry *re)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct root_entry *entry;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct root_entry, node);
- if (entry->root_objectid > re->root_objectid)
- p = &(*p)->rb_left;
- else if (entry->root_objectid < re->root_objectid)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(&re->node, parent_node, p);
- rb_insert_color(&re->node, root);
- return NULL;
+ struct rb_node *node;
+ node = rb_find_add(&re->node, root, root_entry_root_objectid_cmp);
+ return rb_entry_safe(node, struct root_entry, node);
}
static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
@@ -161,48 +162,29 @@ static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
return 0;
}
+static int ref_entry_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ struct ref_entry *new_entry = rb_entry(new, struct ref_entry, node);
+ struct ref_entry *existing_entry = rb_entry(existing, struct ref_entry, node);
+
+ return comp_refs(new_entry, existing_entry);
+}
+
static struct ref_entry *insert_ref_entry(struct rb_root *root,
struct ref_entry *ref)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct ref_entry *entry;
- int cmp;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct ref_entry, node);
- cmp = comp_refs(entry, ref);
- if (cmp > 0)
- p = &(*p)->rb_left;
- else if (cmp < 0)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(&ref->node, parent_node, p);
- rb_insert_color(&ref->node, root);
- return NULL;
+ struct rb_node *node;
+ node = rb_find_add(&ref->node, root, ref_entry_cmp);
+ return rb_entry_safe(node, struct ref_entry, node);
}
static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
{
- struct rb_node *n;
- struct root_entry *entry = NULL;
+ struct rb_node *node;
- n = root->rb_node;
- while (n) {
- entry = rb_entry(n, struct root_entry, node);
- if (entry->root_objectid < objectid)
- n = n->rb_right;
- else if (entry->root_objectid > objectid)
- n = n->rb_left;
- else
- return entry;
- }
- return NULL;
+ node = rb_find(&objectid, root, root_entry_root_objectid_key_cmp);
+ return rb_entry_safe(node, struct root_entry, node);
}
#ifdef CONFIG_STACKTRACE
@@ -668,7 +650,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
* our sanity checks pass as they are no longer needed.
*/
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref)
+ const struct btrfs_ref *generic_ref)
{
struct ref_entry *ref = NULL, *exist;
struct ref_action *ra = NULL;
@@ -989,7 +971,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *extent_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
int tree_block_level = 0;
u64 bytenr = 0, num_bytes = 0;
@@ -998,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
if (!btrfs_test_opt(fs_info, REF_VERIFY))
return 0;
+ extent_root = btrfs_extent_root(fs_info, 0);
+ /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */
+ if (!extent_root) {
+ btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling");
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- extent_root = btrfs_extent_root(fs_info, 0);
eb = btrfs_read_lock_root_node(extent_root);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
@@ -1032,6 +1021,5 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 3511e1a5c96b..1ce544d53cc5 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -12,14 +12,14 @@
struct btrfs_fs_info;
struct btrfs_ref;
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
#include <linux/spinlock.h>
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref);
+ const struct btrfs_ref *generic_ref);
void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
u64 len);
@@ -39,7 +39,7 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
}
static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref)
+ const struct btrfs_ref *generic_ref)
{
return 0;
}
@@ -53,6 +53,6 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
{
}
-#endif /* CONFIG_BTRFS_FS_REF_VERIFY */
+#endif /* CONFIG_BTRFS_DEBUG */
#endif
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 15c296cb4dac..b5fe95baf92e 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/blkdev.h>
+#include <linux/fscrypt.h>
#include <linux/iversion.h>
#include "ctree.h"
#include "fs.h"
@@ -23,7 +24,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
u64 endoff,
const u64 destoff,
const u64 olen,
- int no_time_update)
+ bool no_time_update)
{
int ret;
@@ -43,14 +44,12 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
}
ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
- goto out;
+ return ret;
}
- ret = btrfs_end_transaction(trans);
-out:
- return ret;
+ return btrfs_end_transaction(trans);
}
static int copy_inline_to_page(struct btrfs_inode *inode,
@@ -87,7 +86,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
btrfs_alloc_write_mask(mapping));
if (IS_ERR(folio)) {
- ret = -ENOMEM;
+ ret = PTR_ERR(folio);
goto out_unlock;
}
@@ -95,9 +94,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret < 0)
goto out_unlock;
- clear_extent_bit(&inode->io_tree, file_offset, range_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- NULL);
+ btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -271,11 +269,15 @@ copy_inline_extent:
drop_args.end = aligned_end;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
write_extent_buffer(path->nodes[0], inline_data,
btrfs_item_ptr_offset(path->nodes[0],
@@ -284,6 +286,8 @@ copy_inline_extent:
btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
btrfs_set_inode_full_sync(inode);
ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
+ if (unlikely(ret))
+ btrfs_abort_transaction(trans, ret);
out:
if (!ret && !trans) {
/*
@@ -298,10 +302,8 @@ out:
trans = NULL;
}
}
- if (ret && trans) {
- btrfs_abort_transaction(trans, ret);
+ if (ret && trans)
btrfs_end_transaction(trans);
- }
if (!ret)
*trans_out = trans;
@@ -336,13 +338,13 @@ copy_to_page:
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff, int no_time_update)
+ const u64 destoff, bool no_time_update)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_trans_handle *trans;
- char *buf = NULL;
+ char AUTO_KVFREE(buf);
struct btrfs_key key;
u32 nritems;
int slot;
@@ -357,10 +359,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
return ret;
path = btrfs_alloc_path();
- if (!path) {
- kvfree(buf);
+ if (!path)
return ret;
- }
path->reada = READA_FORWARD;
/* Clone data */
@@ -610,8 +610,6 @@ process_slot:
}
out:
- btrfs_free_path(path);
- kvfree(buf);
clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
return ret;
@@ -646,10 +644,10 @@ static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
* because we have already locked the inode's i_mmap_lock in exclusive
* mode.
*/
- lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
+ btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
ALIGN(len, bs), dst_loff, 1);
- unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
+ btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
btrfs_btree_balance_dirty(fs_info);
@@ -749,9 +747,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* mode.
*/
end = destoff + len - 1;
- lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
/*
* We may have copied an inline extent into a page of the destination
@@ -792,6 +790,10 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb);
}
+ /* Can only reflink encrypted files if both files are encrypted. */
+ if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode))
+ return -EINVAL;
+
/* Don't make the dst file partly checksummed */
if ((inode_in->flags & BTRFS_INODE_NODATASUM) !=
(inode_out->flags & BTRFS_INODE_NODATASUM)) {
@@ -868,6 +870,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
bool same_inode = dst_inode == src_inode;
int ret;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file)))))
+ return -EIO;
+
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f948f4f6431c..5bfefc3e9c06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -90,10 +90,15 @@
* map address of tree root to tree
*/
struct mapping_node {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simle_node for search/insert */
+ union {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
void *data;
};
@@ -106,10 +111,15 @@ struct mapping_tree {
* present a tree block to process
*/
struct tree_block {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simple_node for search/insert */
+ union {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
u64 owner;
struct btrfs_key key;
u8 level;
@@ -178,8 +188,9 @@ static void mark_block_processed(struct reloc_control *rc,
in_range(node->bytenr, rc->block_group->start,
rc->block_group->length)) {
blocksize = rc->extent_root->fs_info->nodesize;
- set_extent_bit(&rc->processed_blocks, node->bytenr,
- node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr,
+ node->bytenr + blocksize - 1, EXTENT_DIRTY,
+ NULL);
}
node->processed = 1;
}
@@ -195,8 +206,8 @@ static struct btrfs_backref_node *walk_up_backref(
int idx = *index;
while (!list_empty(&node->upper)) {
- edge = list_entry(node->upper.next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&node->upper, struct btrfs_backref_edge,
+ list[LOWER]);
edges[idx++] = edge;
node = edge->node[UPPER];
}
@@ -222,8 +233,8 @@ static struct btrfs_backref_node *walk_down_backref(
idx--;
continue;
}
- edge = list_entry(edge->list[LOWER].next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge,
+ list[LOWER]);
edges[idx - 1] = edge;
*index = idx;
return edge->node[UPPER];
@@ -347,8 +358,8 @@ static bool handle_useless_nodes(struct reloc_control *rc,
struct btrfs_backref_edge *edge;
struct btrfs_backref_node *lower;
- edge = list_entry(cur->lower.next,
- struct btrfs_backref_edge, list[UPPER]);
+ edge = list_first_entry(&cur->lower, struct btrfs_backref_edge,
+ list[UPPER]);
list_del(&edge->list[UPPER]);
list_del(&edge->list[LOWER]);
lower = edge->node[LOWER];
@@ -479,8 +490,7 @@ static int __add_reloc_root(struct btrfs_root *root)
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
btrfs_err(fs_info,
@@ -501,7 +511,7 @@ static void __del_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
- struct mapping_node *node = NULL;
+ struct mapping_node AUTO_KFREE(node);
struct reloc_control *rc = fs_info->reloc_ctl;
bool put_ref = false;
@@ -534,7 +544,6 @@ static void __del_reloc_root(struct btrfs_root *root)
spin_unlock(&fs_info->trans_lock);
if (put_ref)
btrfs_put_root(root);
- kfree(node);
}
/*
@@ -563,8 +572,7 @@ static int __update_reloc_root(struct btrfs_root *root)
spin_lock(&rc->reloc_root_tree.lock);
node->bytenr = root->node->start;
- rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node)
btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
@@ -577,10 +585,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
struct extent_buffer *eb;
- struct btrfs_root_item *root_item;
+ struct btrfs_root_item AUTO_KFREE(root_item);
struct btrfs_key root_key;
int ret = 0;
- bool must_abort = false;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
if (!root_item)
@@ -593,11 +600,29 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
if (btrfs_root_id(root) == objectid) {
u64 commit_root_gen;
+ /*
+ * Relocation will wait for cleaner thread, and any half-dropped
+ * subvolume will be fully cleaned up at mount time.
+ * So here we shouldn't hit a subvolume with non-zero drop_progress.
+ *
+ * If this isn't the case, error out since it can make us attempt to
+ * drop references for extents that were already dropped before.
+ */
+ if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) {
+ struct btrfs_key cpu_key;
+
+ btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress);
+ btrfs_err(fs_info,
+ "cannot relocate partially dropped subvolume %llu, drop progress key " BTRFS_KEY_FMT,
+ objectid, BTRFS_KEY_FMT_VALUE(&cpu_key));
+ return ERR_PTR(-EUCLEAN);
+ }
+
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
if (ret)
- goto fail;
+ return ERR_PTR(ret);
/*
* Set the last_snapshot field to the generation of the commit
@@ -620,14 +645,13 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_copy_root(trans, root, root->node, &eb,
BTRFS_TREE_RELOC_OBJECTID);
if (ret)
- goto fail;
+ return ERR_PTR(ret);
}
/*
* We have changed references at this point, we must abort the
- * transaction if anything fails.
+ * transaction if anything fails (i.e. 'goto abort').
*/
- must_abort = true;
memcpy(root_item, &root->root_item, sizeof(*root_item));
btrfs_set_root_bytenr(root_item, eb->start);
@@ -647,9 +671,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
if (ret)
- goto fail;
-
- kfree(root_item);
+ goto abort;
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
if (IS_ERR(reloc_root)) {
@@ -659,11 +681,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
btrfs_set_root_last_trans(reloc_root, trans->transid);
return reloc_root;
-fail:
- kfree(root_item);
+
abort:
- if (must_abort)
- btrfs_abort_transaction(trans, ret);
+ btrfs_abort_transaction(trans, ret);
return ERR_PTR(ret);
}
@@ -793,7 +813,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
u64 bytenr, u64 num_bytes)
{
struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
int ret;
@@ -806,11 +826,9 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ return ret;
+ if (ret > 0)
+ return -ENOENT;
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -821,16 +839,11 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi));
- if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
- ret = -EINVAL;
- goto out;
- }
+ if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi))
+ return -EINVAL;
*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -910,16 +923,16 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
/* Take mmap lock to serialize with reflinks. */
if (!down_read_trylock(&inode->i_mmap_lock))
continue;
- ret = try_lock_extent(&inode->io_tree, key.offset,
- end, &cached_state);
+ ret = btrfs_try_lock_extent(&inode->io_tree, key.offset,
+ end, &cached_state);
if (!ret) {
up_read(&inode->i_mmap_lock);
continue;
}
btrfs_drop_extent_map_range(inode, key.offset, end, true);
- unlock_extent(&inode->io_tree, key.offset, end,
- &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, key.offset, end,
+ &cached_state);
up_read(&inode->i_mmap_lock);
}
}
@@ -946,7 +959,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_data_ref(&ref, key.objectid, key.offset,
btrfs_root_id(root), false);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -960,7 +973,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_data_ref(&ref, key.objectid, key.offset,
btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1171,7 +1184,7 @@ again:
ref.ref_root = btrfs_root_id(src);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1184,7 +1197,7 @@ again:
ref.ref_root = btrfs_root_id(dest);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1198,7 +1211,7 @@ again:
ref.ref_root = btrfs_root_id(src);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1212,7 +1225,7 @@ again:
ref.ref_root = btrfs_root_id(dest);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1378,9 +1391,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
- lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_drop_extent_map_range(inode, start, end, true);
- unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
}
return 0;
}
@@ -1462,7 +1475,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
* ->reloc_root. If it fails however we must
* drop the ref ourselves.
*/
- ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
+ ret2 = btrfs_drop_snapshot(reloc_root, false, true);
if (ret2 < 0) {
btrfs_put_root(reloc_root);
if (!ret)
@@ -1472,7 +1485,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
btrfs_put_root(root);
} else {
/* Orphan reloc tree, just clean it up */
- ret2 = btrfs_drop_snapshot(root, 0, 1);
+ ret2 = btrfs_drop_snapshot(root, false, true);
if (ret2 < 0) {
btrfs_put_root(root);
if (!ret)
@@ -1515,7 +1528,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_root_level(root_item);
- atomic_inc(&reloc_root->node->refs);
+ refcount_inc(&reloc_root->node->refs);
path->nodes[level] = reloc_root->node;
path->slots[level] = 0;
} else {
@@ -1697,8 +1710,8 @@ again:
rc->merge_reloc_tree = true;
while (!list_empty(&rc->reloc_roots)) {
- reloc_root = list_entry(rc->reloc_roots.next,
- struct btrfs_root, root_list);
+ reloc_root = list_first_entry(&rc->reloc_roots,
+ struct btrfs_root, root_list);
list_del_init(&reloc_root->root_list);
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
@@ -1763,7 +1776,7 @@ again:
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
if (!err)
err = ret;
@@ -1813,8 +1826,7 @@ again:
while (!list_empty(&reloc_roots)) {
found = 1;
- reloc_root = list_entry(reloc_roots.next,
- struct btrfs_root, root_list);
+ reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list);
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
@@ -1930,11 +1942,11 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
* reloc root without a corresponding root this could return ENOENT.
*/
if (IS_ERR(root)) {
- ASSERT(0);
+ DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root));
return PTR_ERR(root);
}
- if (root->reloc_root != reloc_root) {
- ASSERT(0);
+ if (unlikely(root->reloc_root != reloc_root)) {
+ DEBUG_WARN("unexpected reloc root found");
btrfs_err(fs_info,
"root %llu has two reloc roots associated with it",
reloc_root->root_key.offset);
@@ -2004,7 +2016,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOENT);
- if (next->new_bytenr) {
+ if (unlikely(next->new_bytenr)) {
/*
* We just created the reloc root, so we shouldn't have
* ->new_bytenr set yet. If it is then we have multiple roots
@@ -2063,7 +2075,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
* This can occur if we have incomplete extent refs leading all
* the way up a particular path, in this case return -EUCLEAN.
*/
- if (!root)
+ if (unlikely(!root))
return ERR_PTR(-EUCLEAN);
/* No other choice for non-shareable tree */
@@ -2109,8 +2121,8 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
if (list_empty(&next->upper))
break;
- edge = list_entry(next->upper.next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&next->upper, struct btrfs_backref_edge,
+ list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -2250,7 +2262,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (lowest) {
- if (bytenr != node->bytenr) {
+ if (unlikely(bytenr != node->bytenr)) {
btrfs_err(root->fs_info,
"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
bytenr, node->bytenr, slot,
@@ -2305,7 +2317,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
upper->eb);
- if (ret)
+ if (unlikely(ret))
btrfs_abort_transaction(trans, ret);
}
next:
@@ -2356,8 +2368,8 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans,
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
while (!list_empty(&cache->pending[level])) {
- node = list_entry(cache->pending[level].next,
- struct btrfs_backref_node, list);
+ node = list_first_entry(&cache->pending[level],
+ struct btrfs_backref_node, list);
list_move_tail(&node->list, &list);
BUG_ON(!node->pending);
@@ -2395,8 +2407,8 @@ static void update_processed_blocks(struct reloc_control *rc,
if (list_empty(&next->upper))
break;
- edge = list_entry(next->upper.next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&next->upper, struct btrfs_backref_edge,
+ list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -2408,8 +2420,8 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
{
u32 blocksize = rc->extent_root->fs_info->nodesize;
- if (test_range_bit(&rc->processed_blocks, bytenr,
- bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
+ if (btrfs_test_range_bit(&rc->processed_blocks, bytenr,
+ bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
return 1;
return 0;
}
@@ -2427,7 +2439,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
eb = read_tree_block(fs_info, block->bytenr, &check);
if (IS_ERR(eb))
return PTR_ERR(eb);
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2492,7 +2504,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
* normal user in the case of corruption.
*/
ASSERT(node->new_bytenr == 0);
- if (node->new_bytenr) {
+ if (unlikely(node->new_bytenr)) {
btrfs_err(root->fs_info,
"bytenr %llu has improper references to it",
node->bytenr);
@@ -2617,7 +2629,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
* tree.
*/
if (block->owner &&
- (!is_fstree(block->owner) ||
+ (!btrfs_is_fstree(block->owner) ||
block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
ret = relocate_cowonly_block(trans, rc, block, path);
if (ret)
@@ -2658,69 +2670,24 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control
u64 num_bytes;
int nr;
int ret = 0;
- u64 i_size = i_size_read(&inode->vfs_inode);
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset = prealloc_start;
/*
- * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
- * This means the range [i_size, PAGE_END + 1) is filled with zeros by
- * btrfs_do_readpage() call of previously relocated file cluster.
+ * For blocksize < folio size case (either bs < page size or large folios),
+ * beyond i_size, all blocks are filled with zero.
*
- * If the current cluster starts in the above range, btrfs_do_readpage()
+ * If the current cluster covers the above range, btrfs_do_readpage()
* will skip the read, and relocate_one_folio() will later writeback
* the padding zeros as new data, causing data corruption.
*
- * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+ * Here we have to invalidate the cache covering our cluster.
*/
- if (!PAGE_ALIGNED(i_size)) {
- struct address_space *mapping = inode->vfs_inode.i_mapping;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u32 sectorsize = fs_info->sectorsize;
- struct folio *folio;
-
- ASSERT(sectorsize < PAGE_SIZE);
- ASSERT(IS_ALIGNED(i_size, sectorsize));
-
- /*
- * Subpage can't handle page with DIRTY but without UPTODATE
- * bit as it can lead to the following deadlock:
- *
- * btrfs_read_folio()
- * | Page already *locked*
- * |- btrfs_lock_and_flush_ordered_range()
- * |- btrfs_start_ordered_extent()
- * |- extent_write_cache_pages()
- * |- lock_page()
- * We try to lock the page we already hold.
- *
- * Here we just writeback the whole data reloc inode, so that
- * we will be ensured to have no dirty range in the page, and
- * are safe to clear the uptodate bits.
- *
- * This shouldn't cause too much overhead, as we need to write
- * the data back anyway.
- */
- ret = filemap_write_and_wait(mapping);
- if (ret < 0)
- return ret;
-
- clear_extent_bits(&inode->io_tree, i_size,
- round_up(i_size, PAGE_SIZE) - 1,
- EXTENT_UPTODATE);
- folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
- /*
- * If page is freed we don't need to do anything then, as we
- * will re-read the whole page anyway.
- */
- if (!IS_ERR(folio)) {
- btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
- round_up(i_size, PAGE_SIZE) - i_size);
- folio_unlock(folio);
- folio_put(folio);
- }
- }
+ ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start,
+ prealloc_end);
+ if (ret < 0)
+ return ret;
BUG_ON(cluster->start != cluster->boundary[0]);
ret = btrfs_alloc_data_chunk_ondemand(inode,
@@ -2738,21 +2705,21 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control
else
end = cluster->end - offset;
- lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
if (ret)
break;
}
btrfs_inode_unlock(inode, 0);
if (cur_offset < prealloc_end)
- btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
- prealloc_end + 1 - cur_offset);
+ btrfs_free_reserved_data_space_noquota(inode,
+ prealloc_end + 1 - cur_offset);
return ret;
}
@@ -2766,7 +2733,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr
u64 end = rc->cluster.end - offset;
int ret = 0;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em)
return -ENOMEM;
@@ -2777,10 +2744,10 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr
em->ram_bytes = em->len;
em->flags |= EXTENT_FLAG_PINNED;
- lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
ret = btrfs_replace_extent_map_range(inode, em, false);
- unlock_extent(&inode->io_tree, start, end, &cached_state);
- free_extent_map(em);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_free_extent_map(em);
return ret;
}
@@ -2809,13 +2776,15 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
static int relocate_one_folio(struct reloc_control *rc,
struct file_ra_state *ra,
- int *cluster_nr, unsigned long index)
+ int *cluster_nr, u64 *file_offset_ret)
{
const struct file_extent_cluster *cluster = &rc->cluster;
struct inode *inode = rc->data_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ const u64 orig_file_offset = *file_offset_ret;
u64 offset = BTRFS_I(inode)->reloc_block_group_start;
- const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ const pgoff_t index = orig_file_offset >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
struct folio *folio;
u64 folio_start;
@@ -2848,8 +2817,6 @@ again:
return PTR_ERR(folio);
}
- WARN_ON(folio_order(folio));
-
if (folio_test_readahead(folio) && !use_rst)
page_cache_async_readahead(inode->i_mapping, ra, NULL,
folio, last_index + 1 - index);
@@ -2857,7 +2824,7 @@ again:
if (!folio_test_uptodate(folio)) {
btrfs_read_folio(NULL, folio);
folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = -EIO;
goto release_folio;
}
@@ -2878,7 +2845,7 @@ again:
goto release_folio;
folio_start = folio_pos(folio);
- folio_end = folio_start + PAGE_SIZE - 1;
+ folio_end = folio_start + folio_size(folio) - 1;
/*
* Start from the cluster, as for subpage case, the cluster can start
@@ -2902,15 +2869,15 @@ again:
goto release_folio;
/* Mark the range delalloc and dirty for later writeback */
- lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
- &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start,
+ clamped_end, &cached_state);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
clamped_end, 0, &cached_state);
if (ret) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree,
- clamped_start, clamped_end,
- EXTENT_LOCKED | EXTENT_BOUNDARY,
- &cached_state);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ clamped_start, clamped_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY,
+ &cached_state);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
@@ -2926,18 +2893,19 @@ again:
* EXTENT_BOUNDARY bit prevents current extent from being merged
* with previous extent.
*/
- if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
+ if (in_range(cluster->boundary[*cluster_nr] - offset,
+ folio_start, folio_size(folio))) {
u64 boundary_start = cluster->boundary[*cluster_nr] -
offset;
u64 boundary_end = boundary_start +
fs_info->sectorsize - 1;
- set_extent_bit(&BTRFS_I(inode)->io_tree,
- boundary_start, boundary_end,
- EXTENT_BOUNDARY, NULL);
+ btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree,
+ boundary_start, boundary_end,
+ EXTENT_BOUNDARY, NULL);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
+ &cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
cur += clamped_len;
@@ -2956,6 +2924,7 @@ again:
btrfs_throttle(fs_info);
if (btrfs_should_cancel_balance(fs_info))
ret = -ECANCELED;
+ *file_offset_ret = folio_end + 1;
return ret;
release_folio:
@@ -2969,9 +2938,8 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
struct inode *inode = rc->data_inode;
const struct file_extent_cluster *cluster = &rc->cluster;
u64 offset = BTRFS_I(inode)->reloc_block_group_start;
- unsigned long index;
- unsigned long last_index;
- struct file_ra_state *ra;
+ u64 cur_file_offset = cluster->start - offset;
+ struct file_ra_state AUTO_KFREE(ra);
int cluster_nr = 0;
int ret = 0;
@@ -2984,22 +2952,21 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
ret = prealloc_file_extent_cluster(rc);
if (ret)
- goto out;
+ return ret;
file_ra_state_init(ra, inode->i_mapping);
ret = setup_relocation_extent_mapping(rc);
if (ret)
- goto out;
+ return ret;
- last_index = (cluster->end - offset) >> PAGE_SHIFT;
- for (index = (cluster->start - offset) >> PAGE_SHIFT;
- index <= last_index && !ret; index++)
- ret = relocate_one_folio(rc, ra, &cluster_nr, index);
+ while (cur_file_offset < cluster->end - offset) {
+ ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset);
+ if (ret)
+ break;
+ }
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
-out:
- kfree(ra);
return ret;
}
@@ -3158,7 +3125,7 @@ static int add_tree_block(struct reloc_control *rc,
block->key_ready = false;
block->owner = owner;
- rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
+ rb_node = rb_simple_insert(blocks, &block->simple_node);
if (rb_node)
btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
-EEXIST);
@@ -3174,7 +3141,7 @@ static int __add_tree_block(struct reloc_control *rc,
struct rb_root *blocks)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret;
bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -3198,11 +3165,11 @@ again:
key.offset = blocksize;
}
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && skinny) {
if (path->slots[0]) {
@@ -3229,14 +3196,10 @@ again:
"tree block extent item (%llu) is not found in extent tree",
bytenr);
WARN_ON(1);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
- ret = add_tree_block(rc, &key, path, blocks);
-out:
- btrfs_free_path(path);
- return ret;
+ return add_tree_block(rc, &key, path, blocks);
}
static int delete_block_group_cache(struct btrfs_block_group *block_group,
@@ -3395,8 +3358,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
0, 0);
if (ret < 0)
@@ -3435,9 +3398,9 @@ next:
goto next;
}
- block_found = find_first_extent_bit(&rc->processed_blocks,
- key.objectid, &start, &end,
- EXTENT_DIRTY, NULL);
+ block_found = btrfs_find_first_extent_bit(&rc->processed_blocks,
+ key.objectid, &start, &end,
+ EXTENT_DIRTY, NULL);
if (block_found && start <= key.objectid) {
btrfs_release_path(path);
@@ -3526,7 +3489,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_item *ei;
u64 flags;
int ret;
@@ -3646,7 +3609,7 @@ restart:
}
btrfs_release_path(path);
- clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
+ btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL);
if (trans) {
btrfs_end_transaction_throttle(trans);
@@ -3695,14 +3658,13 @@ out_free:
if (ret < 0 && !err)
err = ret;
btrfs_free_block_rsv(fs_info, rc->block_rsv);
- btrfs_free_path(path);
return err;
}
static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
int ret;
@@ -3713,7 +3675,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_inode(trans, root, path, objectid);
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
@@ -3723,15 +3685,13 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static void delete_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -3754,7 +3714,6 @@ static void delete_orphan_inode(struct btrfs_trans_handle *trans,
out:
if (ret)
btrfs_abort_transaction(trans, ret);
- btrfs_free_path(path);
}
/*
@@ -3803,7 +3762,7 @@ out:
if (ret) {
if (inode)
iput(&inode->vfs_inode);
- inode = ERR_PTR(ret);
+ return ERR_PTR(ret);
}
return &inode->vfs_inode;
}
@@ -3811,6 +3770,7 @@ out:
/*
* Mark start of chunk relocation that is cancellable. Check if the cancellation
* has been requested meanwhile and don't start in that case.
+ * NOTE: if this returns an error, reloc_chunk_end() must not be called.
*
* Return:
* 0 success
@@ -3827,10 +3787,8 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
btrfs_info(fs_info, "chunk relocation canceled on start");
- /*
- * On cancel, clear all requests but let the caller mark
- * the end after cleanup operations.
- */
+ /* On cancel, clear all requests. */
+ clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
atomic_set(&fs_info->reloc_cancel_req, 0);
return -ECANCELED;
}
@@ -3839,9 +3797,11 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
/*
* Mark end of chunk relocation that is cancellable and wake any waiters.
+ * NOTE: call only if a previous call to reloc_chunk_start() succeeded.
*/
static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
{
+ ASSERT(test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags));
/* Requested after start, clear bit first so any waiters can continue */
if (atomic_read(&fs_info->reloc_cancel_req) > 0)
btrfs_info(fs_info, "chunk relocation canceled during operation");
@@ -3862,7 +3822,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
rc->reloc_root_tree.rb_root = RB_ROOT;
spin_lock_init(&rc->reloc_root_tree.lock);
- extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
+ btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
return rc;
}
@@ -3883,7 +3843,7 @@ static void free_reloc_control(struct reloc_control *rc)
*/
static void describe_relocation(struct btrfs_block_group *block_group)
{
- char buf[128] = {'\0'};
+ char buf[128] = "NONE";
btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
@@ -3903,7 +3863,8 @@ static const char *stage_to_string(enum reloc_stage stage)
/*
* function to relocate all extents in a block group.
*/
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
+ bool verbose)
{
struct btrfs_block_group *bg;
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
@@ -3911,8 +3872,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
struct inode *inode;
struct btrfs_path *path;
int ret;
- int rw = 0;
- int err = 0;
+ bool bg_is_ro = false;
/*
* This only gets set if we had a half-deleted snapshot on mount. We
@@ -3954,24 +3914,20 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
}
ret = reloc_chunk_start(fs_info);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_put_bg;
- }
rc->extent_root = extent_root;
rc->block_group = bg;
ret = btrfs_inc_block_group_ro(rc->block_group, true);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
- rw = 1;
+ bg_is_ro = true;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -3983,19 +3939,18 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
else
ret = PTR_ERR(inode);
- if (ret && ret != -ENOENT) {
- err = ret;
+ if (ret && ret != -ENOENT)
goto out;
- }
rc->data_inode = create_reloc_inode(rc->block_group);
if (IS_ERR(rc->data_inode)) {
- err = PTR_ERR(rc->data_inode);
+ ret = PTR_ERR(rc->data_inode);
rc->data_inode = NULL;
goto out;
}
- describe_relocation(rc->block_group);
+ if (verbose)
+ describe_relocation(rc->block_group);
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
@@ -4010,8 +3965,6 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
mutex_unlock(&fs_info->cleaner_mutex);
- if (ret < 0)
- err = ret;
finishes_stage = rc->stage;
/*
@@ -4024,37 +3977,41 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
* out of the loop if we hit an error.
*/
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
- ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0,
- (u64)-1);
- if (ret)
- err = ret;
+ int wb_ret;
+
+ wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0,
+ (u64)-1);
+ if (wb_ret && ret == 0)
+ ret = wb_ret;
invalidate_mapping_pages(rc->data_inode->i_mapping,
0, -1);
rc->stage = UPDATE_DATA_PTRS;
}
- if (err < 0)
+ if (ret < 0)
goto out;
if (rc->extents_found == 0)
break;
- btrfs_info(fs_info, "found %llu extents, stage: %s",
- rc->extents_found, stage_to_string(finishes_stage));
+ if (verbose)
+ btrfs_info(fs_info, "found %llu extents, stage: %s",
+ rc->extents_found,
+ stage_to_string(finishes_stage));
}
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
WARN_ON(rc->block_group->used > 0);
out:
- if (err && rw)
+ if (ret && bg_is_ro)
btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
+ reloc_chunk_end(fs_info);
out_put_bg:
btrfs_put_block_group(bg);
- reloc_chunk_end(fs_info);
free_reloc_control(rc);
- return err;
+ return ret;
}
static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
@@ -4185,8 +4142,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
rc->merge_reloc_tree = true;
while (!list_empty(&reloc_roots)) {
- reloc_root = list_entry(reloc_roots.next,
- struct btrfs_root, root_list);
+ reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list);
list_del(&reloc_root->root_list);
if (btrfs_root_refs(&reloc_root->root_item) == 0) {
@@ -4236,8 +4192,8 @@ out_clean:
ret = ret2;
out_unset:
unset_reloc_control(rc);
-out_end:
reloc_chunk_end(fs_info);
+out_end:
free_reloc_control(rc);
out:
free_reloc_roots(&reloc_roots);
@@ -4279,7 +4235,7 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
while (!list_empty(&list)) {
struct btrfs_ordered_sum *sums =
- list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_first_entry(&list, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
@@ -4343,7 +4299,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
}
btrfs_backref_drop_node_buffer(node);
- atomic_inc(&cow->refs);
+ refcount_inc(&cow->refs);
node->eb = cow;
node->new_bytenr = cow->start;
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 788c86d8633a..5c36b3f84b57 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -12,7 +12,8 @@ struct btrfs_trans_handle;
struct btrfs_ordered_extent;
struct btrfs_pending_snapshot;
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
+ bool verbose);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index e22e6b06927a..6a7e297ab0a7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -85,7 +85,7 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of the valid range.
*/
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -130,7 +130,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
*item)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *l;
int ret;
int slot;
@@ -143,15 +143,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
btrfs_crit(fs_info,
- "unable to find root key (%llu %u %llu) in tree %llu",
- key->objectid, key->type, key->offset, btrfs_root_id(root));
+ "unable to find root key " BTRFS_KEY_FMT " in tree %llu",
+ BTRFS_KEY_FMT_VALUE(key), btrfs_root_id(root));
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
l = path->nodes[0];
@@ -168,22 +168,22 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, key, path,
-1, 1);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path,
key, sizeof(*item));
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
l = path->nodes[0];
slot = path->slots[0];
@@ -197,8 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
write_extent_buffer(l, item, ptr, sizeof(*item));
-out:
- btrfs_free_path(path);
return ret;
}
@@ -216,7 +214,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root;
int err = 0;
@@ -309,7 +307,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
btrfs_put_root(root);
}
- btrfs_free_path(path);
return err;
}
@@ -318,7 +315,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
const struct btrfs_key *key)
{
struct btrfs_root *root = trans->fs_info->tree_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
path = btrfs_alloc_path();
@@ -326,17 +323,12 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, -1, 1);
if (ret < 0)
- goto out;
- if (ret != 0) {
+ return ret;
+ if (unlikely(ret > 0))
/* The root must exist but we did not find it by the key. */
- ret = -EUCLEAN;
- goto out;
- }
+ return -EUCLEAN;
- ret = btrfs_del_item(trans, root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, root, path);
}
int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
@@ -344,7 +336,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -361,7 +353,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0) {
- goto out;
+ return ret;
} else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -369,18 +361,16 @@ again:
ptr = (unsigned long)(ref + 1);
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
(btrfs_root_ref_name_len(leaf, ref) != name->len) ||
- memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
- ret = -ENOENT;
- goto out;
- }
+ memcmp_extent_buffer(leaf, name->name, ptr, name->len))
+ return -ENOENT;
+
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
- goto out;
+ return ret;
} else {
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
@@ -391,8 +381,6 @@ again:
goto again;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -418,7 +406,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_key key;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
unsigned long ptr;
@@ -433,9 +421,8 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name->len);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- btrfs_free_path(path);
return ret;
}
@@ -455,7 +442,6 @@ again:
goto again;
}
- btrfs_free_path(path);
return 0;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2c5edcee9450..a40ee41f42c6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -66,8 +66,6 @@ struct scrub_ctx;
/* Represent one sector and its needed info to verify the content. */
struct scrub_sector_verification {
- bool is_metadata;
-
union {
/*
* Csum pointer for data csum verification. Should point to a
@@ -100,7 +98,39 @@ enum scrub_stripe_flags {
SCRUB_STRIPE_FLAG_NO_REPORT,
};
-#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
+/*
+ * We have multiple bitmaps for one scrub_stripe.
+ * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits,
+ * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64).
+ *
+ * So to reduce memory usage for each scrub_stripe, we pack those bitmaps
+ * into a larger one.
+ *
+ * These enum records where the sub-bitmap are inside the larger one.
+ * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit.
+ */
+enum {
+ /* Which blocks are covered by extent items. */
+ scrub_bitmap_nr_has_extent = 0,
+
+ /* Which blocks are metadata. */
+ scrub_bitmap_nr_is_metadata,
+
+ /*
+ * Which blocks have errors, including IO, csum, and metadata
+ * errors.
+ * This sub-bitmap is the OR results of the next few error related
+ * sub-bitmaps.
+ */
+ scrub_bitmap_nr_error,
+ scrub_bitmap_nr_io_error,
+ scrub_bitmap_nr_csum_error,
+ scrub_bitmap_nr_meta_error,
+ scrub_bitmap_nr_meta_gen_error,
+ scrub_bitmap_nr_last,
+};
+
+#define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE)
/*
* Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
@@ -109,7 +139,7 @@ struct scrub_stripe {
struct scrub_ctx *sctx;
struct btrfs_block_group *bg;
- struct page *pages[SCRUB_STRIPE_PAGES];
+ struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS];
struct scrub_sector_verification *sectors;
struct btrfs_device *dev;
@@ -138,36 +168,15 @@ struct scrub_stripe {
*/
unsigned long state;
- /* Indicate which sectors are covered by extent items. */
- unsigned long extent_sector_bitmap;
+ /* The large bitmap contains all the sub-bitmaps. */
+ unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last *
+ (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))];
/*
- * The errors hit during the initial read of the stripe.
- *
- * Would be utilized for error reporting and repair.
- *
- * The remaining init_nr_* records the number of errors hit, only used
- * by error reporting.
+ * For writeback (repair or replace) error reporting.
+ * This one is protected by a spinlock, thus can not be packed into
+ * the larger bitmap.
*/
- unsigned long init_error_bitmap;
- unsigned int init_nr_io_errors;
- unsigned int init_nr_csum_errors;
- unsigned int init_nr_meta_errors;
-
- /*
- * The following error bitmaps are all for the current status.
- * Every time we submit a new read, these bitmaps may be updated.
- *
- * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
- *
- * IO and csum errors can happen for both metadata and data.
- */
- unsigned long error_bitmap;
- unsigned long io_error_bitmap;
- unsigned long csum_error_bitmap;
- unsigned long meta_error_bitmap;
-
- /* For writeback (repair or replace) error reporting. */
unsigned long write_error_bitmap;
/* Writeback can be concurrent, thus we need to protect the bitmap. */
@@ -197,7 +206,7 @@ struct scrub_ctx {
ktime_t throttle_deadline;
u64 throttle_sent;
- int is_dev_replace;
+ bool is_dev_replace;
u64 write_pointer;
struct mutex wr_lock;
@@ -219,6 +228,90 @@ struct scrub_ctx {
refcount_t refs;
};
+#define scrub_calc_start_bit(stripe, name, block_nr) \
+({ \
+ unsigned int __start_bit; \
+ \
+ ASSERT(block_nr < stripe->nr_sectors, \
+ "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \
+ __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \
+ __start_bit; \
+})
+
+#define IMPLEMENT_SCRUB_BITMAP_OPS(name) \
+static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr, \
+ unsigned int nr_blocks) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, \
+ name, block_nr); \
+ \
+ bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \
+} \
+static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr, \
+ unsigned int nr_blocks) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \
+} \
+static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ return test_bit(start_bit, stripe->bitmaps); \
+} \
+static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ set_bit(start_bit, stripe->bitmaps); \
+} \
+static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ clear_bit(start_bit, stripe->bitmaps); \
+} \
+static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \
+{ \
+ const unsigned int nr_blocks = stripe->nr_sectors; \
+ \
+ ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \
+ "nr_blocks=%u BITS_PER_LONG=%u", \
+ nr_blocks, BITS_PER_LONG); \
+ \
+ return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \
+ stripe->nr_sectors); \
+} \
+static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \
+{ \
+ unsigned long bitmap = scrub_bitmap_read_##name(stripe); \
+ \
+ return bitmap_empty(&bitmap, stripe->nr_sectors); \
+} \
+static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \
+{ \
+ unsigned long bitmap = scrub_bitmap_read_##name(stripe); \
+ \
+ return bitmap_weight(&bitmap, stripe->nr_sectors); \
+}
+IMPLEMENT_SCRUB_BITMAP_OPS(has_extent);
+IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata);
+IMPLEMENT_SCRUB_BITMAP_OPS(error);
+IMPLEMENT_SCRUB_BITMAP_OPS(io_error);
+IMPLEMENT_SCRUB_BITMAP_OPS(csum_error);
+IMPLEMENT_SCRUB_BITMAP_OPS(meta_error);
+IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error);
+
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
@@ -228,15 +321,28 @@ struct scrub_warning {
struct btrfs_device *dev;
};
+struct scrub_error_records {
+ /*
+ * Bitmap recording which blocks hit errors (IO/csum/...) during the
+ * initial read.
+ */
+ unsigned long init_error_bitmap;
+
+ unsigned int nr_io_errors;
+ unsigned int nr_csum_errors;
+ unsigned int nr_meta_errors;
+ unsigned int nr_meta_gen_errors;
+};
+
static void release_scrub_stripe(struct scrub_stripe *stripe)
{
if (!stripe)
return;
- for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
- if (stripe->pages[i])
- __free_page(stripe->pages[i]);
- stripe->pages[i] = NULL;
+ for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) {
+ if (stripe->folios[i])
+ folio_put(stripe->folios[i]);
+ stripe->folios[i] = NULL;
}
kfree(stripe->sectors);
kfree(stripe->csums);
@@ -249,6 +355,7 @@ static void release_scrub_stripe(struct scrub_stripe *stripe)
static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
struct scrub_stripe *stripe)
{
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
int ret;
memset(stripe, 0, sizeof(*stripe));
@@ -261,7 +368,9 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false);
+ ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS);
+ ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift,
+ fs_info->block_min_order, stripe->folios);
if (ret < 0)
goto error;
@@ -340,7 +449,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx)
}
static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
- struct btrfs_fs_info *fs_info, int is_dev_replace)
+ struct btrfs_fs_info *fs_info, bool is_dev_replace)
{
struct scrub_ctx *sctx;
int i;
@@ -354,10 +463,10 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->fs_info = fs_info;
- sctx->extent_path.search_commit_root = 1;
- sctx->extent_path.skip_locking = 1;
- sctx->csum_path.search_commit_root = 1;
- sctx->csum_path.skip_locking = 1;
+ sctx->extent_path.search_commit_root = true;
+ sctx->extent_path.skip_locking = true;
+ sctx->csum_path.search_commit_root = true;
+ sctx->csum_path.skip_locking = true;
for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) {
int ret;
@@ -396,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
struct btrfs_inode_item *inode_item;
struct scrub_warning *swarn = warn_ctx;
struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
- struct inode_fs_paths *ipath = NULL;
+ struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
struct btrfs_root *local_root;
struct btrfs_key key;
@@ -450,8 +559,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
+ btrfs_warn(fs_info,
+"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
@@ -460,18 +569,16 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
(char *)(unsigned long)ipath->fspath->val[i]);
btrfs_put_root(local_root);
- free_ipath(ipath);
return 0;
err:
- btrfs_warn_in_rcu(fs_info,
- "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
+ btrfs_warn(fs_info,
+ "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset, ret);
- free_ipath(ipath);
return 0;
}
@@ -479,7 +586,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
bool is_super, u64 logical, u64 physical)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key found_key;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
@@ -490,7 +597,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
/* Super block error, no need to search extent tree. */
if (is_super) {
- btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu",
errstr, btrfs_dev_name(dev), physical);
return;
}
@@ -506,7 +613,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
&flags);
if (ret < 0)
- goto out;
+ return;
swarn.extent_item_size = found_key.offset;
@@ -525,14 +632,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
&ref_level);
if (ret < 0) {
btrfs_warn(fs_info,
- "failed to resolve tree backref for logical %llu: %d",
- swarn.logical, ret);
+ "scrub: failed to resolve tree backref for logical %llu: %d",
+ swarn.logical, ret);
break;
}
if (ret > 0)
break;
- btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
+ btrfs_warn(fs_info,
+"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical, btrfs_dev_name(dev),
swarn.physical, (ref_level ? "node" : "leaf"),
ref_level, ref_root);
@@ -552,9 +659,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
}
-
-out:
- btrfs_free_path(path);
}
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
@@ -579,20 +683,32 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
return ret;
}
-static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
+static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
- int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ u32 offset = (sector_nr << fs_info->sectorsize_bits);
+ const struct folio *folio = stripe->folios[offset >> min_folio_shift];
- return stripe->pages[page_index];
+ /* stripe->folios[] is allocated by us and no highmem is allowed. */
+ ASSERT(folio);
+ ASSERT(!folio_test_highmem(folio));
+ return folio_address(folio) + offset_in_folio(folio, offset);
}
-static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
- int sector_nr)
+static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ u32 offset = (sector_nr << fs_info->sectorsize_bits);
+ const struct folio *folio = stripe->folios[offset >> min_folio_shift];
- return offset_in_page(sector_nr << fs_info->sectorsize_bits);
+ /* stripe->folios[] is allocated by us and no highmem is allowed. */
+ ASSERT(folio);
+ ASSERT(!folio_test_highmem(folio));
+ /* And the range must be contained inside the folio. */
+ ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio));
+ return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset);
}
static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
@@ -600,46 +716,44 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
- const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
- const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
+ void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
+ struct btrfs_header *header = first_kaddr;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 on_disk_csum[BTRFS_CSUM_SIZE];
u8 calculated_csum[BTRFS_CSUM_SIZE];
- struct btrfs_header *header;
/*
* Here we don't have a good way to attach the pages (and subpages)
* to a dummy extent buffer, thus we have to directly grab the members
* from pages.
*/
- header = (struct btrfs_header *)(page_address(first_page) + first_off);
memcpy(on_disk_csum, header->csum, fs_info->csum_size);
if (logical != btrfs_stack_header_bytenr(header)) {
- bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_bytenr(header), logical);
return;
}
if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) != 0) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad fsid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU",
logical, stripe->mirror_num,
header->fsid, fs_info->fs_devices->fsid);
return;
}
if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE) != 0) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
logical, stripe->mirror_num,
header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
return;
@@ -648,42 +762,40 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
/* Now check tree block csum. */
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- crypto_shash_update(shash, page_address(first_page) + first_off +
- BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
+ crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE,
+ fs_info->sectorsize - BTRFS_CSUM_SIZE);
for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
- struct page *page = scrub_stripe_get_page(stripe, i);
- unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
-
- crypto_shash_update(shash, page_address(page) + page_off,
+ crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i),
fs_info->sectorsize);
}
crypto_shash_final(shash, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+"scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT,
logical, stripe->mirror_num,
- CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
- CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
+ BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
+ BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
return;
}
if (stripe->sectors[sector_nr].generation !=
btrfs_stack_header_generation(header)) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad generation, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_generation(header),
stripe->sectors[sector_nr].generation);
return;
}
- bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
- bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree);
}
static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
@@ -691,23 +803,22 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
- struct page *page = scrub_stripe_get_page(stripe, sector_nr);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
+ phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr);
u8 csum_buf[BTRFS_CSUM_SIZE];
int ret;
ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
/* Sector not utilized, skip it. */
- if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
+ if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr))
return;
/* IO error, no need to check. */
- if (test_bit(sector_nr, &stripe->io_error_bitmap))
+ if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
return;
/* Metadata, verify the full tree block. */
- if (sector->is_metadata) {
+ if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
/*
* Check if the tree block crosses the stripe boundary. If
* crossed the boundary, we cannot verify it but only give a
@@ -718,7 +829,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
*/
if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
btrfs_warn_rl(fs_info,
- "tree block at %llu crosses stripe boundary %llu",
+ "scrub: tree block at %llu crosses stripe boundary %llu",
stripe->logical +
(sector_nr << fs_info->sectorsize_bits),
stripe->logical);
@@ -733,17 +844,17 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
* cases without csum, we have no other choice but to trust it.
*/
if (!sector->csum) {
- clear_bit(sector_nr, &stripe->error_bitmap);
+ scrub_bitmap_clear_bit_error(stripe, sector_nr);
return;
}
- ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
+ ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum);
if (ret < 0) {
- set_bit(sector_nr, &stripe->csum_error_bitmap);
- set_bit(sector_nr, &stripe->error_bitmap);
+ scrub_bitmap_set_bit_csum_error(stripe, sector_nr);
+ scrub_bitmap_set_bit_error(stripe, sector_nr);
} else {
- clear_bit(sector_nr, &stripe->csum_error_bitmap);
- clear_bit(sector_nr, &stripe->error_bitmap);
+ scrub_bitmap_clear_bit_csum_error(stripe, sector_nr);
+ scrub_bitmap_clear_bit_error(stripe, sector_nr);
}
}
@@ -756,7 +867,7 @@ static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long b
for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
scrub_verify_one_sector(stripe, sector_nr);
- if (stripe->sectors[sector_nr].is_metadata)
+ if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr))
sector_nr += sectors_per_tree - 1;
}
}
@@ -766,8 +877,7 @@ static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first
int i;
for (i = 0; i < stripe->nr_sectors; i++) {
- if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
- scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
+ if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec))
break;
}
ASSERT(i < stripe->nr_sectors);
@@ -795,13 +905,13 @@ static void scrub_repair_read_endio(struct btrfs_bio *bbio)
bio_size += bvec->bv_len;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, sector_nr,
- bio_size >> fs_info->sectorsize_bits);
- bitmap_set(&stripe->error_bitmap, sector_nr,
- bio_size >> fs_info->sectorsize_bits);
+ scrub_bitmap_set_io_error(stripe, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
+ scrub_bitmap_set_error(stripe, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
} else {
- bitmap_clear(&stripe->io_error_bitmap, sector_nr,
- bio_size >> fs_info->sectorsize_bits);
+ scrub_bitmap_clear_io_error(stripe, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io))
@@ -814,27 +924,55 @@ static int calc_next_mirror(int mirror, int num_copies)
return (mirror + 1 > num_copies) ? 1 : mirror + 1;
}
+static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe,
+ int sector_nr)
+{
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
+ int ret;
+
+ ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize,
+ offset_in_page(kaddr));
+ /*
+ * Caller should ensure the bbio has enough size.
+ * And we cannot use __bio_add_page(), which doesn't do any merge.
+ *
+ * Meanwhile for scrub_submit_initial_read() we fully rely on the merge
+ * to create the minimal amount of bio vectors, for fs block size < page
+ * size cases.
+ */
+ ASSERT(ret == fs_info->sectorsize);
+}
+
+static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info,
+ unsigned int nr_vecs, blk_opf_t opf,
+ u64 logical,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct btrfs_bio *bbio;
+
+ bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode),
+ logical, end_io, private);
+ bbio->is_scrub = true;
+ bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+ return bbio;
+}
+
static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
int mirror, int blocksize, bool wait)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
- const unsigned long old_error_bitmap = stripe->error_bitmap;
+ const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
int i;
- ASSERT(stripe->mirror_num >= 1);
- ASSERT(atomic_read(&stripe->pending_io) == 0);
+ ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num);
+ ASSERT(atomic_read(&stripe->pending_io) == 0,
+ "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io));
for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
- struct page *page;
- int pgoff;
- int ret;
-
- page = scrub_stripe_get_page(stripe, i);
- pgoff = scrub_stripe_get_page_offset(stripe, i);
-
/* The current sector cannot be merged, submit the bio. */
- if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
+ if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) ||
bbio->bio.bi_iter.bi_size >= blocksize)) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
@@ -844,15 +982,12 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
bbio = NULL;
}
- if (!bbio) {
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
- fs_info, scrub_repair_read_endio, stripe);
- bbio->bio.bi_iter.bi_sector = (stripe->logical +
- (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
- }
+ if (!bbio)
+ bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ,
+ stripe->logical + (i << fs_info->sectorsize_bits),
+ scrub_repair_read_endio, stripe);
- ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
- ASSERT(ret == fs_info->sectorsize);
+ scrub_bio_add_sector(bbio, stripe, i);
}
if (bbio) {
ASSERT(bbio->bio.bi_iter.bi_size);
@@ -864,12 +999,15 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
}
static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
- struct scrub_stripe *stripe)
+ struct scrub_stripe *stripe,
+ const struct scrub_error_records *errors)
{
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_device *dev = NULL;
+ const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe);
+ const unsigned long error_bitmap = scrub_bitmap_read_error(stripe);
u64 physical = 0;
int nr_data_sectors = 0;
int nr_meta_sectors = 0;
@@ -886,14 +1024,14 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
* Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
* thus no need for dev/physical, error reporting still needs dev and physical.
*/
- if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
+ if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) {
u64 mapped_len = fs_info->sectorsize;
struct btrfs_io_context *bioc = NULL;
int stripe_index = stripe->mirror_num - 1;
int ret;
/* For scrub, our mirror_num should always start at 1. */
- ASSERT(stripe->mirror_num >= 1);
+ ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num);
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
stripe->logical, &mapped_len, &bioc,
NULL, NULL);
@@ -909,10 +1047,10 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
}
skip:
- for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+ for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) {
bool repaired = false;
- if (stripe->sectors[sector_nr].is_metadata) {
+ if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
nr_meta_sectors++;
} else {
nr_data_sectors++;
@@ -920,14 +1058,14 @@ skip:
nr_nodatacsum_sectors++;
}
- if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
- !test_bit(sector_nr, &stripe->error_bitmap)) {
+ if (test_bit(sector_nr, &errors->init_error_bitmap) &&
+ !test_bit(sector_nr, &error_bitmap)) {
nr_repaired_sectors++;
repaired = true;
}
/* Good sector from the beginning, nothing need to be done. */
- if (!test_bit(sector_nr, &stripe->init_error_bitmap))
+ if (!test_bit(sector_nr, &errors->init_error_bitmap))
continue;
/*
@@ -936,13 +1074,13 @@ skip:
*/
if (repaired) {
if (dev) {
- btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on dev %s physical %llu",
+ btrfs_err_rl(fs_info,
+ "scrub: fixed up error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
- btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on mirror %u",
+ btrfs_err_rl(fs_info,
+ "scrub: fixed up error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
continue;
@@ -950,41 +1088,56 @@ skip:
/* The remaining are all for unrepaired. */
if (dev) {
- btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
+ btrfs_err_rl(fs_info,
+"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
- btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on mirror %u",
+ btrfs_err_rl(fs_info,
+ "scrub: unable to fixup (regular) error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
- if (test_bit(sector_nr, &stripe->io_error_bitmap))
+ if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("i/o error", dev, false,
stripe->logical, physical);
- if (test_bit(sector_nr, &stripe->csum_error_bitmap))
+ if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("checksum error", dev, false,
stripe->logical, physical);
- if (test_bit(sector_nr, &stripe->meta_error_bitmap))
+ if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("header error", dev, false,
stripe->logical, physical);
+ if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr))
+ if (__ratelimit(&rs) && dev)
+ scrub_print_common_warning("generation error", dev, false,
+ stripe->logical, physical);
}
+ /* Update the device stats. */
+ for (int i = 0; i < errors->nr_io_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS);
+ for (int i = 0; i < errors->nr_csum_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ /* Generation mismatch error is based on each metadata, not each block. */
+ for (int i = 0; i < errors->nr_meta_gen_errors;
+ i += (fs_info->nodesize >> fs_info->sectorsize_bits))
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS);
+
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
sctx->stat.no_csum += nr_nodatacsum_sectors;
- sctx->stat.read_errors += stripe->init_nr_io_errors;
- sctx->stat.csum_errors += stripe->init_nr_csum_errors;
- sctx->stat.verify_errors += stripe->init_nr_meta_errors;
+ sctx->stat.read_errors += errors->nr_io_errors;
+ sctx->stat.csum_errors += errors->nr_csum_errors;
+ sctx->stat.verify_errors += errors->nr_meta_errors +
+ errors->nr_meta_gen_errors;
sctx->stat.uncorrectable_errors +=
- bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
+ bitmap_weight(&error_bitmap, stripe->nr_sectors);
sctx->stat.corrected_errors += nr_repaired_sectors;
spin_unlock(&sctx->stat_lock);
}
@@ -1010,26 +1163,26 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
struct scrub_ctx *sctx = stripe->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct scrub_error_records errors = { 0 };
int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
stripe->bg->length);
unsigned long repaired;
+ unsigned long error;
int mirror;
int i;
- ASSERT(stripe->mirror_num > 0);
+ ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num);
wait_scrub_stripe_io(stripe);
- scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
+ scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe));
/* Save the initial failed bitmap for later repair and report usage. */
- stripe->init_error_bitmap = stripe->error_bitmap;
- stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap,
- stripe->nr_sectors);
- stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap,
- stripe->nr_sectors);
- stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
- stripe->nr_sectors);
-
- if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
+ errors.init_error_bitmap = scrub_bitmap_read_error(stripe);
+ errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe);
+ errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe);
+ errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe);
+ errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe);
+
+ if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors))
goto out;
/*
@@ -1041,13 +1194,13 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
mirror != stripe->mirror_num;
mirror = calc_next_mirror(mirror, num_copies)) {
- const unsigned long old_error_bitmap = stripe->error_bitmap;
+ const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
scrub_stripe_submit_repair_read(stripe, mirror,
BTRFS_STRIPE_LEN, false);
wait_scrub_stripe_io(stripe);
scrub_verify_one_stripe(stripe, old_error_bitmap);
- if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ if (scrub_bitmap_empty_error(stripe))
goto out;
}
@@ -1065,21 +1218,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
for (i = 0, mirror = stripe->mirror_num;
i < num_copies;
i++, mirror = calc_next_mirror(mirror, num_copies)) {
- const unsigned long old_error_bitmap = stripe->error_bitmap;
+ const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
scrub_stripe_submit_repair_read(stripe, mirror,
fs_info->sectorsize, true);
wait_scrub_stripe_io(stripe);
scrub_verify_one_stripe(stripe, old_error_bitmap);
- if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ if (scrub_bitmap_empty_error(stripe))
goto out;
}
out:
+ error = scrub_bitmap_read_error(stripe);
/*
* Submit the repaired sectors. For zoned case, we cannot do repair
* in-place, but queue the bg to be relocated.
*/
- bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap,
+ bitmap_andnot(&repaired, &errors.init_error_bitmap, &error,
stripe->nr_sectors);
if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) {
if (btrfs_is_zoned(fs_info)) {
@@ -1090,7 +1244,7 @@ out:
}
}
- scrub_stripe_report_errors(sctx, stripe);
+ scrub_stripe_report_errors(sctx, stripe, &errors);
set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
wake_up(&stripe->repair_wait);
}
@@ -1110,10 +1264,10 @@ static void scrub_read_endio(struct btrfs_bio *bbio)
num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
- bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
+ scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors);
+ scrub_bitmap_set_error(stripe, sector_nr, num_sectors);
} else {
- bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1142,6 +1296,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
bitmap_set(&stripe->write_error_bitmap, sector_nr,
bio_size >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&stripe->write_error_lock, flags);
+ for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
}
bio_put(&bbio->bio);
@@ -1199,27 +1356,19 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
int sector_nr;
for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
- struct page *page = scrub_stripe_get_page(stripe, sector_nr);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
- int ret;
-
/* We should only writeback sectors covered by an extent. */
- ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));
+ ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr));
/* Cannot merge with previous sector, submit the current one. */
if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
bbio = NULL;
}
- if (!bbio) {
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
- fs_info, scrub_write_endio, stripe);
- bbio->bio.bi_iter.bi_sector = (stripe->logical +
- (sector_nr << fs_info->sectorsize_bits)) >>
- SECTOR_SHIFT;
- }
- ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
- ASSERT(ret == fs_info->sectorsize);
+ if (!bbio)
+ bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE,
+ stripe->logical + (sector_nr << fs_info->sectorsize_bits),
+ scrub_write_endio, stripe);
+ scrub_bio_add_sector(bbio, stripe, sector_nr);
}
if (bbio)
scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
@@ -1246,8 +1395,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
* Slice is divided into intervals when the IO is submitted, adjust by
* bwlimit and maximum of 64 intervals.
*/
- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
- div = min_t(u32, 64, div);
+ div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
/* Start new epoch, set deadline */
now = ktime_get();
@@ -1339,7 +1487,7 @@ static int compare_extent_item_range(struct btrfs_path *path,
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
- key.type == BTRFS_METADATA_ITEM_KEY);
+ key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type);
if (key.type == BTRFS_METADATA_ITEM_KEY)
len = fs_info->nodesize;
else
@@ -1390,7 +1538,7 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -1444,7 +1592,7 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
- key.type == BTRFS_EXTENT_ITEM_KEY);
+ key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type);
*extent_start_ret = key.objectid;
if (key.type == BTRFS_METADATA_ITEM_KEY)
*size_ret = path->nodes[0]->fs_info->nodesize;
@@ -1470,8 +1618,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
physical,
sctx->write_pointer);
if (ret)
- btrfs_err(fs_info,
- "zoned: failed to recover write pointer");
+ btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer");
}
mutex_unlock(&sctx->wr_lock);
btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
@@ -1493,9 +1640,9 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
struct scrub_sector_verification *sector =
&stripe->sectors[nr_sector];
- set_bit(nr_sector, &stripe->extent_sector_bitmap);
+ scrub_bitmap_set_bit_has_extent(stripe, nr_sector);
if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- sector->is_metadata = true;
+ scrub_bitmap_set_bit_is_metadata(stripe, nr_sector);
sector->generation = extent_gen;
}
}
@@ -1503,15 +1650,8 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
{
- stripe->extent_sector_bitmap = 0;
- stripe->init_error_bitmap = 0;
- stripe->init_nr_io_errors = 0;
- stripe->init_nr_csum_errors = 0;
- stripe->init_nr_meta_errors = 0;
- stripe->error_bitmap = 0;
- stripe->io_error_bitmap = 0;
- stripe->csum_error_bitmap = 0;
- stripe->meta_error_bitmap = 0;
+ ASSERT(stripe->nr_sectors);
+ bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors);
}
/*
@@ -1541,8 +1681,8 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
u64 extent_gen;
int ret;
- if (unlikely(!extent_root)) {
- btrfs_err(fs_info, "no valid extent root for scrub");
+ if (unlikely(!extent_root || !csum_root)) {
+ btrfs_err(fs_info, "scrub: no valid extent or csum root found");
return -EUCLEAN;
}
memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
@@ -1550,7 +1690,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
scrub_stripe_reset_bitmaps(stripe);
/* The range must be inside the bg. */
- ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length,
+ "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu",
+ bg->start, logical_start, logical_end, bg->start + bg->length);
ret = find_first_extent_item(extent_root, extent_path, logical_start,
logical_len);
@@ -1646,7 +1788,6 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
stripe->state = 0;
for (int i = 0; i < stripe->nr_sectors; i++) {
- stripe->sectors[i].is_metadata = false;
stripe->sectors[i].csum = NULL;
stripe->sectors[i].generation = 0;
}
@@ -1665,24 +1806,21 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
+ const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe);
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
atomic_inc(&stripe->pending_io);
- for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
- struct page *page = scrub_stripe_get_page(stripe, i);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
-
+ for_each_set_bit(i, &has_extent, stripe->nr_sectors) {
/* We're beyond the chunk boundary, no need to read anymore. */
if (i >= nr_sectors)
break;
/* The current sector cannot be merged, submit the bio. */
if (bbio &&
- ((i > 0 &&
- !test_bit(i - 1, &stripe->extent_sector_bitmap)) ||
+ ((i > 0 && !test_bit(i - 1, &has_extent)) ||
bbio->bio.bi_iter.bi_size >= stripe_len)) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
@@ -1695,7 +1833,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
struct btrfs_io_context *bioc = NULL;
const u64 logical = stripe->logical +
(i << fs_info->sectorsize_bits);
- int err;
+ int ret;
io_stripe.rst_search_commit_root = true;
stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
@@ -1703,11 +1841,11 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
* For RST cases, we need to manually split the bbio to
* follow the RST boundary.
*/
- err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
- if (err < 0) {
- if (err != -ENODATA) {
+ if (ret < 0) {
+ if (ret != -ENODATA) {
/*
* Earlier btrfs_get_raid_extent_offset()
* returned -ENODATA, which means there's
@@ -1716,18 +1854,17 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
* the extent tree, then it's a preallocated
* extent and not an error.
*/
- set_bit(i, &stripe->io_error_bitmap);
- set_bit(i, &stripe->error_bitmap);
+ scrub_bitmap_set_bit_io_error(stripe, i);
+ scrub_bitmap_set_bit_error(stripe, i);
}
continue;
}
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
- fs_info, scrub_read_endio, stripe);
- bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+ bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ,
+ logical, scrub_read_endio, stripe);
}
- __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+ scrub_bio_add_sector(bbio, stripe, i);
}
if (bbio) {
@@ -1748,6 +1885,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
@@ -1760,20 +1898,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
return;
}
- bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
- scrub_read_endio, stripe);
-
- bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
+ bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ,
+ stripe->logical, scrub_read_endio, stripe);
/* Read the whole range inside the chunk boundary. */
- for (unsigned int cur = 0; cur < nr_sectors; cur++) {
- struct page *page = scrub_stripe_get_page(stripe, cur);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
- int ret;
-
- ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
- /* We should have allocated enough bio vectors. */
- ASSERT(ret == fs_info->sectorsize);
- }
+ for (unsigned int cur = 0; cur < nr_sectors; cur++)
+ scrub_bio_add_sector(bbio, stripe, cur);
atomic_inc(&stripe->pending_io);
/*
@@ -1794,14 +1923,15 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
{
+ const unsigned long error = scrub_bitmap_read_error(stripe);
int i;
- for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
- if (stripe->sectors[i].is_metadata) {
+ for_each_set_bit(i, &error, stripe->nr_sectors) {
+ if (scrub_bitmap_test_bit_is_metadata(stripe, i)) {
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
btrfs_err(fs_info,
- "stripe %llu has unrepaired metadata sector at %llu",
+ "scrub: stripe %llu has unrepaired metadata sector at logical %llu",
stripe->logical,
stripe->logical + (i << fs_info->sectorsize_bits));
return true;
@@ -1865,20 +1995,23 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
* metadata, we should immediately abort.
*/
for (int i = 0; i < nr_stripes; i++) {
- if (stripe_has_metadata_error(&sctx->stripes[i])) {
+ if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) {
ret = -EIO;
goto out;
}
}
for (int i = 0; i < nr_stripes; i++) {
unsigned long good;
+ unsigned long has_extent;
+ unsigned long error;
stripe = &sctx->stripes[i];
ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
- bitmap_andnot(&good, &stripe->extent_sector_bitmap,
- &stripe->error_bitmap, stripe->nr_sectors);
+ has_extent = scrub_bitmap_read_has_extent(stripe);
+ error = scrub_bitmap_read_error(stripe);
+ bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors);
scrub_write_sectors(sctx, stripe, good, true);
}
}
@@ -1944,37 +2077,135 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
return 0;
}
+/*
+ * Return 0 if we should not cancel the scrub.
+ * Return <0 if we need to cancel the scrub, returned value will
+ * indicate the reason:
+ * - -ECANCELED - Being explicitly canceled through ioctl.
+ * - -EINTR - Being interrupted by signal or fs/process freezing.
+ */
+static int should_cancel_scrub(const struct scrub_ctx *sctx)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sctx->cancel_req))
+ return -ECANCELED;
+
+ /*
+ * The user (e.g. fsfreeze command) or power management (PM)
+ * suspend/hibernate can freeze the fs. And PM suspend/hibernate will
+ * also freeze all user processes.
+ *
+ * A user process can only be frozen when it is in user space, thus we
+ * have to cancel the run so that the process can return to the user
+ * space.
+ *
+ * Furthermore we have to check both filesystem and process freezing,
+ * as PM can be configured to freeze the filesystems before processes.
+ *
+ * If we only check fs freezing, then suspend without fs freezing
+ * will timeout, as the process is still in kernel space.
+ *
+ * If we only check process freezing, then suspend with fs freezing
+ * will timeout, as the running scrub will prevent the fs from being frozen.
+ */
+ if (fs_info->sb->s_writers.frozen > SB_UNFROZEN ||
+ freezing(current) || signal_pending(current))
+ return -EINTR;
+ return 0;
+}
+
+static int scrub_raid56_cached_parity(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev,
+ struct btrfs_chunk_map *map,
+ u64 full_stripe_start,
+ unsigned long *extent_bitmap)
+{
+ DECLARE_COMPLETION_ONSTACK(io_done);
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_raid_bio *rbio;
+ struct bio bio;
+ const int data_stripes = nr_data_stripes(map);
+ u64 length = btrfs_stripe_nr_to_offset(data_stripes);
+ int ret;
+
+ bio_init(&bio, NULL, NULL, 0, REQ_OP_READ);
+ bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
+ bio.bi_private = &io_done;
+ bio.bi_end_io = raid56_scrub_wait_endio;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
+ &length, &bioc, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ /* For RAID56 write there must be an @bioc allocated. */
+ ASSERT(bioc);
+ rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap,
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
+ btrfs_put_bioc(bioc);
+ if (!rbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* Use the recovered stripes as cache to avoid read them from disk again. */
+ for (int i = 0; i < data_stripes; i++) {
+ struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i];
+
+ raid56_parity_cache_data_folios(rbio, stripe->folios,
+ full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
+ }
+ raid56_parity_submit_scrub_rbio(rbio);
+ wait_for_completion_io(&io_done);
+ ret = blk_status_to_errno(bio.bi_status);
+out:
+ btrfs_bio_counter_dec(fs_info);
+ bio_uninit(&bio);
+ return ret;
+}
+
static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
u64 full_stripe_start)
{
- DECLARE_COMPLETION_ONSTACK(io_done);
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_raid_bio *rbio;
- struct btrfs_io_context *bioc = NULL;
struct btrfs_path extent_path = { 0 };
struct btrfs_path csum_path = { 0 };
- struct bio *bio;
struct scrub_stripe *stripe;
bool all_empty = true;
const int data_stripes = nr_data_stripes(map);
unsigned long extent_bitmap = 0;
- u64 length = btrfs_stripe_nr_to_offset(data_stripes);
int ret;
ASSERT(sctx->raid56_data_stripes);
+ ret = should_cancel_scrub(sctx);
+ if (ret < 0)
+ return ret;
+
+ if (atomic_read(&fs_info->scrub_pause_req))
+ scrub_blocked_if_needed(fs_info);
+
+ spin_lock(&bg->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ return 0;
+ }
+ spin_unlock(&bg->lock);
+
/*
* For data stripe search, we cannot reuse the same extent/csum paths,
* as the data stripe bytenr may be smaller than previous extent. Thus
* we have to use our own extent/csum paths.
*/
- extent_path.search_commit_root = 1;
- extent_path.skip_locking = 1;
- csum_path.search_commit_root = 1;
- csum_path.skip_locking = 1;
+ extent_path.search_commit_root = true;
+ extent_path.skip_locking = true;
+ csum_path.search_commit_root = true;
+ csum_path.skip_locking = true;
for (int i = 0; i < data_stripes; i++) {
int stripe_index;
@@ -2012,7 +2243,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
/* Check if all data stripes are empty. */
for (int i = 0; i < data_stripes; i++) {
stripe = &sctx->raid56_data_stripes[i];
- if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
+ if (!scrub_bitmap_empty_has_extent(stripe)) {
all_empty = false;
break;
}
@@ -2044,65 +2275,36 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
*/
for (int i = 0; i < data_stripes; i++) {
unsigned long error;
+ unsigned long has_extent;
stripe = &sctx->raid56_data_stripes[i];
+ error = scrub_bitmap_read_error(stripe);
+ has_extent = scrub_bitmap_read_has_extent(stripe);
+
/*
* We should only check the errors where there is an extent.
* As we may hit an empty data stripe while it's missing.
*/
- bitmap_and(&error, &stripe->error_bitmap,
- &stripe->extent_sector_bitmap, stripe->nr_sectors);
- if (!bitmap_empty(&error, stripe->nr_sectors)) {
+ bitmap_and(&error, &error, &has_extent, stripe->nr_sectors);
+ if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) {
btrfs_err(fs_info,
-"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
+"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
full_stripe_start, i, stripe->nr_sectors,
&error);
ret = -EIO;
goto out;
}
- bitmap_or(&extent_bitmap, &extent_bitmap,
- &stripe->extent_sector_bitmap, stripe->nr_sectors);
+ bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent,
+ stripe->nr_sectors);
}
/* Now we can check and regenerate the P/Q stripe. */
- bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
- bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
- bio->bi_private = &io_done;
- bio->bi_end_io = raid56_scrub_wait_endio;
-
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
- &length, &bioc, NULL, NULL);
- if (ret < 0) {
- btrfs_put_bioc(bioc);
- btrfs_bio_counter_dec(fs_info);
- goto out;
- }
- rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
- BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
- btrfs_put_bioc(bioc);
- if (!rbio) {
- ret = -ENOMEM;
- btrfs_bio_counter_dec(fs_info);
- goto out;
- }
- /* Use the recovered stripes as cache to avoid read them from disk again. */
- for (int i = 0; i < data_stripes; i++) {
- stripe = &sctx->raid56_data_stripes[i];
-
- raid56_parity_cache_data_pages(rbio, stripe->pages,
- full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
- }
- raid56_parity_submit_scrub_rbio(rbio);
- wait_for_completion_io(&io_done);
- ret = blk_status_to_errno(bio->bi_status);
- bio_put(bio);
- btrfs_bio_counter_dec(fs_info);
-
+ ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start,
+ &extent_bitmap);
+out:
btrfs_release_path(&extent_path);
btrfs_release_path(&csum_path);
-out:
return ret;
}
@@ -2133,18 +2335,13 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
u64 found_logical = U64_MAX;
u64 cur_physical = physical + cur_logical - logical_start;
- /* Canceled? */
- if (atomic_read(&fs_info->scrub_cancel_req) ||
- atomic_read(&sctx->cancel_req)) {
- ret = -ECANCELED;
+ ret = should_cancel_scrub(sctx);
+ if (ret < 0)
break;
- }
- /* Paused? */
- if (atomic_read(&fs_info->scrub_pause_req)) {
- /* Push queued extents */
+
+ if (atomic_read(&fs_info->scrub_pause_req))
scrub_blocked_if_needed(fs_info);
- }
- /* Block group removed? */
+
spin_lock(&bg->lock);
if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
spin_unlock(&bg->lock);
@@ -2399,8 +2596,6 @@ out:
}
if (sctx->is_dev_replace && ret >= 0) {
- int ret2;
-
ret2 = sync_write_pointer_for_zoned(sctx,
chunk_logical + offset,
map->stripes[stripe_index].physical,
@@ -2475,7 +2670,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev, u64 start, u64 end)
{
struct btrfs_dev_extent *dev_extent = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->dev_root;
u64 chunk_offset;
@@ -2493,8 +2688,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
return -ENOMEM;
path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
key.objectid = scrub_dev->devid;
key.type = BTRFS_DEV_EXTENT_KEY;
@@ -2677,14 +2872,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
ro_set = 0;
} else if (ret == -ETXTBSY) {
btrfs_warn(fs_info,
- "skipping scrub of block group %llu due to active swapfile",
+ "scrub: skipping scrub of block group %llu due to active swapfile",
cache->start);
scrub_pause_off(fs_info);
ret = 0;
goto skip_unfreeze;
} else {
- btrfs_warn(fs_info,
- "failed setting block group ro: %d", ret);
+ btrfs_warn(fs_info, "scrub: failed setting block group ro: %d",
+ ret);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
scrub_pause_off(fs_info);
@@ -2747,8 +2942,8 @@ skip_unfreeze:
btrfs_put_block_group(cache);
if (ret)
break;
- if (sctx->is_dev_replace &&
- atomic64_read(&dev_replace->num_write_errors) > 0) {
+ if (unlikely(sctx->is_dev_replace &&
+ atomic64_read(&dev_replace->num_write_errors) > 0)) {
ret = -EIO;
break;
}
@@ -2761,8 +2956,6 @@ skip:
btrfs_release_path(path);
}
- btrfs_free_path(path);
-
return ret;
}
@@ -2770,29 +2963,23 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
struct page *page, u64 physical, u64 generation)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct bio_vec bvec;
- struct bio bio;
struct btrfs_super_block *sb = page_address(page);
int ret;
- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
- __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
- ret = submit_bio_wait(&bio);
- bio_uninit(&bio);
-
+ ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb,
+ BTRFS_SUPER_INFO_SIZE, REQ_OP_READ);
if (ret < 0)
return ret;
ret = btrfs_check_super_csum(fs_info, sb);
- if (ret != 0) {
+ if (unlikely(ret != 0)) {
btrfs_err_rl(fs_info,
- "super block at physical %llu devid %llu has bad csum",
+ "scrub: super block at physical %llu devid %llu has bad csum",
physical, dev->devid);
return -EIO;
}
- if (btrfs_super_generation(sb) != generation) {
+ if (unlikely(btrfs_super_generation(sb) != generation)) {
btrfs_err_rl(fs_info,
-"super block at physical %llu devid %llu has bad generation %llu expect %llu",
+"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu",
physical, dev->devid,
btrfs_super_generation(sb), generation);
return -EUCLEAN;
@@ -2908,7 +3095,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
- int readonly, int is_dev_replace)
+ bool readonly, bool is_dev_replace)
{
struct btrfs_dev_lookup_args args = { .devid = devid };
struct scrub_ctx *sctx;
@@ -2917,6 +3104,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
unsigned int nofs_flag;
bool need_commit = false;
+ /* Set the basic fallback @last_physical before we got a sctx. */
+ if (progress)
+ progress->last_physical = start;
+
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -2935,6 +3126,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
sctx = scrub_setup_ctx(fs_info, is_dev_replace);
if (IS_ERR(sctx))
return PTR_ERR(sctx);
+ sctx->stat.last_physical = start;
ret = scrub_workers_get(fs_info);
if (ret)
@@ -2952,16 +3144,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_err_in_rcu(fs_info,
- "scrub on devid %llu: filesystem on %s is not writable",
+ btrfs_err(fs_info,
+ "scrub: devid %llu: filesystem on %s is not writable",
devid, btrfs_dev_name(dev));
ret = -EROFS;
goto out;
}
mutex_lock(&fs_info->scrub_lock);
- if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
- test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
+ if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index f0df597b75c7..aa68b6ebaf55 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -11,7 +11,7 @@ struct btrfs_scrub_progress;
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
- int readonly, int is_dev_replace);
+ bool readonly, bool is_dev_replace);
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0c8c58c4f29b..2522faa97478 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4,6 +4,7 @@
*/
#include <linux/bsearch.h>
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/sort.h>
@@ -46,28 +47,30 @@
* It allows fast adding of path elements on the right side (normal path) and
* fast adding to the left side (reversed path). A reversed path can also be
* unreversed if needed.
+ *
+ * The definition of struct fs_path relies on -fms-extensions to allow
+ * including a tagged struct as an anonymous member.
*/
+struct __fs_path {
+ char *start;
+ char *end;
+
+ char *buf;
+ unsigned short buf_len:15;
+ unsigned short reversed:1;
+};
+static_assert(sizeof(struct __fs_path) < 256);
struct fs_path {
- union {
- struct {
- char *start;
- char *end;
-
- char *buf;
- unsigned short buf_len:15;
- unsigned short reversed:1;
- char inline_buf[];
- };
- /*
- * Average path length does not exceed 200 bytes, we'll have
- * better packing in the slab and higher chance to satisfy
- * an allocation later during send.
- */
- char pad[256];
- };
+ struct __fs_path;
+ /*
+ * Average path length does not exceed 200 bytes, we'll have
+ * better packing in the slab and higher chance to satisfy
+ * an allocation later during send.
+ */
+ char inline_buf[256 - sizeof(struct __fs_path)];
};
#define FS_PATH_INLINE_SIZE \
- (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
+ sizeof_field(struct fs_path, inline_buf)
/* reused for each extent */
@@ -177,7 +180,6 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
- struct fs_path cur_inode_path;
bool cur_inode_new;
bool cur_inode_new_gen;
bool cur_inode_deleted;
@@ -304,6 +306,8 @@ struct send_ctx {
struct btrfs_lru_cache dir_created_cache;
struct btrfs_lru_cache dir_utimes_cache;
+
+ struct fs_path cur_inode_path;
};
struct pending_dir_move {
@@ -383,11 +387,11 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
result_string = "updated";
break;
case BTRFS_COMPARE_TREE_SAME:
- ASSERT(0);
+ DEBUG_WARN("no change between trees");
result_string = "unchanged";
break;
default:
- ASSERT(0);
+ DEBUG_WARN("unexpected comparison result %d", result);
result_string = "unexpected";
}
@@ -630,9 +634,9 @@ static struct btrfs_path *alloc_path_for_send(void)
path = btrfs_alloc_path();
if (!path)
return NULL;
- path->search_commit_root = 1;
- path->skip_locking = 1;
- path->need_commit_sem = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
+ path->need_commit_sem = true;
return path;
}
@@ -645,7 +649,7 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
ret = kernel_write(filp, buf + pos, len - pos, off);
if (ret < 0)
return ret;
- if (ret == 0)
+ if (unlikely(ret == 0))
return -EIO;
pos += ret;
}
@@ -758,7 +762,7 @@ static int send_header(struct send_ctx *sctx)
{
struct btrfs_stream_header hdr;
- strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
+ strscpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
hdr.version = cpu_to_le32(sctx->proto);
return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
&sctx->send_off);
@@ -816,11 +820,8 @@ static int send_cmd(struct send_ctx *sctx)
static int send_rename(struct send_ctx *sctx,
struct fs_path *from, struct fs_path *to)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
if (ret < 0)
return ret;
@@ -840,11 +841,8 @@ tlv_put_failure:
static int send_link(struct send_ctx *sctx,
struct fs_path *path, struct fs_path *lnk)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
if (ret < 0)
return ret;
@@ -863,11 +861,8 @@ tlv_put_failure:
*/
static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_unlink %s", path->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
if (ret < 0)
return ret;
@@ -885,11 +880,8 @@ tlv_put_failure:
*/
static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_rmdir %s", path->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
if (ret < 0)
return ret;
@@ -920,7 +912,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
struct btrfs_inode_info *info)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_item *ii;
struct btrfs_key key;
@@ -935,11 +927,11 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto out;
+ return ret;
}
if (!info)
- goto out;
+ return 0;
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
@@ -956,9 +948,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
*/
info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
@@ -984,13 +974,13 @@ typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx);
* path must point to the INODE_REF or INODE_EXTREF when called.
*/
static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *found_key, int resolve,
+ struct btrfs_key *found_key, bool resolve,
iterate_inode_ref_t iterate, void *ctx)
{
struct extent_buffer *eb = path->nodes[0];
struct btrfs_inode_ref *iref;
struct btrfs_inode_extref *extref;
- struct btrfs_path *tmp_path;
+ BTRFS_PATH_AUTO_FREE(tmp_path);
struct fs_path *p;
u32 cur = 0;
u32 total;
@@ -1064,10 +1054,8 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
}
if (unlikely(start < p->buf)) {
btrfs_err(root->fs_info,
- "send: path ref buffer underflow for key (%llu %u %llu)",
- found_key->objectid,
- found_key->type,
- found_key->offset);
+ "send: path ref buffer underflow for key " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(found_key));
ret = -EINVAL;
goto out;
}
@@ -1087,7 +1075,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
}
out:
- btrfs_free_path(tmp_path);
fs_path_free(p);
return ret;
}
@@ -1148,12 +1135,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
- if (name_len > XATTR_NAME_MAX) {
+ if (unlikely(name_len > XATTR_NAME_MAX)) {
ret = -ENAMETOOLONG;
goto out;
}
- if (name_len + data_len >
- BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
+ if (unlikely(name_len + data_len >
+ BTRFS_MAX_XATTR_SIZE(root->fs_info))) {
ret = -E2BIG;
goto out;
}
@@ -1161,7 +1148,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
/*
* Path too long
*/
- if (name_len + data_len > PATH_MAX) {
+ if (unlikely(name_len + data_len > PATH_MAX)) {
ret = -ENAMETOOLONG;
goto out;
}
@@ -1235,7 +1222,7 @@ static int get_inode_path(struct btrfs_root *root,
{
int ret;
struct btrfs_key key, found_key;
- struct btrfs_path *p;
+ BTRFS_PATH_AUTO_FREE(p);
p = alloc_path_for_send();
if (!p)
@@ -1249,28 +1236,20 @@ static int get_inode_path(struct btrfs_root *root,
ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 1;
- goto out;
- }
+ return ret;
+ if (ret)
+ return 1;
+
btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
if (found_key.objectid != ino ||
(found_key.type != BTRFS_INODE_REF_KEY &&
- found_key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = -ENOENT;
- goto out;
- }
+ found_key.type != BTRFS_INODE_EXTREF_KEY))
+ return -ENOENT;
- ret = iterate_inode_ref(root, p, &found_key, 1,
- __copy_first_ref, path);
+ ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path);
if (ret < 0)
- goto out;
- ret = 0;
-
-out:
- btrfs_free_path(p);
- return ret;
+ return ret;
+ return 0;
}
struct backref_ctx {
@@ -1400,7 +1379,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
struct backref_ctx *bctx = ctx;
struct send_ctx *sctx = bctx->sctx;
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
- const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+ const u64 key = leaf_bytenr >> fs_info->nodesize_bits;
struct btrfs_lru_cache_entry *raw_entry;
struct backref_cache_entry *entry;
@@ -1455,7 +1434,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
if (!new_entry)
return;
- new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits;
new_entry->entry.gen = 0;
new_entry->num_roots = 0;
ULIST_ITER_INIT(&uiter);
@@ -1573,7 +1552,6 @@ static int find_extent_clone(struct send_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
int extent_type;
- u64 logical;
u64 disk_byte;
u64 num_bytes;
struct btrfs_file_extent_item *fi;
@@ -1604,7 +1582,6 @@ static int find_extent_clone(struct send_ctx *sctx,
compressed = btrfs_file_extent_compression(eb, fi);
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
- logical = disk_byte + btrfs_file_extent_offset(eb, fi);
/*
* Setup the clone roots.
@@ -1686,14 +1663,8 @@ static int find_extent_clone(struct send_ctx *sctx,
}
up_read(&fs_info->commit_root_sem);
- btrfs_debug(fs_info,
- "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
- data_offset, ino, num_bytes, logical);
-
- if (!backref_ctx.found) {
- btrfs_debug(fs_info, "no clones found");
+ if (!backref_ctx.found)
return -ENOENT;
- }
cur_clone_root = NULL;
for (i = 0; i < sctx->clone_roots_cnt; i++) {
@@ -1735,7 +1706,7 @@ static int read_symlink(struct btrfs_root *root,
struct fs_path *dest)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_file_extent_item *ei;
u8 type;
@@ -1752,21 +1723,20 @@ static int read_symlink(struct btrfs_root *root,
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
- if (ret) {
+ return ret;
+ if (unlikely(ret)) {
/*
* An empty symlink inode. Can happen in rare error paths when
* creating a symlink (transaction committed before the inode
* eviction handler removed the symlink inode items and a crash
- * happened in between or the subvol was snapshoted in between).
+ * happened in between or the subvol was snapshotted in between).
* Print an informative message to dmesg/syslog so that the user
* can delete the symlink.
*/
btrfs_err(root->fs_info,
"Found empty symlink inode %llu at root %llu",
ino, btrfs_root_id(root));
- ret = -EIO;
- goto out;
+ return -EIO;
}
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -1777,7 +1747,7 @@ static int read_symlink(struct btrfs_root *root,
btrfs_crit(root->fs_info,
"send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
ino, btrfs_root_id(root), type);
- goto out;
+ return ret;
}
compression = btrfs_file_extent_compression(path->nodes[0], ei);
if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
@@ -1785,17 +1755,13 @@ static int read_symlink(struct btrfs_root *root,
btrfs_crit(root->fs_info,
"send: found symlink extent with compression, ino %llu root %llu compression type %d",
ino, btrfs_root_id(root), compression);
- goto out;
+ return ret;
}
off = btrfs_file_extent_inline_start(ei);
len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
}
/*
@@ -1806,8 +1772,7 @@ static int gen_unique_name(struct send_ctx *sctx,
u64 ino, u64 gen,
struct fs_path *dest)
{
- int ret = 0;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
char tmp[64];
int len;
@@ -1824,16 +1789,15 @@ static int gen_unique_name(struct send_ctx *sctx,
ino, gen, idx);
ASSERT(len < sizeof(tmp));
tmp_name.name = tmp;
- tmp_name.len = strlen(tmp);
+ tmp_name.len = len;
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
&tmp_name, 0);
btrfs_release_path(path);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+
if (di) {
/* not unique, try again */
idx++;
@@ -1842,7 +1806,6 @@ static int gen_unique_name(struct send_ctx *sctx,
if (!sctx->parent_root) {
/* unique */
- ret = 0;
break;
}
@@ -1850,10 +1813,9 @@ static int gen_unique_name(struct send_ctx *sctx,
path, BTRFS_FIRST_FREE_OBJECTID,
&tmp_name, 0);
btrfs_release_path(path);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+
if (di) {
/* not unique, try again */
idx++;
@@ -1863,11 +1825,7 @@ static int gen_unique_name(struct send_ctx *sctx,
break;
}
- ret = fs_path_add(dest, tmp, strlen(tmp));
-
-out:
- btrfs_free_path(path);
- return ret;
+ return fs_path_add(dest, tmp, len);
}
enum inode_state {
@@ -1979,7 +1937,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
int ret = 0;
struct btrfs_dir_item *di;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
path = alloc_path_for_send();
@@ -1987,19 +1945,15 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
return -ENOMEM;
di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
- if (IS_ERR_OR_NULL(di)) {
- ret = di ? PTR_ERR(di) : -ENOENT;
- goto out;
- }
+ if (IS_ERR_OR_NULL(di))
+ return di ? PTR_ERR(di) : -ENOENT;
+
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.type == BTRFS_ROOT_ITEM_KEY) {
- ret = -ENOENT;
- goto out;
- }
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ return -ENOENT;
+
*found_inode = key.objectid;
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2013,7 +1967,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int len;
u64 parent_dir;
@@ -2027,16 +1981,14 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
if (ret < 0)
- goto out;
+ return ret;
if (!ret)
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (ret || found_key.objectid != ino ||
(found_key.type != BTRFS_INODE_REF_KEY &&
- found_key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = -ENOENT;
- goto out;
- }
+ found_key.type != BTRFS_INODE_EXTREF_KEY))
+ return -ENOENT;
if (found_key.type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
@@ -2057,19 +2009,17 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
}
if (ret < 0)
- goto out;
+ return ret;
btrfs_release_path(path);
if (dir_gen) {
ret = get_inode_gen(root, parent_dir, dir_gen);
if (ret < 0)
- goto out;
+ return ret;
}
*dir = parent_dir;
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2505,11 +2455,11 @@ static int send_subvol_begin(struct send_ctx *sctx)
int ret;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_root *parent_root = sctx->parent_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
- char *name = NULL;
+ char AUTO_KFREE(name);
int namelen;
path = btrfs_alloc_path();
@@ -2517,10 +2467,8 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
- if (!name) {
- btrfs_free_path(path);
+ if (!name)
return -ENOMEM;
- }
key.objectid = btrfs_root_id(send_root);
key.type = BTRFS_ROOT_BACKREF_KEY;
@@ -2529,18 +2477,15 @@ static int send_subvol_begin(struct send_ctx *sctx)
ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
&key, path, 1, 0);
if (ret < 0)
- goto out;
- if (ret) {
- ret = -ENOENT;
- goto out;
- }
+ return ret;
+ if (ret)
+ return -ENOENT;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
key.objectid != btrfs_root_id(send_root)) {
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
namelen = btrfs_root_ref_name_len(leaf, ref);
@@ -2550,11 +2495,11 @@ static int send_subvol_begin(struct send_ctx *sctx)
if (parent_root) {
ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
if (ret < 0)
- goto out;
+ return ret;
} else {
ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
if (ret < 0)
- goto out;
+ return ret;
}
TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
@@ -2582,9 +2527,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- btrfs_free_path(path);
- kfree(name);
return ret;
}
@@ -2631,12 +2573,9 @@ static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *p
static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
-
p = get_path_for_command(sctx, ino, gen);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -2658,12 +2597,9 @@ out:
static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
-
p = get_path_for_command(sctx, ino, gen);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -2685,15 +2621,12 @@ out:
static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
if (sctx->proto < 2)
return 0;
- btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
-
p = get_path_for_command(sctx, ino, gen);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -2715,13 +2648,9 @@ out:
static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
- ino, uid, gid);
-
p = get_path_for_command(sctx, ino, gen);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -2744,17 +2673,14 @@ out:
static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p = NULL;
struct btrfs_inode_item *ii;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
struct btrfs_key key;
int slot;
- btrfs_debug(fs_info, "send_utimes %llu", ino);
-
p = get_path_for_command(sctx, ino, gen);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -2794,7 +2720,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
tlv_put_failure:
out:
free_path_for_command(sctx, p);
- btrfs_free_path(path);
return ret;
}
@@ -2804,7 +2729,7 @@ out:
* processing an inode that is a directory and it just got renamed, and existing
* entries in the cache may refer to inodes that have the directory in their
* full path - in which case we would generate outdated paths (pre-rename)
- * for the inodes that the cache entries point to. Instead of prunning the
+ * for the inodes that the cache entries point to. Instead of pruning the
* cache when inserting, do it after we finish processing each inode at
* finish_inode_if_needed().
*/
@@ -2861,7 +2786,6 @@ static int trim_dir_utimes_cache(struct send_ctx *sctx)
*/
static int send_create_inode(struct send_ctx *sctx, u64 ino)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
int cmd;
@@ -2870,8 +2794,6 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
u64 mode;
u64 rdev;
- btrfs_debug(fs_info, "send_create_inode %llu", ino);
-
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2968,7 +2890,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
{
int ret = 0;
int iter_ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_key di_key;
@@ -3008,7 +2930,6 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
if (iter_ret < 0)
ret = iter_ret;
- btrfs_free_path(path);
return ret;
}
@@ -3098,7 +3019,7 @@ static void __free_recorded_refs(struct list_head *head)
struct recorded_ref *cur;
while (!list_empty(head)) {
- cur = list_entry(head->next, struct recorded_ref, list);
+ cur = list_first_entry(head, struct recorded_ref, list);
recorded_ref_free(cur);
}
}
@@ -3788,7 +3709,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref,
const bool is_orphan)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key di_key;
struct btrfs_dir_item *di;
@@ -3809,19 +3730,15 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 0;
di = btrfs_match_dir_item_name(path, parent_ref->name,
parent_ref->name_len);
- if (!di) {
- ret = 0;
- goto out;
- }
+ if (!di)
+ return 0;
/*
* di_key.objectid has the number of the inode that has a dentry in the
* parent directory with the same name that sctx->cur_ino is being
@@ -3831,26 +3748,22 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
* that it happens after that other inode is renamed.
*/
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
- if (di_key.type != BTRFS_INODE_ITEM_KEY) {
- ret = 0;
- goto out;
- }
+ if (di_key.type != BTRFS_INODE_ITEM_KEY)
+ return 0;
ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
if (ret < 0)
- goto out;
+ return ret;
ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
- goto out;
+ return ret;
}
/* Different inode, no need to delay the rename of sctx->cur_ino */
- if (right_gen != left_gen) {
- ret = 0;
- goto out;
- }
+ if (right_gen != left_gen)
+ return 0;
wdm = get_waiting_dir_move(sctx, di_key.objectid);
if (wdm && !wdm->orphanized) {
@@ -3864,8 +3777,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
if (!ret)
ret = 1;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3915,7 +3826,7 @@ static int is_ancestor(struct btrfs_root *root,
bool free_fs_path = false;
int ret = 0;
int iter_ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
if (!fs_path) {
@@ -3983,7 +3894,6 @@ static int is_ancestor(struct btrfs_root *root,
ret = iter_ret;
out:
- btrfs_free_path(path);
if (free_fs_path)
fs_path_free(fs_path);
return ret;
@@ -4163,7 +4073,7 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
*/
static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
{
- char *name;
+ char AUTO_KFREE(name);
int ret;
name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
@@ -4173,17 +4083,58 @@ static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
fs_path_reset(ref->full_path);
ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
if (ret < 0)
- goto out;
+ return ret;
ret = fs_path_add(ref->full_path, name, ref->name_len);
if (ret < 0)
- goto out;
+ return ret;
/* Update the reference's base name pointer. */
set_ref_path(ref, ref->full_path);
-out:
- kfree(name);
- return ret;
+
+ return 0;
+}
+
+static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node)
+{
+ const struct recorded_ref *data = k;
+ const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
+
+ if (data->dir > ref->dir)
+ return 1;
+ if (data->dir < ref->dir)
+ return -1;
+ if (data->dir_gen > ref->dir_gen)
+ return 1;
+ if (data->dir_gen < ref->dir_gen)
+ return -1;
+ return 0;
+}
+
+static bool rbtree_check_dir_ref_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
+
+ return rbtree_check_dir_ref_comp(entry, parent) < 0;
+}
+
+static int record_check_dir_ref_in_tree(struct rb_root *root,
+ struct recorded_ref *ref, struct list_head *list)
+{
+ struct recorded_ref *tmp_ref;
+ int ret;
+
+ if (rb_find(ref, root, rbtree_check_dir_ref_comp))
+ return 0;
+
+ ret = dup_ref(ref, list);
+ if (ret < 0)
+ return ret;
+
+ tmp_ref = list_last_entry(list, struct recorded_ref, list);
+ rb_add(&tmp_ref->node, root, rbtree_check_dir_ref_less);
+ tmp_ref->root = root;
+ return 0;
}
static int rename_current_inode(struct send_ctx *sctx,
@@ -4213,19 +4164,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct recorded_ref *cur;
struct recorded_ref *cur2;
LIST_HEAD(check_dirs);
+ struct rb_root rbtree_check_dirs = RB_ROOT;
struct fs_path *valid_path = NULL;
u64 ow_inode = 0;
u64 ow_gen;
u64 ow_mode;
- u64 last_dir_ino_rm = 0;
bool did_overwrite = false;
bool is_orphan = false;
bool can_rename = true;
bool orphanized_dir = false;
bool orphanized_ancestor = false;
- btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
-
/*
* This should never happen as the root dir always has the same ref
* which is always '..'
@@ -4523,7 +4472,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
}
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4551,7 +4500,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
}
list_for_each_entry(cur, &sctx->deleted_refs, list) {
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4560,9 +4509,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
/*
* We have a moved dir. Add the old parent to check_dirs
*/
- cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
- list);
- ret = dup_ref(cur, &check_dirs);
+ cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
} else if (!S_ISDIR(sctx->cur_inode_mode)) {
@@ -4596,7 +4544,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (is_current_inode_path(sctx, cur->full_path))
fs_path_reset(&sctx->cur_inode_path);
}
- ret = dup_ref(cur, &check_dirs);
+ ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);
if (ret < 0)
goto out;
}
@@ -4639,8 +4587,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- } else if (ret == inode_state_did_delete &&
- cur->dir != last_dir_ino_rm) {
+ } else if (ret == inode_state_did_delete) {
ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
@@ -4652,7 +4599,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_rmdir(sctx, valid_path);
if (ret < 0)
goto out;
- last_dir_ino_rm = cur->dir;
}
}
}
@@ -4670,7 +4616,6 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node)
{
const struct recorded_ref *data = k;
const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
- int result;
if (data->dir > ref->dir)
return 1;
@@ -4684,12 +4629,7 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node)
return 1;
if (data->name_len < ref->name_len)
return -1;
- result = strcmp(data->name, ref->name);
- if (result > 0)
- return 1;
- if (result < 0)
- return -1;
- return 0;
+ return strcmp(data->name, ref->name);
}
static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
@@ -4803,8 +4743,8 @@ static int record_new_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ false, record_new_ref_if_needed, sctx);
if (ret < 0)
return ret;
@@ -4815,9 +4755,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, record_deleted_ref_if_needed,
- sctx);
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key,
+ false, record_deleted_ref_if_needed, sctx);
if (ret < 0)
return ret;
@@ -4828,12 +4767,12 @@ static int record_changed_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ false, record_new_ref_if_needed, sctx);
if (ret < 0)
return ret;
- ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key,
+ false, record_deleted_ref_if_needed, sctx);
if (ret < 0)
return ret;
@@ -4850,7 +4789,7 @@ static int process_all_refs(struct send_ctx *sctx,
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
iterate_inode_ref_t cb;
@@ -4869,8 +4808,7 @@ static int process_all_refs(struct send_ctx *sctx,
} else {
btrfs_err(sctx->send_root->fs_info,
"Wrong command %d in process_all_refs", cmd);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
key.objectid = sctx->cmp_key->objectid;
@@ -4882,15 +4820,14 @@ static int process_all_refs(struct send_ctx *sctx,
found_key.type != BTRFS_INODE_EXTREF_KEY))
break;
- ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
+ ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx);
if (ret < 0)
- goto out;
+ return ret;
}
/* Catch error found during iteration */
- if (iter_ret < 0) {
- ret = iter_ret;
- goto out;
- }
+ if (iter_ret < 0)
+ return iter_ret;
+
btrfs_release_path(path);
/*
@@ -4898,10 +4835,7 @@ static int process_all_refs(struct send_ctx *sctx,
* re-creating this inode and will be rename'ing it into place once we
* rename the parent directory.
*/
- ret = process_recorded_refs(sctx, &pending_move);
-out:
- btrfs_free_path(path);
- return ret;
+ return process_recorded_refs(sctx, &pending_move);
}
static int send_set_xattr(struct send_ctx *sctx,
@@ -5010,6 +4944,7 @@ struct find_xattr_ctx {
int found_idx;
char *found_data;
int found_data_len;
+ bool copy_data;
};
static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
@@ -5021,9 +4956,11 @@ static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
strncmp(name, ctx->name, name_len) == 0) {
ctx->found_idx = num;
ctx->found_data_len = data_len;
- ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
- if (!ctx->found_data)
- return -ENOMEM;
+ if (ctx->copy_data) {
+ ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
+ if (!ctx->found_data)
+ return -ENOMEM;
+ }
return 1;
}
return 0;
@@ -5043,6 +4980,7 @@ static int find_xattr(struct btrfs_root *root,
ctx.found_idx = -1;
ctx.found_data = NULL;
ctx.found_data_len = 0;
+ ctx.copy_data = (data != NULL);
ret = iterate_dir_item(root, path, __find_xattr, &ctx);
if (ret < 0)
@@ -5054,7 +4992,7 @@ static int find_xattr(struct btrfs_root *root,
*data = ctx.found_data;
*data_len = ctx.found_data_len;
} else {
- kfree(ctx.found_data);
+ ASSERT(ctx.found_data == NULL);
}
return ctx.found_idx;
}
@@ -5067,8 +5005,8 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
{
int ret;
struct send_ctx *sctx = ctx;
- char *found_data = NULL;
- int found_data_len = 0;
+ char AUTO_KFREE(found_data);
+ int found_data_len = 0;
ret = find_xattr(sctx->parent_root, sctx->right_path,
sctx->cmp_key, name, name_len, &found_data,
@@ -5086,7 +5024,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
}
}
- kfree(found_data);
return ret;
}
@@ -5127,7 +5064,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
@@ -5155,7 +5092,6 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
if (iter_ret < 0)
ret = iter_ret;
- btrfs_free_path(path);
return ret;
}
@@ -5198,7 +5134,7 @@ static int process_verity(struct send_ctx *sctx)
if (ret < 0)
goto iput;
- if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+ if (unlikely(ret > FS_VERITY_MAX_DESCRIPTOR_SIZE)) {
ret = -EMSGSIZE;
goto iput;
}
@@ -5242,14 +5178,14 @@ static int put_data_header(struct send_ctx *sctx, u32 len)
* Since v2, the data attribute header doesn't include a length,
* it is implicitly to the end of the command.
*/
- if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
+ if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(__le16) + len))
return -EOVERFLOW;
put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
sctx->send_size += sizeof(__le16);
} else {
struct btrfs_tlv_header *hdr;
- if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len))
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
@@ -5263,10 +5199,9 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct folio *folio;
- pgoff_t index = offset >> PAGE_SHIFT;
- pgoff_t last_index;
- unsigned pg_offset = offset_in_page(offset);
+ u64 cur = offset;
+ const u64 end = offset + len;
+ const pgoff_t last_index = ((end - 1) >> PAGE_SHIFT);
struct address_space *mapping = sctx->cur_inode->i_mapping;
int ret;
@@ -5274,13 +5209,12 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (ret)
return ret;
- last_index = (offset + len - 1) >> PAGE_SHIFT;
-
- while (index <= last_index) {
- unsigned cur_len = min_t(unsigned, len,
- PAGE_SIZE - pg_offset);
+ while (cur < end) {
+ pgoff_t index = (cur >> PAGE_SHIFT);
+ unsigned int cur_len;
+ unsigned int pg_offset;
+ struct folio *folio;
-again:
folio = filemap_lock_folio(mapping, index);
if (IS_ERR(folio)) {
page_cache_sync_readahead(mapping,
@@ -5293,8 +5227,8 @@ again:
break;
}
}
-
- WARN_ON(folio_order(folio));
+ pg_offset = offset_in_folio(folio, cur);
+ cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset);
if (folio_test_readahead(folio))
page_cache_async_readahead(mapping, &sctx->ra, NULL, folio,
@@ -5303,7 +5237,7 @@ again:
if (!folio_test_uptodate(folio)) {
btrfs_read_folio(NULL, folio);
folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
folio_unlock(folio);
btrfs_err(fs_info,
"send: IO error at offset %llu for inode %llu root %llu",
@@ -5316,7 +5250,7 @@ again:
if (folio->mapping != mapping) {
folio_unlock(folio);
folio_put(folio);
- goto again;
+ continue;
}
}
@@ -5324,9 +5258,7 @@ again:
pg_offset, cur_len);
folio_unlock(folio);
folio_put(folio);
- index++;
- pg_offset = 0;
- len -= cur_len;
+ cur += cur_len;
sctx->send_size += cur_len;
}
@@ -5339,12 +5271,9 @@ again:
*/
static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
-
p = get_cur_inode_path(sctx);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -5377,11 +5306,6 @@ static int send_clone(struct send_ctx *sctx,
struct fs_path *cur_inode_path;
u64 gen;
- btrfs_debug(sctx->send_root->fs_info,
- "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, btrfs_root_id(clone_root->root),
- clone_root->ino, clone_root->offset);
-
cur_inode_path = get_cur_inode_path(sctx);
if (IS_ERR(cur_inode_path))
return PTR_ERR(cur_inode_path);
@@ -5465,6 +5389,30 @@ tlv_put_failure:
return ret;
}
+static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len)
+{
+ struct fs_path *path;
+ int ret;
+
+ path = get_cur_inode_path(sctx);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE);
+ if (ret < 0)
+ return ret;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+ return ret;
+}
+
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
@@ -5473,6 +5421,14 @@ static int send_hole(struct send_ctx *sctx, u64 end)
int ret = 0;
/*
+ * Starting with send stream v2 we have fallocate and can use it to
+ * punch holes instead of sending writes full of zeroes.
+ */
+ if (proto_cmd_ok(sctx, BTRFS_SEND_C_FALLOCATE))
+ return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ offset, end - offset);
+
+ /*
* A hole that starts at EOF or beyond it. Since we do not yet support
* fallocate (for extent preallocation and hole punching), sending a
* write of zeroes starting at EOF or beyond would later require issuing
@@ -5629,8 +5585,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* between the beginning of the command and the file data.
*/
data_offset = PAGE_ALIGN(sctx->send_size);
- if (data_offset > sctx->send_max_size ||
- sctx->send_max_size - data_offset < disk_num_bytes) {
+ if (unlikely(data_offset > sctx->send_max_size ||
+ sctx->send_max_size - data_offset < disk_num_bytes)) {
ret = -EOVERFLOW;
goto out;
}
@@ -5793,11 +5749,11 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
*/
static int send_capabilities(struct send_ctx *sctx)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
struct extent_buffer *leaf;
unsigned long data_ptr;
- char *buf = NULL;
+ char AUTO_KFREE(buf);
int buf_len;
int ret = 0;
@@ -5809,29 +5765,23 @@ static int send_capabilities(struct send_ctx *sctx)
XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
if (!di) {
/* There is no xattr for this inode */
- goto out;
+ return 0;
} else if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
+ return PTR_ERR(di);
}
leaf = path->nodes[0];
buf_len = btrfs_dir_data_len(leaf, di);
buf = kmalloc(buf_len, GFP_KERNEL);
- if (!buf) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!buf)
+ return -ENOMEM;
data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
read_extent_buffer(leaf, buf, data_ptr, buf_len);
ret = send_set_xattr(sctx, XATTR_NAME_CAPS,
strlen(XATTR_NAME_CAPS), buf, buf_len);
-out:
- kfree(buf);
- btrfs_free_path(path);
return ret;
}
@@ -5839,7 +5789,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
struct clone_root *clone_root, const u64 disk_byte,
u64 data_offset, u64 offset, u64 len)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret;
struct btrfs_inode_info info;
@@ -5875,7 +5825,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = get_inode_info(clone_root->root, clone_root->ino, &info);
btrfs_release_path(path);
if (ret < 0)
- goto out;
+ return ret;
clone_src_i_size = info.size;
/*
@@ -5905,7 +5855,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
key.offset = clone_root->offset;
ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && path->slots[0] > 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
if (key.objectid == clone_root->ino &&
@@ -5926,7 +5876,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
if (ret < 0)
- goto out;
+ return ret;
else if (ret > 0)
break;
continue;
@@ -5963,7 +5913,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_extent_data(sctx, dst_path, offset,
hole_len);
if (ret < 0)
- goto out;
+ return ret;
len -= hole_len;
if (len == 0)
@@ -6034,7 +5984,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_clone(sctx, offset, slen,
clone_root);
if (ret < 0)
- goto out;
+ return ret;
}
ret = send_extent_data(sctx, dst_path,
offset + slen,
@@ -6068,7 +6018,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
}
if (ret < 0)
- goto out;
+ return ret;
len -= clone_len;
if (len == 0)
@@ -6099,8 +6049,6 @@ next:
ret = send_extent_data(sctx, dst_path, offset, len);
else
ret = 0;
-out:
- btrfs_free_path(path);
return ret;
}
@@ -6189,7 +6137,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
{
int ret = 0;
struct btrfs_key key;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
int slot;
struct btrfs_key found_key;
@@ -6215,10 +6163,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
left_type = btrfs_file_extent_type(eb, ei);
- if (left_type != BTRFS_FILE_EXTENT_REG) {
- ret = 0;
- goto out;
- }
+ if (left_type != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
left_len = btrfs_file_extent_num_bytes(eb, ei);
left_offset = btrfs_file_extent_offset(eb, ei);
@@ -6250,11 +6197,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
key.offset = ekey->offset;
ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ return ret;
+ if (ret)
+ return 0;
/*
* Handle special case where the right side has no extents at all.
@@ -6263,11 +6208,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
slot = path->slots[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
- found_key.type != key.type) {
+ found_key.type != key.type)
/* If we're a hole then just pretend nothing changed */
- ret = (left_disknr) ? 0 : 1;
- goto out;
- }
+ return (left_disknr ? 0 : 1);
/*
* We're now on 2a, 2b or 7.
@@ -6277,10 +6220,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
right_type = btrfs_file_extent_type(eb, ei);
if (right_type != BTRFS_FILE_EXTENT_REG &&
- right_type != BTRFS_FILE_EXTENT_INLINE) {
- ret = 0;
- goto out;
- }
+ right_type != BTRFS_FILE_EXTENT_INLINE)
+ return 0;
if (right_type == BTRFS_FILE_EXTENT_INLINE) {
right_len = btrfs_file_extent_ram_bytes(eb, ei);
@@ -6293,11 +6234,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
* Are we at extent 8? If yes, we know the extent is changed.
* This may only happen on the first iteration.
*/
- if (found_key.offset + right_len <= ekey->offset) {
+ if (found_key.offset + right_len <= ekey->offset)
/* If we're a hole just pretend nothing changed */
- ret = (left_disknr) ? 0 : 1;
- goto out;
- }
+ return (left_disknr ? 0 : 1);
/*
* We just wanted to see if when we have an inline extent, what
@@ -6307,10 +6246,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
* compressed extent representing data with a size matching
* the page size (currently the same as sector size).
*/
- if (right_type == BTRFS_FILE_EXTENT_INLINE) {
- ret = 0;
- goto out;
- }
+ if (right_type == BTRFS_FILE_EXTENT_INLINE)
+ return 0;
right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
right_offset = btrfs_file_extent_offset(eb, ei);
@@ -6330,17 +6267,15 @@ static int is_extent_unchanged(struct send_ctx *sctx,
*/
if (left_disknr != right_disknr ||
left_offset_fixed != right_offset ||
- left_gen != right_gen) {
- ret = 0;
- goto out;
- }
+ left_gen != right_gen)
+ return 0;
/*
* Go to the next extent.
*/
ret = btrfs_next_item(sctx->parent_root, path);
if (ret < 0)
- goto out;
+ return ret;
if (!ret) {
eb = path->nodes[0];
slot = path->slots[0];
@@ -6351,10 +6286,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
key.offset += right_len;
break;
}
- if (found_key.offset != key.offset + right_len) {
- ret = 0;
- goto out;
- }
+ if (found_key.offset != key.offset + right_len)
+ return 0;
+
key = found_key;
}
@@ -6367,15 +6301,12 @@ static int is_extent_unchanged(struct send_ctx *sctx,
else
ret = 0;
-
-out:
- btrfs_free_path(path);
return ret;
}
static int get_last_extent(struct send_ctx *sctx, u64 offset)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = sctx->send_root;
struct btrfs_key key;
int ret;
@@ -6391,15 +6322,13 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
key.offset = offset;
ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
- goto out;
+ return ret;
sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -6407,7 +6336,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
const u64 start,
const u64 end)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root = sctx->parent_root;
u64 search_start = start;
@@ -6422,7 +6351,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
key.offset = search_start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && path->slots[0] > 0)
path->slots[0]--;
@@ -6435,8 +6364,8 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -6458,15 +6387,11 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
search_start = extent_end;
goto next;
}
- ret = 0;
- goto out;
+ return 0;
next:
path->slots[0]++;
}
- ret = 1;
-out:
- btrfs_free_path(path);
- return ret;
+ return 1;
}
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
@@ -6574,7 +6499,7 @@ static int process_all_extents(struct send_ctx *sctx)
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
@@ -6601,11 +6526,10 @@ static int process_all_extents(struct send_ctx *sctx)
if (iter_ret < 0)
ret = iter_ret;
- btrfs_free_path(path);
return ret;
}
-static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end,
int *pending_move,
int *refs_processed)
{
@@ -6628,7 +6552,7 @@ out:
return ret;
}
-static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end)
{
int ret = 0;
struct btrfs_inode_info info;
@@ -7063,7 +6987,7 @@ static int changed_ref(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
inconsistent_snapshot_error(sctx, result, "reference");
return -EIO;
}
@@ -7091,7 +7015,7 @@ static int changed_xattr(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
inconsistent_snapshot_error(sctx, result, "xattr");
return -EIO;
}
@@ -7331,11 +7255,11 @@ static int search_key_again(const struct send_ctx *sctx,
*/
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
ASSERT(ret <= 0);
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
btrfs_print_tree(path->nodes[path->lowest_level], false);
btrfs_err(root->fs_info,
-"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
- key->objectid, key->type, key->offset,
+"send: key " BTRFS_KEY_FMT" not found in %s root %llu, lowest_level %d, slot %d",
+ BTRFS_KEY_FMT_VALUE(key),
(root == sctx->parent_root ? "parent" : "send"),
btrfs_root_id(root), path->lowest_level,
path->slots[path->lowest_level]);
@@ -7351,7 +7275,7 @@ static int full_send_tree(struct send_ctx *sctx)
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
struct btrfs_fs_info *fs_info = send_root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = alloc_path_for_send();
if (!path)
@@ -7368,7 +7292,7 @@ static int full_send_tree(struct send_ctx *sctx)
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
goto out_finish;
@@ -7378,7 +7302,7 @@ static int full_send_tree(struct send_ctx *sctx)
ret = changed_cb(path, NULL, &key,
BTRFS_COMPARE_TREE_NEW, sctx);
if (ret < 0)
- goto out;
+ return ret;
down_read(&fs_info->commit_root_sem);
if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@@ -7397,14 +7321,14 @@ static int full_send_tree(struct send_ctx *sctx)
btrfs_release_path(path);
ret = search_key_again(sctx, send_root, path, &key);
if (ret < 0)
- goto out;
+ return ret;
} else {
up_read(&fs_info->commit_root_sem);
}
ret = btrfs_next_item(send_root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret) {
ret = 0;
break;
@@ -7412,11 +7336,7 @@ static int full_send_tree(struct send_ctx *sctx)
}
out_finish:
- ret = finish_inode_if_needed(sctx, 1);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return finish_inode_if_needed(sctx, 1);
}
static int replace_node_with_clone(struct btrfs_path *path, int level)
@@ -7671,8 +7591,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
int cmp;
- struct btrfs_path *left_path = NULL;
- struct btrfs_path *right_path = NULL;
+ BTRFS_PATH_AUTO_FREE(left_path);
+ BTRFS_PATH_AUTO_FREE(right_path);
struct btrfs_key left_key;
struct btrfs_key right_key;
char *tmp_buf = NULL;
@@ -7707,10 +7627,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- left_path->search_commit_root = 1;
- left_path->skip_locking = 1;
- right_path->search_commit_root = 1;
- right_path->skip_locking = 1;
+ left_path->search_commit_root = true;
+ left_path->skip_locking = true;
+ right_path->search_commit_root = true;
+ right_path->skip_locking = true;
/*
* Strategy: Go to the first items of both trees. Then do
@@ -7945,8 +7865,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
out_unlock:
up_read(&fs_info->commit_root_sem);
out:
- btrfs_free_path(left_path);
- btrfs_free_path(right_path);
kvfree(tmp_buf);
return ret;
}
@@ -8013,7 +7931,7 @@ static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
}
/*
- * Make sure any existing dellaloc is flushed for any root used by a send
+ * Make sure any existing delalloc is flushed for any root used by a send
* operation so that we do not miss any data and we do not race with writeback
* finishing and changing a tree while send is using the tree. This could
* happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index ff089e3e4103..6babbe333741 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -15,6 +15,7 @@
#include "accessors.h"
#include "extent-tree.h"
#include "zoned.h"
+#include "delayed-inode.h"
/*
* HOW DOES SPACE RESERVATION WORK
@@ -50,11 +51,11 @@
* num_bytes we want to reserve.
*
* ->reserve
- * space_info->bytes_may_reserve += num_bytes
+ * space_info->bytes_may_use += num_bytes
*
* ->extent allocation
* Call btrfs_add_reserved_bytes() which does
- * space_info->bytes_may_reserve -= num_bytes
+ * space_info->bytes_may_use -= num_bytes
* space_info->bytes_reserved += extent_bytes
*
* ->insert reference
@@ -67,7 +68,7 @@
* Assume we are unable to simply make the reservation because we do not have
* enough space
*
- * -> __reserve_bytes
+ * -> reserve_bytes
* create a reserve_ticket with ->bytes set to our reservation, add it to
* the tail of space_info->tickets, kick async flush thread
*
@@ -172,15 +173,14 @@
* thing with or without extra unallocated space.
*/
-u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
- bool may_use_included)
-{
- ASSERT(s_info);
- return s_info->bytes_used + s_info->bytes_reserved +
- s_info->bytes_pinned + s_info->bytes_readonly +
- s_info->bytes_zone_unusable +
- (may_use_included ? s_info->bytes_may_use : 0);
-}
+struct reserve_ticket {
+ u64 bytes;
+ int error;
+ bool steal;
+ struct list_head list;
+ wait_queue_head_t wait;
+ spinlock_t lock;
+};
/*
* after adding space to the filesystem, we need to clear the full flags
@@ -192,7 +192,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
struct btrfs_space_info *found;
list_for_each_entry(found, head, list)
- found->full = 0;
+ found->full = false;
}
/*
@@ -211,7 +211,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
if (btrfs_is_zoned(fs_info))
return fs_info->zone_size;
- ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK, "flags=%llu", flags);
if (flags & BTRFS_BLOCK_GROUP_DATA)
return BTRFS_MAX_DATA_CHUNK_SIZE;
@@ -234,19 +234,11 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
WRITE_ONCE(space_info->chunk_size, chunk_size);
}
-static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+static void init_space_info(struct btrfs_fs_info *info,
+ struct btrfs_space_info *space_info, u64 flags)
{
-
- struct btrfs_space_info *space_info;
- int i;
- int ret;
-
- space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
- if (!space_info)
- return -ENOMEM;
-
space_info->fs_info = info;
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
spin_lock_init(&space_info->lock);
@@ -257,11 +249,67 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
+ space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY;
if (btrfs_is_zoned(info))
space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
+}
+
+static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags,
+ enum btrfs_space_info_sub_group id, int index)
+{
+ struct btrfs_fs_info *fs_info = parent->fs_info;
+ struct btrfs_space_info *sub_group;
+ int ret;
+
+ ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY,
+ "parent->subgroup_id=%d", parent->subgroup_id);
+ ASSERT(id != BTRFS_SUB_GROUP_PRIMARY, "id=%d", id);
+
+ sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS);
+ if (!sub_group)
+ return -ENOMEM;
+
+ init_space_info(fs_info, sub_group, flags);
+ parent->sub_group[index] = sub_group;
+ sub_group->parent = parent;
+ sub_group->subgroup_id = id;
+
+ ret = btrfs_sysfs_add_space_info_type(sub_group);
+ if (ret) {
+ kfree(sub_group);
+ parent->sub_group[index] = NULL;
+ }
+ return ret;
+}
- ret = btrfs_sysfs_add_space_info_type(info, space_info);
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+{
+
+ struct btrfs_space_info *space_info;
+ int ret = 0;
+
+ space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
+ if (!space_info)
+ return -ENOMEM;
+
+ init_space_info(info, space_info, flags);
+
+ if (btrfs_is_zoned(info)) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ ret = create_space_info_sub_group(space_info, flags,
+ BTRFS_SUB_GROUP_DATA_RELOC,
+ 0);
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ ret = create_space_info_sub_group(space_info, flags,
+ BTRFS_SUB_GROUP_TREELOG,
+ 0);
+
+ if (ret)
+ return ret;
+ }
+
+ ret = btrfs_sysfs_add_space_info_type(space_info);
if (ret)
return ret;
@@ -312,31 +360,29 @@ out:
void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
struct btrfs_block_group *block_group)
{
- struct btrfs_space_info *found;
+ struct btrfs_space_info *space_info = block_group->space_info;
int factor, index;
factor = btrfs_bg_type_to_factor(block_group->flags);
- found = btrfs_find_space_info(info, block_group->flags);
- ASSERT(found);
- spin_lock(&found->lock);
- found->total_bytes += block_group->length;
- found->disk_total += block_group->length * factor;
- found->bytes_used += block_group->used;
- found->disk_used += block_group->used * factor;
- found->bytes_readonly += block_group->bytes_super;
- btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable);
+ spin_lock(&space_info->lock);
+ space_info->total_bytes += block_group->length;
+ space_info->disk_total += block_group->length * factor;
+ space_info->bytes_used += block_group->used;
+ space_info->disk_used += block_group->used * factor;
+ space_info->bytes_readonly += block_group->bytes_super;
+ btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable);
if (block_group->length > 0)
- found->full = 0;
- btrfs_try_granting_tickets(info, found);
- spin_unlock(&found->lock);
+ space_info->full = false;
+ btrfs_try_granting_tickets(space_info);
+ spin_unlock(&space_info->lock);
- block_group->space_info = found;
+ block_group->space_info = space_info;
index = btrfs_bg_flags_to_raid_index(block_group->flags);
- down_write(&found->groups_sem);
- list_add_tail(&block_group->list, &found->block_groups[index]);
- up_write(&found->groups_sem);
+ down_write(&space_info->groups_sem);
+ list_add_tail(&block_group->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -376,10 +422,10 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info)
return min_t(u64, data_chunk_size, SZ_1G);
}
-static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info,
- enum btrfs_reserve_flush_enum flush)
+static u64 calc_available_free_space(const struct btrfs_space_info *space_info,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 profile;
u64 avail;
u64 data_chunk_size;
@@ -434,7 +480,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
/*
* On the zoned mode, we always allocate one zone as one chunk.
- * Returning non-zone size alingned bytes here will result in
+ * Returning non-zone size aligned bytes here will result in
* less pressure for the async metadata reclaim process, and it
* will over-commit too much leading to ENOSPC. Align down to the
* zone size to avoid that.
@@ -445,44 +491,77 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
return avail;
}
-int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush)
+static inline bool check_can_overcommit(const struct btrfs_space_info *space_info,
+ u64 space_info_used_bytes, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ const u64 avail = calc_available_free_space(space_info, flush);
+
+ return (space_info_used_bytes + bytes < space_info->total_bytes + avail);
+}
+
+static inline bool can_overcommit(const struct btrfs_space_info *space_info,
+ u64 space_info_used_bytes, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ /* Don't overcommit when in mixed mode. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ return check_can_overcommit(space_info, space_info_used_bytes, bytes, flush);
+}
+
+bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
{
- u64 avail;
u64 used;
/* Don't overcommit when in mixed mode */
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
- return 0;
+ return false;
used = btrfs_space_info_used(space_info, true);
- avail = calc_available_free_space(fs_info, space_info, flush);
- if (used + bytes < space_info->total_bytes + avail)
- return 1;
- return 0;
+ return check_can_overcommit(space_info, used, bytes, flush);
}
static void remove_ticket(struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket)
+ struct reserve_ticket *ticket, int error)
{
+ lockdep_assert_held(&space_info->lock);
+
if (!list_empty(&ticket->list)) {
list_del_init(&ticket->list);
- ASSERT(space_info->reclaim_size >= ticket->bytes);
+ ASSERT(space_info->reclaim_size >= ticket->bytes,
+ "space_info->reclaim_size=%llu ticket->bytes=%llu",
+ space_info->reclaim_size, ticket->bytes);
space_info->reclaim_size -= ticket->bytes;
}
+
+ spin_lock(&ticket->lock);
+ /*
+ * If we are called from a task waiting on the ticket, it may happen
+ * that before it sets an error on the ticket, a reclaim task was able
+ * to satisfy the ticket. In that case ignore the error.
+ */
+ if (error && ticket->bytes > 0)
+ ticket->error = error;
+ else
+ ticket->bytes = 0;
+
+ wake_up(&ticket->wait);
+ spin_unlock(&ticket->lock);
}
/*
* This is for space we already have accounted in space_info->bytes_may_use, so
* basically when we're returning space from block_rsv's.
*/
-void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+void btrfs_try_granting_tickets(struct btrfs_space_info *space_info)
{
struct list_head *head;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
+ u64 used = btrfs_space_info_used(space_info, true);
lockdep_assert_held(&space_info->lock);
@@ -490,19 +569,18 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
again:
while (!list_empty(head)) {
struct reserve_ticket *ticket;
- u64 used = btrfs_space_info_used(space_info, true);
+ u64 used_after;
ticket = list_first_entry(head, struct reserve_ticket, list);
+ used_after = used + ticket->bytes;
/* Check and see if our ticket can be satisfied now. */
- if ((used + ticket->bytes <= space_info->total_bytes) ||
- btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
- flush)) {
+ if (used_after <= space_info->total_bytes ||
+ can_overcommit(space_info, used, ticket->bytes, flush)) {
btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes);
- remove_ticket(space_info, ticket);
- ticket->bytes = 0;
+ remove_ticket(space_info, ticket, 0);
space_info->tickets_id++;
- wake_up(&ticket->wait);
+ used = used_after;
} else {
break;
}
@@ -549,15 +627,16 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
}
-static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *info)
+static void __btrfs_dump_space_info(const struct btrfs_space_info *info)
{
+ const struct btrfs_fs_info *fs_info = info->fs_info;
const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
- btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
- flag_str,
+ btrfs_info(fs_info,
+ "space_info %s (sub-group id %d) has %lld free, is %sfull",
+ flag_str, info->subgroup_id,
(s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
@@ -567,16 +646,16 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
info->bytes_readonly, info->bytes_zone_unusable);
}
-void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups)
+void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ bool dump_block_groups)
{
+ struct btrfs_fs_info *fs_info = info->fs_info;
struct btrfs_block_group *cache;
u64 total_avail = 0;
int index = 0;
spin_lock(&info->lock);
- __btrfs_dump_space_info(fs_info, info);
+ __btrfs_dump_space_info(info);
dump_global_block_rsv(fs_info);
spin_unlock(&info->lock);
@@ -624,11 +703,11 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static void shrink_delalloc(struct btrfs_space_info *space_info,
u64 to_reclaim, bool wait_ordered,
bool for_preempt)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 ordered_bytes;
@@ -755,10 +834,10 @@ skip_async:
* and may fail for various reasons. The caller is supposed to examine the
* state of @space_info to detect the outcome.
*/
-static void flush_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 num_bytes,
- enum btrfs_flush_state state, bool for_preempt)
+static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes,
+ enum btrfs_flush_state state, bool for_preempt)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int nr;
@@ -787,7 +866,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
case FLUSH_DELALLOC_FULL:
if (state == FLUSH_DELALLOC_FULL)
num_bytes = U64_MAX;
- shrink_delalloc(fs_info, space_info, num_bytes,
+ shrink_delalloc(space_info, num_bytes,
state != FLUSH_DELALLOC, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
@@ -812,7 +891,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(trans);
break;
}
- ret = btrfs_chunk_alloc(trans,
+ ret = btrfs_chunk_alloc(trans, space_info,
btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
@@ -854,8 +933,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
return;
}
-static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info)
+static u64 btrfs_calc_reclaim_metadata_size(const struct btrfs_space_info *space_info)
{
u64 used;
u64 avail;
@@ -863,8 +941,7 @@ static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&space_info->lock);
- avail = calc_available_free_space(fs_info, space_info,
- BTRFS_RESERVE_FLUSH_ALL);
+ avail = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL);
used = btrfs_space_info_used(space_info, true);
/*
@@ -879,18 +956,25 @@ static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
return to_reclaim;
}
-static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info)
+static bool need_preemptive_reclaim(const struct btrfs_space_info *space_info)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
u64 thresh;
u64 used;
- thresh = mult_perc(space_info->total_bytes, 90);
-
lockdep_assert_held(&space_info->lock);
+ /*
+ * We have tickets queued, bail so we don't compete with the async
+ * flushers.
+ */
+ if (space_info->reclaim_size)
+ return false;
+
+ thresh = mult_perc(space_info->total_bytes, 90);
+
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved +
global_rsv_size) >= thresh)
@@ -911,13 +995,6 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
return false;
/*
- * We have tickets queued, bail so we don't compete with the async
- * flushers.
- */
- if (space_info->reclaim_size)
- return false;
-
- /*
* If we have over half of the free space occupied by reservations or
* pinned then we want to start flushing.
*
@@ -946,8 +1023,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
* much delalloc we need for the background flusher to kick in.
*/
- thresh = calc_available_free_space(fs_info, space_info,
- BTRFS_RESERVE_FLUSH_ALL);
+ thresh = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_readonly + global_rsv_size;
if (used < space_info->total_bytes)
@@ -991,13 +1067,15 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
-static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static bool steal_from_global_rsv(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 min_bytes;
+ lockdep_assert_held(&space_info->lock);
+
if (!ticket->steal)
return false;
@@ -1011,21 +1089,19 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
return false;
}
global_rsv->reserved -= ticket->bytes;
- remove_ticket(space_info, ticket);
- ticket->bytes = 0;
- wake_up(&ticket->wait);
- space_info->tickets_id++;
if (global_rsv->reserved < global_rsv->size)
- global_rsv->full = 0;
+ global_rsv->full = false;
spin_unlock(&global_rsv->lock);
+ remove_ticket(space_info, ticket, 0);
+ space_info->tickets_id++;
+
return true;
}
/*
* We've exhausted our flushing, start failing tickets.
*
- * @fs_info - fs_info for this fs
* @space_info - the space info we were flushing
*
* We call this when we've exhausted our flushing ability and haven't made
@@ -1038,77 +1114,66 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
* other tickets, or if it stumbles across a ticket that was smaller than the
* first ticket.
*/
-static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
- const bool aborted = BTRFS_FS_ERROR(fs_info);
+ const int abort_error = BTRFS_FS_ERROR(fs_info);
trace_btrfs_fail_all_tickets(fs_info, space_info);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
- __btrfs_dump_space_info(fs_info, space_info);
+ __btrfs_dump_space_info(space_info);
}
while (!list_empty(&space_info->tickets) &&
tickets_id == space_info->tickets_id) {
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
+ if (unlikely(abort_error)) {
+ remove_ticket(space_info, ticket, abort_error);
+ } else {
+ if (steal_from_global_rsv(space_info, ticket))
+ return true;
- if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
- return true;
-
- if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_info(fs_info, "failing ticket with %llu bytes",
- ticket->bytes);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_info(fs_info, "failing ticket with %llu bytes",
+ ticket->bytes);
- remove_ticket(space_info, ticket);
- if (aborted)
- ticket->error = -EIO;
- else
- ticket->error = -ENOSPC;
- wake_up(&ticket->wait);
+ remove_ticket(space_info, ticket, -ENOSPC);
- /*
- * We're just throwing tickets away, so more flushing may not
- * trip over btrfs_try_granting_tickets, so we need to call it
- * here to see if we can make progress with the next ticket in
- * the list.
- */
- if (!aborted)
- btrfs_try_granting_tickets(fs_info, space_info);
+ /*
+ * We're just throwing tickets away, so more flushing may
+ * not trip over btrfs_try_granting_tickets, so we need
+ * to call it here to see if we can make progress with
+ * the next ticket in the list.
+ */
+ btrfs_try_granting_tickets(space_info);
+ }
}
return (tickets_id != space_info->tickets_id);
}
-/*
- * This is for normal flushers, we can wait all goddamned day if we want to. We
- * will loop and continuously try to flush as long as we are making progress.
- * We count progress as clearing off tickets each time we have to loop.
- */
-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 to_reclaim;
enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
enum btrfs_flush_state final_state;
- fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
if (btrfs_is_zoned(fs_info))
final_state = RESET_ZONES;
else
final_state = COMMIT_TRANS;
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(space_info);
if (!to_reclaim) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
@@ -1117,15 +1182,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_state = FLUSH_DELAYED_ITEMS_NR;
do {
- flush_space(fs_info, space_info, to_reclaim, flush_state, false);
+ flush_space(space_info, to_reclaim, flush_state, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
- space_info);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(space_info);
if (last_tickets_id == space_info->tickets_id) {
flush_state++;
} else {
@@ -1159,11 +1223,11 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
if (flush_state > final_state) {
commit_cycles++;
if (commit_cycles > 2) {
- if (maybe_fail_all_tickets(fs_info, space_info)) {
+ if (maybe_fail_all_tickets(space_info)) {
flush_state = FLUSH_DELAYED_ITEMS_NR;
commit_cycles--;
} else {
- space_info->flush = 0;
+ space_info->flush = false;
}
} else {
flush_state = FLUSH_DELAYED_ITEMS_NR;
@@ -1174,6 +1238,25 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
/*
+ * This is for normal flushers, it can wait as much time as needed. We will
+ * loop and continuously try to flush as long as we are making progress. We
+ * count progress as clearing off tickets each time we have to loop.
+ */
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ do_async_reclaim_metadata_space(space_info);
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
+ if (space_info->sub_group[i])
+ do_async_reclaim_metadata_space(space_info->sub_group[i]);
+ }
+}
+
+/*
* This handles pre-flushing of metadata space before we get to the point that
* we need to start blocking threads on tickets. The logic here is different
* from the other flush paths because it doesn't rely on tickets to tell us how
@@ -1200,14 +1283,15 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
trans_rsv = &fs_info->trans_block_rsv;
spin_lock(&space_info->lock);
- while (need_preemptive_reclaim(fs_info, space_info)) {
+ while (need_preemptive_reclaim(space_info)) {
enum btrfs_flush_state flush;
u64 delalloc_size = 0;
u64 to_reclaim, block_rsv_size;
const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
+ const u64 bytes_may_use = space_info->bytes_may_use;
+ const u64 bytes_pinned = space_info->bytes_pinned;
- loops++;
-
+ spin_unlock(&space_info->lock);
/*
* We don't have a precise counter for the metadata being
* reserved for delalloc, so we'll approximate it by subtracting
@@ -1219,8 +1303,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
btrfs_block_rsv_reserved(delayed_block_rsv) +
btrfs_block_rsv_reserved(delayed_refs_rsv) +
btrfs_block_rsv_reserved(trans_rsv);
- if (block_rsv_size < space_info->bytes_may_use)
- delalloc_size = space_info->bytes_may_use - block_rsv_size;
+ if (block_rsv_size < bytes_may_use)
+ delalloc_size = bytes_may_use - block_rsv_size;
/*
* We don't want to include the global_rsv in our calculation,
@@ -1237,10 +1321,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
if (delalloc_size > block_rsv_size) {
to_reclaim = delalloc_size;
flush = FLUSH_DELALLOC;
- } else if (space_info->bytes_pinned >
+ } else if (bytes_pinned >
(btrfs_block_rsv_reserved(delayed_block_rsv) +
btrfs_block_rsv_reserved(delayed_refs_rsv))) {
- to_reclaim = space_info->bytes_pinned;
+ to_reclaim = bytes_pinned;
flush = COMMIT_TRANS;
} else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
btrfs_block_rsv_reserved(delayed_refs_rsv)) {
@@ -1251,7 +1335,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
flush = FLUSH_DELAYED_REFS_NR;
}
- spin_unlock(&space_info->lock);
+ loops++;
/*
* We don't want to reclaim everything, just a portion, so scale
@@ -1261,7 +1345,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
to_reclaim >>= 2;
if (!to_reclaim)
to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
- flush_space(fs_info, space_info, to_reclaim, flush, true);
+ flush_space(space_info, to_reclaim, flush, true);
cond_resched();
spin_lock(&space_info->lock);
}
@@ -1318,19 +1402,15 @@ static const enum btrfs_flush_state data_flush_states[] = {
ALLOC_CHUNK_FORCE,
};
-static void btrfs_async_reclaim_data_space(struct work_struct *work)
+static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 last_tickets_id;
enum btrfs_flush_state flush_state = 0;
- fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
- space_info = fs_info->data_sinfo;
-
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
@@ -1338,27 +1418,27 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
spin_unlock(&space_info->lock);
while (!space_info->full) {
- flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
+ flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
/* Something happened, fail everything and bail. */
- if (BTRFS_FS_ERROR(fs_info))
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
goto aborted_fs;
last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
}
while (flush_state < ARRAY_SIZE(data_flush_states)) {
- flush_space(fs_info, space_info, U64_MAX,
+ flush_space(space_info, U64_MAX,
data_flush_states[flush_state], false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
- space_info->flush = 0;
+ space_info->flush = false;
spin_unlock(&space_info->lock);
return;
}
@@ -1372,16 +1452,16 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
if (flush_state >= ARRAY_SIZE(data_flush_states)) {
if (space_info->full) {
- if (maybe_fail_all_tickets(fs_info, space_info))
+ if (maybe_fail_all_tickets(space_info))
flush_state = 0;
else
- space_info->flush = 0;
+ space_info->flush = false;
} else {
flush_state = 0;
}
/* Something happened, fail everything and bail. */
- if (BTRFS_FS_ERROR(fs_info))
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
goto aborted_fs;
}
@@ -1390,11 +1470,24 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
return;
aborted_fs:
- maybe_fail_all_tickets(fs_info, space_info);
- space_info->flush = 0;
+ maybe_fail_all_tickets(space_info);
+ space_info->flush = false;
spin_unlock(&space_info->lock);
}
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+ do_async_reclaim_data_space(space_info);
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++)
+ if (space_info->sub_group[i])
+ do_async_reclaim_data_space(space_info->sub_group[i]);
+}
+
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
@@ -1423,90 +1516,87 @@ static const enum btrfs_flush_state evict_flush_states[] = {
RESET_ZONES,
};
-static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket,
- const enum btrfs_flush_state *states,
- int states_nr)
+static bool is_ticket_served(struct reserve_ticket *ticket)
{
+ bool ret;
+
+ spin_lock(&ticket->lock);
+ ret = (ticket->bytes == 0);
+ spin_unlock(&ticket->lock);
+
+ return ret;
+}
+
+static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket,
+ const enum btrfs_flush_state *states,
+ int states_nr)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 to_reclaim;
int flush_state = 0;
- spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
/*
* This is the priority reclaim path, so to_reclaim could be >0 still
* because we may have only satisfied the priority tickets and still
* left non priority tickets on the list. We would then have
* to_reclaim but ->bytes == 0.
*/
- if (ticket->bytes == 0) {
- spin_unlock(&space_info->lock);
+ if (is_ticket_served(ticket))
return;
- }
+
+ spin_lock(&space_info->lock);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(space_info);
+ spin_unlock(&space_info->lock);
while (flush_state < states_nr) {
- spin_unlock(&space_info->lock);
- flush_space(fs_info, space_info, to_reclaim, states[flush_state],
- false);
- flush_state++;
- spin_lock(&space_info->lock);
- if (ticket->bytes == 0) {
- spin_unlock(&space_info->lock);
+ flush_space(space_info, to_reclaim, states[flush_state], false);
+ if (is_ticket_served(ticket))
return;
- }
+ flush_state++;
}
+ spin_lock(&space_info->lock);
/*
* Attempt to steal from the global rsv if we can, except if the fs was
* turned into error mode due to a transaction abort when flushing space
* above, in that case fail with the abort error instead of returning
* success to the caller if we can steal from the global rsv - this is
- * just to have caller fail immeditelly instead of later when trying to
+ * just to have caller fail immediately instead of later when trying to
* modify the fs, making it easier to debug -ENOSPC problems.
*/
- if (BTRFS_FS_ERROR(fs_info)) {
- ticket->error = BTRFS_FS_ERROR(fs_info);
- remove_ticket(space_info, ticket);
- } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
- ticket->error = -ENOSPC;
- remove_ticket(space_info, ticket);
- }
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ remove_ticket(space_info, ticket, BTRFS_FS_ERROR(fs_info));
+ else if (!steal_from_global_rsv(space_info, ticket))
+ remove_ticket(space_info, ticket, -ENOSPC);
/*
* We must run try_granting_tickets here because we could be a large
* ticket in front of a smaller ticket that can now be satisfied with
* the available space.
*/
- btrfs_try_granting_tickets(fs_info, space_info);
+ btrfs_try_granting_tickets(space_info);
spin_unlock(&space_info->lock);
}
-static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static void priority_reclaim_data_space(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
- spin_lock(&space_info->lock);
-
/* We could have been granted before we got here. */
- if (ticket->bytes == 0) {
- spin_unlock(&space_info->lock);
+ if (is_ticket_served(ticket))
return;
- }
+ spin_lock(&space_info->lock);
while (!space_info->full) {
spin_unlock(&space_info->lock);
- flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
- spin_lock(&space_info->lock);
- if (ticket->bytes == 0) {
- spin_unlock(&space_info->lock);
+ flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
+ if (is_ticket_served(ticket))
return;
- }
+ spin_lock(&space_info->lock);
}
- ticket->error = -ENOSPC;
- remove_ticket(space_info, ticket);
- btrfs_try_granting_tickets(fs_info, space_info);
+ remove_ticket(space_info, ticket, -ENOSPC);
+ btrfs_try_granting_tickets(space_info);
spin_unlock(&space_info->lock);
}
@@ -1515,11 +1605,13 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info,
{
DEFINE_WAIT(wait);
- int ret = 0;
- spin_lock(&space_info->lock);
+ spin_lock(&ticket->lock);
while (ticket->bytes > 0 && ticket->error == 0) {
+ int ret;
+
ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+ spin_unlock(&ticket->lock);
if (ret) {
/*
* Delete us from the list. After we unlock the space
@@ -1529,24 +1621,23 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info,
* despite getting an error, resulting in a space leak
* (bytes_may_use counter of our space_info).
*/
- remove_ticket(space_info, ticket);
- ticket->error = -EINTR;
- break;
+ spin_lock(&space_info->lock);
+ remove_ticket(space_info, ticket, -EINTR);
+ spin_unlock(&space_info->lock);
+ return;
}
- spin_unlock(&space_info->lock);
schedule();
finish_wait(&ticket->wait, &wait);
- spin_lock(&space_info->lock);
+ spin_lock(&ticket->lock);
}
- spin_unlock(&space_info->lock);
+ spin_unlock(&ticket->lock);
}
/*
* Do the appropriate flushing and waiting for a ticket.
*
- * @fs_info: the filesystem
* @space_info: space info for the reservation
* @ticket: ticket for the reservation
* @start_ns: timestamp when the reservation started
@@ -1556,8 +1647,7 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info,
* This does the work of figuring out how to flush for the ticket, waiting for
* the reservation, and returning the appropriate error if there is one.
*/
-static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static int handle_reserve_ticket(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket,
u64 start_ns, u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
@@ -1571,20 +1661,20 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
wait_reserve_ticket(space_info, ticket);
break;
case BTRFS_RESERVE_FLUSH_LIMIT:
- priority_reclaim_metadata_space(fs_info, space_info, ticket,
+ priority_reclaim_metadata_space(space_info, ticket,
priority_flush_states,
ARRAY_SIZE(priority_flush_states));
break;
case BTRFS_RESERVE_FLUSH_EVICT:
- priority_reclaim_metadata_space(fs_info, space_info, ticket,
+ priority_reclaim_metadata_space(space_info, ticket,
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
- priority_reclaim_data_space(fs_info, space_info, ticket);
+ priority_reclaim_data_space(space_info, ticket);
break;
default:
- ASSERT(0);
+ ASSERT(0, "flush=%d", flush);
break;
}
@@ -1596,9 +1686,10 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
* releasing reserved space (if an error happens the expectation is that
* space wasn't reserved at all).
*/
- ASSERT(!(ticket->bytes == 0 && ticket->error));
- trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
- start_ns, flush, ticket->error);
+ ASSERT(!(ticket->bytes == 0 && ticket->error),
+ "ticket->bytes=%llu ticket->error=%d", ticket->bytes, ticket->error);
+ trace_btrfs_reserve_ticket(space_info->fs_info, space_info->flags,
+ orig_bytes, start_ns, flush, ticket->error);
return ret;
}
@@ -1612,9 +1703,9 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
}
-static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+static inline void maybe_clamp_preempt(struct btrfs_space_info *space_info)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
@@ -1649,7 +1740,6 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
/*
* Try to reserve bytes from the block_rsv's space.
*
- * @fs_info: the filesystem
* @space_info: space info we want to allocate from
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
@@ -1661,10 +1751,10 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct work_struct *async_work;
struct reserve_ticket ticket;
u64 start_ns = 0;
@@ -1672,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
int ret = -ENOSPC;
bool pending_tickets;
- ASSERT(orig_bytes);
+ ASSERT(orig_bytes, "orig_bytes=%llu", orig_bytes);
/*
* If have a transaction handle (current->journal_info != NULL), then
* the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
@@ -1681,9 +1771,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
*/
if (current->journal_info) {
/* One assert per line for easier debugging. */
- ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL);
- ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL);
- ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT);
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL, "flush=%d", flush);
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL, "flush=%d", flush);
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT, "flush=%d", flush);
}
if (flush == BTRFS_RESERVE_FLUSH_DATA)
@@ -1711,7 +1801,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
*/
if (!pending_tickets &&
((used + orig_bytes <= space_info->total_bytes) ||
- btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
+ can_overcommit(space_info, used, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
}
@@ -1722,7 +1812,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* left to allocate for the block.
*/
if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
- used = btrfs_space_info_used(space_info, false);
+ used -= space_info->bytes_may_use;
if (used + orig_bytes <= space_info->total_bytes) {
btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
@@ -1741,6 +1831,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
+ spin_lock_init(&ticket.lock);
ticket.steal = can_steal(flush);
if (trace_btrfs_reserve_ticket_enabled())
start_ns = ktime_get_ns();
@@ -1757,14 +1848,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* preemptive flushing in order to keep up with
* the workload.
*/
- maybe_clamp_preempt(fs_info, space_info);
+ maybe_clamp_preempt(space_info);
- space_info->flush = 1;
+ space_info->flush = true;
trace_btrfs_trigger_flush(fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq, async_work);
+ queue_work(system_dfl_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1778,10 +1869,10 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
!work_busy(&fs_info->preempt_reclaim_work) &&
- need_preemptive_reclaim(fs_info, space_info)) {
+ need_preemptive_reclaim(space_info)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
- queue_work(system_unbound_wq,
+ queue_work(system_dfl_wq,
&fs_info->preempt_reclaim_work);
}
}
@@ -1789,14 +1880,12 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
if (!ret || !can_ticket(flush))
return ret;
- return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
- orig_bytes, flush);
+ return handle_reserve_ticket(space_info, &ticket, start_ns, orig_bytes, flush);
}
/*
* Try to reserve metadata bytes from the block_rsv's space.
*
- * @fs_info: the filesystem
* @space_info: the space_info we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
@@ -1808,20 +1897,21 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
- ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush);
+ ret = reserve_bytes(space_info, orig_bytes, flush);
if (ret == -ENOSPC) {
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
space_info->flags, orig_bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
+ btrfs_dump_space_info(space_info, orig_bytes, false);
}
return ret;
}
@@ -1829,30 +1919,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
/*
* Try to reserve data bytes for an allocation.
*
- * @fs_info: the filesystem
+ * @space_info: the space_info we're allocating for
* @bytes: number of bytes we need
* @flush: how we are allowed to flush
*
* This will reserve bytes from the data space info. If there is not enough
* space then we will attempt to flush space as specified by flush.
*/
-int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
int ret;
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
- flush == BTRFS_RESERVE_NO_FLUSH);
- ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+ flush == BTRFS_RESERVE_NO_FLUSH, "flush=%d", flush);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA,
+ "current->journal_info=0x%lx flush=%d",
+ (unsigned long)current->journal_info, flush);
- ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ ret = reserve_bytes(space_info, bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
- data_sinfo->flags, bytes, 1);
+ space_info->flags, bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ btrfs_dump_space_info(space_info, bytes, false);
}
return ret;
}
@@ -1865,7 +1957,7 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
btrfs_info(fs_info, "dumping space info:");
list_for_each_entry(space_info, &fs_info->space_info, list) {
spin_lock(&space_info->lock);
- __btrfs_dump_space_info(fs_info, space_info);
+ __btrfs_dump_space_info(space_info);
spin_unlock(&space_info->lock);
}
dump_global_block_rsv(fs_info);
@@ -1882,7 +1974,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
int factor;
/* It's df, we don't care if it's racy */
- if (list_empty(&sinfo->ro_bgs))
+ if (data_race(list_empty(&sinfo->ro_bgs)))
return 0;
spin_lock(&sinfo->lock);
@@ -1907,13 +1999,13 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
static u64 calc_pct_ratio(u64 x, u64 y)
{
- int err;
+ int ret;
if (!y)
return 0;
again:
- err = check_mul_overflow(100, x, &x);
- if (err)
+ ret = check_mul_overflow(100, x, &x);
+ if (ret)
goto lose_precision;
return div64_u64(x, y);
lose_precision:
@@ -2073,7 +2165,7 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool
}
}
-bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
+static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
{
bool ret;
@@ -2121,7 +2213,7 @@ void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
global_rsv->reserved += to_add;
btrfs_space_info_update_bytes_may_use(space_info, to_add);
if (global_rsv->reserved >= global_rsv->size)
- global_rsv->full = 1;
+ global_rsv->full = true;
len -= to_add;
}
spin_unlock(&global_rsv->lock);
@@ -2129,5 +2221,5 @@ void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
grant:
/* Add to any tickets we may have. */
if (len)
- btrfs_try_granting_tickets(fs_info, space_info);
+ btrfs_try_granting_tickets(space_info);
}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index a96efdb5e681..446c0614ad4a 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -98,8 +98,18 @@ enum btrfs_flush_state {
RESET_ZONES = 12,
};
+enum btrfs_space_info_sub_group {
+ BTRFS_SUB_GROUP_PRIMARY,
+ BTRFS_SUB_GROUP_DATA_RELOC,
+ BTRFS_SUB_GROUP_TREELOG,
+};
+
+#define BTRFS_SPACE_INFO_SUB_GROUP_MAX 1
struct btrfs_space_info {
struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *parent;
+ struct btrfs_space_info *sub_group[BTRFS_SPACE_INFO_SUB_GROUP_MAX];
+ int subgroup_id;
spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
@@ -132,11 +142,11 @@ struct btrfs_space_info {
flushing. The value is >> clamp, so turns
out to be a 2^clamp divisor. */
- unsigned int full:1; /* indicates that we cannot allocate any more
+ bool full; /* indicates that we cannot allocate any more
chunks for this space */
- unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
+ bool chunk_alloc; /* set if we are allocating a chunk */
- unsigned int flush:1; /* set if we are trying to make space */
+ bool flush; /* set if we are trying to make space */
unsigned int force_alloc; /* set if we need to force a chunk
alloc for this space */
@@ -214,14 +224,6 @@ struct btrfs_space_info {
s64 reclaimable_bytes;
};
-struct reserve_ticket {
- u64 bytes;
- int error;
- bool steal;
- struct list_head list;
- wait_queue_head_t wait;
-};
-
static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
@@ -256,6 +258,17 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable");
+static inline u64 btrfs_space_info_used(const struct btrfs_space_info *s_info,
+ bool may_use_included)
+{
+ lockdep_assert_held(&s_info->lock);
+
+ return s_info->bytes_used + s_info->bytes_reserved +
+ s_info->bytes_pinned + s_info->bytes_readonly +
+ s_info->bytes_zone_unusable +
+ (may_use_included ? s_info->bytes_may_use : 0);
+}
+
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
struct btrfs_block_group *block_group);
@@ -263,21 +276,15 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
-u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
- bool may_use_included);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups);
-int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ bool dump_block_groups);
+int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
-void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info);
-int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush);
+void btrfs_try_granting_tickets(struct btrfs_space_info *space_info);
+bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
static inline void btrfs_space_info_free_bytes_may_use(
struct btrfs_space_info *space_info,
@@ -285,10 +292,10 @@ static inline void btrfs_space_info_free_bytes_may_use(
{
spin_lock(&space_info->lock);
btrfs_space_info_update_bytes_may_use(space_info, -num_bytes);
- btrfs_try_granting_tickets(space_info->fs_info, space_info);
+ btrfs_try_granting_tickets(space_info);
spin_unlock(&space_info->lock);
}
-int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
@@ -296,7 +303,6 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes);
void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
-bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 11dbd7be6a3b..f82e71f5d88b 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -49,7 +49,7 @@
* Implementation:
*
* - Common
- * Both metadata and data will use a new structure, btrfs_subpage, to
+ * Both metadata and data will use a new structure, btrfs_folio_state, to
* record the status of each sector inside a page. This provides the extra
* granularity needed.
*
@@ -63,13 +63,14 @@
* This means a slightly higher tree locking latency.
*/
-int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio, enum btrfs_subpage_type type)
+int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, enum btrfs_folio_type type)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
/* For metadata we don't support large folio yet. */
- ASSERT(!folio_test_large(folio));
+ if (type == BTRFS_SUBPAGE_METADATA)
+ ASSERT(!folio_test_large(folio));
/*
* We have cases like a dummy extent buffer page, which is not mapped
@@ -86,18 +87,18 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return 0;
- subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type);
- if (IS_ERR(subpage))
- return PTR_ERR(subpage);
+ bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type);
+ if (IS_ERR(bfs))
+ return PTR_ERR(bfs);
- folio_attach_private(folio, subpage);
+ folio_attach_private(folio, bfs);
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio,
- enum btrfs_subpage_type type)
+void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_folio_type type)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
/* Either not subpage, or the folio already has private attached. */
if (!folio_test_private(folio))
@@ -107,15 +108,15 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol
if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return;
- subpage = folio_detach_private(folio);
- ASSERT(subpage);
- btrfs_free_subpage(subpage);
+ bfs = folio_detach_private(folio);
+ ASSERT(bfs);
+ btrfs_free_folio_state(bfs);
}
-struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- size_t fsize, enum btrfs_subpage_type type)
+struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
+ size_t fsize, enum btrfs_folio_type type)
{
- struct btrfs_subpage *ret;
+ struct btrfs_folio_state *ret;
unsigned int real_size;
ASSERT(fs_info->sectorsize < fsize);
@@ -135,11 +136,6 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
return ret;
}
-void btrfs_free_subpage(struct btrfs_subpage *subpage)
-{
- kfree(subpage);
-}
-
/*
* Increase the eb_refs of current subpage.
*
@@ -151,7 +147,7 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
*/
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
if (!btrfs_meta_is_subpage(fs_info))
return;
@@ -159,13 +155,13 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *
ASSERT(folio_test_private(folio) && folio->mapping);
lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = folio_get_private(folio);
- atomic_inc(&subpage->eb_refs);
+ bfs = folio_get_private(folio);
+ atomic_inc(&bfs->eb_refs);
}
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
if (!btrfs_meta_is_subpage(fs_info))
return;
@@ -173,39 +169,37 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *
ASSERT(folio_test_private(folio) && folio->mapping);
lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = folio_get_private(folio);
- ASSERT(atomic_read(&subpage->eb_refs));
- atomic_dec(&subpage->eb_refs);
+ bfs = folio_get_private(folio);
+ ASSERT(atomic_read(&bfs->eb_refs));
+ atomic_dec(&bfs->eb_refs);
}
static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- /* For subpage support, the folio must be single page. */
- ASSERT(folio_order(folio) == 0);
-
/* Basic checks */
ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
- IS_ALIGNED(len, fs_info->sectorsize));
+ IS_ALIGNED(len, fs_info->sectorsize), "start=%llu len=%u", start, len);
/*
* The range check only works for mapped page, we can still have
* unmapped page like dummy extent buffer pages.
*/
if (folio->mapping)
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + folio_size(folio));
+ start + len <= folio_next_pos(folio),
+ "start=%llu len=%u folio_pos=%llu folio_size=%zu",
+ start, len, folio_pos(folio), folio_size(folio));
}
#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
unsigned int __start_bit; \
- const unsigned int blocks_per_folio = \
- btrfs_blocks_per_folio(fs_info, folio); \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
\
btrfs_subpage_assert(fs_info, folio, start, len); \
- __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \
+ __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \
+ __start_bit += __bpf * btrfs_bitmap_nr_##name; \
__start_bit; \
})
@@ -223,14 +217,13 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, folio_pos(folio) + folio_size(folio),
- orig_start + orig_len) - *start;
+ *len = min_t(u64, folio_next_pos(folio), orig_start + orig_len) - *start;
}
static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
unsigned long flags;
@@ -240,7 +233,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
btrfs_subpage_assert(fs_info, folio, start, len);
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
/*
* We have call sites passing @lock_page into
* extent_clear_unlock_delalloc() for compression path.
@@ -248,18 +241,20 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
* This @locked_page is locked by plain lock_page(), thus its
* subpage::locked is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->nr_locked) == 0) {
- spin_unlock_irqrestore(&subpage->lock, flags);
+ if (atomic_read(&bfs->nr_locked) == 0) {
+ spin_unlock_irqrestore(&bfs->lock, flags);
return true;
}
- for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) {
- clear_bit(bit, subpage->bitmaps);
+ for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) {
+ clear_bit(bit, bfs->bitmaps);
cleared++;
}
- ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->nr_locked);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(atomic_read(&bfs->nr_locked) >= cleared,
+ "atomic_read(&bfs->nr_locked)=%d cleared=%d",
+ atomic_read(&bfs->nr_locked), cleared);
+ last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ spin_unlock_irqrestore(&bfs->lock, flags);
return last;
}
@@ -282,7 +277,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
ASSERT(folio_test_locked(folio));
@@ -298,7 +293,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
* Since we own the page lock, no one else could touch subpage::locked
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->nr_locked) == 0) {
+ if (atomic_read(&bfs->nr_locked) == 0) {
/* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
@@ -312,7 +307,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, unsigned long bitmap)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked;
unsigned long flags;
@@ -325,86 +320,84 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
return;
}
- if (atomic_read(&subpage->nr_locked) == 0) {
+ if (atomic_read(&bfs->nr_locked) == 0) {
/* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
for_each_set_bit(bit, &bitmap, blocks_per_folio) {
- if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
+ if (test_and_clear_bit(bit + start_bit, bfs->bitmaps))
cleared++;
}
- ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->nr_locked);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(atomic_read(&bfs->nr_locked) >= cleared,
+ "atomic_read(&bfs->nr_locked)=%d cleared=%d",
+ atomic_read(&bfs->nr_locked), cleared);
+ last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ spin_unlock_irqrestore(&bfs->lock, flags);
if (last)
folio_unlock(folio);
}
#define subpage_test_bitmap_all_set(fs_info, folio, name) \
({ \
- struct btrfs_subpage *subpage = folio_get_private(folio); \
- const unsigned int blocks_per_folio = \
- btrfs_blocks_per_folio(fs_info, folio); \
+ struct btrfs_folio_state *__bfs = folio_get_private(folio); \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
\
- bitmap_test_range_all_set(subpage->bitmaps, \
- blocks_per_folio * btrfs_bitmap_nr_##name, \
- blocks_per_folio); \
+ bitmap_test_range_all_set(__bfs->bitmaps, \
+ __bpf * btrfs_bitmap_nr_##name, __bpf); \
})
#define subpage_test_bitmap_all_zero(fs_info, folio, name) \
({ \
- struct btrfs_subpage *subpage = folio_get_private(folio); \
- const unsigned int blocks_per_folio = \
- btrfs_blocks_per_folio(fs_info, folio); \
+ struct btrfs_folio_state *__bfs = folio_get_private(folio); \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
\
- bitmap_test_range_all_zero(subpage->bitmaps, \
- blocks_per_folio * btrfs_bitmap_nr_##name, \
- blocks_per_folio); \
+ bitmap_test_range_all_zero(__bfs->bitmaps, \
+ __bpf * btrfs_bitmap_nr_##name, __bpf); \
})
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, folio, uptodate))
folio_mark_uptodate(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_clear_uptodate(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_unlock_irqrestore(&bfs->lock, flags);
folio_mark_dirty(folio);
}
@@ -421,17 +414,17 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
bool last = false;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, folio, dirty))
last = true;
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
return last;
}
@@ -448,91 +441,100 @@ void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
+ bool keep_write;
+
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ /*
+ * Don't clear the TOWRITE tag when starting writeback on a still-dirty
+ * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it,
+ * assume writeback is complete, and exit too early — violating sync
+ * ordering guarantees.
+ */
+ keep_write = folio_test_dirty(folio);
if (!folio_test_writeback(folio))
- folio_start_writeback(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ __folio_start_writeback(folio, keep_write);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) {
ASSERT(folio_test_writeback(folio));
folio_end_writeback(folio);
}
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_set_ordered(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, folio, ordered))
folio_clear_ordered(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, folio, checked))
folio_set_checked(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_clear_checked(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -543,16 +545,16 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
- struct btrfs_subpage *subpage = folio_get_private(folio); \
+ struct btrfs_folio_state *bfs = folio_get_private(folio); \
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \
name, start, len); \
unsigned long flags; \
bool ret; \
\
- spin_lock_irqsave(&subpage->lock, flags); \
- ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ spin_lock_irqsave(&bfs->lock, flags); \
+ ret = bitmap_test_range_all_set(bfs->bitmaps, start_bit, \
len >> fs_info->sectorsize_bits); \
- spin_unlock_irqrestore(&subpage->lock, flags); \
+ spin_unlock_irqrestore(&bfs->lock, flags); \
return ret; \
}
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
@@ -662,27 +664,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \
{ \
- const unsigned int blocks_per_folio = \
- btrfs_blocks_per_folio(fs_info, folio); \
- const struct btrfs_subpage *subpage = folio_get_private(folio); \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
+ const struct btrfs_folio_state *__bfs = folio_get_private(folio); \
\
- ASSERT(blocks_per_folio < BITS_PER_LONG); \
- *dst = bitmap_read(subpage->bitmaps, \
- blocks_per_folio * btrfs_bitmap_nr_##name, \
- blocks_per_folio); \
+ ASSERT(__bpf <= BITS_PER_LONG); \
+ *dst = bitmap_read(__bfs->bitmaps, \
+ __bpf * btrfs_bitmap_nr_##name, __bpf); \
}
#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \
{ \
unsigned long bitmap; \
- const unsigned int blocks_per_folio = \
- btrfs_blocks_per_folio(fs_info, folio); \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
\
GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \
btrfs_warn(fs_info, \
- "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
- start, len, folio_pos(folio), \
- blocks_per_folio, &bitmap); \
+ "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
+ start, len, folio_pos(folio), __bpf, &bitmap); \
}
/*
@@ -692,7 +690,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned int start_bit;
unsigned int nbits;
unsigned long flags;
@@ -707,15 +705,15 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len);
nbits = len >> fs_info->sectorsize_bits;
- subpage = folio_get_private(folio);
- ASSERT(subpage);
- spin_lock_irqsave(&subpage->lock, flags);
- if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+ bfs = folio_get_private(folio);
+ ASSERT(bfs);
+ spin_lock_irqsave(&bfs->lock, flags);
+ if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
}
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -728,7 +726,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned long flags;
unsigned int start_bit;
unsigned int nbits;
@@ -738,19 +736,19 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio))
return;
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
nbits = len >> fs_info->sectorsize_bits;
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
/* Target range should not yet be locked. */
- if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+ if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
}
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->nr_locked);
+ bitmap_set(bfs->bitmaps, start_bit, nbits);
+ ret = atomic_add_return(nbits, &bfs->nr_locked);
ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio));
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -778,7 +776,7 @@ bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct ext
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long uptodate_bitmap;
unsigned long dirty_bitmap;
@@ -790,18 +788,18 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(blocks_per_folio > 1);
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
- dump_page(folio_page(folio, 0), "btrfs subpage dump");
+ dump_page(folio_page(folio, 0), "btrfs folio state dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
@@ -817,14 +815,14 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
struct folio *folio,
unsigned long *ret_bitmap)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1);
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 3042c5ea840a..d81a0ade559f 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -7,13 +7,12 @@
#include <linux/atomic.h>
#include <linux/sizes.h>
#include "btrfs_inode.h"
-#include "fs.h"
struct address_space;
struct folio;
/*
- * Extra info for subpapge bitmap.
+ * Extra info for subpage bitmap.
*
* For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into
* one larger bitmap.
@@ -32,9 +31,31 @@ struct folio;
enum {
btrfs_bitmap_nr_uptodate = 0,
btrfs_bitmap_nr_dirty,
+
+ /*
+ * This can be changed to atomic eventually. But this change will rely
+ * on the async delalloc range rework for locked bitmap. As async
+ * delalloc can unlock its range and mark blocks writeback at random
+ * timing.
+ */
btrfs_bitmap_nr_writeback,
+
+ /*
+ * The ordered and checked flags are for COW fixup, already marked
+ * deprecated, and will be removed eventually.
+ */
btrfs_bitmap_nr_ordered,
btrfs_bitmap_nr_checked,
+
+ /*
+ * The locked bit is for async delalloc range (compression), currently
+ * async extent is queued with the range locked, until the compression
+ * is done.
+ * So an async extent can unlock the range at any random timing.
+ *
+ * This will need a rework on the async extent lifespan (mark writeback
+ * and do compression) before deprecating this flag.
+ */
btrfs_bitmap_nr_locked,
btrfs_bitmap_nr_max
};
@@ -43,7 +64,7 @@ enum {
* Structure to trace status of each sector inside a page, attached to
* page::private for both data and metadata inodes.
*/
-struct btrfs_subpage {
+struct btrfs_folio_state {
/* Common members for both data and metadata pages */
spinlock_t lock;
union {
@@ -51,7 +72,7 @@ struct btrfs_subpage {
* Structures only used by metadata
*
* @eb_refs should only be operated under private_lock, as it
- * manages whether the subpage can be detached.
+ * manages whether the btrfs_folio_state can be detached.
*/
atomic_t eb_refs;
@@ -65,12 +86,11 @@ struct btrfs_subpage {
unsigned long bitmaps[];
};
-enum btrfs_subpage_type {
+enum btrfs_folio_type {
BTRFS_SUBPAGE_METADATA,
BTRFS_SUBPAGE_DATA,
};
-#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE
/*
* Subpage support for metadata is more complex, as we can have dummy extent
* buffers, where folios have no mapping to determine the owning inode.
@@ -91,29 +111,19 @@ static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
return fs_info->sectorsize < folio_size(folio);
}
-#else
-static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info)
-{
- return false;
-}
-static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio)
-{
- if (folio->mapping && folio->mapping->host)
- ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
- return false;
-}
-#endif
-int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio,
- enum btrfs_subpage_type type);
+int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, enum btrfs_folio_type type);
+void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_folio_type type);
/* Allocate additional data where page represents more than one sector */
-struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- size_t fsize, enum btrfs_subpage_type type);
-void btrfs_free_subpage(struct btrfs_subpage *subpage);
+struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
+ size_t fsize, enum btrfs_folio_type type);
+static inline void btrfs_free_folio_state(struct btrfs_folio_state *bfs)
+{
+ kfree(bfs);
+}
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 40709e2a44fc..1999533b52be 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -88,6 +88,9 @@ struct btrfs_fs_context {
refcount_t refs;
};
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old);
+
enum {
Opt_acl,
Opt_clear_cache,
@@ -125,15 +128,13 @@ enum {
/* Rescue options */
Opt_rescue,
Opt_usebackuproot,
- Opt_nologreplay,
/* Debugging options */
Opt_enospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
+ Opt_ref_tracker,
#endif
Opt_err,
};
@@ -246,8 +247,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
/* Rescue options. */
fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
- /* Deprecated, with alias rescue=nologreplay */
- __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
__fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
/* For compatibility only, alias for "rescue=nologreplay". */
@@ -257,17 +256,85 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
fsparam_flag_no("enospc_debug", Opt_enospc_debug),
#ifdef CONFIG_BTRFS_DEBUG
fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ fsparam_flag("ref_tracker", Opt_ref_tracker),
fsparam_flag("ref_verify", Opt_ref_verify),
#endif
{}
};
-/* No support for restricting writes to btrfs devices yet... */
-static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
+static bool btrfs_match_compress_type(const char *string, const char *type, bool may_have_level)
+{
+ const int len = strlen(type);
+
+ return (strncmp(string, type, len) == 0) &&
+ ((may_have_level && string[len] == ':') || string[len] == '\0');
+}
+
+static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
+ const struct fs_parameter *param, int opt)
{
- return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
+ const char *string = param->string;
+ int ret;
+
+ /*
+ * Provide the same semantics as older kernels that don't use fs
+ * context, specifying the "compress" option clears "force-compress"
+ * without the need to pass "compress-force=[no|none]" before
+ * specifying "compress".
+ */
+ if (opt != Opt_compress_force && opt != Opt_compress_force_type)
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+
+ if (opt == Opt_compress || opt == Opt_compress_force) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "zlib", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "lzo", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_LZO;
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
+ if (string[3] == ':' && string[4])
+ btrfs_warn(NULL, "Compression level ignored for LZO");
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "zstd", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "no", false) ||
+ btrfs_match_compress_type(string, "none", false)) {
+ ctx->compress_level = 0;
+ ctx->compress_type = 0;
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ } else {
+ ret = -EINVAL;
+ goto error;
+ }
+ return 0;
+error:
+ btrfs_err(NULL, "failed to parse compression option '%s'", string);
+ return ret;
+
}
static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
@@ -306,10 +373,9 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_device: {
struct btrfs_device *device;
- blk_mode_t mode = btrfs_open_mode(fc);
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(param->string, mode, false);
+ device = btrfs_scan_one_device(param->string, false);
mutex_unlock(&uuid_mutex);
if (IS_ERR(device))
return PTR_ERR(device);
@@ -339,53 +405,8 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
fallthrough;
case Opt_compress:
case Opt_compress_type:
- /*
- * Provide the same semantics as older kernels that don't use fs
- * context, specifying the "compress" option clears
- * "force-compress" without the need to pass
- * "compress-force=[no|none]" before specifying "compress".
- */
- if (opt != Opt_compress_force && opt != Opt_compress_force_type)
- btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
-
- if (opt == Opt_compress || opt == Opt_compress_force) {
- ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "zlib", 4) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level =
- btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
- param->string + 4);
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "lzo", 3) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_LZO;
- ctx->compress_level = 0;
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "zstd", 4) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_ZSTD;
- ctx->compress_level =
- btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
- param->string + 4);
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "no", 2) == 0) {
- ctx->compress_level = 0;
- ctx->compress_type = 0;
- btrfs_clear_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
- } else {
- btrfs_err(NULL, "unrecognized compression value %s",
- param->string);
+ if (btrfs_parse_compress(ctx, param, opt))
return -EINVAL;
- }
break;
case Opt_ssd:
if (result.negated) {
@@ -449,11 +470,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
else
btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
break;
- case Opt_nologreplay:
- btrfs_warn(NULL,
- "'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
- btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
- break;
case Opt_norecovery:
btrfs_info(NULL,
"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'");
@@ -569,6 +585,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_commit_interval:
ctx->commit_interval = result.uint_32;
+ if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) {
+ btrfs_warn(NULL, "excessive commit interval %u, use with care",
+ ctx->commit_interval);
+ }
if (ctx->commit_interval == 0)
ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
break;
@@ -624,11 +644,12 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
}
break;
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
case Opt_ref_verify:
btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
break;
+ case Opt_ref_tracker:
+ btrfs_set_opt(ctx->mount_opt, REF_TRACKER);
+ break;
#endif
default:
btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
@@ -693,12 +714,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,
if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
- btrfs_info(info, "disk space caching is enabled");
btrfs_warn(info,
"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
}
- if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
- btrfs_info(info, "using free-space-tree");
}
return ret;
@@ -789,17 +807,15 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
struct btrfs_root_ref *root_ref;
struct btrfs_inode_ref *inode_ref;
struct btrfs_key key;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
char *name = NULL, *ptr;
u64 dirid;
int len;
int ret;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!path)
+ return ERR_PTR(-ENOMEM);
name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
@@ -887,7 +903,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
fs_root = NULL;
}
- btrfs_free_path(path);
if (ptr == name + PATH_MAX - 1) {
name[0] = '/';
name[1] = '\0';
@@ -898,7 +913,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
err:
btrfs_put_root(fs_root);
- btrfs_free_path(path);
kfree(name);
return ERR_PTR(ret);
}
@@ -907,7 +921,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_dir_item *di;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key location;
struct fscrypt_str name = FSTR_INIT("default", 7);
u64 dir_id;
@@ -924,7 +938,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
if (IS_ERR(di)) {
- btrfs_free_path(path);
return PTR_ERR(di);
}
if (!di) {
@@ -933,13 +946,11 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
* it's always been there, but don't freak out, just try and
* mount the top-level subvolume.
*/
- btrfs_free_path(path);
*objectid = BTRFS_FS_TREE_OBJECTID;
return 0;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- btrfs_free_path(path);
*objectid = location.objectid;
return 0;
}
@@ -949,12 +960,12 @@ static int btrfs_fill_super(struct super_block *sb,
{
struct btrfs_inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- int err;
+ int ret;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = BTRFS_SUPER_MAGIC;
sb->s_op = &btrfs_super_ops;
- sb->s_d_op = &btrfs_dentry_operations;
+ set_default_d_op(sb, &btrfs_dentry_operations);
sb->s_export_op = &btrfs_export_ops;
#ifdef CONFIG_FS_VERITY
sb->s_vop = &btrfs_verityops;
@@ -963,28 +974,30 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_time_gran = 1;
sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
- err = super_setup_bdi(sb);
- if (err) {
+ ret = super_setup_bdi(sb);
+ if (ret) {
btrfs_err(fs_info, "super_setup_bdi failed");
- return err;
+ return ret;
}
- err = open_ctree(sb, fs_devices);
- if (err) {
- btrfs_err(fs_info, "open_ctree failed: %d", err);
- return err;
+ ret = open_ctree(sb, fs_devices);
+ if (ret) {
+ btrfs_err(fs_info, "open_ctree failed: %d", ret);
+ return ret;
}
+ btrfs_emit_options(fs_info, NULL);
+
inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- btrfs_handle_fs_error(fs_info, err, NULL);
+ ret = PTR_ERR(inode);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
goto fail_close;
}
sb->s_root = d_make_root(&inode->vfs_inode);
if (!sb->s_root) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto fail_close;
}
@@ -993,7 +1006,7 @@ static int btrfs_fill_super(struct super_block *sb,
fail_close:
close_ctree(fs_info);
- return err;
+ return ret;
}
int btrfs_sync_fs(struct super_block *sb, int wait)
@@ -1072,7 +1085,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
- if (info->compress_level)
+ if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO)
seq_printf(seq, ":%d", info->compress_level);
}
if (btrfs_test_opt(info, NOSSD))
@@ -1135,12 +1148,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
#endif
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
+ if (btrfs_test_opt(info, REF_TRACKER))
+ seq_puts(seq, ",ref_tracker");
seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
subvol_name = btrfs_get_subvol_name_from_objectid(info,
btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
if (!IS_ERR(subvol_name)) {
- seq_puts(seq, ",subvol=");
- seq_escape(seq, subvol_name, " \t\n\\");
+ seq_show_option(seq, "subvol", subvol_name);
kfree(subvol_name);
}
return 0;
@@ -1149,11 +1163,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
/*
* subvolumes are identified by ino 256
*/
-static inline int is_subvolume_inode(struct inode *inode)
+static inline bool is_subvolume_inode(struct inode *inode)
{
if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
- return 1;
- return 0;
+ return true;
+ return false;
}
static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
@@ -1262,7 +1276,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
/*
- * We need to cleanup all defragable inodes if the autodefragment is
+ * We need to cleanup all defraggable inodes if the autodefragment is
* close or the filesystem is read only.
*/
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
@@ -1433,7 +1447,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
{
btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
- btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow");
btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
@@ -1455,10 +1469,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
+ btrfs_info_if_unset(info, old, NODATASUM, "setting datasum");
btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
- btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers");
btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
@@ -1595,7 +1610,7 @@ static inline void btrfs_descending_sort_devices(
static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
u64 *free_bytes)
{
- struct btrfs_device_info *devices_info;
+ struct btrfs_device_info AUTO_KFREE(devices_info);
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 type;
@@ -1693,7 +1708,6 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
nr_devices--;
}
- kfree(devices_info);
*free_bytes = avail_space;
return 0;
}
@@ -1831,10 +1845,9 @@ static int btrfs_get_tree_super(struct fs_context *fc)
struct btrfs_fs_info *fs_info = fc->s_fs_info;
struct btrfs_fs_context *ctx = fc->fs_private;
struct btrfs_fs_devices *fs_devices = NULL;
- struct block_device *bdev;
struct btrfs_device *device;
struct super_block *sb;
- blk_mode_t mode = btrfs_open_mode(fc);
+ blk_mode_t mode = sb_open_mode(fc->sb_flags);
int ret;
btrfs_ctx_to_info(fs_info, ctx);
@@ -1844,47 +1857,58 @@ static int btrfs_get_tree_super(struct fs_context *fc)
* With 'true' passed to btrfs_scan_one_device() (mount time) we expect
* either a valid device or an error.
*/
- device = btrfs_scan_one_device(fc->source, mode, true);
+ device = btrfs_scan_one_device(fc->source, true);
ASSERT(device != NULL);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
return PTR_ERR(device);
}
-
fs_devices = device->fs_devices;
+ /*
+ * We cannot hold uuid_mutex calling sget_fc(), it will lead to a
+ * locking order reversal with s_umount.
+ *
+ * So here we increase the holding number of fs_devices, this will ensure
+ * the fs_devices itself won't be freed.
+ */
+ btrfs_fs_devices_inc_holding(fs_devices);
fs_info->fs_devices = fs_devices;
-
- ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
mutex_unlock(&uuid_mutex);
- if (ret)
- return ret;
- if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- ret = -EACCES;
- goto error;
- }
- bdev = fs_devices->latest_dev->bdev;
-
- /*
- * From now on the error handling is not straightforward.
- *
- * If successful, this will transfer the fs_info into the super block,
- * and fc->s_fs_info will be NULL. However if there's an existing
- * super, we'll still have fc->s_fs_info populated. If we error
- * completely out it'll be cleaned up when we drop the fs_context,
- * otherwise it's tied to the lifetime of the super_block.
- */
sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- goto error;
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ /*
+ * Since the fs_devices is not opened, it can be freed at any
+ * time after unlocking uuid_mutex. We need to avoid double
+ * free through put_fs_context()->btrfs_free_fs_info().
+ * So here we reset fs_info->fs_devices to NULL, and let the
+ * regular fs_devices reclaim path to handle it.
+ *
+ * This applies to all later branches where no fs_devices is
+ * opened.
+ */
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
+ return PTR_ERR(sb);
}
- set_device_specific_options(fs_info);
-
if (sb->s_root) {
- btrfs_close_devices(fs_devices);
+ /*
+ * Not the first mount of the fs thus got an existing super block.
+ * Will reuse the returned super block, fs_info and fs_devices.
+ *
+ * fc->s_fs_info is not touched and will be later freed by
+ * put_fs_context() through btrfs_free_fs_context().
+ */
+ ASSERT(fc->s_fs_info == fs_info);
+
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
/*
* At this stage we may have RO flag mismatch between
* fc->sb_flags and sb->s_flags. Caller should detect such
@@ -1892,9 +1916,33 @@ static int btrfs_get_tree_super(struct fs_context *fc)
* needed.
*/
} else {
+ struct block_device *bdev;
+
+ /*
+ * The first mount of the fs thus a new superblock, fc->s_fs_info
+ * must be NULL, and the ownership of our fs_info and fs_devices is
+ * transferred to the super block.
+ */
+ ASSERT(fc->s_fs_info == NULL);
+
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ ret = btrfs_open_devices(fs_devices, mode, sb);
+ if (ret < 0)
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
+ if (ret < 0) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+ if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ deactivate_locked_super(sb);
+ return -EACCES;
+ }
+ set_device_specific_options(fs_info);
+ bdev = fs_devices->latest_dev->bdev;
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
- btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
ret = btrfs_fill_super(sb, fs_devices);
if (ret) {
deactivate_locked_super(sb);
@@ -1906,10 +1954,6 @@ static int btrfs_get_tree_super(struct fs_context *fc)
fc->root = dget(sb->s_root);
return 0;
-
-error:
- btrfs_close_devices(fs_devices);
- return ret;
}
/*
@@ -1985,17 +2029,13 @@ error:
* btrfs or not, setting the whole super block RO. To make per-subvolume mounting
* work with different options work we need to keep backward compatibility.
*/
-static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt)
+static int btrfs_reconfigure_for_mount(struct fs_context *fc)
{
int ret = 0;
- if (fc->sb_flags & SB_RDONLY)
- return ret;
-
- down_write(&mnt->mnt_sb->s_umount);
- if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY))
+ if (!(fc->sb_flags & SB_RDONLY) && (fc->root->d_sb->s_flags & SB_RDONLY))
ret = btrfs_reconfigure(fc);
- up_write(&mnt->mnt_sb->s_umount);
+
return ret;
}
@@ -2023,7 +2063,13 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
- btrfs_free_fs_info(fs_info);
+ /*
+ * Dont call btrfs_free_fs_info() to free it as it's still
+ * initialized partially.
+ */
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kvfree(fs_info);
return -ENOMEM;
}
btrfs_init_fs_info(fs_info);
@@ -2040,25 +2086,18 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
*/
dup_fc->s_fs_info = fs_info;
- /*
- * We'll do the security settings in our btrfs_get_tree_super() mount
- * loop, they were duplicated into dup_fc, we can drop the originals
- * here.
- */
- security_free_mnt_opts(&fc->security);
- fc->security = NULL;
+ ret = btrfs_get_tree_super(dup_fc);
+ if (ret)
+ goto error;
- mnt = fc_mount(dup_fc);
- if (IS_ERR(mnt)) {
- put_fs_context(dup_fc);
- return PTR_ERR(mnt);
- }
- ret = btrfs_reconfigure_for_mount(dup_fc, mnt);
+ ret = btrfs_reconfigure_for_mount(dup_fc);
+ up_write(&dup_fc->root->d_sb->s_umount);
+ if (ret)
+ goto error;
+ mnt = vfs_create_mount(dup_fc);
put_fs_context(dup_fc);
- if (ret) {
- mntput(mnt);
- return ret;
- }
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
/*
* This free's ->subvol_name, because if it isn't set we have to
@@ -2072,25 +2111,15 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
fc->root = dentry;
return 0;
+error:
+ put_fs_context(dup_fc);
+ return ret;
}
static int btrfs_get_tree(struct fs_context *fc)
{
- /*
- * Since we use mount_subtree to mount the default/specified subvol, we
- * have to do mounts in two steps.
- *
- * First pass through we call btrfs_get_tree_subvol(), this is just a
- * wrapper around fc_mount() to call back into here again, and this time
- * we'll call btrfs_get_tree_super(). This will do the open_ctree() and
- * everything to open the devices and file system. Then we return back
- * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
- * from there we can do our mount_subvol() call, which will lookup
- * whichever subvol we're mounting and setup this fc with the
- * appropriate dentry for the subvol.
- */
- if (fc->s_fs_info)
- return btrfs_get_tree_super(fc);
+ ASSERT(fc->s_fs_info == NULL);
+
return btrfs_get_tree_subvol(fc);
}
@@ -2222,7 +2251,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
* Scanning outside of mount can return NULL which would turn
* into 0 error code.
*/
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ device = btrfs_scan_one_device(vol->name, false);
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
@@ -2240,13 +2269,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
* Scanning outside of mount can return NULL which would turn
* into 0 error code.
*/
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ device = btrfs_scan_one_device(vol->name, false);
if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
- if (IS_ERR(device))
- ret = PTR_ERR(device);
- else
- ret = 0;
+ ret = PTR_ERR_OR_ZERO(device);
break;
}
ret = !(device->fs_devices->num_devices ==
@@ -2293,20 +2319,20 @@ static int check_dev_super(struct btrfs_device *dev)
return 0;
/* Only need to check the primary super block. */
- sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+ sb = btrfs_read_disk_super(dev->bdev, 0, true);
if (IS_ERR(sb))
return PTR_ERR(sb);
/* Verify the checksum. */
csum_type = btrfs_super_csum_type(sb);
- if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
+ if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) {
btrfs_err(fs_info, "csum type changed, has %u expect %u",
csum_type, btrfs_super_csum_type(fs_info->super_copy));
ret = -EUCLEAN;
goto out;
}
- if (btrfs_check_super_csum(fs_info, sb)) {
+ if (unlikely(btrfs_check_super_csum(fs_info, sb))) {
btrfs_err(fs_info, "csum for on-disk super block no longer matches");
ret = -EUCLEAN;
goto out;
@@ -2318,7 +2344,7 @@ static int check_dev_super(struct btrfs_device *dev)
goto out;
last_trans = btrfs_get_last_trans_committed(fs_info);
- if (btrfs_super_generation(sb) != last_trans) {
+ if (unlikely(btrfs_super_generation(sb) != last_trans)) {
btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
btrfs_super_generation(sb), last_trans);
ret = -EUCLEAN;
@@ -2399,6 +2425,66 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
return 0;
}
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static int btrfs_remove_bdev(struct super_block *sb, struct block_device *bdev)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_device *device;
+ struct btrfs_dev_lookup_args lookup_args = { .devt = bdev->bd_dev };
+ bool can_rw;
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ device = btrfs_find_device(fs_info->fs_devices, &lookup_args);
+ if (!device) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ /* Device not found, should not affect the running fs, just give a warning. */
+ btrfs_warn(fs_info, "unable to find btrfs device for block device '%pg'", bdev);
+ return 0;
+ }
+ /*
+ * The to-be-removed device is already missing?
+ *
+ * That's weird but no special handling needed and can exit right now.
+ */
+ if (unlikely(test_and_set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ btrfs_warn(fs_info, "btrfs device id %llu is already missing", device->devid);
+ return 0;
+ }
+
+ device->fs_devices->missing_devices++;
+ if (test_and_clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ list_del_init(&device->dev_alloc_list);
+ WARN_ON(device->fs_devices->rw_devices < 1);
+ device->fs_devices->rw_devices--;
+ }
+ can_rw = btrfs_check_rw_degradable(fs_info, device);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ /*
+ * Now device is considered missing, btrfs_device_name() won't give a
+ * meaningful result anymore, so only output the devid.
+ */
+ if (unlikely(!can_rw)) {
+ btrfs_crit(fs_info,
+ "btrfs device id %llu has gone missing, can not maintain read-write",
+ device->devid);
+ return -EIO;
+ }
+ btrfs_warn(fs_info,
+ "btrfs device id %llu has gone missing, continue as degraded",
+ device->devid);
+ btrfs_set_opt(fs_info->mount_opt, DEGRADED);
+ return 0;
+}
+
+static void btrfs_shutdown(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ btrfs_force_shutdown(fs_info);
+}
+#endif
+
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
@@ -2414,6 +2500,10 @@ static const struct super_operations btrfs_super_ops = {
.unfreeze_fs = btrfs_unfreeze,
.nr_cached_objects = btrfs_nr_cached_objects,
.free_cached_objects = btrfs_free_cached_objects,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ .remove_bdev = btrfs_remove_bdev,
+ .shutdown = btrfs_shutdown,
+#endif
};
static const struct file_operations btrfs_ctl_fops = {
@@ -2455,9 +2545,6 @@ static int __init btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_ASSERT
", assert=on"
#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- ", ref-verify=on"
-#endif
#ifdef CONFIG_BLK_DEV_ZONED
", zoned=yes"
#else
@@ -2526,8 +2613,8 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_free_space_init,
.exit_func = btrfs_free_space_exit,
}, {
- .init_func = extent_state_init_cachep,
- .exit_func = extent_state_free_cachep,
+ .init_func = btrfs_extent_state_init_cachep,
+ .exit_func = btrfs_extent_state_free_cachep,
}, {
.init_func = extent_buffer_init_cachep,
.exit_func = extent_buffer_free_cachep,
@@ -2535,8 +2622,8 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_bioset_init,
.exit_func = btrfs_bioset_exit,
}, {
- .init_func = extent_map_init,
- .exit_func = extent_map_exit,
+ .init_func = btrfs_extent_map_init,
+ .exit_func = btrfs_extent_map_exit,
#ifdef CONFIG_BTRFS_EXPERIMENTAL
}, {
.init_func = btrfs_read_policy_init,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b9af74498b0c..1f64c132b387 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -10,6 +10,7 @@
#include <linux/completion.h>
#include <linux/bug.h>
#include <linux/list.h>
+#include <linux/string_choices.h>
#include <crypto/hash.h>
#include "messages.h"
#include "ctree.h"
@@ -25,6 +26,7 @@
#include "misc.h"
#include "fs.h"
#include "accessors.h"
+#include "zoned.h"
/*
* Structure name Path
@@ -160,8 +162,7 @@ static int can_modify_feature(struct btrfs_feature_attr *fa)
clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
break;
default:
- pr_warn("btrfs: sysfs: unknown feature set %d\n",
- fa->feature_set);
+ btrfs_warn(NULL, "sysfs: unknown feature set %d", fa->feature_set);
return 0;
}
@@ -410,13 +411,17 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
char *buf)
{
ssize_t ret = 0;
+ bool has_output = false;
- if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE)
- ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE);
- if (PAGE_SIZE > SZ_4K)
- ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
- ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
-
+ for (u32 cur = BTRFS_MIN_BLOCKSIZE; cur <= BTRFS_MAX_BLOCKSIZE; cur *= 2) {
+ if (!btrfs_supported_blocksize(cur))
+ continue;
+ if (has_output)
+ ret += sysfs_emit_at(buf, ret, " ");
+ ret += sysfs_emit_at(buf, ret, "%u", cur);
+ has_output = true;
+ }
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_sectorsizes,
@@ -1138,13 +1143,21 @@ static ssize_t btrfs_commit_stats_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ u64 now = ktime_get_ns();
+ u64 start_time = fs_info->commit_stats.critical_section_start_time;
+ u64 pending = 0;
+
+ if (start_time)
+ pending = now - start_time;
return sysfs_emit(buf,
"commits %llu\n"
+ "cur_commit_ms %llu\n"
"last_commit_ms %llu\n"
"max_commit_ms %llu\n"
"total_commit_ms %llu\n",
fs_info->commit_stats.commit_count,
+ div_u64(pending, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC));
@@ -1176,6 +1189,56 @@ static ssize_t btrfs_commit_stats_store(struct kobject *kobj,
}
BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store);
+static ssize_t btrfs_zoned_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ struct btrfs_block_group *bg;
+ size_t ret = 0;
+
+
+ if (!btrfs_is_zoned(fs_info))
+ return ret;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ret += sysfs_emit_at(buf, ret, "active block-groups: %zu\n",
+ list_count_nodes(&fs_info->zone_active_bgs));
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ ret += sysfs_emit_at(buf, ret, "\treclaimable: %zu\n",
+ list_count_nodes(&fs_info->reclaim_bgs));
+ ret += sysfs_emit_at(buf, ret, "\tunused: %zu\n",
+ list_count_nodes(&fs_info->unused_bgs));
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+
+ ret += sysfs_emit_at(buf, ret, "\tneed reclaim: %s\n",
+ str_true_false(btrfs_zoned_should_reclaim(fs_info)));
+
+ if (fs_info->data_reloc_bg)
+ ret += sysfs_emit_at(buf, ret,
+ "data relocation block-group: %llu\n",
+ fs_info->data_reloc_bg);
+ if (fs_info->treelog_bg)
+ ret += sysfs_emit_at(buf, ret,
+ "tree-log block-group: %llu\n",
+ fs_info->treelog_bg);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ret += sysfs_emit_at(buf, ret, "active zones:\n");
+ list_for_each_entry(bg, &fs_info->zone_active_bgs, active_bg_list) {
+ ret += sysfs_emit_at(buf, ret,
+ "\tstart: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu\n",
+ bg->start, bg->alloc_offset, bg->used,
+ bg->reserved, bg->zone_unusable);
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ return ret;
+}
+BTRFS_ATTR(, zoned_stats, btrfs_zoned_stats_show);
+
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1202,7 +1265,7 @@ static ssize_t quota_override_store(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
unsigned long knob;
- int err;
+ int ret;
if (!fs_info)
return -EPERM;
@@ -1210,9 +1273,9 @@ static ssize_t quota_override_store(struct kobject *kobj,
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- err = kstrtoul(buf, 10, &knob);
- if (err)
- return err;
+ ret = kstrtoul(buf, 10, &knob);
+ if (ret)
+ return ret;
if (knob > 1)
return -EINVAL;
@@ -1588,6 +1651,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
BTRFS_ATTR_PTR(, temp_fsid),
+ BTRFS_ATTR_PTR(, zoned_stats),
#ifdef CONFIG_BTRFS_EXPERIMENTAL
BTRFS_ATTR_PTR(, offload_csum),
#endif
@@ -1930,16 +1994,35 @@ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info)
kobject_put(&space_info->kobj);
}
-static const char *alloc_name(u64 flags)
+static const char *alloc_name(struct btrfs_space_info *space_info)
{
+ u64 flags = space_info->flags;
+
switch (flags) {
case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
return "mixed";
case BTRFS_BLOCK_GROUP_METADATA:
- return "metadata";
+ switch (space_info->subgroup_id) {
+ case BTRFS_SUB_GROUP_PRIMARY:
+ return "metadata";
+ case BTRFS_SUB_GROUP_TREELOG:
+ return "metadata-treelog";
+ default:
+ WARN_ON_ONCE(1);
+ return "metadata (unknown sub-group)";
+ }
case BTRFS_BLOCK_GROUP_DATA:
- return "data";
+ switch (space_info->subgroup_id) {
+ case BTRFS_SUB_GROUP_PRIMARY:
+ return "data";
+ case BTRFS_SUB_GROUP_DATA_RELOC:
+ return "data-reloc";
+ default:
+ WARN_ON_ONCE(1);
+ return "data (unknown sub-group)";
+ }
case BTRFS_BLOCK_GROUP_SYSTEM:
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY);
return "system";
default:
WARN_ON(1);
@@ -1951,14 +2034,13 @@ static const char *alloc_name(u64 flags)
* Create a sysfs entry for a space info type at path
* /sys/fs/btrfs/UUID/allocation/TYPE
*/
-int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info)
{
int ret;
ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
- fs_info->space_info_kobj, "%s",
- alloc_name(space_info->flags));
+ space_info->fs_info->space_info_kobj, "%s",
+ alloc_name(space_info));
if (ret) {
kobject_put(&space_info->kobj);
return ret;
@@ -2220,7 +2302,7 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
if (ret)
- pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+ btrfs_warn(NULL, "sending event %d to kobject: '%s' (%p): failed",
action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
&disk_to_dev(bdev->bd_disk)->kobj);
}
@@ -2263,15 +2345,15 @@ static struct kset *btrfs_kset;
*/
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
{
- int error;
+ int ret;
init_completion(&fs_devs->kobj_unregister);
fs_devs->fsid_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
- "%pU", fs_devs->fsid);
- if (error) {
+ ret = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
+ "%pU", fs_devs->fsid);
+ if (ret) {
kobject_put(&fs_devs->fsid_kobj);
- return error;
+ return ret;
}
fs_devs->devices_kobj = kobject_create_and_add("devices",
@@ -2297,71 +2379,70 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
{
- int error;
+ int ret;
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- error = btrfs_sysfs_add_fs_devices(fs_devs);
- if (error)
- return error;
+ ret = btrfs_sysfs_add_fs_devices(fs_devs);
+ if (ret)
+ return ret;
- error = sysfs_create_files(fsid_kobj, btrfs_attrs);
- if (error) {
+ ret = sysfs_create_files(fsid_kobj, btrfs_attrs);
+ if (ret) {
btrfs_sysfs_remove_fs_devices(fs_devs);
- return error;
+ return ret;
}
- error = sysfs_create_group(fsid_kobj,
- &btrfs_feature_attr_group);
- if (error)
+ ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret)
goto failure;
#ifdef CONFIG_BTRFS_DEBUG
fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj);
if (!fs_info->debug_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ if (ret)
goto failure;
#endif
/* Discard directory */
fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
if (!fs_info->discard_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
+ if (ret)
goto failure;
- error = addrm_unknown_feature_attrs(fs_info, true);
- if (error)
+ ret = addrm_unknown_feature_attrs(fs_info, true);
+ if (ret)
goto failure;
- error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
- if (error)
+ ret = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
+ if (ret)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
fsid_kobj);
if (!fs_info->space_info_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+ if (ret)
goto failure;
return 0;
failure:
btrfs_sysfs_remove_mounted(fs_info);
- return error;
+ return ret;
}
static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 0f94ae923210..05498e5346c3 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -37,8 +37,7 @@ void __cold btrfs_exit_sysfs(void);
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache);
-int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info);
+int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info);
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
void btrfs_sysfs_update_devid(struct btrfs_device *device);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 5eff8d7d2360..b576897d71cc 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -102,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(fs_info, &dev->alloc_state, 0);
+ btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -111,7 +111,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
static void btrfs_free_dummy_device(struct btrfs_device *dev)
{
- extent_io_tree_release(&dev->alloc_state);
+ btrfs_extent_io_tree_release(&dev->alloc_state);
kfree(dev);
}
@@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
- struct radix_tree_iter iter;
- void **slot;
struct btrfs_device *dev, *tmp;
+ struct extent_buffer *eb;
+ unsigned long index;
if (!fs_info)
return;
@@ -169,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
test_mnt->mnt_sb->s_fs_info = NULL;
- spin_lock(&fs_info->buffer_lock);
- radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
- struct extent_buffer *eb;
-
- eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
- if (!eb)
- continue;
- /* Shouldn't happen but that kind of thinking creates CVE's */
- if (radix_tree_exception(eb)) {
- if (radix_tree_deref_retry(eb))
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- slot = radix_tree_iter_resume(slot, &iter);
- spin_unlock(&fs_info->buffer_lock);
- free_extent_buffer_stale(eb);
- spin_lock(&fs_info->buffer_lock);
+ xa_lock_irq(&fs_info->buffer_tree);
+ xa_for_each(&fs_info->buffer_tree, index, eb) {
+ xa_unlock_irq(&fs_info->buffer_tree);
+ free_extent_buffer(eb);
+ xa_lock_irq(&fs_info->buffer_tree);
}
- spin_unlock(&fs_info->buffer_lock);
+ xa_unlock_irq(&fs_info->buffer_tree);
btrfs_mapping_tree_free(fs_info);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c
index 265370e79a54..e2248acb906b 100644
--- a/fs/btrfs/tests/delayed-refs-tests.c
+++ b/fs/btrfs/tests/delayed-refs-tests.c
@@ -997,12 +997,12 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
ret = simple_tests(&trans);
if (!ret) {
- test_msg("running delayed refs merg tests on metadata refs");
+ test_msg("running delayed refs merge tests on metadata refs");
ret = merge_tests(&trans, BTRFS_REF_METADATA);
}
if (!ret) {
- test_msg("running delayed refs merg tests on data refs");
+ test_msg("running delayed refs merge tests on data refs");
ret = merge_tests(&trans, BTRFS_REF_DATA);
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 74aca7180a5a..a0187d6163df 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -14,17 +14,17 @@
#include "../disk-io.h"
#include "../btrfs_inode.h"
-#define PROCESS_UNLOCK (1 << 0)
-#define PROCESS_RELEASE (1 << 1)
-#define PROCESS_TEST_LOCKED (1 << 2)
+#define PROCESS_UNLOCK (1U << 0)
+#define PROCESS_RELEASE (1U << 1)
+#define PROCESS_TEST_LOCKED (1U << 2)
static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
unsigned long flags)
{
int ret;
struct folio_batch fbatch;
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
int i;
int count = 0;
int loops = 0;
@@ -74,9 +74,9 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest)
dest[0] = 0;
PRINT_ONE_FLAG(state, dest, cur, DIRTY);
- PRINT_ONE_FLAG(state, dest, cur, UPTODATE);
PRINT_ONE_FLAG(state, dest, cur, LOCKED);
- PRINT_ONE_FLAG(state, dest, cur, NEW);
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG1);
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG2);
PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
@@ -114,7 +114,6 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
struct extent_io_tree *tmp;
struct page *page;
struct page *locked_page = NULL;
- unsigned long index = 0;
/* In this test we need at least 2 file extents at its maximum size */
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 total_dirty = 2 * max_bytes;
@@ -150,14 +149,14 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
* Passing NULL as we don't have fs_info but tracepoints are not used
* at this point
*/
- extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
+ btrfs_extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
/*
* First go through and create and mark all of our pages dirty, we pin
* everything to make sure our pages don't get evicted and screw up our
* test.
*/
- for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
+ for (pgoff_t index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
test_err("failed to allocate test page");
@@ -177,7 +176,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL);
+ btrfs_set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL);
start = 0;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
@@ -191,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
sectorsize - 1, start, end);
goto out_bits;
}
- unlock_extent(tmp, start, end, NULL);
+ btrfs_unlock_extent(tmp, start, end, NULL);
unlock_page(locked_page);
put_page(locked_page);
@@ -208,7 +207,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
test_err("couldn't find the locked page");
goto out_bits;
}
- set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL);
+ btrfs_set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
@@ -227,7 +226,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
test_err("there were unlocked pages in the range");
goto out_bits;
}
- unlock_extent(tmp, start, end, NULL);
+ btrfs_unlock_extent(tmp, start, end, NULL);
/* locked_page was unlocked above */
put_page(locked_page);
@@ -263,7 +262,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL);
+ btrfs_set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
@@ -282,7 +281,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
test_err("pages in range were not all locked");
goto out_bits;
}
- unlock_extent(tmp, start, end, NULL);
+ btrfs_unlock_extent(tmp, start, end, NULL);
/*
* Now to test where we run into a page that is no longer dirty in the
@@ -327,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
out_bits:
if (ret)
dump_extent_io_tree(tmp);
- clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
+ btrfs_clear_extent_bit(tmp, 0, total_dirty - 1, (unsigned)-1, NULL);
out:
if (locked_page)
put_page(locked_page);
@@ -344,11 +343,11 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb)
unsigned long i;
for (i = 0; i < eb->len * BITS_PER_BYTE; i++) {
- int bit, bit1;
+ bool bit_set, bit1_set;
- bit = !!test_bit(i, bitmap);
- bit1 = !!extent_buffer_test_bit(eb, 0, i);
- if (bit1 != bit) {
+ bit_set = test_bit(i, bitmap);
+ bit1_set = extent_buffer_test_bit(eb, 0, i);
+ if (bit1_set != bit_set) {
u8 has;
u8 expect;
@@ -361,9 +360,9 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb)
return -EINVAL;
}
- bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
- i % BITS_PER_BYTE);
- if (bit1 != bit) {
+ bit1_set = extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+ i % BITS_PER_BYTE);
+ if (bit1_set != bit_set) {
u8 has;
u8 expect;
@@ -506,7 +505,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb)
static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
- unsigned long *bitmap = NULL;
+ unsigned long AUTO_KFREE(bitmap);
struct extent_buffer *eb = NULL;
int ret;
@@ -552,7 +551,6 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
ret = __test_eb_bitmaps(bitmap, eb);
out:
free_extent_buffer(eb);
- kfree(bitmap);
btrfs_free_dummy_fs_info(fs_info);
return ret;
}
@@ -565,10 +563,10 @@ static int test_find_first_clear_extent_bit(void)
test_msg("running find_first_clear_extent_bit test");
- extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
+ btrfs_extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
/* Test correct handling of empty tree */
- find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
+ btrfs_find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
if (start != 0 || end != -1) {
test_err(
"error getting a range from completely empty tree: start %llu end %llu",
@@ -579,11 +577,11 @@ static int test_find_first_clear_extent_bit(void)
* Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
* 4M-32M
*/
- set_extent_bit(&tree, SZ_1M, SZ_4M - 1,
- CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
+ btrfs_set_extent_bit(&tree, SZ_1M, SZ_4M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
- find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != 0 || end != SZ_1M - 1) {
test_err("error finding beginning range: start %llu end %llu",
@@ -592,14 +590,14 @@ static int test_find_first_clear_extent_bit(void)
}
/* Now add 32M-64M so that we have a hole between 4M-32M */
- set_extent_bit(&tree, SZ_32M, SZ_64M - 1,
- CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
+ btrfs_set_extent_bit(&tree, SZ_32M, SZ_64M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
/*
* Request first hole starting at 12M, we should get 4M-32M
*/
- find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != SZ_4M || end != SZ_32M - 1) {
test_err("error finding trimmed range: start %llu end %llu",
@@ -611,8 +609,8 @@ static int test_find_first_clear_extent_bit(void)
* Search in the middle of allocated range, should get the next one
* available, which happens to be unallocated -> 4M-32M
*/
- find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != SZ_4M || end != SZ_32M - 1) {
test_err("error finding next unalloc range: start %llu end %llu",
@@ -624,9 +622,9 @@ static int test_find_first_clear_extent_bit(void)
* Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
* being unset in this range, we should get the entry in range 64M-72M
*/
- set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL);
- find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
- CHUNK_TRIMMED);
+ btrfs_set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
+ CHUNK_TRIMMED);
if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
test_err("error finding exact range: start %llu end %llu",
@@ -634,8 +632,8 @@ static int test_find_first_clear_extent_bit(void)
goto out;
}
- find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
- CHUNK_TRIMMED);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
+ CHUNK_TRIMMED);
/*
* Search in the middle of set range whose immediate neighbour doesn't
@@ -651,7 +649,7 @@ static int test_find_first_clear_extent_bit(void)
* Search beyond any known range, shall return after last known range
* and end should be -1
*/
- find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
+ btrfs_find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
if (start != SZ_64M + SZ_8M || end != -1) {
test_err(
"error handling beyond end of range search: start %llu end %llu",
@@ -663,7 +661,7 @@ static int test_find_first_clear_extent_bit(void)
out:
if (ret)
dump_extent_io_tree(&tree);
- clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_clear_extent_bit(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
return ret;
}
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 609bb6c9c087..0b9f25dd1a68 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -22,7 +22,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
while (!RB_EMPTY_ROOT(&em_tree->root)) {
node = rb_first(&em_tree->root);
em = rb_entry(node, struct extent_map, rb_node);
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
@@ -36,7 +36,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
refcount_set(&em->refs, 1);
}
#endif
- free_extent_map(em);
+ btrfs_free_extent_map(em);
}
write_unlock(&em_tree->lock);
@@ -68,7 +68,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -87,10 +87,10 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [0, 16K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Add [16K, 20K) following [0, 16K) */
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -109,9 +109,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [16K, 20K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -137,7 +137,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = -ENOENT;
goto out;
}
- if (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ if (em->start != 0 || btrfs_extent_map_end(em) != SZ_16K ||
em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) {
test_err(
"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu",
@@ -145,7 +145,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -167,7 +167,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -186,10 +186,10 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [0, 1K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Add [4K, 8K) following [0, 1K) */
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -208,9 +208,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [4K, 8K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -235,14 +235,14 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = -ENOENT;
goto out;
}
- if (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K ||
em->disk_bytenr != EXTENT_MAP_INLINE) {
test_err(
"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu",
ret, em->start, em->len, em->disk_bytenr);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -260,7 +260,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -279,9 +279,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
test_err("cannot add extent range [4K, 8K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -312,15 +312,15 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
* Since bytes within em are contiguous, em->block_start is identical to
* em->start.
*/
- if (start < em->start || start + len > extent_map_end(em) ||
- em->start != extent_map_block_start(em)) {
+ if (start < em->start || start + len > btrfs_extent_map_end(em) ||
+ em->start != btrfs_extent_map_block_start(em)) {
test_err(
"case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -369,7 +369,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -388,9 +388,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
test_err("cannot add extent range [0, 8K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -410,9 +410,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
test_err("cannot add extent range [8K, 32K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -438,14 +438,14 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
ret = -ENOENT;
goto out;
}
- if (start < em->start || start + len > extent_map_end(em)) {
+ if (start < em->start || start + len > btrfs_extent_map_end(em)) {
test_err(
"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)",
start, start + len, ret, em->start, em->len,
em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -498,7 +498,7 @@ static int add_compressed_extent(struct btrfs_inode *inode,
struct extent_map *em;
int ret;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -513,7 +513,7 @@ static int add_compressed_extent(struct btrfs_inode *inode,
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (ret < 0) {
test_err("cannot add extent map [%llu, %llu)", start, start + len);
return ret;
@@ -719,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
if (ret)
goto out;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -751,7 +751,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
}
ret = 0;
out:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret2 = free_extent_map_tree(inode);
if (ret == 0)
ret = ret2;
@@ -773,7 +773,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_msg("Running btrfs_drop_extent_cache with pinned");
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -793,9 +793,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("couldn't add extent map");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -815,7 +815,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("couldn't add extent map");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/*
* Drop [0, 36K) This should skip the [0, 4K) extent and then split the
@@ -826,7 +826,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Make sure our extent maps look sane. */
ret = -EINVAL;
- em = lookup_extent_mapping(em_tree, 0, SZ_16K);
+ em = btrfs_lookup_extent_mapping(em_tree, 0, SZ_16K);
if (!em) {
test_err("didn't find an em at 0 as expected");
goto out;
@@ -842,10 +842,10 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K);
+ em = btrfs_lookup_extent_mapping(em_tree, SZ_16K, SZ_16K);
read_unlock(&em_tree->lock);
if (em) {
test_err("found an em when we weren't expecting one");
@@ -853,7 +853,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
}
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K);
+ em = btrfs_lookup_extent_mapping(em_tree, SZ_32K, SZ_16K);
read_unlock(&em_tree->lock);
if (!em) {
test_err("didn't find an em at 32K as expected");
@@ -870,16 +870,16 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
- if (extent_map_block_start(em) != SZ_32K + SZ_4K) {
+ if (btrfs_extent_map_block_start(em) != SZ_32K + SZ_4K) {
test_err("em->block_start is %llu, expected 36K",
- extent_map_block_start(em));
+ btrfs_extent_map_block_start(em));
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1);
+ em = btrfs_lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1);
read_unlock(&em_tree->lock);
if (em) {
test_err("found an unexpected em above 48K");
@@ -888,9 +888,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = 0;
out:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Unpin our extent to prevent warning when removing it below. */
- ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0);
+ ret2 = btrfs_unpin_extent_cache(inode, 0, SZ_16K, 0);
if (ret == 0)
ret = ret2;
ret2 = free_extent_map_tree(inode);
@@ -913,7 +913,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -928,13 +928,13 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (ret < 0) {
test_err("couldn't add extent map for range [120K, 128K)");
goto out;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -967,7 +967,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K);
write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (ret < 0) {
test_err("couldn't add extent map for range [108K, 144K)");
goto out;
@@ -1013,7 +1013,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
struct rmap_test_vector *test)
{
struct btrfs_chunk_map *map;
- u64 *logical = NULL;
+ u64 AUTO_KFREE(logical);
int i, out_ndaddrs, out_stripe_len;
int ret;
@@ -1046,7 +1046,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
if (ret) {
test_err("error adding chunk map to mapping tree");
btrfs_free_chunk_map(map);
- goto out_free;
+ return ret;
}
ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1),
@@ -1079,8 +1079,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = 0;
out:
btrfs_remove_chunk_map(fs_info, map);
-out_free:
- kfree(logical);
return ret;
}
@@ -1095,7 +1093,7 @@ int btrfs_test_extent_map(void)
/*
* Test a chunk with 2 data stripes one of which
* intersects the physical address of the super block
- * is correctly recognised.
+ * is correctly recognized.
*/
.raid_type = BTRFS_BLOCK_GROUP_RAID1,
.physical_start = SZ_64M - SZ_4M,
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index b61972046feb..c8822edd32e2 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
unsigned int i;
int ret;
- info = search_free_space_info(trans, cache, path, 0);
+ info = btrfs_search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
ret = PTR_ERR(info);
@@ -57,7 +57,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
goto invalid;
offset = key.objectid;
while (offset < key.objectid + key.offset) {
- bit = free_space_test_bit(cache, path, offset);
+ bit = btrfs_free_space_test_bit(cache, path, offset);
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
@@ -115,7 +115,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
- info = search_free_space_info(trans, cache, path, 0);
+ info = btrfs_search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
btrfs_release_path(path);
@@ -131,13 +131,13 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
/* Flip it to the other format and check that for good measure. */
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- ret = convert_free_space_to_extents(trans, cache, path);
+ ret = btrfs_convert_free_space_to_extents(trans, cache, path);
if (ret) {
test_err("could not convert to extents");
return ret;
}
} else {
- ret = convert_free_space_to_bitmaps(trans, cache, path);
+ ret = btrfs_convert_free_space_to_bitmaps(trans, cache, path);
if (ret) {
test_err("could not convert to bitmaps");
return ret;
@@ -170,9 +170,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
const struct free_space_extent extents[] = {};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start,
- cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -193,8 +192,8 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, alignment);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -216,7 +215,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
cache->start + cache->length - alignment,
alignment);
if (ret) {
@@ -240,9 +239,9 @@ static int test_remove_middle(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start + alignment,
+ alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -263,23 +262,22 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -300,24 +298,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -338,29 +335,29 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -383,29 +380,29 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 4 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 4 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -483,14 +480,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
goto out;
}
- ret = add_block_group_free_space(&trans, cache);
+ ret = btrfs_add_block_group_free_space(&trans, cache);
if (ret) {
test_err("could not add block group free space");
goto out;
}
if (bitmaps) {
- ret = convert_free_space_to_bitmaps(&trans, cache, path);
+ ret = btrfs_convert_free_space_to_bitmaps(&trans, cache, path);
if (ret) {
test_err("could not convert block group to bitmaps");
goto out;
@@ -501,7 +498,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
if (ret)
goto out;
- ret = remove_block_group_free_space(&trans, cache);
+ ret = btrfs_remove_block_group_free_space(&trans, cache);
if (ret) {
test_err("could not remove block group free space");
goto out;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 3ea3bc2225fe..a4c2b7748b95 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -268,7 +268,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
@@ -314,7 +314,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
* this?
*/
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -336,7 +336,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Regular extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -363,7 +363,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* The next 3 are split extents */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -389,10 +389,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -414,7 +414,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -441,13 +441,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
disk_bytenr += (em->start - orig_start);
- if (extent_map_block_start(em) != disk_bytenr) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr) {
test_err("wrong block start, want %llu, have %llu",
- disk_bytenr, extent_map_block_start(em));
+ disk_bytenr, btrfs_extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -475,7 +475,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* The next 3 are a half written prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -502,10 +502,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -531,13 +531,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start - orig_start, em->offset);
goto out;
}
- if (extent_map_block_start(em) != disk_bytenr + em->offset) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + em->offset, extent_map_block_start(em));
+ disk_bytenr + em->offset, btrfs_extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -564,13 +564,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->offset, orig_start);
goto out;
}
- if (extent_map_block_start(em) != disk_bytenr + em->offset) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + em->offset, extent_map_block_start(em));
+ disk_bytenr + em->offset, btrfs_extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Now for the compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -597,13 +597,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
+ BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Split compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -630,15 +630,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
+ BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
- disk_bytenr = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -664,16 +664,16 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (extent_map_block_start(em) != disk_bytenr) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr) {
test_err("block start does not match, want %llu got %llu",
- disk_bytenr, extent_map_block_start(em));
+ disk_bytenr, btrfs_extent_map_block_start(em));
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -692,13 +692,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->offset, orig_start);
goto out;
}
- if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
+ BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* A hole between regular extents but no hole extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize);
@@ -725,7 +725,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M);
if (IS_ERR(em)) {
@@ -757,7 +757,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -785,7 +785,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (!IS_ERR(em))
- free_extent_map(em);
+ btrfs_free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
@@ -858,15 +858,16 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
em->flags);
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (extent_map_block_start(em) != sectorsize) {
- test_err("expected a real extent, got %llu", extent_map_block_start(em));
+ if (btrfs_extent_map_block_start(em) != sectorsize) {
+ test_err("expected a real extent, got %llu",
+ btrfs_extent_map_block_start(em));
goto out;
}
if (em->start != sectorsize || em->len != sectorsize) {
@@ -883,7 +884,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (!IS_ERR(em))
- free_extent_map(em);
+ btrfs_free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
@@ -949,11 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */
- ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE >> 1,
- (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1017,11 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
- ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE + sectorsize,
- BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1052,9 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* Empty */
- ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1068,9 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (ret)
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 3fc8dc3fd980..05cfda8af422 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -20,7 +20,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
struct btrfs_extent_item *item;
struct btrfs_extent_inline_ref *iref;
struct btrfs_tree_block_info *block_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key ins;
u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
@@ -41,7 +41,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
if (ret) {
test_err("couldn't insert ref %d", ret);
- btrfs_free_path(path);
return ret;
}
@@ -61,7 +60,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
- btrfs_free_path(path);
return 0;
}
@@ -70,7 +68,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
{
struct btrfs_trans_handle trans;
struct btrfs_extent_item *item;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 refs;
int ret;
@@ -90,7 +88,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
- btrfs_free_path(path);
return ret;
}
@@ -112,7 +109,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
if (ret)
test_err("failed to insert backref");
- btrfs_free_path(path);
return ret;
}
@@ -121,7 +117,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
{
struct btrfs_trans_handle trans;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
btrfs_init_dummy_trans(&trans, NULL);
@@ -139,11 +135,9 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
test_err("didn't find our key %d", ret);
- btrfs_free_path(path);
return ret;
}
btrfs_del_item(&trans, root, path);
- btrfs_free_path(path);
return 0;
}
@@ -152,7 +146,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
{
struct btrfs_trans_handle trans;
struct btrfs_extent_item *item;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 refs;
int ret;
@@ -172,7 +166,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
- btrfs_free_path(path);
return ret;
}
@@ -198,7 +191,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
return ret;
}
btrfs_del_item(&trans, root, path);
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f26a394a9ec5..05ee4391c83a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,6 +32,8 @@
#include "ioctl.h"
#include "relocation.h"
#include "scrub.h"
+#include "ordered-data.h"
+#include "delayed-inode.h"
static struct kmem_cache *btrfs_trans_handle_cachep;
@@ -103,7 +105,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep;
* | attached to transid N+1. |
* | |
* | To next stage: |
- * | Until all tree blocks are super blocks are |
+ * | Until all tree blocks and super blocks are |
* | written to block devices |
* V |
* Transaction N [[TRANS_STATE_COMPLETED]] V
@@ -138,7 +140,6 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
void btrfs_put_transaction(struct btrfs_transaction *transaction)
{
- WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs));
@@ -185,7 +186,8 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
* At this point no one can be using this transaction to modify any tree
* and no one can start another transaction to modify any tree either.
*/
- ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING,
+ "cur_trans->state=%d", cur_trans->state);
down_write(&fs_info->commit_root_sem);
@@ -197,7 +199,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- extent_io_tree_release(&root->dirty_log_pages);
+ btrfs_extent_io_tree_release(&root->dirty_log_pages);
btrfs_qgroup_clean_swapped_blocks(root);
}
@@ -383,10 +385,10 @@ loop:
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
- extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
- IO_TREE_TRANS_DIRTY_PAGES);
- extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
- IO_TREE_FS_PINNED_EXTENTS);
+ btrfs_extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
+ IO_TREE_TRANS_DIRTY_PAGES);
+ btrfs_extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
+ IO_TREE_FS_PINNED_EXTENTS);
btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -404,7 +406,7 @@ loop:
*/
static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- int force)
+ bool force)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
@@ -538,15 +540,15 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
}
}
-static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
+static bool may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
{
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
- return 0;
+ return false;
if (type == TRANS_START)
- return 1;
+ return true;
- return 0;
+ return false;
}
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
@@ -575,7 +577,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
* We want to reserve all the bytes we may need all at once, so we only
* do 1 enospc flushing cycle per transaction start.
*/
- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(si, bytes, flush);
/*
* If we are an emergency flush, which can steal from the global block
@@ -585,7 +587,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
bytes -= *delayed_refs_bytes;
*delayed_refs_bytes = 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(si, bytes, flush);
}
return ret;
@@ -761,9 +763,10 @@ got_it:
* value here.
*/
if (do_chunk_alloc && num_bytes) {
- u64 flags = h->block_rsv->space_info->flags;
+ struct btrfs_space_info *space_info = h->block_rsv->space_info;
+ u64 flags = space_info->flags;
- btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
+ btrfs_chunk_alloc(h, space_info, btrfs_get_alloc_profile(fs_info, flags),
CHUNK_ALLOC_NO_FORCE);
}
@@ -1023,13 +1026,18 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
if (!trans->block_rsv) {
- ASSERT(!trans->bytes_reserved);
- ASSERT(!trans->delayed_refs_bytes_reserved);
+ ASSERT(trans->bytes_reserved == 0,
+ "trans->bytes_reserved=%llu", trans->bytes_reserved);
+ ASSERT(trans->delayed_refs_bytes_reserved == 0,
+ "trans->delayed_refs_bytes_reserved=%llu",
+ trans->delayed_refs_bytes_reserved);
return;
}
if (!trans->bytes_reserved) {
- ASSERT(!trans->delayed_refs_bytes_reserved);
+ ASSERT(trans->delayed_refs_bytes_reserved == 0,
+ "trans->delayed_refs_bytes_reserved=%llu",
+ trans->delayed_refs_bytes_reserved);
return;
}
@@ -1128,13 +1136,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- mark, &cached_state)) {
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, &cached_state)) {
bool wait_writeback = false;
- ret = convert_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT,
- mark, &cached_state);
+ ret = btrfs_convert_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ mark, &cached_state);
/*
* convert_extent_bit can return -ENOMEM, which is most of the
* time a temporary error. So when it happens, ignore the error
@@ -1155,8 +1163,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
if (!ret)
ret = filemap_fdatawrite_range(mapping, start, end);
if (!ret && wait_writeback)
- ret = filemap_fdatawait_range(mapping, start, end);
- free_extent_state(cached_state);
+ btrfs_btree_wait_writeback_range(fs_info, start, end);
+ btrfs_free_extent_state(cached_state);
if (ret)
break;
cached_state = NULL;
@@ -1175,14 +1183,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
- struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
int ret = 0;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- EXTENT_NEED_WAIT, &cached_state)) {
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_NEED_WAIT, &cached_state)) {
/*
* Ignore -ENOMEM errors returned by clear_extent_bit().
* When committing the transaction, we'll remove any entries
@@ -1191,13 +1198,13 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* concurrently - we do it only at transaction commit time when
* it's safe to do it (through extent_io_tree_release()).
*/
- ret = clear_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT, &cached_state);
+ ret = btrfs_clear_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT, &cached_state);
if (ret == -ENOMEM)
ret = 0;
if (!ret)
- ret = filemap_fdatawait_range(mapping, start, end);
- free_extent_state(cached_state);
+ btrfs_btree_wait_writeback_range(fs_info, start, end);
+ btrfs_free_extent_state(cached_state);
if (ret)
break;
cached_state = NULL;
@@ -1211,15 +1218,15 @@ static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
bool errors = false;
- int err;
+ int ret;
- err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ ret = __btrfs_wait_marked_extents(fs_info, dirty_pages);
if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
errors = true;
- if (errors && !err)
- err = -EIO;
- return err;
+ if (errors && !ret)
+ ret = -EIO;
+ return ret;
}
int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
@@ -1227,22 +1234,23 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
struct btrfs_fs_info *fs_info = log_root->fs_info;
struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
bool errors = false;
- int err;
+ int ret;
- ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID,
+ "root_id(log_root)=%llu", btrfs_root_id(log_root));
- err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
- if ((mark & EXTENT_DIRTY) &&
+ ret = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ if ((mark & EXTENT_DIRTY_LOG1) &&
test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
errors = true;
- if ((mark & EXTENT_NEW) &&
+ if ((mark & EXTENT_DIRTY_LOG2) &&
test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
errors = true;
- if (errors && !err)
- err = -EIO;
- return err;
+ if (errors && !ret)
+ ret = -EIO;
+ return ret;
}
/*
@@ -1265,7 +1273,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
blk_finish_plug(&plug);
ret2 = btrfs_wait_extents(fs_info, dirty_pages);
- extent_io_tree_release(&trans->transaction->dirty_pages);
+ btrfs_extent_io_tree_release(&trans->transaction->dirty_pages);
if (ret)
return ret;
@@ -1327,7 +1335,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
struct list_head *io_bgs = &trans->transaction->io_bgs;
- struct list_head *next;
struct extent_buffer *eb;
int ret;
@@ -1335,7 +1342,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
* At this point no one can be using this transaction to modify any tree
* and no one can start another transaction to modify any tree either.
*/
- ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING,
+ "trans->transaction->state=%d", trans->transaction->state);
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
@@ -1363,13 +1371,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
struct btrfs_root *root;
- next = fs_info->dirty_cowonly_roots.next;
- list_del_init(next);
- root = list_entry(next, struct btrfs_root, dirty_list);
+
+ root = list_first_entry(&fs_info->dirty_cowonly_roots,
+ struct btrfs_root, dirty_list);
clear_bit(BTRFS_ROOT_DIRTY, &root->state);
+ list_move_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
@@ -1469,7 +1477,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
* At this point no one can be using this transaction to modify any tree
* and no one can start another transaction to modify any tree either.
*/
- ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING,
+ "trans->transaction->state=%d", trans->transaction->state);
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
@@ -1487,9 +1496,15 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
* At this point we can neither have tasks logging inodes
* from a root nor trying to commit a log tree.
*/
- ASSERT(atomic_read(&root->log_writers) == 0);
- ASSERT(atomic_read(&root->log_commit[0]) == 0);
- ASSERT(atomic_read(&root->log_commit[1]) == 0);
+ ASSERT(atomic_read(&root->log_writers) == 0,
+ "atomic_read(&root->log_writers)=%d",
+ atomic_read(&root->log_writers));
+ ASSERT(atomic_read(&root->log_commit[0]) == 0,
+ "atomic_read(&root->log_commit[0])=%d",
+ atomic_read(&root->log_commit[0]));
+ ASSERT(atomic_read(&root->log_commit[1]) == 0,
+ "atomic_read(&root->log_commit[1])=%d",
+ atomic_read(&root->log_commit[1]));
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)btrfs_root_id(root),
@@ -1570,7 +1585,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* qgroup counters could end up wrong.
*/
ret = btrfs_run_delayed_refs(trans, U64_MAX);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1642,7 +1657,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
struct btrfs_inode *parent_inode = pending->dir;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *dir_item;
struct extent_buffer *tmp;
struct extent_buffer *old;
@@ -1695,10 +1710,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto clear_skip_qgroup;
}
- key.objectid = objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
trans->bytes_reserved = trans->block_rsv->reserved;
@@ -1715,7 +1726,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* insert the directory item
*/
ret = btrfs_set_inode_index(parent_inode, &index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1736,8 +1747,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_create_qgroup(trans, objectid);
if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ if (unlikely(ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info))) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
}
/*
@@ -1747,13 +1760,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* snapshot
*/
ret = btrfs_run_delayed_items(trans);
- if (ret) { /* Transaction aborted */
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = record_root_in_trans(trans, root, 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1788,7 +1801,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
old = btrfs_lock_root_node(root);
ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
BTRFS_NESTING_COW);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
btrfs_abort_transaction(trans, ret);
@@ -1799,21 +1812,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* clean up in any case */
btrfs_tree_unlock(old);
free_extent_buffer(old);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
/* see comments in should_cow_block() */
set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
- smp_wmb();
+ smp_mb__after_atomic();
btrfs_set_root_node(new_root_item, tmp);
/* record when the snapshot was created in key.offset */
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = trans->transid;
ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1825,7 +1840,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_root_id(parent_root),
btrfs_ino(parent_inode), index,
&fname.disk_name);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1840,7 +1855,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
ret = btrfs_reloc_post_snapshot(trans, pending);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1863,7 +1878,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
parent_inode, &key, BTRFS_FT_DIR,
index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1873,14 +1888,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
inode_set_mtime_to_ts(&parent_inode->vfs_inode,
inode_set_ctime_current(&parent_inode->vfs_inode));
ret = btrfs_update_inode_fallback(trans, parent_inode);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
BTRFS_UUID_KEY_SUBVOL,
objectid);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1888,7 +1903,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
objectid);
- if (ret && ret != -EEXIST) {
+ if (unlikely(ret && ret != -EEXIST)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1906,7 +1921,6 @@ free_fname:
free_pending:
kfree(new_root_item);
pending->root_item = NULL;
- btrfs_free_path(path);
pending->path = NULL;
return ret;
@@ -2159,18 +2173,25 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans)
return;
lockdep_assert_held(&trans->fs_info->trans_lock);
- ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP);
+ ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP,
+ "cur_trans->state=%d", cur_trans->state);
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
-static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
+static void update_commit_stats(struct btrfs_fs_info *fs_info)
{
+ ktime_t now = ktime_get_ns();
+ ktime_t interval = now - fs_info->commit_stats.critical_section_start_time;
+
+ ASSERT(fs_info->commit_stats.critical_section_start_time);
+
fs_info->commit_stats.commit_count++;
fs_info->commit_stats.last_commit_dur = interval;
fs_info->commit_stats.max_commit_dur =
max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
fs_info->commit_stats.total_commit_dur += interval;
+ fs_info->commit_stats.critical_section_start_time = 0;
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2179,10 +2200,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
- ktime_t start_time;
- ktime_t interval;
- ASSERT(refcount_read(&trans->use_count) == 1);
+ ASSERT(refcount_read(&trans->use_count) == 1,
+ "refcount_read(&trans->use_count)=%d", refcount_read(&trans->use_count));
btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
@@ -2271,14 +2291,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_blocked_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
- if (cur_trans->list.prev != &fs_info->trans_list) {
+ if (!list_is_first(&cur_trans->list, &fs_info->trans_list)) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
if (trans->in_fsync)
want_state = TRANS_STATE_SUPER_COMMITTED;
- prev_trans = list_entry(cur_trans->list.prev,
- struct btrfs_transaction, list);
+ prev_trans = list_prev_entry(cur_trans, list);
if (prev_trans->state < want_state) {
refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
@@ -2314,8 +2333,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* Get the time spent on the work done by the commit thread and not
* the time spent waiting on a previous commit
*/
- start_time = ktime_get_ns();
-
+ fs_info->commit_stats.critical_section_start_time = ktime_get_ns();
extwriter_counter_dec(cur_trans, trans->type);
ret = btrfs_start_delalloc_flush(fs_info);
@@ -2420,7 +2438,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* them.
*
* We needn't worry that this operation will corrupt the snapshots,
- * because all the tree which are snapshoted will be forced to COW
+ * because all the tree which are snapshotted will be forced to COW
* the nodes and leaves.
*/
ret = btrfs_run_delayed_items(trans);
@@ -2547,6 +2565,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto scrub_continue;
+ update_commit_stats(fs_info);
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
@@ -2555,7 +2574,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&cur_trans->commit_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
- btrfs_finish_extent_commit(trans);
+ ret = btrfs_finish_extent_commit(trans);
+ if (ret)
+ goto scrub_continue;
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info);
@@ -2581,8 +2602,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trace_btrfs_transaction_commit(fs_info);
- interval = ktime_get_ns() - start_time;
-
btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
@@ -2590,8 +2609,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- update_commit_stats(fs_info, interval);
-
return ret;
unlock_reloc:
@@ -2655,9 +2672,9 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- ret = btrfs_drop_snapshot(root, 0, 0);
+ ret = btrfs_drop_snapshot(root, false, false);
else
- ret = btrfs_drop_snapshot(root, 1, 0);
+ ret = btrfs_drop_snapshot(root, true, false);
btrfs_put_root(root);
return (ret < 0) ? 0 : 1;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 9f7c777af635..18ef069197e5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -14,10 +14,6 @@
#include <linux/wait.h>
#include "btrfs_inode.h"
#include "delayed-ref.h"
-#include "extent-io-tree.h"
-#include "block-rsv.h"
-#include "messages.h"
-#include "misc.h"
struct dentry;
struct inode;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 43979891f7c8..c21c21adf61e 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -183,15 +183,16 @@ static bool check_prev_ino(struct extent_buffer *leaf,
/* Only these key->types needs to be checked */
ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY ||
key->type == BTRFS_DIR_INDEX_KEY ||
key->type == BTRFS_DIR_ITEM_KEY ||
- key->type == BTRFS_EXTENT_DATA_KEY);
+ key->type == BTRFS_EXTENT_DATA_KEY, "key->type=%u", key->type);
/*
* Only subvolume trees along with their reloc trees need this check.
* Things like log tree doesn't follow this ino requirement.
*/
- if (!is_fstree(btrfs_header_owner(leaf)))
+ if (!btrfs_is_fstree(btrfs_header_owner(leaf)))
return true;
if (key->objectid == prev_key->objectid)
@@ -475,7 +476,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
* to be COWed to be relocated.
*/
if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
- !is_fstree(key->offset))) {
+ !btrfs_is_fstree(key->offset))) {
generic_err(leaf, slot,
"invalid reloc tree for root %lld, root id is not a subvolume tree",
key->offset);
@@ -493,7 +494,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
- if (unlikely(!is_fstree(key->objectid) && !is_root_item)) {
+ if (unlikely(!btrfs_is_fstree(key->objectid) && !is_root_item)) {
dir_item_err(leaf, slot,
"invalid location key objectid, have %llu expect [%llu, %llu]",
key->objectid, BTRFS_FIRST_FREE_OBJECTID,
@@ -1209,7 +1210,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
/*
* For legacy root item, the members starting at generation_v2 will be
* all filled with 0.
- * And since we allow geneartion_v2 as 0, it will still pass the check.
+ * And since we allow generation_v2 as 0, it will still pass the check.
*/
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
btrfs_item_size(leaf, slot));
@@ -1311,7 +1312,7 @@ static bool is_valid_dref_root(u64 rootid)
* - tree root
* For v1 space cache
*/
- return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
+ return btrfs_is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
rootid == BTRFS_ROOT_TREE_OBJECTID;
}
@@ -1571,7 +1572,7 @@ static int check_extent_item(struct extent_buffer *leaf,
inline_type);
return -EUCLEAN;
}
- if (inline_type < last_type) {
+ if (unlikely(inline_type < last_type)) {
extent_err(leaf, slot,
"inline ref out-of-order: has type %u, prev type %u",
inline_type, last_type);
@@ -1580,7 +1581,7 @@ static int check_extent_item(struct extent_buffer *leaf,
/* Type changed, allow the sequence starts from U64_MAX again. */
if (inline_type > last_type)
last_seq = U64_MAX;
- if (seq > last_seq) {
+ if (unlikely(seq > last_seq)) {
extent_err(leaf, slot,
"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx",
inline_type, inline_offset, seq,
@@ -1617,10 +1618,9 @@ static int check_extent_item(struct extent_buffer *leaf,
if (unlikely(prev_end > key->objectid)) {
extent_err(leaf, slot,
- "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]",
- prev_key->objectid, prev_key->type,
- prev_key->offset, key->objectid, key->type,
- key->offset);
+ "previous extent " BTRFS_KEY_FMT " overlaps current extent " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(prev_key),
+ BTRFS_KEY_FMT_VALUE(key));
return -EUCLEAN;
}
}
@@ -1756,10 +1756,10 @@ static int check_inode_ref(struct extent_buffer *leaf,
while (ptr < end) {
u16 namelen;
- if (unlikely(ptr + sizeof(iref) > end)) {
+ if (unlikely(ptr + sizeof(*iref) > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
- ptr, end, sizeof(iref));
+ ptr, end, sizeof(*iref));
return -EUCLEAN;
}
@@ -1782,6 +1782,39 @@ static int check_inode_ref(struct extent_buffer *leaf,
return 0;
}
+static int check_inode_extref(struct extent_buffer *leaf,
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
+{
+ unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
+ unsigned long end = ptr + btrfs_item_size(leaf, slot);
+
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+ return -EUCLEAN;
+
+ while (ptr < end) {
+ struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr;
+ u16 namelen;
+
+ if (unlikely(ptr + sizeof(*extref) > end)) {
+ inode_ref_err(leaf, slot,
+ "inode extref overflow, ptr %lu end %lu inode_extref size %zu",
+ ptr, end, sizeof(*extref));
+ return -EUCLEAN;
+ }
+
+ namelen = btrfs_inode_extref_name_len(leaf, extref);
+ if (unlikely(ptr + sizeof(*extref) + namelen > end)) {
+ inode_ref_err(leaf, slot,
+ "inode extref overflow, ptr %lu end %lu namelen %u",
+ ptr, end, namelen);
+ return -EUCLEAN;
+ }
+ ptr += sizeof(*extref) + namelen;
+ }
+ return 0;
+}
+
static int check_raid_stripe_extent(const struct extent_buffer *leaf,
const struct btrfs_key *key, int slot)
{
@@ -1893,6 +1926,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
case BTRFS_INODE_REF_KEY:
ret = check_inode_ref(leaf, key, prev_key, slot);
break;
+ case BTRFS_INODE_EXTREF_KEY:
+ ret = check_inode_extref(leaf, key, prev_key, slot);
+ break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
ret = check_block_group_item(leaf, key, slot);
break;
@@ -1929,7 +1965,7 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
break;
}
- if (ret)
+ if (unlikely(ret))
return BTRFS_TREE_BLOCK_INVALID_ITEM;
return BTRFS_TREE_BLOCK_CLEAN;
}
@@ -2023,10 +2059,9 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
/* Make sure the keys are in the right order */
if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) {
generic_err(leaf, slot,
- "bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
- prev_key.objectid, prev_key.type,
- prev_key.offset, key.objectid, key.type,
- key.offset);
+ "bad key order, prev " BTRFS_KEY_FMT " current " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&prev_key),
+ BTRFS_KEY_FMT_VALUE(&key));
return BTRFS_TREE_BLOCK_BAD_KEY_ORDER;
}
@@ -2144,10 +2179,9 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node)
if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) {
generic_err(node, slot,
- "bad key order, current (%llu %u %llu) next (%llu %u %llu)",
- key.objectid, key.type, key.offset,
- next_key.objectid, next_key.type,
- next_key.offset);
+ "bad key order, current " BTRFS_KEY_FMT " next " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&key),
+ BTRFS_KEY_FMT_VALUE(&next_key));
return BTRFS_TREE_BLOCK_BAD_KEY_ORDER;
}
}
@@ -2167,7 +2201,7 @@ ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
{
- const bool is_subvol = is_fstree(root_owner);
+ const bool is_subvol = btrfs_is_fstree(root_owner);
const u64 eb_owner = btrfs_header_owner(eb);
/*
@@ -2209,7 +2243,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
* For subvolume trees, owners can mismatch, but they should all belong
* to subvolume trees.
*/
- if (unlikely(is_subvol != is_fstree(eb_owner))) {
+ if (unlikely(is_subvol != btrfs_is_fstree(eb_owner))) {
btrfs_crit(eb->fs_info,
"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -2229,13 +2263,12 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
int ret;
found_level = btrfs_header_level(eb);
- if (found_level != check->level) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: tree level check failed\n");
+ if (unlikely(found_level != check->level)) {
+ DEBUG_WARN();
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
eb->start, check->level, found_level);
- return -EIO;
+ return -EUCLEAN;
}
if (!check->has_first_key)
@@ -2251,11 +2284,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
return 0;
/* We have @first_key, so this @eb must have at least one item */
- if (btrfs_header_nritems(eb) == 0) {
+ if (unlikely(btrfs_header_nritems(eb) == 0)) {
btrfs_err(fs_info,
"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
return -EUCLEAN;
}
@@ -2263,11 +2296,10 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
btrfs_item_key_to_cpu(eb, &found_key, 0);
- ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
- if (ret) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: tree first key check failed\n");
+ ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
+ if (unlikely(ret)) {
+ DEBUG_WARN();
btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
eb->start, check->transid, check->first_key.objectid,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 90dc094cfa5e..fff37c8d96a4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,7 +27,9 @@
#include "file-item.h"
#include "file.h"
#include "orphan.h"
+#include "print-tree.h"
#include "tree-checker.h"
+#include "delayed-inode.h"
#define MAX_CONFLICT_INODES 10
@@ -101,18 +103,135 @@ enum {
LOG_WALK_REPLAY_ALL,
};
+/*
+ * The walk control struct is used to pass state down the chain when processing
+ * the log tree. The stage field tells us which part of the log tree processing
+ * we are currently doing.
+ */
+struct walk_control {
+ /*
+ * Signal that we are freeing the metadata extents of a log tree.
+ * This is used at transaction commit time while freeing a log tree.
+ */
+ bool free;
+
+ /*
+ * Signal that we are pinning the metadata extents of a log tree and the
+ * data extents its leaves point to (if using mixed block groups).
+ * This happens in the first stage of log replay to ensure that during
+ * replay, while we are modifying subvolume trees, we don't overwrite
+ * the metadata extents of log trees.
+ */
+ bool pin;
+
+ /* What stage of the replay code we're currently in. */
+ int stage;
+
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY.
+ */
+ bool ignore_cur_inode;
+
+ /*
+ * The root we are currently replaying to. This is NULL for the replay
+ * stage LOG_WALK_PIN_ONLY.
+ */
+ struct btrfs_root *root;
+
+ /* The log tree we are currently processing (not NULL for any stage). */
+ struct btrfs_root *log;
+
+ /* The transaction handle used for replaying all log trees. */
+ struct btrfs_trans_handle *trans;
+
+ /*
+ * The function that gets used to process blocks we find in the tree.
+ * Note the extent_buffer might not be up to date when it is passed in,
+ * and it must be checked or read if you need the data inside it.
+ */
+ int (*process_func)(struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level);
+
+ /*
+ * The following are used only when stage is >= LOG_WALK_REPLAY_INODES
+ * and by the replay_one_buffer() callback.
+ */
+
+ /* The current log leaf being processed. */
+ struct extent_buffer *log_leaf;
+ /* The key being processed of the current log leaf. */
+ struct btrfs_key log_key;
+ /* The slot being processed of the current log leaf. */
+ int log_slot;
+
+ /* A path used for searches and modifications to subvolume trees. */
+ struct btrfs_path *subvol_path;
+};
+
+static void do_abort_log_replay(struct walk_control *wc, const char *function,
+ unsigned int line, int error, const char *fmt, ...)
+{
+ struct btrfs_fs_info *fs_info = wc->trans->fs_info;
+ struct va_format vaf;
+ va_list args;
+
+ /*
+ * Do nothing if we already aborted, to avoid dumping leaves again which
+ * can be verbose. Further more, only the first call is useful since it
+ * is where we have a problem. Note that we do not use the flag
+ * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
+ * are outside of tree-log.c that can abort transactions (such as
+ * btrfs_add_link() for example), so if that happens we still want to
+ * dump all log replay specific information below.
+ */
+ if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state))
+ return;
+
+ btrfs_abort_transaction(wc->trans, error);
+
+ if (wc->subvol_path->nodes[0]) {
+ btrfs_crit(fs_info,
+ "subvolume (root %llu) leaf currently being processed:",
+ btrfs_root_id(wc->root));
+ btrfs_print_leaf(wc->subvol_path->nodes[0]);
+ }
+
+ if (wc->log_leaf) {
+ btrfs_crit(fs_info,
+"log tree (for root %llu) leaf currently being processed (slot %d key " BTRFS_KEY_FMT "):",
+ btrfs_root_id(wc->root), wc->log_slot,
+ BTRFS_KEY_FMT_VALUE(&wc->log_key));
+ btrfs_print_leaf(wc->log_leaf);
+ }
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
+ function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);
+
+ va_end(args);
+}
+
+/*
+ * Use this for aborting a transaction during log replay while we are down the
+ * call chain of replay_one_buffer(), so that we get a lot more useful
+ * information for debugging issues when compared to a plain call to
+ * btrfs_abort_transaction().
+ */
+#define btrfs_abort_log_replay(wc, error, fmt, args...) \
+ do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)
+
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx);
-static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid);
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- u64 dirid, int del_all);
+static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
+static noinline int replay_dir_deletes(struct walk_control *wc,
+ u64 dirid, bool del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);
/*
@@ -143,6 +262,9 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r
unsigned int nofs_flag;
struct btrfs_inode *inode;
+ /* Only meant to be called for subvolume roots and not for log roots. */
+ ASSERT(btrfs_is_fstree(btrfs_root_id(root)), "root_id=%llu", btrfs_root_id(root));
+
/*
* We're holding a transaction handle whether we are logging or
* replaying a log tree, so we must make sure NOFS semantics apply
@@ -297,54 +419,13 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
/*
- * the walk control struct is used to pass state down the chain when
- * processing the log tree. The stage field tells us which part
- * of the log tree processing we are currently doing. The others
- * are state fields used for that specific part
- */
-struct walk_control {
- /* should we free the extent on disk when done? This is used
- * at transaction commit time while freeing a log tree
- */
- int free;
-
- /* pin only walk, we record which extents on disk belong to the
- * log trees
- */
- int pin;
-
- /* what stage of the replay code we're currently in */
- int stage;
-
- /*
- * Ignore any items from the inode currently being processed. Needs
- * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
- * the LOG_WALK_REPLAY_INODES stage.
- */
- bool ignore_cur_inode;
-
- /* the root we are currently replaying */
- struct btrfs_root *replay_dest;
-
- /* the trans handle for the current replay */
- struct btrfs_trans_handle *trans;
-
- /* the function that gets used to process blocks we find in the
- * tree. Note the extent_buffer might not be up to date when it is
- * passed in, and it must be checked or read if you need the data
- * inside it
- */
- int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen, int level);
-};
-
-/*
* process_func used to pin down extents, write them or wait on them
*/
-static int process_one_buffer(struct btrfs_root *log,
- struct extent_buffer *eb,
+static int process_one_buffer(struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
+ struct btrfs_root *log = wc->log;
+ struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
@@ -359,25 +440,36 @@ static int process_one_buffer(struct btrfs_root *log,
};
ret = btrfs_read_extent_buffer(eb, &check);
- if (ret)
+ if (unlikely(ret)) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
+ }
}
if (wc->pin) {
- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
- if (ret)
+ ASSERT(trans != NULL);
+ ret = btrfs_pin_extent_for_log_replay(trans, eb);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
return ret;
+ }
- if (btrfs_buffer_uptodate(eb, gen, 0) &&
- btrfs_header_level(eb) == 0)
+ if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) {
ret = btrfs_exclude_logged_extents(eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
}
return ret;
}
/*
- * Item overwrite used by log replay. The given eb, slot and key all refer to
- * the source data we are copying out.
+ * Item overwrite used by log replay. The given log tree leaf, slot and key
+ * from the walk_control structure all refer to the source data we are copying
+ * out.
*
* The given root is for the tree we are copying into, and path is a scratch
* path for use in this function (it should be released on entry and will be
@@ -389,12 +481,10 @@ static int process_one_buffer(struct btrfs_root *log,
*
* If the key isn't in the destination yet, a new item is inserted.
*/
-static int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int overwrite_item(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
int ret;
u32 item_size;
u64 saved_i_size = 0;
@@ -403,7 +493,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
unsigned long dst_ptr;
struct extent_buffer *dst_eb;
int dst_slot;
- bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
+ const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);
/*
* This is only used during log replay, so the root is always from a
@@ -412,18 +502,23 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* the leaf before writing into the log tree. See the comments at
* copy_items() for more details.
*/
- ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root));
- item_size = btrfs_item_size(eb, slot);
- src_ptr = btrfs_item_ptr_offset(eb, slot);
+ item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
+ src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
/* Look for the key in the destination tree. */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
return ret;
+ }
- dst_eb = path->nodes[0];
- dst_slot = path->slots[0];
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
if (ret == 0) {
char *src_copy;
@@ -433,16 +528,17 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
goto insert;
if (item_size == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
src_copy = kmalloc(item_size, GFP_NOFS);
if (!src_copy) {
- btrfs_release_path(path);
+ btrfs_abort_log_replay(wc, -ENOMEM,
+ "failed to allocate memory for log leaf item");
return -ENOMEM;
}
- read_extent_buffer(eb, src_copy, src_ptr, item_size);
+ read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size);
dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
@@ -454,7 +550,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* sync
*/
if (ret == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -462,7 +558,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* We need to load the old nbytes into the inode so when we
* replay the extents we've logged we get the right nbytes.
*/
- if (inode_item) {
+ if (is_inode_item) {
struct btrfs_inode_item *item;
u64 nbytes;
u32 mode;
@@ -470,20 +566,20 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
item = btrfs_item_ptr(dst_eb, dst_slot,
struct btrfs_inode_item);
nbytes = btrfs_inode_nbytes(dst_eb, item);
- item = btrfs_item_ptr(eb, slot,
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, nbytes);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes);
/*
* If this is a directory we need to reset the i_size to
* 0 so that we can set it up properly when replaying
* the rest of the items in this log.
*/
- mode = btrfs_inode_mode(eb, item);
+ mode = btrfs_inode_mode(wc->log_leaf, item);
if (S_ISDIR(mode))
- btrfs_set_inode_size(eb, item, 0);
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
- } else if (inode_item) {
+ } else if (is_inode_item) {
struct btrfs_inode_item *item;
u32 mode;
@@ -491,38 +587,40 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* New inode, set nbytes to 0 so that the nbytes comes out
* properly when we replay the extents.
*/
- item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, 0);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, 0);
/*
* If this is a directory we need to reset the i_size to 0 so
* that we can set it up properly when replaying the rest of
* the items in this log.
*/
- mode = btrfs_inode_mode(eb, item);
+ mode = btrfs_inode_mode(wc->log_leaf, item);
if (S_ISDIR(mode))
- btrfs_set_inode_size(eb, item, 0);
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
insert:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* try to insert the key into the destination tree */
- path->skip_release_on_error = 1;
- ret = btrfs_insert_empty_item(trans, root, path,
- key, item_size);
- path->skip_release_on_error = 0;
+ wc->subvol_path->skip_release_on_error = true;
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size);
+ wc->subvol_path->skip_release_on_error = false;
- dst_eb = path->nodes[0];
- dst_slot = path->slots[0];
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
if (found_size > item_size)
- btrfs_truncate_item(trans, path, item_size, 1);
+ btrfs_truncate_item(trans, wc->subvol_path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(trans, path, item_size - found_size);
+ btrfs_extend_item(trans, wc->subvol_path, item_size - found_size);
} else if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item for key " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(&wc->log_key));
return ret;
}
dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
@@ -536,15 +634,15 @@ insert:
* state of the tree found in the subvolume, and i_size is modified
* as it goes
*/
- if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+ if (is_inode_item && ret == -EEXIST) {
struct btrfs_inode_item *src_item;
struct btrfs_inode_item *dst_item;
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(eb, src_item) == 0) {
- const u64 ino_size = btrfs_inode_size(eb, src_item);
+ if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) {
+ const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item);
/*
* For regular files an ino_size == 0 is used only when
@@ -553,21 +651,21 @@ insert:
* case don't set the size of the inode in the fs/subvol
* tree, otherwise we would be throwing valid data away.
*/
- if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
ino_size != 0)
btrfs_set_inode_size(dst_eb, dst_item, ino_size);
goto no_copy;
}
- if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+ if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
save_old_i_size = 1;
saved_i_size = btrfs_inode_size(dst_eb, dst_item);
}
}
- copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size);
+ copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
@@ -577,7 +675,7 @@ insert:
}
/* make sure the generation is filled in */
- if (key->type == BTRFS_INODE_ITEM_KEY) {
+ if (is_inode_item) {
struct btrfs_inode_item *dst_item;
dst_item = (struct btrfs_inode_item *)dst_ptr;
@@ -585,7 +683,7 @@ insert:
btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
}
no_copy:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -604,21 +702,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
return 0;
}
-/*
- * simple helper to read an inode off the disk from a given root
- * This can only be called for subvolume roots and not for the log
- */
-static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
-{
- struct btrfs_inode *inode;
-
- inode = btrfs_iget_logging(objectid, root);
- if (IS_ERR(inode))
- return NULL;
- return inode;
-}
-
/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released
* on exit.
@@ -631,51 +714,53 @@ static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root,
* The extent is inserted into the file, dropping any existing extents
* from the file that overlap the new one.
*/
-static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int replay_one_extent(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
- u64 start = key->offset;
+ const u64 start = wc->log_key.offset;
u64 nbytes = 0;
+ u64 csum_start;
+ u64 csum_end;
+ LIST_HEAD(ordered_sums);
+ u64 offset;
+ unsigned long dest_offset;
+ struct btrfs_key ins;
struct btrfs_file_extent_item *item;
struct btrfs_inode *inode = NULL;
- unsigned long size;
int ret = 0;
- item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(eb, item);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(wc->log_leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- nbytes = btrfs_file_extent_num_bytes(eb, item);
- extent_end = start + nbytes;
-
- /*
- * We don't add to the inodes nbytes if we are prealloc or a
- * hole.
- */
- if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
- nbytes = 0;
+ extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+ /* Holes don't take up space. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0)
+ nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- size = btrfs_file_extent_ram_bytes(eb, item);
- nbytes = btrfs_file_extent_ram_bytes(eb, item);
- extent_end = ALIGN(start + size,
- fs_info->sectorsize);
+ nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item);
+ extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
} else {
- ret = 0;
- goto out;
+ btrfs_abort_log_replay(wc, -EUCLEAN,
+ "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
+ found_type, btrfs_root_id(root),
+ wc->log_key.objectid, wc->log_key.offset);
+ return -EUCLEAN;
}
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- ret = -EIO;
- goto out;
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get inode %llu for root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
}
/*
@@ -683,240 +768,300 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path,
+ btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
struct btrfs_file_extent_item existing;
unsigned long ptr;
- ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing));
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ read_extent_buffer(leaf, &existing, ptr, sizeof(existing));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
- if (memcmp_extent_buffer(eb, &existing, (unsigned long)item,
+ if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item,
sizeof(existing)) == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
goto out;
}
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* drop any overlapping extents */
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
+ drop_args.path = wc->subvol_path;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu range [%llu, %llu) root %llu",
+ wc->log_key.objectid, start, extent_end,
+ btrfs_root_id(root));
goto out;
+ }
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- u64 offset;
- unsigned long dest_offset;
- struct btrfs_key ins;
-
- if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
- btrfs_fs_incompat(fs_info, NO_HOLES))
- goto update_inode;
-
- ret = btrfs_insert_empty_item(trans, root, path, key,
- sizeof(*item));
+ if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ /* inline extents are easy, we just overwrite them */
+ ret = overwrite_item(wc);
if (ret)
goto out;
- dest_offset = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
- copy_extent_buffer(path->nodes[0], eb, dest_offset,
- (unsigned long)item, sizeof(*item));
+ goto update_inode;
+ }
- ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
- ins.type = BTRFS_EXTENT_ITEM_KEY;
- ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
- offset = key->offset - btrfs_file_extent_offset(eb, item);
+ /*
+ * If not an inline extent, it can only be a regular or prealloc one.
+ * We have checked that above and returned -EUCLEAN if not.
+ */
- /*
- * Manually record dirty extent, as here we did a shallow
- * file extent item copy and skip normal backref update,
- * but modifying extent tree all by ourselves.
- * So need to manually record dirty extent for qgroup,
- * as the owner of the file extent changed from log tree
- * (doesn't affect qgroup) to fs/file tree(affects qgroup)
- */
- ret = btrfs_qgroup_trace_extent(trans,
- btrfs_file_extent_disk_bytenr(eb, item),
- btrfs_file_extent_disk_num_bytes(eb, item));
- if (ret < 0)
- goto out;
+ /* A hole and NO_HOLES feature enabled, nothing else to do. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES))
+ goto update_inode;
- if (ins.objectid > 0) {
- u64 csum_start;
- u64 csum_end;
- LIST_HEAD(ordered_sums);
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path,
+ &wc->log_key, sizeof(*item));
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item with key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
+ goto out;
+ }
+ dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0],
+ wc->subvol_path->slots[0]);
+ copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset,
+ (unsigned long)item, sizeof(*item));
- /*
- * is this extent already allocated in the extent
- * allocation tree? If so, just add a reference
- */
- ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
- ins.offset);
- if (ret < 0) {
- goto out;
- } else if (ret == 0) {
- struct btrfs_ref ref = {
- .action = BTRFS_ADD_DELAYED_REF,
- .bytenr = ins.objectid,
- .num_bytes = ins.offset,
- .owning_root = btrfs_root_id(root),
- .ref_root = btrfs_root_id(root),
- };
- btrfs_init_data_ref(&ref, key->objectid, offset,
- 0, false);
- ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret)
- goto out;
- } else {
- /*
- * insert the extent pointer in the extent
- * allocation tree
- */
- ret = btrfs_alloc_logged_file_extent(trans,
- btrfs_root_id(root),
- key->objectid, offset, &ins);
- if (ret)
- goto out;
- }
- btrfs_release_path(path);
+ /*
+ * We have an explicit hole and NO_HOLES is not enabled. We have added
+ * the hole file extent item to the subvolume tree, so we don't have
+ * anything else to do other than update the file extent item range and
+ * update the inode item.
+ */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) {
+ btrfs_release_path(wc->subvol_path);
+ goto update_inode;
+ }
- if (btrfs_file_extent_compression(eb, item)) {
- csum_start = ins.objectid;
- csum_end = csum_start + ins.offset;
- } else {
- csum_start = ins.objectid +
- btrfs_file_extent_offset(eb, item);
- csum_end = csum_start +
- btrfs_file_extent_num_bytes(eb, item);
- }
+ ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item);
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item);
+ offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item);
- ret = btrfs_lookup_csums_list(root->log_root,
- csum_start, csum_end - 1,
- &ordered_sums, false);
- if (ret < 0)
- goto out;
- ret = 0;
- /*
- * Now delete all existing cums in the csum root that
- * cover our range. We do this because we can have an
- * extent that is completely referenced by one file
- * extent item and partially referenced by another
- * file extent item (like after using the clone or
- * extent_same ioctls). In this case if we end up doing
- * the replay of the one that partially references the
- * extent first, and we do not do the csum deletion
- * below, we can get 2 csum items in the csum tree that
- * overlap each other. For example, imagine our log has
- * the two following file extent items:
- *
- * key (257 EXTENT_DATA 409600)
- * extent data disk byte 12845056 nr 102400
- * extent data offset 20480 nr 20480 ram 102400
- *
- * key (257 EXTENT_DATA 819200)
- * extent data disk byte 12845056 nr 102400
- * extent data offset 0 nr 102400 ram 102400
- *
- * Where the second one fully references the 100K extent
- * that starts at disk byte 12845056, and the log tree
- * has a single csum item that covers the entire range
- * of the extent:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
- *
- * After the first file extent item is replayed, the
- * csum tree gets the following csum item:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
- *
- * Which covers the 20K sub-range starting at offset 20K
- * of our extent. Now when we replay the second file
- * extent item, if we do not delete existing csum items
- * that cover any of its blocks, we end up getting two
- * csum items in our csum tree that overlap each other:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
- * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
- *
- * Which is a problem, because after this anyone trying
- * to lookup up for the checksum of any block of our
- * extent starting at an offset of 40K or higher, will
- * end up looking at the second csum item only, which
- * does not contain the checksum for any block starting
- * at offset 40K or higher of our extent.
- */
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums;
- struct btrfs_root *csum_root;
-
- sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- csum_root = btrfs_csum_root(fs_info,
- sums->logical);
- if (!ret)
- ret = btrfs_del_csums(trans, csum_root,
- sums->logical,
- sums->len);
- if (!ret)
- ret = btrfs_csum_file_blocks(trans,
- csum_root,
- sums);
- list_del(&sums->list);
- kfree(sums);
- }
- if (ret)
- goto out;
- } else {
- btrfs_release_path(path);
+ /*
+ * Manually record dirty extent, as here we did a shallow file extent
+ * item copy and skip normal backref update, but modifying extent tree
+ * all by ourselves. So need to manually record dirty extent for qgroup,
+ * as the owner of the file extent changed from log tree (doesn't affect
+ * qgroup) to fs/file tree (affects qgroup).
+ */
+ ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ }
+
+ /*
+ * Is this extent already allocated in the extent tree?
+ * If so, just add a reference.
+ */
+ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ } else if (ret == 0) {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+
+ btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
}
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- /* inline extents are easy, we just overwrite them */
- ret = overwrite_item(trans, root, path, eb, slot, key);
- if (ret)
+ } else {
+ /* Insert the extent pointer in the extent tree. */
+ ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root),
+ wc->log_key.objectid, offset, &ins);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
+ ins.objectid, ins.offset, offset,
+ wc->log_key.objectid, btrfs_root_id(root));
goto out;
+ }
}
- ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
+ btrfs_release_path(wc->subvol_path);
+
+ if (btrfs_file_extent_compression(wc->log_leaf, item)) {
+ csum_start = ins.objectid;
+ csum_end = csum_start + ins.offset;
+ } else {
+ csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item);
+ csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+ }
+
+ ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1,
+ &ordered_sums, false);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
+ csum_start, csum_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+ ret = 0;
+ /*
+ * Now delete all existing cums in the csum root that cover our range.
+ * We do this because we can have an extent that is completely
+ * referenced by one file extent item and partially referenced by
+ * another file extent item (like after using the clone or extent_same
+ * ioctls). In this case if we end up doing the replay of the one that
+ * partially references the extent first, and we do not do the csum
+ * deletion below, we can get 2 csum items in the csum tree that overlap
+ * each other. For example, imagine our log has the two following file
+ * extent items:
+ *
+ * key (257 EXTENT_DATA 409600)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 20480 nr 20480 ram 102400
+ *
+ * key (257 EXTENT_DATA 819200)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 0 nr 102400 ram 102400
+ *
+ * Where the second one fully references the 100K extent that starts at
+ * disk byte 12845056, and the log tree has a single csum item that
+ * covers the entire range of the extent:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ *
+ * After the first file extent item is replayed, the csum tree gets the
+ * following csum item:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which covers the 20K sub-range starting at offset 20K of our extent.
+ * Now when we replay the second file extent item, if we do not delete
+ * existing csum items that cover any of its blocks, we end up getting
+ * two csum items in our csum tree that overlap each other:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which is a problem, because after this anyone trying to lookup for
+ * the checksum of any block of our extent starting at an offset of 40K
+ * or higher, will end up looking at the second csum item only, which
+ * does not contain the checksum for any block starting at offset 40K or
+ * higher of our extent.
+ */
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_root *csum_root;
+
+ sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
+ csum_root = btrfs_csum_root(fs_info, sums->logical);
+ if (!ret) {
+ ret = btrfs_del_csums(trans, csum_root, sums->logical,
+ sums->len);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ }
+ if (!ret) {
+ ret = btrfs_csum_file_blocks(trans, csum_root, sums);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to add csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ }
+ list_del(&sums->list);
+ kfree(sums);
+ }
if (ret)
goto out;
update_inode:
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to set file extent range [%llu, %llu) inode %llu root %llu",
+ start, extent_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+
btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
out:
iput(&inode->vfs_inode);
return ret;
}
-static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
+static int unlink_inode_for_log_replay(struct walk_control *wc,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
const struct fscrypt_str *name)
{
+ struct btrfs_trans_handle *trans = wc->trans;
int ret;
ret = btrfs_unlink_inode(trans, dir, inode, name);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to unlink inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
return ret;
+ }
/*
* Whenever we need to check if a name exists or not, we check the
* fs/subvolume tree. So after an unlink we must run delayed items, so
* that future checks for a name during log replay see that the name
* does not exists anymore.
*/
- return btrfs_run_delayed_items(trans);
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
+
+ return ret;
}
/*
@@ -927,41 +1072,48 @@ static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
* This is a helper function to do the unlink of a specific directory
* item
*/
-static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
+static noinline int drop_one_dir_item(struct walk_control *wc,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
struct btrfs_inode *inode;
struct fscrypt_str name;
- struct extent_buffer *leaf;
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
struct btrfs_key location;
int ret;
- leaf = path->nodes[0];
-
btrfs_dir_item_key_to_cpu(leaf, di, &location);
ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
- if (ret)
- return -ENOMEM;
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
+ return ret;
+ }
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to open inode %llu parent dir %llu name %.*s root %llu",
+ location.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
+ inode = NULL;
goto out;
}
- ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ ret = link_to_fixup_dir(wc, location.objectid);
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
out:
kfree(name.name);
- iput(&inode->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1024,7 +1176,7 @@ static noinline int backref_in_log(struct btrfs_root *log,
u64 ref_objectid,
const struct fscrypt_str *name)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
path = btrfs_alloc_path();
@@ -1032,12 +1184,10 @@ static noinline int backref_in_log(struct btrfs_root *log,
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret == 1) {
- ret = 0;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
@@ -1046,172 +1196,224 @@ static noinline int backref_in_log(struct btrfs_root *log,
else
ret = !!btrfs_find_name_in_backref(path->nodes[0],
path->slots[0], name);
-out:
- btrfs_free_path(path);
return ret;
}
-static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_root *log_root,
+static int unlink_refs_not_in_log(struct walk_control *wc,
+ struct btrfs_key *search_key,
struct btrfs_inode *dir,
- struct btrfs_inode *inode,
- u64 inode_objectid, u64 parent_objectid,
- u64 ref_index, struct fscrypt_str *name)
+ struct btrfs_inode *inode)
{
- int ret;
- struct extent_buffer *leaf;
- struct btrfs_dir_item *di;
- struct btrfs_key search_key;
- struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ unsigned long ptr;
+ unsigned long ptr_end;
-again:
- /* Search old style refs */
- search_key.objectid = inode_objectid;
- search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = parent_objectid;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
+ /*
+ * Check all the names in this back reference to see if they are in the
+ * log. If so, we allow them to stay otherwise they must be unlinked as
+ * a conflict.
+ */
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]);
+ while (ptr < ptr_end) {
+ struct fscrypt_str victim_name;
struct btrfs_inode_ref *victim_ref;
- unsigned long ptr;
- unsigned long ptr_end;
-
- leaf = path->nodes[0];
-
- /* are we trying to overwrite a back ref for the root directory
- * if so, just jump out, we're done
- */
- if (search_key.objectid == search_key.offset)
- return 1;
-
- /* check all the names in this back reference to see
- * if they are in the log. if so, we allow them to stay
- * otherwise they must be unlinked as a conflict
- */
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
- while (ptr < ptr_end) {
- struct fscrypt_str victim_name;
+ int ret;
- victim_ref = (struct btrfs_inode_ref *)ptr;
- ret = read_alloc_one_name(leaf, (victim_ref + 1),
- btrfs_inode_ref_name_len(leaf, victim_ref),
- &victim_name);
- if (ret)
- return ret;
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ ret = read_alloc_one_name(leaf, (victim_ref + 1),
+ btrfs_inode_ref_name_len(leaf, victim_ref),
+ &victim_name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
+ return ret;
+ }
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, &victim_name);
+ ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name);
+ if (ret) {
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
kfree(victim_name.name);
return ret;
- } else if (!ret) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = unlink_inode_for_log_replay(trans, dir, inode,
- &victim_name);
- kfree(victim_name.name);
- if (ret)
- return ret;
- goto again;
}
kfree(victim_name.name);
-
ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
+ continue;
}
- }
- btrfs_release_path(path);
- /* Same search but for extended refs */
- extref = btrfs_lookup_inode_extref(NULL, root, path, name,
- inode_objectid, parent_objectid, 0,
- 0);
- if (IS_ERR(extref)) {
- return PTR_ERR(extref);
- } else if (extref) {
- u32 item_size;
- u32 cur_offset = 0;
- unsigned long base;
- struct btrfs_inode *victim_parent;
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(wc->subvol_path);
- leaf = path->nodes[0];
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
+ }
- item_size = btrfs_item_size(leaf, path->slots[0]);
- base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ return 0;
+}
- while (cur_offset < item_size) {
- struct fscrypt_str victim_name;
+static int unlink_extrefs_not_in_log(struct walk_control *wc,
+ struct btrfs_key *search_key,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode)
+{
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]);
+ u32 cur_offset = 0;
- extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ while (cur_offset < item_size) {
+ struct btrfs_root *log_root = wc->log;
+ struct btrfs_inode_extref *extref;
+ struct fscrypt_str victim_name;
+ int ret;
- if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
- goto next;
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
- ret = read_alloc_one_name(leaf, &extref->name,
- btrfs_inode_extref_name_len(leaf, extref),
- &victim_name);
- if (ret)
- return ret;
+ if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir))
+ goto next;
+
+ ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
+ &victim_name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
+ return ret;
+ }
- search_key.objectid = inode_objectid;
- search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = btrfs_extref_hash(parent_objectid,
- victim_name.name,
- victim_name.len);
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, &victim_name);
+ search_key->objectid = btrfs_ino(inode);
+ search_key->type = BTRFS_INODE_EXTREF_KEY;
+ search_key->offset = btrfs_extref_hash(btrfs_ino(dir),
+ victim_name.name,
+ victim_name.len);
+ ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name);
+ if (ret) {
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
kfree(victim_name.name);
return ret;
- } else if (!ret) {
- ret = -ENOENT;
- victim_parent = read_one_inode(root,
- parent_objectid);
- if (victim_parent) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = unlink_inode_for_log_replay(trans,
- victim_parent,
- inode, &victim_name);
- }
- iput(&victim_parent->vfs_inode);
- kfree(victim_name.name);
- if (ret)
- return ret;
- goto again;
}
kfree(victim_name.name);
next:
cur_offset += victim_name.len + sizeof(*extref);
+ continue;
}
+
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(wc->subvol_path);
+
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
}
- btrfs_release_path(path);
+
+ return 0;
+}
+
+static inline int __add_inode_ref(struct walk_control *wc,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ u64 ref_index, struct fscrypt_str *name)
+{
+ int ret;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_dir_item *di;
+ struct btrfs_key search_key;
+ struct btrfs_inode_extref *extref;
+
+again:
+ /* Search old style refs */
+ search_key.objectid = btrfs_ino(inode);
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = btrfs_ino(dir);
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&search_key),
+ btrfs_root_id(root));
+ return ret;
+ } else if (ret == 0) {
+ /*
+ * Are we trying to overwrite a back ref for the root directory?
+ * If so, we're done.
+ */
+ if (search_key.objectid == search_key.offset)
+ return 1;
+
+ ret = unlink_refs_not_in_log(wc, &search_key, dir, inode);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
+ }
+ btrfs_release_path(wc->subvol_path);
+
+ /* Same search but for extended refs */
+ extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name,
+ btrfs_ino(inode), btrfs_ino(dir));
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
+ ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
+ }
+ btrfs_release_path(wc->subvol_path);
/* look for a conflicting sequence number */
- di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+ di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir),
ref_index, name, 0);
if (IS_ERR(di)) {
- return PTR_ERR(di);
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(dir), ref_index, name->len,
+ name->name, btrfs_root_id(root));
+ return ret;
} else if (di) {
- ret = drop_one_dir_item(trans, path, dir, di);
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* look for a conflicting name */
- di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
+ di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0);
if (IS_ERR(di)) {
- return PTR_ERR(di);
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name->len, name->name,
+ btrfs_root_id(root));
+ return ret;
} else if (di) {
- ret = drop_one_dir_item(trans, path, dir, di);
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -1264,63 +1466,79 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
* proper unlink of that name (that is, remove its entry from the inode
* reference item and both dir index keys).
*/
-static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_inode *inode,
- struct extent_buffer *log_eb,
- int log_slot,
- struct btrfs_key *key)
+static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode)
{
+ struct btrfs_root *root = wc->root;
int ret;
unsigned long ref_ptr;
unsigned long ref_end;
struct extent_buffer *eb;
again:
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ btrfs_release_path(wc->subvol_path);
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
if (ret > 0) {
ret = 0;
goto out;
}
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
+ BTRFS_KEY_FMT_VALUE(&wc->log_key),
+ btrfs_root_id(root));
goto out;
+ }
- eb = path->nodes[0];
- ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
+ eb = wc->subvol_path->nodes[0];
+ ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]);
while (ref_ptr < ref_end) {
struct fscrypt_str name;
u64 parent_id;
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
ret = extref_get_fields(eb, ref_ptr, &name,
NULL, &parent_id);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
} else {
- parent_id = key->offset;
+ parent_id = wc->log_key.offset;
ret = ref_get_fields(eb, ref_ptr, &name, NULL);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_id %llu root %llu",
+ btrfs_ino(inode), parent_id,
+ btrfs_root_id(root));
+ goto out;
+ }
}
- if (ret)
- goto out;
- if (key->type == BTRFS_INODE_EXTREF_KEY)
- ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot,
parent_id, &name);
else
- ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
+ ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot,
+ &name);
if (!ret) {
struct btrfs_inode *dir;
- btrfs_release_path(path);
- dir = read_one_inode(root, parent_id);
- if (!dir) {
- ret = -ENOENT;
+ btrfs_release_path(wc->subvol_path);
+ dir = btrfs_iget_logging(parent_id, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
kfree(name.name);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_id, btrfs_root_id(root));
goto out;
}
- ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
kfree(name.name);
iput(&dir->vfs_inode);
if (ret)
@@ -1330,57 +1548,51 @@ again:
kfree(name.name);
ref_ptr += name.len;
- if (key->type == BTRFS_INODE_EXTREF_KEY)
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
ref_ptr += sizeof(struct btrfs_inode_extref);
else
ref_ptr += sizeof(struct btrfs_inode_ref);
}
ret = 0;
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function. (it should be released on return).
+ * Replay one inode back reference item found in the log tree.
+ * Path is for temporary use by this function (it should be released on return).
*/
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int add_inode_ref(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_inode *dir = NULL;
struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name = { 0 };
int ret;
- int log_ref_ver = 0;
+ const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
int ref_struct_size;
- ref_ptr = btrfs_item_ptr_offset(eb, slot);
- ref_end = ref_ptr + btrfs_item_size(eb, slot);
+ ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
+ ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot);
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (is_extref_item) {
struct btrfs_inode_extref *r;
ref_struct_size = sizeof(struct btrfs_inode_extref);
- log_ref_ver = 1;
r = (struct btrfs_inode_extref *)ref_ptr;
- parent_objectid = btrfs_inode_extref_parent(eb, r);
+ parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r);
} else {
ref_struct_size = sizeof(struct btrfs_inode_ref);
- parent_objectid = key->offset;
+ parent_objectid = wc->log_key.offset;
}
- inode_objectid = key->objectid;
+ inode_objectid = wc->log_key.objectid;
/*
* it is possible that we didn't log all the parent directories
@@ -1388,41 +1600,93 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
- dir = read_one_inode(root, parent_objectid);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid, btrfs_root_id(root));
+ dir = NULL;
goto out;
}
- inode = read_one_inode(root, inode_objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(inode_objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ inode_objectid, btrfs_root_id(root));
+ inode = NULL;
goto out;
}
while (ref_ptr < ref_end) {
- if (log_ref_ver) {
- ret = extref_get_fields(eb, ref_ptr, &name,
+ if (is_extref_item) {
+ ret = extref_get_fields(wc->log_leaf, ref_ptr, &name,
&ref_index, &parent_objectid);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
/*
* parent object can change from one array
* item to another.
*/
- if (!dir)
- dir = read_one_inode(root, parent_objectid);
if (!dir) {
- ret = -ENOENT;
- goto out;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
+ /*
+ * A new parent dir may have not been
+ * logged and not exist in the subvolume
+ * tree, see the comment above before
+ * the loop when getting the first
+ * parent dir.
+ */
+ if (ret == -ENOENT) {
+ /*
+ * The next extref may refer to
+ * another parent dir that
+ * exists, so continue.
+ */
+ ret = 0;
+ goto next;
+ } else {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid,
+ btrfs_root_id(root));
+ }
+ goto out;
+ }
}
} else {
- ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
+ ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_objectid %llu root %llu",
+ btrfs_ino(inode),
+ parent_objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
}
- if (ret)
- goto out;
- ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
- ref_index, &name);
+ ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir),
+ btrfs_ino(inode), ref_index, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ ref_index, name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (ret == 0) {
/*
@@ -1432,9 +1696,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
- ret = __add_inode_ref(trans, root, path, log, dir, inode,
- inode_objectid, parent_objectid,
- ref_index, &name);
+ ret = __add_inode_ref(wc, dir, inode, ref_index, &name);
if (ret) {
if (ret == 1)
ret = 0;
@@ -1443,19 +1705,32 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
/* insert our name */
ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode),
+ btrfs_ino(dir), ref_index,
+ name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
ret = btrfs_update_inode(trans, inode);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
goto out;
+ }
}
/* Else, ret == 1, we already have a perfect match, we're done. */
+next:
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
kfree(name.name);
name.name = NULL;
- if (log_ref_ver) {
+ if (is_extref_item && dir) {
iput(&dir->vfs_inode);
dir = NULL;
}
@@ -1469,14 +1744,14 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
- ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
+ ret = unlink_old_inode_refs(wc, inode);
if (ret)
goto out;
/* finally write the back reference in the inode */
- ret = overwrite_item(trans, root, path, eb, slot, key);
+ ret = overwrite_item(wc);
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
kfree(name.name);
if (dir)
iput(&dir->vfs_inode);
@@ -1594,26 +1869,22 @@ process_slot:
* number of back refs found. If it goes down to zero, the iput
* will free the inode.
*/
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+static noinline int fixup_inode_link_count(struct walk_control *wc,
struct btrfs_inode *inode)
{
+ struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_root *root = inode->root;
- struct btrfs_path *path;
int ret;
u64 nlink = 0;
const u64 ino = btrfs_ino(inode);
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- ret = count_inode_refs(inode, path);
+ ret = count_inode_refs(inode, wc->subvol_path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(inode, path);
+ ret = count_inode_extrefs(inode, wc->subvol_path);
if (ret < 0)
goto out;
@@ -1632,8 +1903,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (inode->vfs_inode.i_nlink == 0) {
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- ret = replay_dir_deletes(trans, root, NULL, path,
- ino, 1);
+ ret = replay_dir_deletes(wc, ino, true);
if (ret)
goto out;
}
@@ -1643,13 +1913,11 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
}
out:
- btrfs_free_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
-static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path)
+static noinline int fixup_inode_link_counts(struct walk_control *wc)
{
int ret;
struct btrfs_key key;
@@ -1658,48 +1926,50 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_inode *inode;
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1);
if (ret < 0)
break;
if (ret == 1) {
ret = 0;
- if (path->slots[0] == 0)
+ if (wc->subvol_path->slots[0] == 0)
break;
- path->slots[0]--;
+ wc->subvol_path->slots[0]--;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]);
if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- ret = btrfs_del_item(trans, root, path);
+ ret = btrfs_del_item(trans, root, wc->subvol_path);
if (ret)
break;
- btrfs_release_path(path);
- inode = read_one_inode(root, key.offset);
- if (!inode) {
- ret = -EIO;
+ btrfs_release_path(wc->subvol_path);
+ inode = btrfs_iget_logging(key.offset, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
- ret = fixup_inode_link_count(trans, inode);
+ ret = fixup_inode_link_count(wc, inode);
iput(&inode->vfs_inode);
if (ret)
break;
/*
* fixup on a directory may create new entries,
- * make sure we always look for the highset possible
+ * make sure we always look for the highest possible
* offset
*/
key.offset = (u64)-1;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
@@ -1709,36 +1979,48 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
* count when replay is done. The link count is incremented here
* so the inode won't go away until we check it
*/
-static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 objectid)
+static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_key key;
int ret = 0;
struct btrfs_inode *inode;
struct inode *vfs_inode;
- inode = read_one_inode(root, objectid);
- if (!inode)
- return -EIO;
+ inode = btrfs_iget_logging(objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ objectid, btrfs_root_id(root));
+ return ret;
+ }
vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
- ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (ret == 0) {
if (!vfs_inode->i_nlink)
set_nlink(vfs_inode, 1);
else
inc_nlink(vfs_inode);
ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ objectid, btrfs_root_id(root));
} else if (ret == -EEXIST) {
ret = 0;
+ } else {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert fixup item for inode %llu root %llu",
+ objectid, btrfs_root_id(root));
}
iput(vfs_inode);
@@ -1760,14 +2042,14 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir;
int ret;
- inode = read_one_inode(root, location->objectid);
- if (!inode)
- return -ENOENT;
+ inode = btrfs_iget_logging(location->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- dir = read_one_inode(root, dirid);
- if (!dir) {
+ dir = btrfs_iget_logging(dirid, root);
+ if (IS_ERR(dir)) {
iput(&inode->vfs_inode);
- return -EIO;
+ return PTR_ERR(dir);
}
ret = btrfs_add_link(trans, dir, inode, name, 1, index);
@@ -1779,9 +2061,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
return ret;
}
-static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+static int delete_conflicting_dir_entry(struct walk_control *wc,
struct btrfs_inode *dir,
- struct btrfs_path *path,
struct btrfs_dir_item *dst_di,
const struct btrfs_key *log_key,
u8 log_flags,
@@ -1789,12 +2070,12 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
{
struct btrfs_key found_key;
- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key);
/* The existing dentry points to the same inode, don't delete it. */
if (found_key.objectid == log_key->objectid &&
found_key.type == log_key->type &&
found_key.offset == log_key->offset &&
- btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
+ btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags)
return 1;
/*
@@ -1804,7 +2085,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
if (!exists)
return 0;
- return drop_one_dir_item(trans, path, dir, dst_di);
+ return drop_one_dir_item(wc, dir, dst_di);
}
/*
@@ -1823,13 +2104,10 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
* non-existing inode) and 1 if the name was replayed.
*/
-static noinline int replay_one_name(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb,
- struct btrfs_dir_item *di,
- struct btrfs_key *key)
+static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct fscrypt_str name = { 0 };
struct btrfs_dir_item *dir_dst_di;
struct btrfs_dir_item *index_dst_di;
@@ -1844,53 +2122,85 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool update_size = true;
bool name_added = false;
- dir = read_one_inode(root, key->objectid);
- if (!dir)
- return -EIO;
+ dir = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
+ }
- ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
- if (ret)
+ ret = read_alloc_one_name(wc->log_leaf, di + 1,
+ btrfs_dir_name_len(wc->log_leaf, di), &name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
goto out;
+ }
- log_flags = btrfs_dir_flags(eb, di);
- btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- btrfs_release_path(path);
- if (ret < 0)
+ log_flags = btrfs_dir_flags(wc->log_leaf, di);
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key);
+ ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0);
+ btrfs_release_path(wc->subvol_path);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ log_key.objectid, btrfs_root_id(root));
goto out;
+ }
exists = (ret == 0);
ret = 0;
- dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- &name, 1);
+ dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid, &name, 1);
if (IS_ERR(dir_dst_di)) {
ret = PTR_ERR(dir_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (dir_dst_di) {
- ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
+ ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di,
&log_key, log_flags, exists);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
dir_dst_matches = (ret == 1);
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
- key->objectid, key->offset,
- &name, 1);
+ index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid,
+ wc->log_key.offset, &name, 1);
if (IS_ERR(index_dst_di)) {
ret = PTR_ERR(index_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (index_dst_di) {
- ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
+ ret = delete_conflicting_dir_entry(wc, dir, index_dst_di,
&log_key, log_flags, exists);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
index_dst_matches = (ret == 1);
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (dir_dst_matches && index_dst_matches) {
ret = 0;
@@ -1904,9 +2214,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
*/
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = key->objectid;
+ search_key.offset = wc->log_key.objectid;
ret = backref_in_log(root->log_root, &search_key, 0, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
goto out;
} else if (ret) {
/* The dentry will be added later. */
@@ -1917,9 +2231,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
+ search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len);
+ ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
goto out;
} else if (ret) {
/* The dentry will be added later. */
@@ -1927,11 +2245,15 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
update_size = false;
goto out;
}
- btrfs_release_path(path);
- ret = insert_one_name(trans, root, key->objectid, key->offset,
+ ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset,
&name, &log_key);
- if (ret && ret != -ENOENT && ret != -EEXIST)
+ if (ret && ret != -ENOENT && ret != -EEXIST) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert name %.*s for inode %llu dir %llu root %llu",
+ name.len, name.name, log_key.objectid,
+ btrfs_ino(dir), btrfs_root_id(root));
goto out;
+ }
if (!ret)
name_added = true;
update_size = false;
@@ -1941,6 +2263,10 @@ out:
if (!ret && update_size) {
btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
ret = btrfs_update_inode(trans, dir);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update dir inode %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
}
kfree(name.name);
iput(&dir->vfs_inode);
@@ -1950,20 +2276,17 @@ out:
}
/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
-static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int replay_one_dir_item(struct walk_control *wc)
{
int ret;
struct btrfs_dir_item *di;
/* We only log dir index keys, which only contain a single dir item. */
- ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
+ ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY,
+ "wc->log_key.type=%u", wc->log_key.type);
- di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
- ret = replay_one_name(trans, root, path, eb, di, key);
+ di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
+ ret = replay_one_name(wc, di);
if (ret < 0)
return ret;
@@ -1993,17 +2316,11 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
* to ever delete the parent directory has it would result in stale
* dentries that can never be deleted.
*/
- if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
- struct btrfs_path *fixup_path;
+ if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) {
struct btrfs_key di_key;
- fixup_path = btrfs_alloc_path();
- if (!fixup_path)
- return -ENOMEM;
-
- btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
- btrfs_free_path(fixup_path);
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key);
+ ret = link_to_fixup_dir(wc, di_key.objectid);
}
return ret;
@@ -2096,13 +2413,13 @@ out:
* item is not in the log, the item is removed and the inode it points
* to is unlinked
*/
-static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *log,
- struct btrfs_path *path,
+static noinline int check_item_in_log(struct walk_control *wc,
struct btrfs_path *log_path,
struct btrfs_inode *dir,
- struct btrfs_key *dir_key)
+ struct btrfs_key *dir_key,
+ bool force_remove)
{
+ struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
@@ -2118,23 +2435,33 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
* we need to do is process the dir index keys, we (and our caller) can
* safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
*/
- ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
+ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY, "dir_key->type=%u", dir_key->type);
- eb = path->nodes[0];
- slot = path->slots[0];
+ eb = wc->subvol_path->nodes[0];
+ slot = wc->subvol_path->slots[0];
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu index %llu root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ btrfs_root_id(root));
goto out;
+ }
- if (log) {
+ if (!force_remove) {
struct btrfs_dir_item *log_di;
- log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+ log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path,
dir_key->objectid,
dir_key->offset, &name, 0);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (log_di) {
/* The dentry exists in the log, we have nothing to do. */
@@ -2144,27 +2471,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
}
btrfs_dir_item_key_to_cpu(eb, di, &location);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ location.objectid, btrfs_root_id(root));
goto out;
}
- ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ ret = link_to_fixup_dir(wc, location.objectid);
if (ret)
goto out;
inc_nlink(&inode->vfs_inode);
- ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
* (an index number), so we're done.
*/
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
btrfs_release_path(log_path);
kfree(name.name);
if (inode)
@@ -2172,59 +2503,67 @@ out:
return ret;
}
-static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- const u64 ino)
+static int replay_xattr_deletes(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = wc->log;
struct btrfs_key search_key;
- struct btrfs_path *log_path;
- int i;
+ BTRFS_PATH_AUTO_FREE(log_path);
+ const u64 ino = wc->log_key.objectid;
int nritems;
int ret;
log_path = btrfs_alloc_path();
- if (!log_path)
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
search_key.objectid = ino;
search_key.type = BTRFS_XATTR_ITEM_KEY;
search_key.offset = 0;
again:
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search xattrs for inode %llu root %llu",
+ ino, btrfs_root_id(root));
goto out;
+ }
process_leaf:
- nritems = btrfs_header_nritems(path->nodes[0]);
- for (i = path->slots[0]; i < nritems; i++) {
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ for (int i = wc->subvol_path->slots[0]; i < nritems; i++) {
struct btrfs_key key;
struct btrfs_dir_item *di;
struct btrfs_dir_item *log_di;
u32 total_size;
u32 cur;
- btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i);
if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
ret = 0;
goto out;
}
- di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
- total_size = btrfs_item_size(path->nodes[0], i);
+ di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size(wc->subvol_path->nodes[0], i);
cur = 0;
while (cur < total_size) {
- u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
- u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+ u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di);
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate memory for name of length %u",
+ name_len);
goto out;
}
- read_extent_buffer(path->nodes[0], name,
+ read_extent_buffer(wc->subvol_path->nodes[0], name,
(unsigned long)(di + 1), name_len);
log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
@@ -2232,40 +2571,59 @@ process_leaf:
btrfs_release_path(log_path);
if (!log_di) {
/* Doesn't exist in log tree, so delete it. */
- btrfs_release_path(path);
- di = btrfs_lookup_xattr(trans, root, path, ino,
+ btrfs_release_path(wc->subvol_path);
+ di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino,
name, name_len, -1);
- kfree(name);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
}
ASSERT(di);
ret = btrfs_delete_one_dir_name(trans, root,
- path, di);
- if (ret)
+ wc->subvol_path, di);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(wc->subvol_path);
+ kfree(name);
search_key = key;
goto again;
}
- kfree(name);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
}
+ kfree(name);
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
}
- ret = btrfs_next_leaf(root, path);
+ ret = btrfs_next_leaf(root, wc->subvol_path);
if (ret > 0)
ret = 0;
else if (ret == 0)
goto process_leaf;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
out:
- btrfs_free_path(log_path);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
@@ -2280,34 +2638,41 @@ out:
* Anything we don't find in the log is unlinked and removed from the
* directory.
*/
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- u64 dirid, int del_all)
+static noinline int replay_dir_deletes(struct walk_control *wc,
+ u64 dirid, bool del_all)
{
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = (del_all ? NULL : wc->log);
u64 range_start;
u64 range_end;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
- struct btrfs_path *log_path;
+ BTRFS_PATH_AUTO_FREE(log_path);
struct btrfs_inode *dir;
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
- if (!log_path)
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
- dir = read_one_inode(root, dirid);
- /* it isn't an error if the inode isn't there, that can happen
- * because we replay the deletes before we copy in the inode item
- * from the log
+ dir = btrfs_iget_logging(dirid, root);
+ /*
+ * It isn't an error if the inode isn't there, that can happen because
+ * we replay the deletes before we copy in the inode item from the log.
*/
- if (!dir) {
- btrfs_free_path(log_path);
- return 0;
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ dirid, btrfs_root_id(root));
+ return ret;
}
range_start = 0;
@@ -2316,32 +2681,45 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (del_all)
range_end = (u64)-1;
else {
- ret = find_dir_range(log, path, dirid,
+ ret = find_dir_range(log, wc->subvol_path, dirid,
&range_start, &range_end);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to find range for dir %llu in log tree root %llu",
+ dirid, btrfs_root_id(root));
goto out;
- else if (ret > 0)
+ } else if (ret > 0) {
break;
+ }
}
dir_key.offset = range_start;
while (1) {
int nritems;
- ret = btrfs_search_slot(NULL, root, &dir_key, path,
- 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &dir_key,
+ wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search root %llu for key " BTRFS_KEY_FMT,
+ btrfs_root_id(root),
+ BTRFS_KEY_FMT_VALUE(&dir_key));
goto out;
+ }
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 1)
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ if (wc->subvol_path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, wc->subvol_path);
+ if (ret == 1) {
break;
- else if (ret < 0)
+ } else if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
goto out;
+ }
}
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key,
+ wc->subvol_path->slots[0]);
if (found_key.objectid != dirid ||
found_key.type != dir_key.type) {
ret = 0;
@@ -2351,24 +2729,21 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, log, path,
- log_path, dir,
- &found_key);
+ ret = check_item_in_log(wc, log_path, dir, &found_key, del_all);
if (ret)
goto out;
if (found_key.offset == (u64)-1)
break;
dir_key.offset = found_key.offset + 1;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (range_end == (u64)-1)
break;
range_start = range_end + 1;
}
ret = 0;
out:
- btrfs_release_path(path);
- btrfs_free_path(log_path);
+ btrfs_release_path(wc->subvol_path);
iput(&dir->vfs_inode);
return ret;
}
@@ -2384,7 +2759,7 @@ out:
* only in the log (references come from either directory items or inode
* back refs).
*/
-static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+static int replay_one_buffer(struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
int nritems;
@@ -2392,44 +2767,62 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
.transid = gen,
.level = level
};
- struct btrfs_path *path;
- struct btrfs_root *root = wc->replay_dest;
- struct btrfs_key key;
- int i;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_trans_handle *trans = wc->trans;
int ret;
- ret = btrfs_read_extent_buffer(eb, &check);
- if (ret)
- return ret;
-
- level = btrfs_header_level(eb);
-
if (level != 0)
return 0;
- path = btrfs_alloc_path();
- if (!path)
+ /*
+ * Set to NULL since it was not yet read and in case we abort log replay
+ * on error, we have no valid log tree leaf to dump.
+ */
+ wc->log_leaf = NULL;
+ ret = btrfs_read_extent_buffer(eb, &check);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to read log tree leaf %llu for root %llu",
+ eb->start, btrfs_root_id(root));
+ return ret;
+ }
+
+ ASSERT(wc->subvol_path == NULL);
+ wc->subvol_path = btrfs_alloc_path();
+ if (!wc->subvol_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
+
+ wc->log_leaf = eb;
nritems = btrfs_header_nritems(eb);
- for (i = 0; i < nritems; i++) {
- btrfs_item_key_to_cpu(eb, &key, i);
+ for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) {
+ struct btrfs_inode_item *inode_item;
- /* inode keys are done during the first stage */
- if (key.type == BTRFS_INODE_ITEM_KEY &&
- wc->stage == LOG_WALK_REPLAY_INODES) {
- struct btrfs_inode_item *inode_item;
- u32 mode;
+ btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot);
- inode_item = btrfs_item_ptr(eb, i,
- struct btrfs_inode_item);
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(eb, wc->log_slot,
+ struct btrfs_inode_item);
/*
- * If we have a tmpfile (O_TMPFILE) that got fsync'ed
- * and never got linked before the fsync, skip it, as
- * replaying it is pointless since it would be deleted
- * later. We skip logging tmpfiles, but it's always
- * possible we are replaying a log created with a kernel
- * that used to log tmpfiles.
+ * An inode with no links is either:
+ *
+ * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
+ * got linked before the fsync, skip it, as replaying
+ * it is pointless since it would be deleted later.
+ * We skip logging tmpfiles, but it's always possible
+ * we are replaying a log created with a kernel that
+ * used to log tmpfiles;
+ *
+ * 2) A non-tmpfile which got its last link deleted
+ * while holding an open fd on it and later got
+ * fsynced through that fd. We always log the
+ * parent inodes when inode->last_unlink_trans is
+ * set to the current transaction, so ignore all the
+ * inode items for this inode. We will delete the
+ * inode when processing the parent directory with
+ * replay_dir_deletes().
*/
if (btrfs_inode_nlink(eb, inode_item) == 0) {
wc->ignore_cur_inode = true;
@@ -2437,19 +2830,23 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
} else {
wc->ignore_cur_inode = false;
}
- ret = replay_xattr_deletes(wc->trans, root, log,
- path, key.objectid);
+ }
+
+ /* Inode keys are done during the first stage. */
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ u32 mode;
+
+ ret = replay_xattr_deletes(wc);
if (ret)
break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
- ret = replay_dir_deletes(wc->trans,
- root, log, path, key.objectid, 0);
+ ret = replay_dir_deletes(wc, wc->log_key.objectid, false);
if (ret)
break;
}
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ ret = overwrite_item(wc);
if (ret)
break;
@@ -2466,9 +2863,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct btrfs_inode *inode;
u64 from;
- inode = read_one_inode(root, key.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ wc->log_key.objectid,
+ btrfs_root_id(root));
break;
}
from = ALIGN(i_size_read(&inode->vfs_inode),
@@ -2476,21 +2877,31 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(wc->trans, root, inode,
- &drop_args);
- if (!ret) {
+ drop_args.path = wc->subvol_path;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu root %llu offset %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root),
+ from);
+ } else {
inode_sub_bytes(&inode->vfs_inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
- ret = btrfs_update_inode(wc->trans, inode);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
}
iput(&inode->vfs_inode);
if (ret)
break;
}
- ret = link_to_fixup_dir(wc->trans, root,
- path, key.objectid);
+ ret = link_to_fixup_dir(wc, wc->log_key.objectid);
if (ret)
break;
}
@@ -2498,10 +2909,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
if (wc->ignore_cur_inode)
continue;
- if (key.type == BTRFS_DIR_INDEX_KEY &&
+ if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
- ret = replay_one_dir_item(wc->trans, root, path,
- eb, i, &key);
+ ret = replay_one_dir_item(wc);
if (ret)
break;
}
@@ -2510,21 +2920,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
continue;
/* these keys are simply copied */
- if (key.type == BTRFS_XATTR_ITEM_KEY) {
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
+ ret = overwrite_item(wc);
if (ret)
break;
- } else if (key.type == BTRFS_INODE_REF_KEY ||
- key.type == BTRFS_INODE_EXTREF_KEY) {
- ret = add_inode_ref(wc->trans, root, log, path,
- eb, i, &key);
- if (ret && ret != -ENOENT)
+ } else if (wc->log_key.type == BTRFS_INODE_REF_KEY ||
+ wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = add_inode_ref(wc);
+ if (ret)
break;
- ret = 0;
- } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
- ret = replay_one_extent(wc->trans, root, path,
- eb, i, &key);
+ } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
+ ret = replay_one_extent(wc);
if (ret)
break;
}
@@ -2535,37 +2941,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
* older kernel with such keys, ignore them.
*/
}
- btrfs_free_path(path);
+ btrfs_free_path(wc->subvol_path);
+ wc->subvol_path = NULL;
return ret;
}
-/*
- * Correctly adjust the reserved bytes occupied by a log tree extent buffer
- */
-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
-{
- struct btrfs_block_group *cache;
-
- cache = btrfs_lookup_block_group(fs_info, start);
- if (!cache) {
- btrfs_err(fs_info, "unable to find block group for %llu", start);
- return;
- }
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->reserved -= fs_info->nodesize;
- cache->space_info->bytes_reserved -= fs_info->nodesize;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- btrfs_put_block_group(cache);
-}
-
static int clean_log_buffer(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
- int ret;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_block_group *bg;
btrfs_tree_lock(eb);
btrfs_clear_buffer_dirty(trans, eb);
@@ -2573,22 +2958,38 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(eb);
if (trans) {
+ int ret;
+
ret = btrfs_pin_reserved_extent(trans, eb);
if (ret)
- return ret;
- } else {
- unaccount_log_buffer(eb->fs_info, eb->start);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
+ bg = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!bg) {
+ btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
+ btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
+ return -ENOENT;
+ }
+
+ spin_lock(&bg->space_info->lock);
+ spin_lock(&bg->lock);
+ bg->reserved -= fs_info->nodesize;
+ bg->space_info->bytes_reserved -= fs_info->nodesize;
+ spin_unlock(&bg->lock);
+ spin_unlock(&bg->space_info->lock);
+
+ btrfs_put_block_group(bg);
+
return 0;
}
-static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_fs_info *fs_info = wc->log->fs_info;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
@@ -2616,12 +3017,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
next = btrfs_find_create_tree_block(fs_info, bytenr,
btrfs_header_owner(cur),
*level - 1);
- if (IS_ERR(next))
- return PTR_ERR(next);
+ if (IS_ERR(next)) {
+ ret = PTR_ERR(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ return ret;
+ }
if (*level == 1) {
- ret = wc->process_func(root, next, wc, ptr_gen,
- *level - 1);
+ ret = wc->process_func(next, wc, ptr_gen, *level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2632,6 +3038,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
@@ -2647,6 +3057,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
@@ -2663,10 +3077,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static noinline int walk_up_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
{
int i;
int slot;
@@ -2680,14 +3092,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level == 0);
return 0;
} else {
- ret = wc->process_func(root, path->nodes[*level], wc,
+ ret = wc->process_func(path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
if (ret)
return ret;
if (wc->free) {
- ret = clean_log_buffer(trans, path->nodes[*level]);
+ ret = clean_log_buffer(wc->trans, path->nodes[*level]);
if (ret)
return ret;
}
@@ -2704,13 +3116,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
* the tree freeing any blocks that have a ref count of zero after being
* decremented.
*/
-static int walk_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *log, struct walk_control *wc)
+static int walk_log_tree(struct walk_control *wc)
{
+ struct btrfs_root *log = wc->log;
int ret = 0;
int wret;
int level;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int orig_level;
path = btrfs_alloc_path();
@@ -2720,40 +3132,34 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
- atomic_inc(&log->node->refs);
+ refcount_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
- wret = walk_down_log_tree(trans, log, path, &level, wc);
+ wret = walk_down_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
- wret = walk_up_log_tree(trans, log, path, &level, wc);
+ wret = walk_up_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
}
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
- ret = wc->process_func(log, path->nodes[orig_level], wc,
+ ret = wc->process_func(path->nodes[orig_level], wc,
btrfs_header_generation(path->nodes[orig_level]),
orig_level);
if (ret)
- goto out;
+ return ret;
if (wc->free)
- ret = clean_log_buffer(trans, path->nodes[orig_level]);
+ ret = clean_log_buffer(wc->trans, path->nodes[orig_level]);
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2932,7 +3338,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
- ASSERT(log_transid == root->log_transid);
+ ASSERT(log_transid == root->log_transid,
+ "log_transid=%d root->log_transid=%d", log_transid, root->log_transid);
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
@@ -2961,9 +3368,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (log_transid % 2 == 0)
- mark = EXTENT_DIRTY;
+ mark = EXTENT_DIRTY_LOG1;
else
- mark = EXTENT_NEW;
+ mark = EXTENT_DIRTY_LOG2;
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
@@ -3072,7 +3479,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = root_log_ctx.log_ret;
goto out;
}
- ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
+ ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid,
+ "root_log_ctx.log_transid=%d log_root_tree->log_transid=%d",
+ root_log_ctx.log_transid, log_root_tree->log_transid);
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
@@ -3094,7 +3503,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_marked_extents(fs_info,
&log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
blk_finish_plug(&plug);
/*
* As described above, -EAGAIN indicates a hole in the extents. We
@@ -3114,7 +3523,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_tree_log_extents(log, mark);
if (!ret)
ret = btrfs_wait_tree_log_extents(log_root_tree,
- EXTENT_NEW | EXTENT_DIRTY);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (ret) {
btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3162,7 +3571,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
mutex_unlock(&fs_info->tree_log_mutex);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
@@ -3176,7 +3585,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* someone else already started it. We use <= and not < because the
* first log transaction has an ID of 0.
*/
- ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
+ ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid,
+ "last_log_commit(root)=%d log_transid=%d",
+ btrfs_get_root_last_log_commit(root), log_transid);
btrfs_set_root_last_log_commit(root, log_transid);
out_wake_log_root:
@@ -3214,12 +3625,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
{
int ret;
struct walk_control wc = {
- .free = 1,
- .process_func = process_one_buffer
+ .free = true,
+ .process_func = process_one_buffer,
+ .log = log,
+ .trans = trans,
};
if (log->node) {
- ret = walk_log_tree(trans, log, &wc);
+ ret = walk_log_tree(&wc);
if (ret) {
/*
* We weren't able to traverse the entire log tree, the
@@ -3240,9 +3653,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
*/
btrfs_write_marked_extents(log->fs_info,
&log->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
btrfs_wait_tree_log_extents(log,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (trans)
btrfs_abort_transaction(trans, ret);
@@ -3251,8 +3664,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
}
}
- extent_io_tree_release(&log->dirty_log_pages);
- extent_io_tree_release(&log->log_csum_range);
+ btrfs_extent_io_tree_release(&log->dirty_log_pages);
+ btrfs_extent_io_tree_release(&log->log_csum_range);
btrfs_put_root(log);
}
@@ -3282,6 +3695,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
return 0;
}
+static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ bool ret = false;
+
+ /*
+ * Do this only if ->logged_trans is still 0 to prevent races with
+ * concurrent logging as we may see the inode not logged when
+ * inode_logged() is called but it gets logged after inode_logged() did
+ * not find it in the log tree and we end up setting ->logged_trans to a
+ * value less than trans->transid after the concurrent logging task has
+ * set it to trans->transid. As a consequence, subsequent rename, unlink
+ * and link operations may end up not logging new names and removing old
+ * names from the log.
+ */
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == 0)
+ inode->logged_trans = trans->transid - 1;
+ else if (inode->logged_trans == trans->transid)
+ ret = true;
+ spin_unlock(&inode->lock);
+
+ return ret;
+}
+
/*
* Check if an inode was logged in the current transaction. This correctly deals
* with the case where the inode was logged but has a logged_trans of 0, which
@@ -3299,15 +3737,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
- if (inode->logged_trans == trans->transid)
+ /*
+ * Quick lockless call, since once ->logged_trans is set to the current
+ * transaction, we never set it to a lower value anywhere else.
+ */
+ if (data_race(inode->logged_trans) == trans->transid)
return 1;
/*
- * If logged_trans is not 0, then we know the inode logged was not logged
- * in this transaction, so we can return false right away.
+ * If logged_trans is not 0 and not trans->transid, then we know the
+ * inode was not logged in this transaction, so we can return false
+ * right away. We take the lock to avoid a race caused by load/store
+ * tearing with a concurrent btrfs_log_inode() call or a concurrent task
+ * in this function further below - an update to trans->transid can be
+ * teared into two 32 bits updates for example, in which case we could
+ * see a positive value that is not trans->transid and assume the inode
+ * was not logged when it was.
*/
- if (inode->logged_trans > 0)
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == trans->transid) {
+ spin_unlock(&inode->lock);
+ return 1;
+ } else if (inode->logged_trans > 0) {
+ spin_unlock(&inode->lock);
return 0;
+ }
+ spin_unlock(&inode->lock);
/*
* If no log tree was created for this root in this transaction, then
@@ -3316,10 +3771,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* transaction's ID, to avoid the search below in a future call in case
* a log tree gets created after this.
*/
- if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
- inode->logged_trans = trans->transid - 1;
- return 0;
- }
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return mark_inode_as_not_logged(trans, inode);
/*
* We have a log tree and the inode's logged_trans is 0. We can't tell
@@ -3373,29 +3826,17 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* Set logged_trans to a value greater than 0 and less then the
* current transaction to avoid doing the search in future calls.
*/
- inode->logged_trans = trans->transid - 1;
- return 0;
+ return mark_inode_as_not_logged(trans, inode);
}
/*
* The inode was previously logged and then evicted, set logged_trans to
- * the current transacion's ID, to avoid future tree searches as long as
+ * the current transaction's ID, to avoid future tree searches as long as
* the inode is not evicted again.
*/
+ spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
-
- /*
- * If it's a directory, then we must set last_dir_index_offset to the
- * maximum possible value, so that the next attempt to log the inode does
- * not skip checking if dir index keys found in modified subvolume tree
- * leaves have been logged before, otherwise it would result in attempts
- * to insert duplicate dir index keys in the log tree. This must be done
- * because last_dir_index_offset is an in-memory only field, not persisted
- * in the inode item or any other on-disk structure, so its value is lost
- * once the inode is evicted.
- */
- if (S_ISDIR(inode->vfs_inode.i_mode))
- inode->last_dir_index_offset = (u64)-1;
+ spin_unlock(&inode->lock);
return 1;
}
@@ -3432,7 +3873,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
* inode item because on log replay we update the field to reflect
* all existing entries in the directory (see overwrite_item()).
*/
- return btrfs_delete_one_dir_name(trans, log, path, di);
+ return btrfs_del_item(trans, log, path);
}
/*
@@ -3457,37 +3898,36 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
* or the entire directory.
*/
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
const struct fscrypt_str *name,
struct btrfs_inode *dir, u64 index)
{
- struct btrfs_path *path;
+ struct btrfs_root *root = dir->root;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
ret = inode_logged(trans, dir, NULL);
if (ret == 0)
return;
- else if (ret < 0) {
+ if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
btrfs_set_log_full_commit(trans);
return;
}
ret = join_running_log_trans(root);
- if (ret)
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
return;
mutex_lock(&dir->log_mutex);
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
name, index);
- btrfs_free_path(path);
-out_unlock:
mutex_unlock(&dir->log_mutex);
if (ret < 0)
btrfs_set_log_full_commit(trans);
@@ -3496,12 +3936,11 @@ out_unlock:
/* see comments for btrfs_del_dir_entries_in_log */
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
const struct fscrypt_str *name,
- struct btrfs_inode *inode, u64 dirid)
+ struct btrfs_inode *inode,
+ struct btrfs_inode *dir)
{
- struct btrfs_root *log;
- u64 index;
+ struct btrfs_root *root = dir->root;
int ret;
ret = inode_logged(trans, inode, NULL);
@@ -3513,13 +3952,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
}
ret = join_running_log_trans(root);
- if (ret)
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
return;
- log = root->log_root;
mutex_lock(&inode->log_mutex);
- ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
- dirid, &index);
+ ret = btrfs_del_inode_ref(trans, root->log_root, name, btrfs_ino(inode),
+ btrfs_ino(dir), NULL);
mutex_unlock(&inode->log_mutex);
if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
@@ -3581,7 +4020,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
int count)
{
struct btrfs_root *log = inode->root->log_root;
- char *ins_data = NULL;
+ char AUTO_KFREE(ins_data);
struct btrfs_item_batch batch;
struct extent_buffer *dst;
unsigned long src_offset;
@@ -3592,7 +4031,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
int ret;
int i;
- ASSERT(count > 0);
+ ASSERT(count > 0, "count=%d", count);
batch.nr = count;
if (count == 1) {
@@ -3605,8 +4044,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
struct btrfs_key *ins_keys;
u32 *ins_sizes;
- ins_data = kmalloc(count * sizeof(u32) +
- count * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
@@ -3627,7 +4065,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
if (ret)
- goto out;
+ return ret;
dst = dst_path->nodes[0];
/*
@@ -3646,7 +4084,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
btrfs_release_path(dst_path);
last_index = batch.keys[count - 1].offset;
- ASSERT(last_index > inode->last_dir_index_offset);
+ ASSERT(last_index > inode->last_dir_index_offset,
+ "last_index=%llu inode->last_dir_index_offset=%llu",
+ last_index, inode->last_dir_index_offset);
/*
* If for some unexpected reason the last item's index is not greater
@@ -3659,8 +4099,6 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
if (btrfs_get_first_dir_index_to_log(inode) == 0)
btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
-out:
- kfree(ins_data);
return ret;
}
@@ -3684,7 +4122,7 @@ static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
* Add extra ref to scratch eb so that it is not freed when callers
* release the path, so we can reuse it later if needed.
*/
- atomic_inc(&ctx->scratch_eb->refs);
+ refcount_inc(&ctx->scratch_eb->refs);
return 0;
}
@@ -3719,7 +4157,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
for (int i = path->slots[0]; i < nritems; i++) {
struct btrfs_dir_item *di;
struct btrfs_key key;
- int ret;
btrfs_item_key_to_cpu(src, &key, i);
@@ -3789,8 +4226,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
}
if (batch_size > 0) {
- int ret;
-
ret = flush_dir_items_batch(trans, inode, src, dst_path,
batch_start, batch_size);
if (ret < 0)
@@ -3975,7 +4410,9 @@ done:
* change in the current transaction), then we don't need to log
* a range, last_old_dentry_offset is == to last_offset.
*/
- ASSERT(last_old_dentry_offset <= last_offset);
+ ASSERT(last_old_dentry_offset <= last_offset,
+ "last_old_dentry_offset=%llu last_offset=%llu",
+ last_old_dentry_offset, last_offset);
if (last_old_dentry_offset < last_offset)
ret = insert_dir_log_key(trans, log, path, ino,
last_old_dentry_offset + 1,
@@ -3987,7 +4424,7 @@ done:
/*
* If the inode was logged before and it was evicted, then its
- * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * last_dir_index_offset is 0, so we don't know the value of the last index
* key offset. If that's the case, search for it and update the inode. This
* is to avoid lookups in the log tree every time we try to insert a dir index
* key from a leaf changed in the current transaction, and to allow us to always
@@ -4003,7 +4440,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode,
lockdep_assert_held(&inode->log_mutex);
- if (inode->last_dir_index_offset != (u64)-1)
+ if (inode->last_dir_index_offset != 0)
return 0;
if (!ctx->logged_before) {
@@ -4169,47 +4606,40 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
- struct inode *inode, int log_inode_only,
+ struct inode *inode, bool log_inode_only,
u64 logged_isize)
{
- struct btrfs_map_token token;
u64 flags;
- btrfs_init_map_token(&token, leaf);
-
if (log_inode_only) {
/* set the generation to zero so the recover code
* can tell the difference between an logging
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
- btrfs_set_token_inode_generation(&token, item, 0);
- btrfs_set_token_inode_size(&token, item, logged_isize);
+ btrfs_set_inode_generation(leaf, item, 0);
+ btrfs_set_inode_size(leaf, item, logged_isize);
} else {
- btrfs_set_token_inode_generation(&token, item,
- BTRFS_I(inode)->generation);
- btrfs_set_token_inode_size(&token, item, inode->i_size);
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_size(leaf, item, inode->i_size);
}
- btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
- btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
- btrfs_set_token_inode_mode(&token, item, inode->i_mode);
- btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->atime,
- inode_get_atime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode_get_atime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode_get_mtime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode_get_mtime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
/*
* We do not need to set the nbytes field, in fact during a fast fsync
@@ -4220,13 +4650,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* inode item in subvolume tree as needed (see overwrite_item()).
*/
- btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
- btrfs_set_token_inode_transid(&token, item, trans->transid);
- btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
- btrfs_set_token_inode_flags(&token, item, flags);
- btrfs_set_token_inode_block_group(&token, item, 0);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
@@ -4272,7 +4702,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
- 0, 0);
+ false, 0);
btrfs_release_path(path);
return 0;
}
@@ -4300,8 +4730,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
* file which happens to refer to the same extent as well. Such races
* can leave checksum items in the log with overlapping ranges.
*/
- ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
- &cached_state);
+ ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
if (ret)
return ret;
/*
@@ -4317,8 +4747,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
if (!ret)
ret = btrfs_csum_file_blocks(trans, log_root, sums);
- unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
- &cached_state);
+ btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
return ret;
}
@@ -4337,7 +4767,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_key *ins_keys;
u32 *ins_sizes;
struct btrfs_item_batch batch;
- char *ins_data;
+ char AUTO_KFREE(ins_data);
int dst_index;
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
const u64 i_size = i_size_read(&inode->vfs_inode);
@@ -4376,8 +4806,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src = src_path->nodes[0];
- ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
- nr * sizeof(u32), GFP_NOFS);
+ ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
@@ -4466,7 +4895,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
disk_bytenr + extent_num_bytes - 1,
&ordered_sums, false);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
@@ -4476,7 +4905,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
kfree(sums);
}
if (ret)
- goto out;
+ return ret;
add_to_batch:
ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
@@ -4490,11 +4919,11 @@ add_to_batch:
* so we don't need to do anything.
*/
if (batch.nr == 0)
- goto out;
+ return 0;
ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
if (ret)
- goto out;
+ return ret;
dst_index = 0;
for (int i = 0; i < nr; i++) {
@@ -4547,8 +4976,6 @@ copy_item:
}
btrfs_release_path(dst_path);
-out:
- kfree(ins_data);
return ret;
}
@@ -4648,7 +5075,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
return 0;
/* If we're compressed we have to save the entire range of csums. */
- if (extent_map_is_compressed(em)) {
+ if (btrfs_extent_map_is_compressed(em)) {
csum_offset = 0;
csum_len = em->disk_num_bytes;
} else {
@@ -4657,7 +5084,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
/* block start is already adjusted for the file extent offset. */
- block_start = extent_map_block_start(em);
+ block_start = btrfs_extent_map_block_start(em);
csum_root = btrfs_csum_root(trans->fs_info, block_start);
ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
block_start + csum_offset + csum_len - 1,
@@ -4667,9 +5094,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
ret = 0;
while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
+ struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
+ struct btrfs_ordered_sum,
+ list);
if (!ret)
ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
@@ -4692,7 +5119,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
enum btrfs_compression_type compress_type;
u64 extent_offset = em->offset;
- u64 block_start = extent_map_block_start(em);
+ u64 block_start = btrfs_extent_map_block_start(em);
u64 block_len;
int ret;
@@ -4703,7 +5130,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
block_len = em->disk_num_bytes;
- compress_type = extent_map_compression(em);
+ compress_type = btrfs_extent_map_compression(em);
if (compress_type != BTRFS_COMPRESS_NONE) {
btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
@@ -4778,7 +5205,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct btrfs_key key;
const u64 i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
- struct btrfs_path *dst_path = NULL;
+ BTRFS_PATH_AUTO_FREE(dst_path);
bool dropped_extents = false;
u64 truncate_offset = i_size;
struct extent_buffer *leaf;
@@ -4896,7 +5323,6 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
start_slot, ins_nr, 1, 0, ctx);
out:
btrfs_release_path(path);
- btrfs_free_path(dst_path);
return ret;
}
@@ -4947,7 +5373,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
list_sort(NULL, &extents, extent_cmp);
process:
while (!list_empty(&extents)) {
- em = list_entry(extents.next, struct extent_map, list);
+ em = list_first_entry(&extents, struct extent_map, list);
list_del_init(&em->list);
@@ -4956,8 +5382,8 @@ process:
* private list.
*/
if (ret) {
- clear_em_logging(inode, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
continue;
}
@@ -4965,8 +5391,8 @@ process:
ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
- clear_em_logging(inode, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
@@ -4988,12 +5414,12 @@ process:
set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
- spin_lock_irq(&inode->ordered_tree_lock);
+ spin_lock(&inode->ordered_tree_lock);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
atomic_inc(&trans->transaction->pending_ordered);
}
- spin_unlock_irq(&inode->ordered_tree_lock);
+ spin_unlock(&inode->ordered_tree_lock);
}
btrfs_put_ordered_extent(ordered);
}
@@ -5268,9 +5694,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
struct btrfs_inode *inode,
u64 *other_ino, u64 *other_parent)
{
- int ret;
- struct btrfs_path *search_path;
- char *name = NULL;
+ BTRFS_PATH_AUTO_FREE(search_path);
+ char AUTO_KFREE(name);
u32 name_len = 0;
u32 item_size = btrfs_item_size(eb, slot);
u32 cur_offset = 0;
@@ -5279,8 +5704,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
search_path = btrfs_alloc_path();
if (!search_path)
return -ENOMEM;
- search_path->search_commit_root = 1;
- search_path->skip_locking = 1;
+ search_path->search_commit_root = true;
+ search_path->skip_locking = true;
while (cur_offset < item_size) {
u64 parent;
@@ -5313,10 +5738,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
char *new_name;
new_name = krealloc(name, this_name_len, GFP_NOFS);
- if (!new_name) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!new_name)
+ return -ENOMEM;
name_len = this_name_len;
name = new_name;
}
@@ -5334,29 +5757,24 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
if (di_key.objectid != key->objectid) {
- ret = 1;
*other_ino = di_key.objectid;
*other_parent = parent;
+ return 1;
} else {
- ret = 0;
+ return 0;
}
} else {
- ret = -EAGAIN;
+ return -EAGAIN;
}
- goto out;
} else if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
+ return PTR_ERR(di);
}
btrfs_release_path(search_path);
cur_offset += this_len;
}
- ret = 0;
-out:
- btrfs_free_path(search_path);
- kfree(name);
- return ret;
+
+ return 0;
}
/*
@@ -5404,7 +5822,7 @@ struct btrfs_dir_list {
* See process_dir_items_leaf() for details about why it is needed.
* This is a recursive operation - if an existing dentry corresponds to a
* directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes
* the dentries point to we do not acquire their VFS lock, otherwise lockdep
* complains about the following circular lock dependency / possible deadlock:
*
@@ -5606,8 +6024,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (WARN_ON_ONCE(ret > 0)) {
@@ -5627,8 +6045,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
}
btrfs_release_path(path);
- path->search_commit_root = 0;
- path->skip_locking = 0;
+ path->search_commit_root = false;
+ path->skip_locking = false;
return ret;
}
@@ -6082,8 +6500,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
if (!first)
return 0;
- ins_data = kmalloc(max_batch_size * sizeof(u32) +
- max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
ins_sizes = (u32 *)ins_data;
@@ -6119,7 +6536,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
curr = list_next_entry(curr, log_list);
}
- ASSERT(batch.nr >= 1);
+ ASSERT(batch.nr >= 1, "batch.nr=%d", batch.nr);
ret = insert_delayed_items_batch(trans, log, path, &batch, first);
curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
@@ -6163,7 +6580,9 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
}
last_dir_index = curr->index;
- ASSERT(last_dir_index >= first_dir_index);
+ ASSERT(last_dir_index >= first_dir_index,
+ "last_dir_index=%llu first_dir_index=%llu",
+ last_dir_index, first_dir_index);
ret = insert_dir_log_key(trans, inode->root->log_root, path,
ino, first_dir_index, last_dir_index);
@@ -6257,7 +6676,9 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
goto next_batch;
last_dir_index = last->index;
- ASSERT(last_dir_index >= first_dir_index);
+ ASSERT(last_dir_index >= first_dir_index,
+ "last_dir_index=%llu first_dir_index=%llu",
+ last_dir_index, first_dir_index);
/*
* If this range starts right after where the previous one ends,
* then we want to reuse the previous range item and change its
@@ -6324,7 +6745,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
*/
lockdep_assert_not_held(&inode->log_mutex);
- ASSERT(!ctx->logging_new_delayed_dentries);
+ ASSERT(!ctx->logging_new_delayed_dentries,
+ "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries);
ctx->logging_new_delayed_dentries = true;
list_for_each_entry(item, delayed_ins_list, log_list) {
@@ -6583,6 +7005,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_log_get_delayed_items(inode, &delayed_ins_list,
&delayed_del_list);
+ /*
+ * If we are fsyncing a file with 0 hard links, then commit the delayed
+ * inode because the last inode ref (or extref) item may still be in the
+ * subvolume tree and if we log it the file will still exist after a log
+ * replay. So commit the delayed inode to delete that last ref and we
+ * skip logging it.
+ */
+ if (inode->vfs_inode.i_nlink == 0) {
+ ret = btrfs_commit_inode_delayed_inode(inode);
+ if (ret)
+ goto out_unlock;
+ }
+
ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
inode_only, ctx,
@@ -6685,7 +7120,7 @@ log_extents:
* a power failure unless the log was synced as part of an fsync
* against any other unrelated inode.
*/
- if (inode_only != LOG_INODE_EXISTS)
+ if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
@@ -6724,7 +7159,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
@@ -6732,15 +7167,15 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->skip_locking = 1;
- path->search_commit_root = 1;
+ path->skip_locking = true;
+ path->search_commit_root = true;
key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
while (true) {
struct extent_buffer *leaf = path->nodes[0];
@@ -6752,8 +7187,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -6766,28 +7201,24 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
- struct btrfs_key inode_key;
+ u64 dir_id;
struct btrfs_inode *dir_inode;
- inode_key.type = BTRFS_INODE_ITEM_KEY;
- inode_key.offset = 0;
-
if (key.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)
(ptr + cur_offset);
- inode_key.objectid = btrfs_inode_extref_parent(
- leaf, extref);
+ dir_id = btrfs_inode_extref_parent(leaf, extref);
cur_offset += sizeof(*extref);
cur_offset += btrfs_inode_extref_name_len(leaf,
extref);
} else {
- inode_key.objectid = key.offset;
+ dir_id = key.offset;
cur_offset = item_size;
}
- dir_inode = btrfs_iget_logging(inode_key.objectid, root);
+ dir_inode = btrfs_iget_logging(dir_id, root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -6811,10 +7242,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
* at both parents and the old parent B would still
* exist.
*/
- if (IS_ERR(dir_inode)) {
- ret = PTR_ERR(dir_inode);
- goto out;
- }
+ if (IS_ERR(dir_inode))
+ return PTR_ERR(dir_inode);
if (!need_log_inode(trans, dir_inode)) {
btrfs_add_delayed_iput(dir_inode);
@@ -6827,14 +7256,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ret = log_new_dir_dentries(trans, dir_inode, ctx);
btrfs_add_delayed_iput(dir_inode);
if (ret)
- goto out;
+ return ret;
}
path->slots[0]++;
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int log_new_ancestors(struct btrfs_trans_handle *trans,
@@ -6945,7 +7371,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key search_key;
int ret;
@@ -6966,7 +7392,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 0)
path->slots[0]++;
@@ -6978,8 +7404,8 @@ again:
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -6996,10 +7422,8 @@ again:
* this loop, etc). So just return some error to fallback to
* a transaction commit.
*/
- if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
- ret = -EMLINK;
- goto out;
- }
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY)
+ return -EMLINK;
/*
* Logging ancestors needs to do more searches on the fs/subvol
@@ -7011,14 +7435,11 @@ again:
ret = log_new_ancestors(trans, root, path, ctx);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
goto again;
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -7051,14 +7472,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (btrfs_root_generation(&root->root_item) == trans->transid)
return BTRFS_LOG_FORCE_COMMIT;
- /*
- * Skip already logged inodes or inodes corresponding to tmpfiles
- * (since logging them is pointless, a link count of 0 means they
- * will never be accessible).
- */
- if ((btrfs_inode_in_log(inode, trans->transid) &&
- list_empty(&ctx->ordered_extents)) ||
- inode->vfs_inode.i_nlink == 0)
+ /* Skip already logged inodes and without new extents. */
+ if (btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents))
return BTRFS_NO_LOG_SYNC;
ret = start_log_trans(trans, root, ctx);
@@ -7184,8 +7600,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_key found_key;
- struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
.process_func = process_one_buffer,
@@ -7205,10 +7619,12 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
}
wc.trans = trans;
- wc.pin = 1;
+ wc.pin = true;
+ wc.log = log_root_tree;
- ret = walk_log_tree(trans, log_root_tree, &wc);
- if (ret) {
+ ret = walk_log_tree(&wc);
+ wc.log = NULL;
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -7219,9 +7635,11 @@ again:
key.offset = (u64)-1;
while (1) {
+ struct btrfs_key found_key;
+
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -7236,17 +7654,22 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_tree_root(log_root_tree, &found_key);
- if (IS_ERR(log)) {
- ret = PTR_ERR(log);
+ wc.log = btrfs_read_tree_root(log_root_tree, &found_key);
+ if (IS_ERR(wc.log)) {
+ ret = PTR_ERR(wc.log);
+ wc.log = NULL;
btrfs_abort_transaction(trans, ret);
goto error;
}
- wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
- true);
- if (IS_ERR(wc.replay_dest)) {
- ret = PTR_ERR(wc.replay_dest);
+ wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true);
+ if (IS_ERR(wc.root)) {
+ ret = PTR_ERR(wc.root);
+ wc.root = NULL;
+ if (unlikely(ret != -ENOENT)) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
/*
* We didn't find the subvol, likely because it was
@@ -7259,36 +7682,37 @@ again:
* block from being modified, and we'll just bail for
* each subsequent pass.
*/
- if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(trans, log->node);
- btrfs_put_root(log);
+ ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+ goto next;
+ }
- if (!ret)
- goto next;
+ wc.root->log_root = wc.log;
+ ret = btrfs_record_root_in_trans(trans, wc.root);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- goto error;
+ goto next;
}
- wc.replay_dest->log_root = log;
- ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
- if (ret)
- /* The loop needs to continue due to the root refs */
+ ret = walk_log_tree(&wc);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- else
- ret = walk_log_tree(trans, log, &wc);
-
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- ret = fixup_inode_link_counts(trans, wc.replay_dest,
- path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ goto next;
}
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- struct btrfs_root *root = wc.replay_dest;
-
- btrfs_release_path(path);
+ if (wc.stage == LOG_WALK_REPLAY_ALL) {
+ struct btrfs_root *root = wc.root;
+ wc.subvol_path = path;
+ ret = fixup_inode_link_counts(&wc);
+ wc.subvol_path = NULL;
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
/*
* We have just replayed everything, and the highest
* objectid of fs roots probably has changed in case
@@ -7298,17 +7722,21 @@ again:
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
- if (ret)
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
}
-
- wc.replay_dest->log_root = NULL;
- btrfs_put_root(wc.replay_dest);
- btrfs_put_root(log);
+next:
+ if (wc.root) {
+ wc.root->log_root = NULL;
+ btrfs_put_root(wc.root);
+ }
+ btrfs_put_root(wc.log);
+ wc.log = NULL;
if (ret)
goto error;
-next:
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
@@ -7317,7 +7745,7 @@ next:
/* step one is to pin it all, step two is to replay just inodes */
if (wc.pin) {
- wc.pin = 0;
+ wc.pin = false;
wc.process_func = replay_one_buffer;
wc.stage = LOG_WALK_REPLAY_INODES;
goto again;
@@ -7335,14 +7763,13 @@ next:
if (ret)
return ret;
- log_root_tree->log_root = NULL;
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
- btrfs_put_root(log_root_tree);
return 0;
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ btrfs_put_root(wc.log);
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
@@ -7439,6 +7866,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* full log sync.
* Also we don't need to worry with renames, since btrfs_rename() marks the log
* for full commit when renaming a subvolume.
+ *
+ * Must be called before creating the subvolume entry in its parent directory.
*/
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
struct btrfs_inode *dir)
@@ -7475,6 +7904,12 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
bool log_pinned = false;
int ret;
+ /* The inode has a new name (ref/extref), so make sure we log it. */
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+
+ btrfs_init_log_ctx(&ctx, inode);
+ ctx.logging_new_name = true;
+
/*
* this will force the logging code to walk the dentry chain
* up for the file
@@ -7506,6 +7941,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
ret = 0;
/*
+ * Now that we know we need to update the log, allocate the scratch eb
+ * for the context before joining a log transaction below, as this can
+ * take time and therefore we could delay log commits from other tasks.
+ */
+ btrfs_init_log_ctx_scratch_eb(&ctx);
+
+ /*
* If we are doing a rename (old_dir is not NULL) from a directory that
* was previously logged, make sure that on log replay we get the old
* dir entry deleted. This is needed because we will also log the new
@@ -7517,12 +7959,21 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct fscrypt_name fname;
- ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
+ ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX,
+ "old_dir_index=%llu", old_dir_index);
ret = fscrypt_setup_filename(&old_dir->vfs_inode,
&old_dentry->d_name, 0, &fname);
if (ret)
goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
+
/*
* We have two inodes to update in the log, the old directory and
* the inode that got renamed, so we must pin the log to prevent
@@ -7536,19 +7987,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* mark the log for a full commit.
*/
if (WARN_ON_ONCE(ret < 0)) {
+ btrfs_free_path(path);
fscrypt_free_filename(&fname);
goto out;
}
log_pinned = true;
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- fscrypt_free_filename(&fname);
- goto out;
- }
-
/*
* Other concurrent task might be logging the old directory,
* as it can be triggered when logging other inode that had or
@@ -7580,9 +8025,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
goto out;
}
- btrfs_init_log_ctx(&ctx, inode);
- ctx.logging_new_name = true;
- btrfs_init_log_ctx_scratch_eb(&ctx);
/*
* We don't care about the return value. If we fail to log the new name
* then we know the next attempt to sync the log will fallback to a full
@@ -7591,7 +8033,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
- free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
@@ -7604,5 +8045,6 @@ out:
btrfs_set_log_full_commit(trans);
if (log_pinned)
btrfs_end_log_trans(root);
+ free_extent_buffer(ctx.scratch_eb);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index dc313e6bb2fa..41e47fda036d 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -8,8 +8,7 @@
#include <linux/list.h>
#include <linux/fs.h>
-#include "messages.h"
-#include "ctree.h"
+#include <linux/fscrypt.h>
#include "transaction.h"
struct inode;
@@ -80,13 +79,12 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
struct btrfs_log_ctx *ctx);
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
const struct fscrypt_str *name,
struct btrfs_inode *dir, u64 index);
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
const struct fscrypt_str *name,
- struct btrfs_inode *inode, u64 dirid);
+ struct btrfs_inode *inode,
+ struct btrfs_inode *dir);
void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 1ac2678fc4ca..9e8cb3b7c064 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -27,18 +27,29 @@ struct tree_mod_elem {
/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
u64 generation;
- /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
- struct btrfs_disk_key key;
- u64 blockptr;
-
- /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
- struct {
- int dst_slot;
- int nr_items;
- } move;
-
- /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
- struct tree_mod_root old_root;
+ union {
+ /*
+ * This is used for the following op types:
+ *
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING
+ * BTRFS_MOD_LOG_KEY_REMOVE
+ * BTRFS_MOD_LOG_KEY_REPLACE
+ */
+ struct {
+ struct btrfs_disk_key key;
+ u64 blockptr;
+ } slot_change;
+
+ /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
+
+ /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
+ struct tree_mod_root old_root;
+ };
};
/*
@@ -164,6 +175,30 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
return 0;
}
+static inline bool skip_eb_logging(const struct extent_buffer *eb)
+{
+ const u64 owner = btrfs_header_owner(eb);
+
+ if (btrfs_header_level(eb) == 0)
+ return true;
+
+ /*
+ * Tree mod logging exists so that there's a consistent view of the
+ * extents and backrefs of inodes even if while a task is iterating over
+ * them other tasks are modifying subvolume trees and the extent tree
+ * (including running delayed refs). So we only need to log extent
+ * buffers from the extent tree and subvolume trees.
+ */
+
+ if (owner == BTRFS_EXTENT_TREE_OBJECTID)
+ return false;
+
+ if (btrfs_is_fstree(owner))
+ return false;
+
+ return true;
+}
+
/*
* Determines if logging can be omitted. Returns true if it can. Otherwise, it
* returns false with the tree_mod_log_lock acquired. The caller must hold
@@ -174,7 +209,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
- if (eb && btrfs_header_level(eb) == 0)
+ if (eb && skip_eb_logging(eb))
return true;
write_lock(&fs_info->tree_mod_log_lock);
@@ -192,7 +227,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return false;
- if (eb && btrfs_header_level(eb) == 0)
+ if (eb && skip_eb_logging(eb))
return false;
return true;
@@ -204,15 +239,17 @@ static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
{
struct tree_mod_elem *tm;
+ /* Can't be one of these types, due to union in struct tree_mod_elem. */
+ ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS);
+ ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE);
+
tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm)
return NULL;
tm->logical = eb->start;
- if (op != BTRFS_MOD_LOG_KEY_ADD) {
- btrfs_node_key(eb, &tm->key, slot);
- tm->blockptr = btrfs_node_blockptr(eb, slot);
- }
+ btrfs_node_key(eb, &tm->slot_change.key, slot);
+ tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot);
tm->op = op;
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
@@ -830,8 +867,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
fallthrough;
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case BTRFS_MOD_LOG_KEY_REMOVE:
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
n++;
@@ -840,8 +877,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
break;
case BTRFS_MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
break;
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index fc59b57257d6..7e16a253fb35 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -129,21 +129,25 @@ void ulist_free(struct ulist *ulist)
kfree(ulist);
}
+static int ulist_node_val_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *val = key;
+ const struct ulist_node *unode = rb_entry(node, struct ulist_node, rb_node);
+
+ if (unode->val < *val)
+ return 1;
+ else if (unode->val > *val)
+ return -1;
+
+ return 0;
+}
+
static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
{
- struct rb_node *n = ulist->root.rb_node;
- struct ulist_node *u = NULL;
-
- while (n) {
- u = rb_entry(n, struct ulist_node, rb_node);
- if (u->val < val)
- n = n->rb_right;
- else if (u->val > val)
- n = n->rb_left;
- else
- return u;
- }
- return NULL;
+ struct rb_node *node;
+
+ node = rb_find(&val, &ulist->root, ulist_node_val_key_cmp);
+ return rb_entry_safe(node, struct ulist_node, rb_node);
}
static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
@@ -155,25 +159,20 @@ static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
ulist->nnodes--;
}
+static int ulist_node_val_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct ulist_node *unode = rb_entry(new, struct ulist_node, rb_node);
+
+ return ulist_node_val_key_cmp(&unode->val, existing);
+}
+
static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
- struct rb_node **p = &ulist->root.rb_node;
- struct rb_node *parent = NULL;
- struct ulist_node *cur = NULL;
-
- while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct ulist_node, rb_node);
-
- if (cur->val < ins->val)
- p = &(*p)->rb_right;
- else if (cur->val > ins->val)
- p = &(*p)->rb_left;
- else
- return -EEXIST;
- }
- rb_link_node(&ins->rb_node, parent, p);
- rb_insert_color(&ins->rb_node, &ulist->root);
+ struct rb_node *node;
+
+ node = rb_find_add(&ins->rb_node, &ulist->root, ulist_node_val_cmp);
+ if (node)
+ return -EEXIST;
return 0;
}
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 17b5e81123a1..e3a1310fa7d5 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -27,32 +27,26 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid,
u8 type, u64 subid)
{
int ret;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
int slot;
u32 item_size;
unsigned long offset;
struct btrfs_key key;
- if (WARN_ON_ONCE(!uuid_root)) {
- ret = -ENOENT;
- goto out;
- }
+ if (WARN_ON_ONCE(!uuid_root))
+ return -ENOENT;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
btrfs_uuid_to_key(uuid, type, &key);
ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return -ENOENT;
eb = path->nodes[0];
slot = path->slots[0];
@@ -64,7 +58,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid,
btrfs_warn(uuid_root->fs_info,
"uuid item with illegal size %lu!",
(unsigned long)item_size);
- goto out;
+ return ret;
}
while (item_size) {
__le64 data;
@@ -78,8 +72,6 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid,
item_size -= sizeof(data);
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -89,7 +81,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
int slot;
@@ -100,18 +92,14 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ
if (ret != -ENOENT)
return ret;
- if (WARN_ON_ONCE(!uuid_root)) {
- ret = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(!uuid_root))
+ return -EINVAL;
btrfs_uuid_to_key(uuid, type, &key);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
sizeof(subid_le));
@@ -134,15 +122,12 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ
btrfs_warn(fs_info,
"insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
ret, key.objectid, key.offset, type);
- goto out;
+ return ret;
}
- ret = 0;
subid_le = cpu_to_le64(subid_cpu);
write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
@@ -151,7 +136,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
int slot;
@@ -161,29 +146,23 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
unsigned long move_src;
unsigned long move_len;
- if (WARN_ON_ONCE(!uuid_root)) {
- ret = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(!uuid_root))
+ return -EINVAL;
btrfs_uuid_to_key(uuid, type, &key);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn(fs_info, "error %d while searching for uuid item!",
ret);
- goto out;
- }
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
+ return ret;
}
+ if (ret > 0)
+ return -ENOENT;
eb = path->nodes[0];
slot = path->slots[0];
@@ -192,8 +171,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
if (!IS_ALIGNED(item_size, sizeof(u64))) {
btrfs_warn(fs_info, "uuid item with illegal size %lu!",
(unsigned long)item_size);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
while (item_size) {
__le64 read_subid;
@@ -205,16 +183,12 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
item_size -= sizeof(read_subid);
}
- if (!item_size) {
- ret = -ENOENT;
- goto out;
- }
+ if (!item_size)
+ return -ENOENT;
item_size = btrfs_item_size(eb, slot);
- if (item_size == sizeof(subid)) {
- ret = btrfs_del_item(trans, uuid_root, path);
- goto out;
- }
+ if (item_size == sizeof(subid))
+ return btrfs_del_item(trans, uuid_root, path);
move_dst = offset;
move_src = offset + sizeof(subid);
@@ -222,9 +196,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8
memmove_extent_buffer(eb, move_dst, move_src, move_len);
btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1);
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
@@ -293,7 +265,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->uuid_root;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret = 0;
struct extent_buffer *leaf;
int slot;
@@ -301,10 +273,8 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
unsigned long offset;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = 0;
key.type = 0;
@@ -312,17 +282,15 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
again_search_slot:
ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
- if (ret) {
- if (ret > 0)
- ret = 0;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 0;
while (1) {
- if (btrfs_fs_closing(fs_info)) {
- ret = -EINTR;
- goto out;
- }
+ if (btrfs_fs_closing(fs_info))
+ return -EINTR;
+
cond_resched();
leaf = path->nodes[0];
slot = path->slots[0];
@@ -353,7 +321,7 @@ again_search_slot:
ret = btrfs_check_uuid_tree_entry(fs_info, uuid,
key.type, subid_cpu);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) {
btrfs_release_path(path);
ret = btrfs_uuid_iter_rem(root, uuid, key.type,
@@ -369,7 +337,7 @@ again_search_slot:
goto again_search_slot;
}
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
key.offset++;
goto again_search_slot;
}
@@ -386,8 +354,6 @@ skip:
break;
}
-out:
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index b7a96a005487..a2ac3fb68bc8 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -109,7 +109,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = inode->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int count = 0;
int ret;
@@ -121,10 +121,8 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
while (1) {
/* 1 for the item being dropped */
trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
/*
* Walk backwards through all the items until we find one that
@@ -143,7 +141,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
path->slots[0]--;
} else if (ret < 0) {
btrfs_end_transaction(trans);
- goto out;
+ return ret;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -161,17 +159,14 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
if (ret) {
btrfs_end_transaction(trans);
- goto out;
+ return ret;
}
count++;
btrfs_release_path(path);
btrfs_end_transaction(trans);
}
- ret = count;
btrfs_end_transaction(trans);
-out:
- btrfs_free_path(path);
- return ret;
+ return count;
}
/*
@@ -217,7 +212,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
const char *src, u64 len)
{
struct btrfs_trans_handle *trans;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -233,10 +228,8 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
while (len > 0) {
/* 1 for the new item being inserted */
trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
key.objectid = btrfs_ino(inode);
key.type = key_type;
@@ -267,7 +260,6 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
btrfs_end_transaction(trans);
}
- btrfs_free_path(path);
return ret;
}
@@ -296,7 +288,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
char *dest, u64 len, struct folio *dest_folio)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -404,7 +396,6 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
}
}
out:
- btrfs_free_path(path);
if (!ret)
ret = copied;
return ret;
@@ -487,12 +478,12 @@ static int rollback_verity(struct btrfs_inode *inode)
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = del_orphan(trans, inode);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -587,6 +578,9 @@ static int btrfs_begin_enable_verity(struct file *filp)
btrfs_assert_inode_locked(inode);
+ if (IS_ENCRYPTED(&inode->vfs_inode))
+ return -EOPNOTSUPP;
+
if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
return -EBUSY;
@@ -676,11 +670,11 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
if (ret < 0)
return ret;
- if (item.reserved[0] != 0 || item.reserved[1] != 0)
+ if (unlikely(item.reserved[0] != 0 || item.reserved[1] != 0))
return -EUCLEAN;
true_size = btrfs_stack_verity_descriptor_size(&item);
- if (true_size > INT_MAX)
+ if (unlikely(true_size > INT_MAX))
return -EUCLEAN;
if (buf_size == 0)
@@ -742,7 +736,7 @@ again:
}
folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS),
- 0);
+ 0, NULL);
if (!folio)
return ERR_PTR(-ENOMEM);
@@ -802,6 +796,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
}
const struct fsverity_operations btrfs_verityops = {
+ .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) -
+ (int)offsetof(struct btrfs_inode, vfs_inode),
.begin_enable_verity = btrfs_begin_enable_verity,
.end_enable_verity = btrfs_end_enable_verity,
.get_verity_descriptor = btrfs_get_verity_descriptor,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c8c21c55be53..ae1742a35e76 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -18,7 +18,6 @@
#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "tree-checker.h"
@@ -214,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
u64 flags = bg_flags;
u32 size_bp = size_buf;
- if (!flags) {
- strcpy(bp, "NONE");
+ if (!flags)
return;
- }
#define DESCRIBE_FLAG(flag, desc) \
do { \
@@ -403,8 +400,12 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
static void btrfs_free_device(struct btrfs_device *device)
{
WARN_ON(!list_empty(&device->post_commit_list));
- rcu_string_free(device->name);
- extent_io_tree_release(&device->alloc_state);
+ /*
+ * No need to call kfree_rcu() nor do RCU lock/unlock, nothing is
+ * reading the device name.
+ */
+ kfree(rcu_dereference_raw(device->name));
+ btrfs_extent_io_tree_release(&device->alloc_state);
btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -414,9 +415,10 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
struct btrfs_device *device;
WARN_ON(fs_devices->opened);
+ WARN_ON(fs_devices->holding);
while (!list_empty(&fs_devices->devices)) {
- device = list_entry(fs_devices->devices.next,
- struct btrfs_device, dev_list);
+ device = list_first_entry(&fs_devices->devices,
+ struct btrfs_device, dev_list);
list_del(&device->dev_list);
btrfs_free_device(device);
}
@@ -428,8 +430,8 @@ void __exit btrfs_cleanup_fs_uuids(void)
struct btrfs_fs_devices *fs_devices;
while (!list_empty(&fs_uuids)) {
- fs_devices = list_entry(fs_uuids.next,
- struct btrfs_fs_devices, fs_list);
+ fs_devices = list_first_entry(&fs_uuids, struct btrfs_fs_devices,
+ fs_list);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
@@ -473,7 +475,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
struct block_device *bdev;
int ret;
- *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops);
if (IS_ERR(*bdev_file)) {
ret = PTR_ERR(*bdev_file);
@@ -488,15 +490,15 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
if (holder) {
ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- fput(*bdev_file);
+ bdev_fput(*bdev_file);
goto error;
}
}
invalidate_bdev(bdev);
- *disk_super = btrfs_read_dev_super(bdev);
+ *disk_super = btrfs_read_disk_super(bdev, 0, false);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- fput(*bdev_file);
+ bdev_fput(*bdev_file);
goto error;
}
@@ -541,7 +543,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
continue;
if (devt && devt != device->devt)
continue;
- if (fs_devices->opened) {
+ if (fs_devices->opened || fs_devices->holding) {
if (devt)
ret = -EBUSY;
break;
@@ -657,7 +659,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (!device->name)
return -EINVAL;
- ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ ret = btrfs_get_bdev_and_sb(rcu_dereference_raw(device->name), flags, holder, 1,
&bdev_file, &disk_super);
if (ret)
return ret;
@@ -674,8 +676,8 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
if (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
- pr_err(
- "BTRFS: Invalid seeding and uuid-changed device detected\n");
+ btrfs_err(NULL,
+ "invalid seeding and uuid-changed device detected");
goto error_free_page;
}
@@ -701,7 +703,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (device->devt != device->bdev->bd_dev) {
btrfs_warn(NULL,
"device %s maj:min changed from %d:%d to %d:%d",
- device->name->str, MAJOR(device->devt),
+ rcu_dereference_raw(device->name), MAJOR(device->devt),
MINOR(device->devt), MAJOR(device->bdev->bd_dev),
MINOR(device->bdev->bd_dev));
@@ -720,7 +722,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- fput(bdev_file);
+ bdev_fput(bdev_file);
return -EINVAL;
}
@@ -733,87 +735,11 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
}
-/*
- * We can have very weird soft links passed in.
- * One example is "/proc/self/fd/<fd>", which can be a soft link to
- * a block device.
- *
- * But it's never a good idea to use those weird names.
- * Here we check if the path (not following symlinks) is a good one inside
- * "/dev/".
- */
-static bool is_good_dev_path(const char *dev_path)
-{
- struct path path = { .mnt = NULL, .dentry = NULL };
- char *path_buf = NULL;
- char *resolved_path;
- bool is_good = false;
- int ret;
-
- if (!dev_path)
- goto out;
-
- path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!path_buf)
- goto out;
-
- /*
- * Do not follow soft link, just check if the original path is inside
- * "/dev/".
- */
- ret = kern_path(dev_path, 0, &path);
- if (ret)
- goto out;
- resolved_path = d_path(&path, path_buf, PATH_MAX);
- if (IS_ERR(resolved_path))
- goto out;
- if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
- goto out;
- is_good = true;
-out:
- kfree(path_buf);
- path_put(&path);
- return is_good;
-}
-
-static int get_canonical_dev_path(const char *dev_path, char *canonical)
-{
- struct path path = { .mnt = NULL, .dentry = NULL };
- char *path_buf = NULL;
- char *resolved_path;
- int ret;
-
- if (!dev_path) {
- ret = -EINVAL;
- goto out;
- }
-
- path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!path_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
- if (ret)
- goto out;
- resolved_path = d_path(&path, path_buf, PATH_MAX);
- if (IS_ERR(resolved_path)) {
- ret = PTR_ERR(resolved_path);
- goto out;
- }
- ret = strscpy(canonical, resolved_path, PATH_MAX);
-out:
- kfree(path_buf);
- path_put(&path);
- return ret;
-}
-
static bool is_same_device(struct btrfs_device *device, const char *new_path)
{
struct path old = { .mnt = NULL, .dentry = NULL };
struct path new = { .mnt = NULL, .dentry = NULL };
- char *old_path = NULL;
+ char AUTO_KFREE(old_path);
bool is_same = false;
int ret;
@@ -825,7 +751,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path)
goto out;
rcu_read_lock();
- ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
+ ret = strscpy(old_path, rcu_dereference(device->name), PATH_MAX);
rcu_read_unlock();
if (ret < 0)
goto out;
@@ -839,7 +765,6 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path)
if (path_equal(&old, &new))
is_same = true;
out:
- kfree(old_path);
path_put(&old);
path_put(&new);
return is_same;
@@ -858,11 +783,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = NULL;
- struct rcu_string *name;
+ const char *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_t path_devt;
- int error;
+ int ret;
bool same_fsid_diff_dev = false;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
@@ -874,11 +799,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-EAGAIN);
}
- error = lookup_bdev(path, &path_devt);
- if (error) {
+ ret = lookup_bdev(path, &path_devt);
+ if (ret) {
btrfs_err(NULL, "failed to lookup block device for path %s: %d",
- path, error);
- return ERR_PTR(error);
+ path, ret);
+ return ERR_PTR(ret);
}
fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
@@ -895,7 +820,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (same_fsid_diff_dev) {
generate_random_uuid(fs_devices->fsid);
fs_devices->temp_fsid = true;
- pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
+ btrfs_info(NULL, "device %s (%d:%d) using temp-fsid %pU",
path, MAJOR(path_devt), MINOR(path_devt),
fs_devices->fsid);
}
@@ -966,6 +891,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
current->comm, task_pid_nr(current));
} else if (!device->name || !is_same_device(device, path)) {
+ const char *old_name;
+
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
@@ -1019,27 +946,31 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_warn_in_rcu(NULL,
+ btrfs_warn(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(NULL,
+ btrfs_info(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, btrfs_dev_name(device),
path, current->comm,
task_pid_nr(current));
}
- name = rcu_string_strdup(path, GFP_NOFS);
+ name = kstrdup(path, GFP_NOFS);
if (!name) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-ENOMEM);
}
- rcu_string_free(device->name);
+ rcu_read_lock();
+ old_name = rcu_dereference(device->name);
+ rcu_read_unlock();
rcu_assign_pointer(device->name, name);
+ kfree_rcu_mightsleep(old_name);
+
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
@@ -1088,7 +1019,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
* uuid mutex so nothing we touch in here is going to disappear.
*/
if (orig_dev->name)
- dev_path = orig_dev->name->str;
+ dev_path = rcu_dereference_raw(orig_dev->name);
device = btrfs_alloc_device(NULL, &orig_dev->devid,
orig_dev->uuid, dev_path);
@@ -1146,7 +1077,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
continue;
if (device->bdev_file) {
- fput(device->bdev_file);
+ bdev_fput(device->bdev_file);
device->bdev = NULL;
device->bdev_file = NULL;
fs_devices->open_devices--;
@@ -1193,7 +1124,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- fput(device->bdev_file);
+ bdev_fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1225,7 +1156,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
- extent_io_tree_release(&device->alloc_state);
+ btrfs_extent_io_tree_release(&device->alloc_state);
/*
* Reset the flush error record. We might have a transient flush error
@@ -1273,7 +1204,7 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&uuid_mutex);
close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
+ if (!fs_devices->opened && !fs_devices->holding) {
list_splice_init(&fs_devices->seed_list, &list);
/*
@@ -1401,48 +1332,58 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
put_page(page);
}
-static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
- u64 bytenr, u64 bytenr_orig)
+struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ int copy_num, bool drop_cache)
{
- struct btrfs_super_block *disk_super;
+ struct btrfs_super_block *super;
struct page *page;
- void *p;
- pgoff_t index;
+ u64 bytenr, bytenr_orig;
+ struct address_space *mapping = bdev->bd_mapping;
+ int ret;
- /* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
- return ERR_PTR(-EINVAL);
+ bytenr_orig = btrfs_sb_offset(copy_num);
+ ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ return ERR_PTR(ret);
+ }
- /* make sure our super fits in the page */
- if (sizeof(*disk_super) > PAGE_SIZE)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
- /* make sure our super doesn't straddle pages on disk */
- index = bytenr >> PAGE_SHIFT;
- if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
- return ERR_PTR(-EINVAL);
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
- /* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+ page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
- p = page_address(page);
-
- /* align our pointer to the offset of the super block */
- disk_super = p + offset_in_page(bytenr);
-
- if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(p);
+ super = page_address(page);
+ if (btrfs_super_magic(super) != BTRFS_MAGIC ||
+ btrfs_super_bytenr(super) != bytenr_orig) {
+ btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
- if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
+ /*
+ * Make sure the last byte of label is properly NUL terminated. We use
+ * '%s' to print the label, if not properly NUL terminated we can access
+ * beyond the label.
+ */
+ if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1])
+ super->label[BTRFS_LABEL_SIZE - 1] = 0;
- return disk_super;
+ return super;
}
int btrfs_forget_devices(dev_t devt)
@@ -1480,7 +1421,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev && (device->bdev->bd_dev == devt) &&
- strcmp(device->name->str, path) != 0) {
+ strcmp(rcu_dereference_raw(device->name), path) != 0) {
mutex_unlock(&fs_devices->device_list_mutex);
/* Do not skip registration. */
@@ -1506,30 +1447,17 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
* the device or return an error. Multi-device and seeding devices are registered
* in both cases.
*/
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+struct btrfs_device *btrfs_scan_one_device(const char *path,
bool mount_arg_dev)
{
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct file *bdev_file;
- char *canonical_path = NULL;
- u64 bytenr;
dev_t devt;
- int ret;
lockdep_assert_held(&uuid_mutex);
- if (!is_good_dev_path(path)) {
- canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
- if (canonical_path) {
- ret = get_canonical_dev_path(path, canonical_path);
- if (ret < 0) {
- kfree(canonical_path);
- canonical_path = NULL;
- }
- }
- }
/*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
@@ -1540,24 +1468,11 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ bdev_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL);
if (IS_ERR(bdev_file))
return ERR_CAST(bdev_file);
- /*
- * We would like to check all the super blocks, but doing so would
- * allow a mount to succeed after a mkfs from a different filesystem.
- * Currently, recovery from a bad primary btrfs superblock is done
- * using the userspace command 'btrfs check --super'.
- */
- ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
- if (ret) {
- device = ERR_PTR(ret);
- goto error_bdev_put;
- }
-
- disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
- btrfs_sb_offset(0));
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), 0, false);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
@@ -1565,7 +1480,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
devt = file_bdev(bdev_file)->bd_dev;
if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
- pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
+ btrfs_debug(NULL, "skip registering single non-seed device %s (%d:%d)",
path, MAJOR(devt), MINOR(devt));
btrfs_free_stale_devices(devt, NULL);
@@ -1574,8 +1489,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
goto free_disk_super;
}
- device = device_list_add(canonical_path ? : path, disk_super,
- &new_device_added);
+ device = device_list_add(path, disk_super, &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
@@ -1583,8 +1497,7 @@ free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- fput(bdev_file);
- kfree(canonical_path);
+ bdev_fput(bdev_file);
return device;
}
@@ -1600,9 +1513,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
lockdep_assert_held(&device->fs_info->chunk_mutex);
- if (find_first_extent_bit(&device->alloc_state, *start,
- &physical_start, &physical_end,
- CHUNK_ALLOCATED, NULL)) {
+ if (btrfs_find_first_extent_bit(&device->alloc_state, *start,
+ &physical_start, &physical_end,
+ CHUNK_ALLOCATED, NULL)) {
if (in_range(physical_start, *start, len) ||
in_range(*start, physical_start,
@@ -1617,6 +1530,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
static u64 dev_extent_search_start(struct btrfs_device *device)
{
switch (device->fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
return BTRFS_DEVICE_RANGE_RESERVED;
case BTRFS_CHUNK_ALLOC_ZONED:
@@ -1626,8 +1542,6 @@ static u64 dev_extent_search_start(struct btrfs_device *device)
* for superblock logging.
*/
return 0;
- default:
- BUG();
}
}
@@ -1640,7 +1554,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
int ret;
bool changed = false;
- ASSERT(IS_ALIGNED(*hole_start, zone_size));
+ ASSERT(IS_ALIGNED(*hole_start, zone_size),
+ "hole_start=%llu zone_size=%llu", *hole_start, zone_size);
while (*hole_size > 0) {
pos = btrfs_find_allocatable_zones(device, *hole_start,
@@ -1706,6 +1621,9 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
}
switch (device->fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
/* No extra check */
break;
@@ -1720,8 +1638,6 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
continue;
}
break;
- default:
- BUG();
}
break;
@@ -1764,7 +1680,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
struct btrfs_dev_extent *dev_extent;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
u64 search_start;
u64 hole_size;
u64 max_hole_start;
@@ -1794,8 +1710,8 @@ again:
}
path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- path->skip_locking = 1;
+ path->search_commit_root = true;
+ path->skip_locking = true;
key.objectid = device->devid;
key.type = BTRFS_DEV_EXTENT_KEY;
@@ -1891,9 +1807,10 @@ next:
else
ret = 0;
- ASSERT(max_hole_start + max_hole_size <= search_end);
+ ASSERT(max_hole_start + max_hole_size <= search_end,
+ "max_hole_start=%llu max_hole_size=%llu search_end=%llu",
+ max_hole_start, max_hole_size, search_end);
out:
- btrfs_free_path(path);
*start = max_hole_start;
if (len)
*len = max_hole_size;
@@ -1907,7 +1824,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf = NULL;
@@ -1926,7 +1843,7 @@ again:
ret = btrfs_previous_item(root, path, key.objectid,
BTRFS_DEV_EXTENT_KEY);
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1941,7 +1858,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- goto out;
+ return ret;
}
*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
@@ -1949,8 +1866,6 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret == 0)
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -1978,7 +1893,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
@@ -1990,13 +1905,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
if (ret < 0)
- goto error;
+ return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/* Corruption */
btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
- ret = -EUCLEAN;
- goto error;
+ return -EUCLEAN;
}
ret = btrfs_previous_item(fs_info->chunk_root, path,
@@ -2009,10 +1923,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
path->slots[0]);
*devid_ret = found_key.offset + 1;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -2023,7 +1934,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -2042,7 +1953,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
&key, sizeof(*dev_item));
btrfs_trans_release_chunk_metadata(trans);
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
@@ -2068,10 +1979,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
ptr, BTRFS_FSID_SIZE);
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -2083,14 +1991,11 @@ out:
static void update_dev_time(const char *device_path)
{
struct path path;
- int ret;
- ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
- if (ret)
- return;
-
- inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
- path_put(&path);
+ if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) {
+ vfs_utimes(&path, NULL);
+ path_put(&path);
+ }
}
static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
@@ -2098,7 +2003,7 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -2112,16 +2017,12 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
btrfs_trans_release_chunk_metadata(trans);
- if (ret) {
- if (ret > 0)
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
+ if (ret < 0)
+ return ret;
- ret = btrfs_del_item(trans, root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, root, path);
}
/*
@@ -2204,7 +2105,7 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- ASSERT(num_devices > 1);
+ ASSERT(num_devices > 1, "num_devices=%llu", num_devices);
num_devices--;
}
up_read(&fs_info->dev_replace.rwsem);
@@ -2220,7 +2121,7 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
const u64 bytenr = btrfs_sb_offset(copy_num);
int ret;
- disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
+ disk_super = btrfs_read_disk_super(bdev, copy_num, false);
if (IS_ERR(disk_super))
return;
@@ -2253,7 +2154,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(device->name->str);
+ update_dev_time(rcu_dereference_raw(device->name));
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
@@ -2293,7 +2194,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
btrfs_dev_name(device), device->devid);
return -ETXTBSY;
@@ -2324,7 +2225,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
ret = btrfs_rm_dev_item(trans, device);
- if (ret) {
+ if (unlikely(ret)) {
/* Any error in dev item removal is critical */
btrfs_crit(fs_info,
"failed to remove device item for devid %llu: %d",
@@ -2383,7 +2284,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and fput() on the block device will pull in the
+ * write lock, and bdev_fput() on the block device will pull in the
* ->open_mutex on the block device and it's dependencies. Instead
* just flush the device and let the caller do the final bdev_release.
*/
@@ -2408,7 +2309,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
*/
if (cur_devices->num_devices == 0) {
list_del_init(&cur_devices->seed_list);
- ASSERT(cur_devices->opened == 1);
+ ASSERT(cur_devices->opened == 1, "opened=%d", cur_devices->opened);
cur_devices->opened--;
free_fs_devices(cur_devices);
}
@@ -2562,7 +2463,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- fput(bdev_file);
+ bdev_fput(bdev_file);
return 0;
}
@@ -2707,7 +2608,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_dev_item *dev_item;
struct btrfs_device *device;
@@ -2729,7 +2630,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
- goto error;
+ return ret;
leaf = path->nodes[0];
next_slot:
@@ -2738,7 +2639,7 @@ next_slot:
if (ret > 0)
break;
if (ret < 0)
- goto error;
+ return ret;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
@@ -2769,10 +2670,7 @@ next_slot:
path->slots[0]++;
goto next_slot;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
@@ -2794,7 +2692,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return -EROFS;
bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
+ fs_info->sb, &fs_holder_ops);
if (IS_ERR(bdev_file))
return PTR_ERR(bdev_file);
@@ -2803,6 +2701,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error;
}
+ if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = true;
down_write(&sb->s_umount);
@@ -2919,21 +2822,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
mutex_lock(&fs_info->chunk_mutex);
ret = init_first_rw_device(trans);
mutex_unlock(&fs_info->chunk_mutex);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
}
ret = btrfs_add_dev_item(trans, device);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
if (seeding_dev) {
ret = btrfs_finish_sprout(trans);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
@@ -3010,7 +2913,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- fput(bdev_file);
+ bdev_fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -3022,7 +2925,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = device->fs_info->chunk_root;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
@@ -3038,12 +2941,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ if (ret > 0)
+ return -ENOENT;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
@@ -3057,8 +2958,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3111,7 +3010,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
path = btrfs_alloc_path();
@@ -3124,23 +3023,21 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
- else if (ret > 0) { /* Logic error or corruption */
+ return ret;
+ if (unlikely(ret > 0)) {
+ /* Logic error or corruption */
btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
chunk_offset);
btrfs_abort_transaction(trans, -ENOENT);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3338,7 +3235,8 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
- ASSERT(0);
+ DEBUG_WARN("errr %ld reading chunk map at offset %llu",
+ PTR_ERR(map), chunk_offset);
return PTR_ERR(map);
}
@@ -3358,7 +3256,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
- if (ret) {
+ if (unlikely(ret)) {
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3370,6 +3268,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
+
+ if (list_empty(&device->post_commit_list)) {
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
mutex_unlock(&fs_info->chunk_mutex);
}
}
@@ -3419,8 +3323,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_info, sys_flags);
+ if (unlikely(!space_info)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
- sys_bg = btrfs_create_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, space_info, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3428,17 +3340,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = remove_chunk_item(trans, map, chunk_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- } else if (ret) {
+ } else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3447,7 +3359,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3463,7 +3375,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
btrfs_trans_release_chunk_metadata(trans);
ret = btrfs_remove_block_group(trans, map);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3478,7 +3390,8 @@ out:
return ret;
}
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ bool verbose)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
@@ -3508,7 +3421,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
- ret = btrfs_relocate_block_group(fs_info, chunk_offset);
+ ret = btrfs_relocate_block_group(fs_info, chunk_offset, true);
btrfs_scrub_continue(fs_info);
if (ret) {
/*
@@ -3561,7 +3474,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *chunk_root = fs_info->chunk_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_chunk *chunk;
struct btrfs_key key;
@@ -3585,9 +3498,9 @@ again:
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
- goto error;
+ return ret;
}
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* On the first search we would find chunk tree with
* offset -1, which is not possible. On subsequent
@@ -3595,9 +3508,8 @@ again:
* offset (one less than the previous one, wrong
* alignment and size).
*/
- ret = -EUCLEAN;
mutex_unlock(&fs_info->reclaim_bgs_lock);
- goto error;
+ return -EUCLEAN;
}
ret = btrfs_previous_item(chunk_root, path, key.objectid,
@@ -3605,7 +3517,7 @@ again:
if (ret)
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
- goto error;
+ return ret;
if (ret > 0)
break;
@@ -3618,7 +3530,8 @@ again:
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset,
+ true);
if (ret == -ENOSPC)
failed++;
else
@@ -3638,8 +3551,6 @@ again:
} else if (WARN_ON(failed && retried)) {
ret = -ENOSPC;
}
-error:
- btrfs_free_path(path);
return ret;
}
@@ -3880,26 +3791,25 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info)
* Balance filters. Return 1 if chunk should be filtered out
* (should not be balanced).
*/
-static int chunk_profiles_filter(u64 chunk_type,
- struct btrfs_balance_args *bargs)
+static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
{
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->profiles & chunk_type)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
- int ret = 1;
+ bool ret = true;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
@@ -3917,18 +3827,18 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off
user_thresh_max = mult_perc(cache->length, bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
- ret = 0;
+ ret = false;
btrfs_put_block_group(cache);
return ret;
}
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
- u64 chunk_offset, struct btrfs_balance_args *bargs)
+static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used, user_thresh;
- int ret = 1;
+ bool ret = true;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
@@ -3941,15 +3851,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
user_thresh = mult_perc(cache->length, bargs->usage);
if (chunk_used < user_thresh)
- ret = 0;
+ ret = false;
btrfs_put_block_group(cache);
return ret;
}
-static int chunk_devid_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3958,10 +3867,10 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static u64 calc_data_stripes(u64 type, int num_stripes)
@@ -3974,9 +3883,8 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
}
/* [pstart, pend) */
-static int chunk_drange_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3987,7 +3895,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
- return 0;
+ return false;
type = btrfs_chunk_type(leaf, chunk);
factor = calc_data_stripes(type, num_stripes);
@@ -4003,56 +3911,53 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* [vstart, vend) */
-static int chunk_vrange_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static bool chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ u64 chunk_offset, struct btrfs_balance_args *bargs)
{
if (chunk_offset < bargs->vend &&
chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
/* at least part of the chunk is inside this vrange */
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_stripes_range_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_stripes_range_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
if (bargs->stripes_min <= num_stripes
&& num_stripes <= bargs->stripes_max)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_soft_convert_filter(u64 chunk_type,
- struct btrfs_balance_args *bargs)
+static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
- return 0;
+ return false;
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->target == chunk_type)
- return 1;
+ return true;
- return 0;
+ return false;
}
-static int should_balance_chunk(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk, u64 chunk_offset)
+static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -4062,7 +3967,7 @@ static int should_balance_chunk(struct extent_buffer *leaf,
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
- return 0;
+ return false;
}
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
@@ -4075,46 +3980,46 @@ static int should_balance_chunk(struct extent_buffer *leaf,
/* profiles filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
chunk_profiles_filter(chunk_type, bargs)) {
- return 0;
+ return false;
}
/* usage filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(fs_info, chunk_offset, bargs)) {
- return 0;
+ return false;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
- return 0;
+ return false;
}
/* devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
chunk_devid_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* drange filter, makes sense only with devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
chunk_drange_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* vrange filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
- return 0;
+ return false;
}
/* stripes filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
chunk_stripes_range_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
- return 0;
+ return false;
}
/*
@@ -4122,7 +4027,7 @@ static int should_balance_chunk(struct extent_buffer *leaf,
*/
if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
if (bargs->limit == 0)
- return 0;
+ return false;
else
bargs->limit--;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
@@ -4132,12 +4037,12 @@ static int should_balance_chunk(struct extent_buffer *leaf,
* about the count of all chunks that satisfy the filters.
*/
if (bargs->limit_max == 0)
- return 0;
+ return false;
else
bargs->limit_max--;
}
- return 1;
+ return true;
}
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
@@ -4146,7 +4051,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
struct btrfs_root *chunk_root = fs_info->chunk_root;
u64 chunk_type;
struct btrfs_chunk *chunk;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf;
@@ -4289,7 +4194,7 @@ again:
}
}
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
@@ -4317,7 +4222,6 @@ loop:
goto again;
}
error:
- btrfs_free_path(path);
if (enospc_errors) {
btrfs_info(fs_info, "%d enospc errors during balance",
enospc_errors);
@@ -4334,7 +4238,7 @@ error:
* @flags: profile to validate
* @extended: if true @flags is treated as an extended profile
*/
-static int alloc_profile_is_valid(u64 flags, int extended)
+static int alloc_profile_is_valid(u64 flags, bool extended)
{
u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
BTRFS_BLOCK_GROUP_PROFILE_MASK);
@@ -4475,7 +4379,7 @@ static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
{
u32 size_buf = 1024;
char tmp_buf[192] = {'\0'};
- char *buf;
+ char AUTO_KFREE(buf);
char *bp;
u32 size_bp = size_buf;
int ret;
@@ -4523,12 +4427,10 @@ out_overflow:
btrfs_info(fs_info, "balance: %s %s",
(bctl->flags & BTRFS_BALANCE_RESUME) ?
"resume" : "start", buf);
-
- kfree(buf);
}
/*
- * Should be called with balance mutexe held
+ * Should be called with balance mutex held
*/
int btrfs_balance(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl,
@@ -4725,12 +4627,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
+
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
- sb_end_write(fs_info->sb);
return ret;
}
@@ -4752,7 +4654,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
spin_lock(&fs_info->super_lock);
- ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED,
+ "exclusive_operation=%d", fs_info->exclusive_operation);
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
spin_unlock(&fs_info->super_lock);
/*
@@ -4773,7 +4676,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
struct btrfs_balance_control *bctl;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
@@ -4788,17 +4691,14 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) { /* ret = -ENOENT; */
- ret = 0;
- goto out;
+ return 0;
}
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
- if (!bctl) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!bctl)
+ return -ENOMEM;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
@@ -4835,8 +4735,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
fs_info->balance_ctl = bctl;
spin_unlock(&fs_info->balance_lock);
mutex_unlock(&fs_info->balance_mutex);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -5056,7 +4954,7 @@ again:
goto done;
}
- ret = btrfs_relocate_chunk(fs_info, chunk_offset);
+ ret = btrfs_relocate_chunk(fs_info, chunk_offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
@@ -5088,8 +4986,8 @@ again:
mutex_lock(&fs_info->chunk_mutex);
/* Clear all state bits beyond the shrunk device size */
- clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
- CHUNK_STATE_MASK);
+ btrfs_clear_extent_bit(&device->alloc_state, new_size, (u64)-1,
+ CHUNK_STATE_MASK, NULL);
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
@@ -5105,7 +5003,7 @@ again:
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
btrfs_trans_release_chunk_metadata(trans);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
} else {
@@ -5216,6 +5114,8 @@ struct alloc_chunk_ctl {
u64 stripe_size;
u64 chunk_size;
int ndevs;
+ /* Space_info the block group is going to belong. */
+ struct btrfs_space_info *space_info;
};
static void init_alloc_chunk_ctl_policy_regular(
@@ -5289,14 +5189,15 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
ctl->ndevs = 0;
switch (fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
break;
case BTRFS_CHUNK_ALLOC_ZONED:
init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
break;
- default:
- BUG();
}
}
@@ -5435,7 +5336,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
* It should hold because:
* dev_extent_min == dev_extent_want == zone_size * dev_stripes
*/
- ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min,
+ "ndevs=%d max_avail=%llu dev_extent_min=%llu", ctl->ndevs,
+ devices_info[ctl->ndevs - 1].max_avail, ctl->dev_extent_min);
ctl->stripe_size = zone_size;
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
@@ -5448,7 +5351,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
ctl->dev_stripes);
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
- ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+ ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size,
+ "stripe_size=%llu data_stripes=%d max_chunk_size=%llu",
+ ctl->stripe_size, data_stripes, ctl->max_chunk_size);
}
ctl->chunk_size = ctl->stripe_size * data_stripes;
@@ -5481,12 +5386,13 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
switch (fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
return decide_stripe_size_regular(ctl, devices_info);
case BTRFS_CHUNK_ALLOC_ZONED:
return decide_stripe_size_zoned(ctl, devices_info);
- default:
- BUG();
}
}
@@ -5496,9 +5402,9 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int
struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
- set_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + map->stripe_size - 1,
- bits | EXTENT_NOWAIT, NULL);
+ btrfs_set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
}
}
@@ -5508,10 +5414,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in
struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
- __clear_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + map->stripe_size - 1,
- bits | EXTENT_NOWAIT,
- NULL, NULL);
+ btrfs_clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
}
}
@@ -5618,7 +5523,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
}
- block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
+ block_group = btrfs_make_block_group(trans, ctl->space_info, type, start,
+ ctl->chunk_size);
if (IS_ERR(block_group)) {
btrfs_remove_chunk_map(info, map);
return block_group;
@@ -5644,19 +5550,19 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
}
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
- u64 type)
+ struct btrfs_space_info *space_info,
+ u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct btrfs_device_info *devices_info = NULL;
+ struct btrfs_device_info AUTO_KFREE(devices_info);
struct alloc_chunk_ctl ctl;
- struct btrfs_block_group *block_group;
int ret;
lockdep_assert_held(&info->chunk_mutex);
if (!alloc_profile_is_valid(type, 0)) {
- ASSERT(0);
+ DEBUG_WARN("invalid alloc profile for type %llu", type);
return ERR_PTR(-EINVAL);
}
@@ -5668,12 +5574,13 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(info, "invalid chunk type 0x%llx requested", type);
- ASSERT(0);
+ DEBUG_WARN();
return ERR_PTR(-EINVAL);
}
ctl.start = find_next_chunk(info);
ctl.type = type;
+ ctl.space_info = space_info;
init_alloc_chunk_ctl(fs_devices, &ctl);
devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
@@ -5682,22 +5589,14 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
return ERR_PTR(-ENOMEM);
ret = gather_device_info(fs_devices, &ctl, devices_info);
- if (ret < 0) {
- block_group = ERR_PTR(ret);
- goto out;
- }
+ if (ret < 0)
+ return ERR_PTR(ret);
ret = decide_stripe_size(fs_devices, &ctl, devices_info);
- if (ret < 0) {
- block_group = ERR_PTR(ret);
- goto out;
- }
-
- block_group = create_chunk(trans, &ctl, devices_info);
+ if (ret < 0)
+ return ERR_PTR(ret);
-out:
- kfree(devices_info);
- return block_group;
+ return create_chunk(trans, &ctl, devices_info);
}
/*
@@ -5755,7 +5654,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
- if (!chunk) {
+ if (unlikely(!chunk)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -5817,7 +5716,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
u64 alloc_profile;
struct btrfs_block_group *meta_bg;
+ struct btrfs_space_info *meta_space_info;
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *sys_space_info;
/*
* When adding a new device for sprouting, the seed device is read-only
@@ -5841,12 +5742,22 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
*/
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- meta_bg = btrfs_create_chunk(trans, alloc_profile);
+ meta_space_info = btrfs_find_space_info(fs_info, alloc_profile);
+ if (!meta_space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+ meta_bg = btrfs_create_chunk(trans, meta_space_info, alloc_profile);
if (IS_ERR(meta_bg))
return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- sys_bg = btrfs_create_chunk(trans, alloc_profile);
+ sys_space_info = btrfs_find_space_info(fs_info, alloc_profile);
+ if (!sys_space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+ sys_bg = btrfs_create_chunk(trans, sys_space_info, alloc_profile);
if (IS_ERR(sys_bg))
return PTR_ERR(sys_bg);
@@ -6046,7 +5957,7 @@ static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_s
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
- int dev_replace_is_ongoing)
+ bool dev_replace_is_ongoing)
{
const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
int i;
@@ -6055,8 +5966,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
int tolerance;
struct btrfs_device *srcdev;
- ASSERT((map->type &
- (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
+ ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)),
+ "type=%llu", map->type);
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = map->sub_stripes;
@@ -6118,12 +6029,7 @@ struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
{
struct btrfs_io_context *bioc;
- bioc = kzalloc(
- /* The size of btrfs_io_context */
- sizeof(struct btrfs_io_context) +
- /* Plus the variable array for the stripes */
- sizeof(struct btrfs_io_stripe) * (total_stripes),
- GFP_NOFS);
+ bioc = kzalloc(struct_size(bioc, stripes, total_stripes), GFP_NOFS);
if (!bioc)
return NULL;
@@ -6357,7 +6263,7 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
}
/* We can only have at most 2 extra nr_stripes (for DUP). */
- ASSERT(nr_extra_stripes <= 2);
+ ASSERT(nr_extra_stripes <= 2, "nr_extra_stripes=%d", nr_extra_stripes);
/*
* For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
* replace.
@@ -6368,7 +6274,8 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
/* Only DUP can have two extra stripes. */
- ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
+ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP,
+ "map_type=%llu", bioc->map_type);
/*
* Swap the last stripe stripes and reduce @nr_extra_stripes.
@@ -6395,7 +6302,8 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
*/
io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
- ASSERT(io_geom->stripe_offset < U32_MAX);
+ ASSERT(io_geom->stripe_offset < U32_MAX,
+ "stripe_offset=%llu", io_geom->stripe_offset);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len =
@@ -6413,8 +6321,12 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
- ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
- ASSERT(io_geom->raid56_full_stripe_start <= offset);
+ ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset,
+ "raid56_full_stripe_start=%llu full_stripe_len=%lu offset=%llu",
+ io_geom->raid56_full_stripe_start, full_stripe_len, offset);
+ ASSERT(io_geom->raid56_full_stripe_start <= offset,
+ "raid56_full_stripe_start=%llu offset=%llu",
+ io_geom->raid56_full_stripe_start, offset);
/*
* For writes to RAID56, allow to write a full stripe set, but
* no straddling of stripe sets.
@@ -6580,7 +6492,7 @@ static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
{
int data_stripes = nr_data_stripes(map);
- ASSERT(io_geom->mirror_num <= 1);
+ ASSERT(io_geom->mirror_num <= 1, "mirror_num=%d", io_geom->mirror_num);
/* Just grab the data stripe directly. */
io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
io_geom->stripe_nr /= data_stripes;
@@ -6648,7 +6560,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int num_copies;
struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int dev_replace_is_ongoing = 0;
+ bool dev_replace_is_ongoing = false;
u16 num_alloc_stripes;
u64 max_len;
@@ -6843,6 +6755,8 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
const struct btrfs_device *device)
{
+ if (args->devt)
+ return device->devt == args->devt;
if (args->missing) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
!device->bdev)
@@ -6953,7 +6867,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
- extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
+ btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
if (devid)
tmp = *devid;
@@ -6974,9 +6888,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
generate_random_uuid(dev->uuid);
if (path) {
- struct rcu_string *name;
+ const char *name;
- name = rcu_string_strdup(path, GFP_KERNEL);
+ name = kstrdup(path, GFP_KERNEL);
if (!name) {
btrfs_free_device(dev);
return ERR_PTR(-ENOMEM);
@@ -7225,7 +7139,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (IS_ERR(fs_devices))
return fs_devices;
- ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
+ ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->sb);
if (ret) {
free_fs_devices(fs_devices);
return ERR_PTR(ret);
@@ -7491,7 +7405,7 @@ static void readahead_tree_node_children(struct extent_buffer *node)
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->chunk_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
struct btrfs_key found_key;
@@ -7522,7 +7436,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
/*
* Lockdep complains about possible circular locking dependency between
* a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
- * used for freeze procection of a fs (struct super_block.s_writers),
+ * used for freeze protection of a fs (struct super_block.s_writers),
* which we take when starting a transaction, and extent buffers of the
* chunk tree if we call read_one_dev() while holding a lock on an
* extent buffer of the chunk tree. Since we are mounting the filesystem
@@ -7530,7 +7444,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* chunk tree, to keep it simple, just skip locking on the chunk tree.
*/
ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
- path->skip_locking = 1;
+ path->skip_locking = true;
/*
* Read all device items, and then all the chunk items. All
@@ -7608,8 +7522,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
ret = 0;
error:
mutex_unlock(&uuid_mutex);
-
- btrfs_free_path(path);
return ret;
}
@@ -7709,7 +7621,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
int ret = 0;
path = btrfs_alloc_path();
@@ -7731,8 +7643,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
}
out:
mutex_unlock(&fs_devices->device_list_mutex);
-
- btrfs_free_path(path);
return ret;
}
@@ -7741,7 +7651,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_stats_item *ptr;
@@ -7757,10 +7667,10 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"error %d while searching for dev_stats item for device %s",
ret, btrfs_dev_name(device));
- goto out;
+ return ret;
}
if (ret == 0 &&
@@ -7768,10 +7678,10 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"delete too small dev_stats item for device %s failed %d",
btrfs_dev_name(device), ret);
- goto out;
+ return ret;
}
ret = 1;
}
@@ -7782,10 +7692,10 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"insert dev_stats item for device %s failed %d",
btrfs_dev_name(device), ret);
- goto out;
+ return ret;
}
}
@@ -7794,8 +7704,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_set_dev_stats_value(eb, ptr, i,
btrfs_dev_stat_read(device, i));
-out:
- btrfs_free_path(path);
return ret;
}
@@ -7845,7 +7753,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
if (!dev->dev_stats_valid)
return;
- btrfs_err_rl_in_rcu(dev->fs_info,
+ btrfs_err_rl(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7865,7 +7773,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- btrfs_info_in_rcu(dev->fs_info,
+ btrfs_info(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7925,7 +7833,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
{
struct btrfs_device *curr, *next;
- ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state);
if (list_empty(&trans->dev_update_list))
return;
@@ -7955,8 +7863,6 @@ int btrfs_bg_type_to_factor(u64 flags)
return btrfs_raid_array[index].ncopies;
}
-
-
static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
@@ -7970,7 +7876,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
int i;
map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
- if (!map) {
+ if (unlikely(!map)) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
@@ -7979,7 +7885,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
stripe_len = btrfs_calc_stripe_length(map);
- if (physical_len != stripe_len) {
+ if (unlikely(physical_len != stripe_len)) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
physical_offset, devid, map->start, physical_len,
@@ -7989,7 +7895,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/*
- * Very old mkfs.btrfs (before v4.1) will not respect the reserved
+ * Very old mkfs.btrfs (before v4.15) will not respect the reserved
* space. Although kernel can handle it without problem, better to warn
* the users.
*/
@@ -7999,8 +7905,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
devid, physical_offset, physical_len);
for (i = 0; i < map->num_stripes; i++) {
- if (map->stripes[i].dev->devid == devid &&
- map->stripes[i].physical == physical_offset) {
+ if (unlikely(map->stripes[i].dev->devid == devid &&
+ map->stripes[i].physical == physical_offset)) {
found = true;
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
@@ -8013,7 +7919,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
break;
}
}
- if (!found) {
+ if (unlikely(!found)) {
btrfs_err(fs_info,
"dev extent physical offset %llu devid %llu has no corresponding chunk",
physical_offset, devid);
@@ -8022,13 +7928,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* Make sure no dev extent is beyond device boundary */
dev = btrfs_find_device(fs_info->fs_devices, &args);
- if (!dev) {
+ if (unlikely(!dev)) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
- if (physical_offset + physical_len > dev->disk_total_bytes) {
+ if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
devid, physical_offset, physical_len,
@@ -8040,8 +7946,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
if (dev->zone_info) {
u64 zone_size = dev->zone_info->zone_size;
- if (!IS_ALIGNED(physical_offset, zone_size) ||
- !IS_ALIGNED(physical_len, zone_size)) {
+ if (unlikely(!IS_ALIGNED(physical_offset, zone_size) ||
+ !IS_ALIGNED(physical_len, zone_size))) {
btrfs_err(fs_info,
"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
devid, physical_offset, physical_len);
@@ -8065,7 +7971,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
struct btrfs_chunk_map *map;
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
- if (map->num_stripes != map->verified_stripes) {
+ if (unlikely(map->num_stripes != map->verified_stripes)) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
map->start, map->verified_stripes, map->num_stripes);
@@ -8087,7 +7993,7 @@ out:
*/
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
u64 prev_devid = 0;
@@ -8118,17 +8024,15 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
path->reada = READA_FORWARD;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
+ return ret;
/* No dev extents at all? Not good */
- if (ret > 0) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (unlikely(ret > 0))
+ return -EUCLEAN;
}
while (1) {
struct extent_buffer *leaf = path->nodes[0];
@@ -8150,24 +8054,23 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
physical_len = btrfs_dev_extent_length(leaf, dext);
/* Check if this dev extent overlaps with the previous one */
- if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+ if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
devid, physical_offset, prev_dev_ext_end);
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
physical_offset, physical_len);
if (ret < 0)
- goto out;
+ return ret;
prev_devid = devid;
prev_dev_ext_end = physical_offset + physical_len;
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0) {
ret = 0;
break;
@@ -8175,10 +8078,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
}
/* Ensure all chunks have corresponding dev extents */
- ret = verify_chunk_dev_extent_mapping(fs_info);
-out:
- btrfs_free_path(path);
- return ret;
+ return verify_chunk_dev_extent_mapping(fs_info);
}
/*
@@ -8215,12 +8115,12 @@ static int relocating_repair_kthread(void *data)
target = cache->start;
btrfs_put_block_group(cache);
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
btrfs_info(fs_info,
"zoned: skip relocating block group %llu to repair: EBUSY",
target);
- sb_end_write(fs_info->sb);
return -EBUSY;
}
@@ -8241,14 +8141,13 @@ static int relocating_repair_kthread(void *data)
btrfs_info(fs_info,
"zoned: relocating block group %llu to repair IO failure",
target);
- ret = btrfs_relocate_chunk(fs_info, target);
+ ret = btrfs_relocate_chunk(fs_info, target, true);
out:
if (cache)
btrfs_put_block_group(cache);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
return ret;
}
@@ -8294,7 +8193,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc,
logical < stripe_start + BTRFS_STRIPE_LEN)
break;
}
- ASSERT(i < data_stripes);
+ ASSERT(i < data_stripes, "i=%d data_stripes=%d", i, data_stripes);
smap->dev = bioc->stripes[i].dev;
smap->physical = bioc->stripes[i].physical +
((logical - bioc->full_stripe_logical) &
@@ -8323,7 +8222,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
int mirror_ret = mirror_num;
int ret;
- ASSERT(mirror_num > 0);
+ ASSERT(mirror_num > 0, "mirror_num=%d", mirror_num);
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
&bioc, smap, &mirror_ret);
@@ -8331,7 +8230,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
return ret;
/* The map range should not cross stripe boundary. */
- ASSERT(map_length >= length);
+ ASSERT(map_length >= length, "map_length=%llu length=%u", map_length, length);
/* Already mapped to single stripe. */
if (!bioc)
@@ -8343,7 +8242,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
goto out;
}
- ASSERT(mirror_num <= bioc->num_stripes);
+ ASSERT(mirror_num <= bioc->num_stripes,
+ "mirror_num=%d num_stripes=%d", mirror_num, bioc->num_stripes);
smap->dev = bioc->stripes[mirror_num - 1].dev;
smap->physical = bioc->stripes[mirror_num - 1].physical;
out:
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index e247d551da67..34b854c1a303 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,7 +21,6 @@
#include <uapi/linux/btrfs.h>
#include <uapi/linux/btrfs_tree.h>
#include "messages.h"
-#include "rcu-string.h"
#include "extent-io-tree.h"
struct block_device;
@@ -35,7 +34,7 @@ struct btrfs_zoned_device_info;
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
/*
- * Arbitratry maximum size of one discard request to limit potentially long time
+ * Arbitrary maximum size of one discard request to limit potentially long time
* spent in blkdev_issue_discard().
*/
#define BTRFS_MAX_DISCARD_CHUNK_SIZE (SZ_1G)
@@ -46,7 +45,7 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN_SHIFT (16)
#define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1)
-static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
+static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
/* Used by sanity check for btrfs_raid_types. */
#define const_ffs(n) (__builtin_ctzll(n) + 1)
@@ -59,8 +58,7 @@ static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
*/
static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
-static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
- ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
+static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
/* ilog2() can handle both constants and variables */
#define BTRFS_BG_FLAG_TO_INDEX(profile) \
@@ -114,7 +112,8 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string __rcu *name;
+ /* Device path or NULL if missing. */
+ const char __rcu *name;
u64 generation;
@@ -422,6 +421,16 @@ struct btrfs_fs_devices {
/* Count fs-devices opened. */
int opened;
+ /*
+ * Counter of the processes that are holding this fs_devices but not
+ * yet opened.
+ * This is for mounting handling, as we can only open the fs_devices
+ * after a super block is created. But we cannot take uuid_mutex
+ * during sget_fc(), thus we have to hold the fs_devices (meaning it
+ * cannot be released) until a super block is returned.
+ */
+ int holding;
+
/* Set when we find or add a device that doesn't have the nonrot flag set. */
bool rotating;
/* Devices support TRIM/discard commands. */
@@ -473,7 +482,6 @@ struct btrfs_io_stripe {
struct btrfs_device *dev;
/* Block mapping. */
u64 physical;
- u64 length;
bool rst_search_commit_root;
/* For the endio handler. */
struct btrfs_io_context *bioc;
@@ -486,7 +494,7 @@ struct btrfs_discard_stripe {
};
/*
- * Context for IO subsmission for device stripe.
+ * Context for IO submission for device stripe.
*
* - Track the unfinished mirrors for mirror based profiles
* Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
@@ -653,6 +661,11 @@ struct btrfs_dev_lookup_args {
u64 devid;
u8 *uuid;
u8 *fsid;
+ /*
+ * If devt is specified, all other members will be ignored as it is
+ * enough to uniquely locate a device.
+ */
+ dev_t devt;
bool missing;
};
@@ -668,7 +681,7 @@ enum btrfs_map_op {
BTRFS_MAP_GET_READ_MIRRORS,
};
-static inline enum btrfs_map_op btrfs_op(struct bio *bio)
+static inline enum btrfs_map_op btrfs_op(const struct bio *bio)
{
switch (bio_op(bio)) {
case REQ_OP_WRITE:
@@ -715,12 +728,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
- u64 type);
+ struct btrfs_space_info *space_info,
+ u64 type);
void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
- bool mount_arg_dev);
+struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev);
int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
@@ -754,7 +767,8 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ bool verbose);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
@@ -786,6 +800,8 @@ struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_inf
struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
+struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ int copy_num, bool drop_cache);
void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
@@ -844,7 +860,26 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device)
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
return "<missing disk>";
else
- return rcu_str_deref(device->name);
+ return rcu_dereference(device->name);
+}
+
+static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol)
+{
+ WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol);
+}
+
+static inline void btrfs_fs_devices_inc_holding(struct btrfs_fs_devices *fs_devices)
+{
+ lockdep_assert_held(&uuid_mutex);
+ ASSERT(fs_devices->holding >= 0);
+ fs_devices->holding++;
+}
+
+static inline void btrfs_fs_devices_dec_holding(struct btrfs_fs_devices *fs_devices)
+{
+ lockdep_assert_held(&uuid_mutex);
+ ASSERT(fs_devices->holding > 0);
+ fs_devices->holding--;
}
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3e0edbcf73e1..ab55d10bd71f 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,9 +29,8 @@ int btrfs_getxattr(const struct inode *inode, const char *name,
{
struct btrfs_dir_item *di;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
- int ret = 0;
unsigned long data_ptr;
path = btrfs_alloc_path();
@@ -41,26 +40,19 @@ int btrfs_getxattr(const struct inode *inode, const char *name,
/* lookup the xattr by name */
di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)),
name, strlen(name), 0);
- if (!di) {
- ret = -ENODATA;
- goto out;
- } else if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
+ if (!di)
+ return -ENODATA;
+ if (IS_ERR(di))
+ return PTR_ERR(di);
leaf = path->nodes[0];
/* if size is 0, that means we want the size of the attr */
- if (!size) {
- ret = btrfs_dir_data_len(leaf, di);
- goto out;
- }
+ if (!size)
+ return btrfs_dir_data_len(leaf, di);
/* now get the data out of our dir_item */
- if (btrfs_dir_data_len(leaf, di) > size) {
- ret = -ERANGE;
- goto out;
- }
+ if (btrfs_dir_data_len(leaf, di) > size)
+ return -ERANGE;
/*
* The way things are packed into the leaf is like this
@@ -73,11 +65,7 @@ int btrfs_getxattr(const struct inode *inode, const char *name,
btrfs_dir_name_len(leaf, di));
read_extent_buffer(leaf, buffer, data_ptr,
btrfs_dir_data_len(leaf, di));
- ret = btrfs_dir_data_len(leaf, di);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_dir_data_len(leaf, di);
}
int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
@@ -85,7 +73,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
{
struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
size_t name_len = strlen(name);
int ret = 0;
@@ -97,7 +85,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->skip_release_on_error = 1;
+ path->skip_release_on_error = true;
if (!value) {
di = btrfs_lookup_xattr(trans, root, path,
@@ -212,7 +200,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
*/
}
out:
- btrfs_free_path(path);
if (!ret) {
set_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
@@ -278,7 +265,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int iter_ret = 0;
int ret = 0;
size_t total_size = 0, size_left = size;
@@ -354,8 +341,6 @@ next:
else
ret = total_size;
- btrfs_free_path(path);
-
return ret;
}
@@ -510,14 +495,15 @@ static int btrfs_initxattrs(struct inode *inode,
*/
nofs_flag = memalloc_nofs_save();
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
- strlen(xattr->name) + 1, GFP_KERNEL);
+ const size_t name_len = XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1;
+
+ name = kmalloc(name_len, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
break;
}
- strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ scnprintf(name, name_len, "%s%s", XATTR_SECURITY_PREFIX, xattr->name);
if (strcmp(name, XATTR_NAME_CAPS) == 0)
clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 545f413d81fc..6caba8be7c84 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -34,11 +34,9 @@ struct workspace {
int level;
};
-static struct workspace_manager wsm;
-
-struct list_head *zlib_get_workspace(unsigned int level)
+struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
{
- struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level);
+ struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level);
struct workspace *workspace = list_entry(ws, struct workspace, list);
workspace->level = level;
@@ -55,8 +53,25 @@ void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *zlib_alloc_workspace(unsigned int level)
+/*
+ * For s390 hardware acceleration, the buffer size should be at least
+ * ZLIB_DFLTCC_BUF_SIZE to achieve the best performance.
+ *
+ * But if bs > ps we can have large enough folios that meet the s390 hardware
+ * handling.
+ */
+static bool need_special_buffer(struct btrfs_fs_info *fs_info)
+{
+ if (!zlib_deflate_dfltcc_enabled())
+ return false;
+ if (btrfs_min_folio_size(fs_info) >= ZLIB_DFLTCC_BUF_SIZE)
+ return false;
+ return true;
+}
+
+struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
{
+ const u32 blocksize = fs_info->sectorsize;
struct workspace *workspace;
int workspacesize;
@@ -69,19 +84,15 @@ struct list_head *zlib_alloc_workspace(unsigned int level)
workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN);
workspace->level = level;
workspace->buf = NULL;
- /*
- * In case of s390 zlib hardware support, allocate lager workspace
- * buffer. If allocator fails, fall back to a single page buffer.
- */
- if (zlib_deflate_dfltcc_enabled()) {
+ if (need_special_buffer(fs_info)) {
workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE,
__GFP_NOMEMALLOC | __GFP_NORETRY |
__GFP_NOWARN | GFP_NOIO);
workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE;
}
if (!workspace->buf) {
- workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
- workspace->buf_size = PAGE_SIZE;
+ workspace->buf = kmalloc(blocksize, GFP_KERNEL);
+ workspace->buf_size = blocksize;
}
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -120,8 +131,6 @@ static int copy_data_into_buffer(struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio);
if (ret < 0)
return ret;
- /* No large folio support yet. */
- ASSERT(!folio_test_large(folio));
offset = offset_in_folio(folio, cur);
copy_length = min(folio_size(folio) - offset,
@@ -135,11 +144,15 @@ static int copy_data_into_buffer(struct address_space *mapping,
return 0;
}
-int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
int ret;
char *data_in = NULL;
char *cfolio_out;
@@ -148,7 +161,8 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
struct folio *out_folio = NULL;
unsigned long len = *total_out;
unsigned long nr_dest_folios = *out_folios;
- const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+ const unsigned long max_out = nr_dest_folios << min_folio_shift;
+ const u32 blocksize = fs_info->sectorsize;
const u64 orig_end = start + len;
*out_folios = 0;
@@ -157,9 +171,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = zlib_deflateInit(&workspace->strm, workspace->level);
if (unlikely(ret != Z_OK)) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_err(inode->root->fs_info,
+ btrfs_err(fs_info,
"zlib compression init failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode), start);
ret = -EIO;
@@ -169,7 +181,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -181,7 +193,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = 0;
workspace->strm.next_out = cfolio_out;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = min_folio_size;
while (workspace->strm.total_in < len) {
/*
@@ -193,10 +205,11 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned int copy_length = min(bytes_left, workspace->buf_size);
/*
- * This can only happen when hardware zlib compression is
- * enabled.
+ * For s390 hardware accelerated zlib, and our folio is smaller
+ * than the copy_length, we need to fill the buffer so that
+ * we can take full advantage of hardware acceleration.
*/
- if (copy_length > PAGE_SIZE) {
+ if (need_special_buffer(fs_info)) {
ret = copy_data_into_buffer(mapping, workspace,
start, copy_length);
if (ret < 0)
@@ -205,7 +218,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = copy_length;
} else {
- unsigned int pg_off;
unsigned int cur_len;
if (data_in) {
@@ -217,9 +229,9 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
- cur_len = btrfs_calc_input_length(orig_end, start);
- data_in = kmap_local_folio(in_folio, pg_off);
+ cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
+ data_in = kmap_local_folio(in_folio,
+ offset_in_folio(in_folio, start));
start += cur_len;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = cur_len;
@@ -228,9 +240,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
if (unlikely(ret != Z_OK)) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_warn(inode->root->fs_info,
+ btrfs_warn(fs_info,
"zlib compression failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
start);
@@ -240,7 +250,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
}
/* we're making it bigger, give up */
- if (workspace->strm.total_in > 8192 &&
+ if (workspace->strm.total_in > blocksize * 2 &&
workspace->strm.total_in <
workspace->strm.total_out) {
ret = -E2BIG;
@@ -255,7 +265,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -263,7 +273,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
cfolio_out = folio_address(out_folio);
folios[nr_folios] = out_folio;
nr_folios++;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = min_folio_size;
workspace->strm.next_out = cfolio_out;
}
/* we're all done */
@@ -281,7 +291,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = zlib_deflate(&workspace->strm, Z_FINISH);
if (ret == Z_STREAM_END)
break;
- if (ret != Z_OK && ret != Z_BUF_ERROR) {
+ if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) {
zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
@@ -291,7 +301,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -299,7 +309,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
cfolio_out = folio_address(out_folio);
folios[nr_folios] = out_folio;
nr_folios++;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = min_folio_size;
workspace->strm.next_out = cfolio_out;
}
}
@@ -325,20 +335,22 @@ out:
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
int ret = 0, ret2;
int wbits = MAX_WBITS;
char *data_in;
size_t total_out = 0;
unsigned long folio_in_index = 0;
size_t srclen = cb->compressed_len;
- unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size);
unsigned long buf_start;
struct folio **folios_in = cb->compressed_folios;
data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
- workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size);
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
@@ -399,7 +411,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
- workspace->strm.avail_in = min(tmp, PAGE_SIZE);
+ workspace->strm.avail_in = min(tmp, min_folio_size);
}
}
if (unlikely(ret != Z_STREAM_END)) {
@@ -487,8 +499,7 @@ out:
return ret;
}
-const struct btrfs_compress_op btrfs_zlib_compress = {
- .workspace_manager = &wsm,
+const struct btrfs_compress_levels btrfs_zlib_compress = {
.min_level = 1,
.max_level = 9,
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index fb8b8b29c169..359a98e6de85 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -9,7 +9,6 @@
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
-#include "rcu-string.h"
#include "disk-io.h"
#include "block-group.h"
#include "dev-replace.h"
@@ -17,6 +16,8 @@
#include "fs.h"
#include "accessors.h"
#include "bio.h"
+#include "transaction.h"
+#include "sysfs.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -36,12 +37,15 @@
#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
-#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
-#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
+#define BTRFS_SB_LOG_FIRST_SHIFT ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
+#define BTRFS_SB_LOG_SECOND_SHIFT ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
+/* Default number of max active zones when the device has no limits. */
+#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128
+
/*
* Minimum of active zones we need:
*
@@ -89,7 +93,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
sector_t sector;
for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
- ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL,
+ "zones[%d].type=%d", i, zones[i].type);
empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
full[i] = sb_zone_is_full(&zones[i]);
}
@@ -162,14 +167,14 @@ static inline u32 sb_zone_number(int shift, int mirror)
{
u64 zone = U64_MAX;
- ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX, "mirror=%d", mirror);
switch (mirror) {
case 0: zone = 0; break;
case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
}
- ASSERT(zone <= U32_MAX);
+ ASSERT(zone <= U32_MAX, "zone=%llu", zone);
return (u32)zone;
}
@@ -236,7 +241,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
unsigned int i;
u32 zno;
- ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(pos, zinfo->zone_size),
+ "pos=%llu zinfo->zone_size=%llu", pos, zinfo->zone_size);
zno = pos >> zinfo->zone_size_shift;
/*
* We cannot report zones beyond the zone end. So, it is OK to
@@ -260,17 +266,17 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
}
}
- ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
- copy_zone_info_cb, zones);
+ ret = blkdev_report_zones_cached(device->bdev, pos >> SECTOR_SHIFT,
+ *nr_zones, copy_zone_info_cb, zones);
if (ret < 0) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to read zone %llu on %s (devid %llu)",
- pos, rcu_str_deref(device->name),
+ pos, rcu_dereference(device->name),
device->devid);
return ret;
}
*nr_zones = ret;
- if (!ret)
+ if (unlikely(!ret))
return -EIO;
/* Populate cache */
@@ -311,7 +317,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
if (ret < 0)
return ret;
/* No dev extents at all? Not good */
- if (ret > 0)
+ if (unlikely(ret > 0))
return -EUCLEAN;
}
@@ -395,16 +401,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
/* We reject devices with a zone size larger than 8GB */
if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu larger than supported maximum %llu",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
ret = -EINVAL;
goto out;
} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu smaller than supported minimum %u",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
ret = -EINVAL;
goto out;
@@ -416,11 +422,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = bdev_max_active_zones(bdev);
+ max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
+ bdev_max_open_zones(bdev));
+ if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
+ max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
- rcu_str_deref(device->name), max_active_zones,
+ rcu_dereference(device->name), max_active_zones,
BTRFS_MIN_ACTIVE_ZONES);
ret = -EINVAL;
goto out;
@@ -460,9 +469,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_info->zone_cache = vcalloc(zone_info->nr_zones,
sizeof(struct blk_zone));
if (!zone_info->zone_cache) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to allocate zone cache for %s",
- rcu_str_deref(device->name));
+ rcu_dereference(device->name));
ret = -ENOMEM;
goto out;
}
@@ -487,6 +496,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_ACTIVE:
__set_bit(nreported, zone_info->active_zones);
nactive++;
break;
@@ -496,20 +506,25 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
}
- if (nreported != zone_info->nr_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(nreported != zone_info->nr_zones)) {
+ btrfs_err(device->fs_info,
"inconsistent number of zones on %s (%u/%u)",
- rcu_str_deref(device->name), nreported,
+ rcu_dereference(device->name), nreported,
zone_info->nr_zones);
ret = -EIO;
goto out;
}
if (max_active_zones) {
- if (nactive > max_active_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(nactive > max_active_zones)) {
+ if (bdev_max_active_zones(bdev) == 0) {
+ max_active_zones = 0;
+ zone_info->max_active_zones = 0;
+ goto validate;
+ }
+ btrfs_err(device->fs_info,
"zoned: %u active zones on %s exceeds max_active_zones %u",
- nactive, rcu_str_deref(device->name),
+ nactive, rcu_dereference(device->name),
max_active_zones);
ret = -EIO;
goto out;
@@ -519,6 +534,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
}
+validate:
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -537,8 +553,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (ret)
goto out;
- if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) {
+ btrfs_err(device->fs_info,
"zoned: failed to read super block log zone info at devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -555,8 +571,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
ret = sb_write_pointer(device->bdev,
&zone_info->sb_zones[sb_pos], &sb_wp);
- if (ret != -ENOENT && ret) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(ret != -ENOENT && ret)) {
+ btrfs_err(device->fs_info,
"zoned: super block log zone corrupted devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -575,9 +591,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
emulated = "emulated ";
}
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"%s block device %s, %u %szones of %llu bytes",
- model, rcu_str_deref(device->name), zone_info->nr_zones,
+ model, rcu_dereference(device->name), zone_info->nr_zones,
emulated, zone_info->zone_size);
return 0;
@@ -883,12 +899,12 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
- BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
- zones);
+ ret = blkdev_report_zones_cached(bdev, zone_start_sector(sb_zone, bdev),
+ BTRFS_NR_SB_LOG_ZONES,
+ copy_zone_info_cb, zones);
if (ret < 0)
return ret;
- if (ret != BTRFS_NR_SB_LOG_ZONES)
+ if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES))
return -EIO;
return sb_log_location(bdev, zones, rw, bytenr_ret);
@@ -989,7 +1005,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
}
/* All the zones are FULL. Should not reach here. */
- ASSERT(0);
+ DEBUG_WARN("unexpected state, all zones full");
return -EIO;
}
@@ -1042,8 +1058,10 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
bool have_sb;
int i;
- ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
- ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size),
+ "hole_start=%llu zinfo->zone_size=%llu", hole_start, zinfo->zone_size);
+ ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size),
+ "num_bytes=%llu zinfo->zone_size=%llu", num_bytes, zinfo->zone_size);
while (pos < hole_end) {
begin = pos >> shift;
@@ -1159,8 +1177,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
u64 pos;
int ret;
- ASSERT(IS_ALIGNED(start, zinfo->zone_size));
- ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(start, zinfo->zone_size),
+ "start=%llu, zinfo->zone_size=%llu", start, zinfo->zone_size);
+ ASSERT(IS_ALIGNED(size, zinfo->zone_size),
+ "size=%llu, zinfo->zone_size=%llu", size, zinfo->zone_size);
if (begin + nbits > zinfo->nr_zones)
return -ERANGE;
@@ -1182,10 +1202,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
continue;
/* Free regions should be empty */
- btrfs_warn_in_rcu(
+ btrfs_warn(
device->fs_info,
"zoned: resetting device %s (devid %llu) zone %llu for allocation",
- rcu_str_deref(device->name), device->devid, pos >> shift);
+ rcu_dereference(device->name), device->devid, pos >> shift);
WARN_ON_ONCE(1);
ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
@@ -1240,7 +1260,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
root = btrfs_extent_root(fs_info, key.objectid);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
/* We should not find the exact match */
- if (!ret)
+ if (unlikely(!ret))
ret = -EUCLEAN;
if (ret < 0)
return ret;
@@ -1261,8 +1281,8 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
else
length = fs_info->nodesize;
- if (!(found_key.objectid >= cache->start &&
- found_key.objectid + length <= cache->start + cache->length)) {
+ if (unlikely(!(found_key.objectid >= cache->start &&
+ found_key.objectid + length <= cache->start + cache->length))) {
return -EUCLEAN;
}
*offset_ret = found_key.objectid + length - cache->start;
@@ -1277,7 +1297,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct btrfs_chunk_map *map)
+ struct btrfs_chunk_map *map, bool new)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device;
@@ -1304,9 +1324,12 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
if (!btrfs_dev_is_sequential(device, info->physical)) {
up_read(&dev_replace->rwsem);
info->alloc_offset = WP_CONVENTIONAL;
+ info->capacity = device->zone_info->zone_size;
return 0;
}
+ ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical));
+
/* This zone will be used for allocation, so mark this zone non-empty. */
btrfs_dev_clear_zone_empty(device, info->physical);
@@ -1319,6 +1342,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
* to determine the allocation offset within the zone.
*/
WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+
+ if (new) {
+ sector_t capacity;
+
+ capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT);
+ up_read(&dev_replace->rwsem);
+ info->alloc_offset = 0;
+ info->capacity = capacity << SECTOR_SHIFT;
+
+ return 0;
+ }
+
nofs_flag = memalloc_nofs_save();
ret = btrfs_get_dev_zone(device, info->physical, &zone);
memalloc_nofs_restore(nofs_flag);
@@ -1330,10 +1365,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
return 0;
}
- if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
- btrfs_err_in_rcu(fs_info,
+ if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) {
+ btrfs_err(fs_info,
"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
- zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+ zone.start << SECTOR_SHIFT, rcu_dereference(device->name),
device->devid);
up_read(&dev_replace->rwsem);
return -EIO;
@@ -1344,10 +1379,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
(info->physical >> device->zone_info->zone_size_shift),
- rcu_str_deref(device->name), device->devid);
+ rcu_dereference(device->name), device->devid);
info->alloc_offset = WP_MISSING_DEV;
break;
case BLK_ZONE_COND_EMPTY:
@@ -1372,7 +1407,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
struct zone_info *info,
unsigned long *active)
{
- if (info->alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(info->alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
info->physical);
@@ -1389,7 +1424,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1400,26 +1436,33 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
- if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[0].physical);
return -EIO;
}
- if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[1].physical);
return -EIO;
}
- if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+
+ if (zone_info[0].alloc_offset == WP_CONVENTIONAL)
+ zone_info[0].alloc_offset = last_alloc;
+
+ if (zone_info[1].alloc_offset == WP_CONVENTIONAL)
+ zone_info[1].alloc_offset = last_alloc;
+
+ if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) {
btrfs_err(bg->fs_info,
"zoned: write pointer offset mismatch of zones in DUP profile");
return -EIO;
}
if (test_bit(0, active) != test_bit(1, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else if (test_bit(0, active)) {
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
@@ -1432,7 +1475,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
int i;
@@ -1447,20 +1491,22 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
for (i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
- if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ zone_info[i].alloc_offset = last_alloc;
+
+ if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+ !btrfs_test_opt(fs_info, DEGRADED))) {
btrfs_err(fs_info,
"zoned: write pointer offset mismatch of zones in %s profile",
btrfs_bg_type_to_raid_name(map->type));
return -EIO;
}
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_test_opt(fs_info, DEGRADED) &&
- !btrfs_zone_activate(bg)) {
+ if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(bg))) {
return -EIO;
}
} else {
@@ -1480,9 +1526,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1490,13 +1539,30 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
+
+ if (stripe_index > i)
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
+ else if (stripe_index == i)
+ zone_info[i].alloc_offset += stripe_offset;
+ }
+
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else {
if (test_bit(0, active))
@@ -1512,9 +1578,12 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1522,19 +1591,35 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes / map->sub_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else {
if (test_bit(0, active))
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
+
+ if (stripe_index > (i / map->sub_stripes))
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
+ else if (stripe_index == (i / map->sub_stripes))
+ zone_info[i].alloc_offset += stripe_offset;
+ }
+
if ((i % map->sub_stripes) == 0) {
bg->zone_capacity += zone_info[i].capacity;
bg->alloc_offset += zone_info[i].alloc_offset;
@@ -1550,7 +1635,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
struct btrfs_chunk_map *map;
u64 logical = cache->start;
u64 length = cache->length;
- struct zone_info *zone_info = NULL;
+ struct zone_info AUTO_KFREE(zone_info);
int ret;
int i;
unsigned long *active = NULL;
@@ -1562,7 +1647,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
return 0;
/* Sanity check */
- if (!IS_ALIGNED(length, fs_info->zone_size)) {
+ if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) {
btrfs_err(fs_info,
"zoned: block group %llu len %llu unaligned to zone size %llu",
logical, length, fs_info->zone_size);
@@ -1588,7 +1673,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new);
if (ret)
goto out;
@@ -1602,8 +1687,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
if (num_conventional > 0) {
- /* Zone capacity is always zone size in emulation */
- cache->zone_capacity = cache->length;
ret = calculate_alloc_pointer(cache, &last_alloc, new);
if (ret) {
btrfs_err(fs_info,
@@ -1612,6 +1695,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
} else if (map->num_stripes == num_conventional) {
cache->alloc_offset = last_alloc;
+ cache->zone_capacity = cache->length;
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
goto out;
}
@@ -1623,18 +1707,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active,
+ last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID1C3:
case BTRFS_BLOCK_GROUP_RAID1C4:
- ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID0:
- ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID10:
- ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
@@ -1659,7 +1747,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* stripe.
*/
cache->alloc_offset = cache->zone_capacity;
- ret = 0;
}
out:
@@ -1669,10 +1756,10 @@ out:
!fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
btrfs_bg_type_to_raid_name(map->type));
- return -EINVAL;
+ ret = -EINVAL;
}
- if (cache->alloc_offset > cache->zone_capacity) {
+ if (unlikely(cache->alloc_offset > cache->zone_capacity)) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
cache->alloc_offset, cache->zone_capacity,
@@ -1702,7 +1789,6 @@ out:
cache->physical_map = NULL;
}
bitmap_free(active);
- kfree(zone_info);
return ret;
}
@@ -1729,14 +1815,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
struct btrfs_inode *inode = bbio->inode;
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_group *cache;
bool ret = false;
if (!btrfs_is_zoned(fs_info))
return false;
- if (!inode || !is_data_inode(inode))
+ if (!is_data_inode(inode))
return false;
if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
@@ -1784,12 +1870,12 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
ordered->disk_bytenr = logical;
write_lock(&em_tree->lock);
- em = search_extent_mapping(em_tree, ordered->file_offset,
- ordered->num_bytes);
+ em = btrfs_search_extent_mapping(em_tree, ordered->file_offset,
+ ordered->num_bytes);
/* The em should be a new COW extent, thus it should not have an offset. */
- ASSERT(em->offset == 0);
+ ASSERT(em->offset == 0, "em->offset=%llu", em->offset);
em->disk_bytenr = logical;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
write_unlock(&em_tree->lock);
}
@@ -1799,8 +1885,8 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
struct btrfs_ordered_extent *new;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
- split_extent_map(ordered->inode, ordered->file_offset,
- ordered->num_bytes, len, logical))
+ btrfs_split_extent_map(ordered->inode, ordered->file_offset,
+ ordered->num_bytes, len, logical))
return false;
new = btrfs_split_ordered_extent(ordered, len);
@@ -2003,7 +2089,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&mapped_length, &bioc, NULL, NULL);
- if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) {
ret = -EIO;
goto out_put_bioc;
}
@@ -2061,7 +2147,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
if (physical_pos == wp)
return 0;
- if (physical_pos > wp)
+ if (unlikely(physical_pos > wp))
return -EUCLEAN;
length = wp - physical_pos;
@@ -2097,10 +2183,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- /* No space left */
- if (btrfs_zoned_bg_is_full(block_group)) {
- ret = false;
- goto out_unlock;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
+ /* The caller should check if the block group is full. */
+ if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) {
+ ret = false;
+ goto out_unlock;
+ }
+ } else {
+ /* Since it is already written, it should have been active. */
+ WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);
}
for (i = 0; i < map->num_stripes; i++) {
@@ -2158,27 +2249,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
const u64 end = block_group->start + block_group->length;
- struct radix_tree_iter iter;
struct extent_buffer *eb;
- void __rcu **slot;
+ unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);
rcu_read_lock();
- radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
- block_group->start >> fs_info->sectorsize_bits) {
- eb = radix_tree_deref_slot(slot);
- if (!eb)
- continue;
- if (radix_tree_deref_retry(eb)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
+ xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
if (eb->start < block_group->start)
continue;
if (eb->start >= end)
break;
-
- slot = radix_tree_iter_resume(slot, &iter);
rcu_read_unlock();
wait_on_extent_buffer_writeback(eb);
rcu_read_lock();
@@ -2186,6 +2265,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
rcu_read_unlock();
}
+static int call_zone_finish(struct btrfs_block_group *block_group,
+ struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_device *device = stripe->dev;
+ const u64 physical = stripe->physical;
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ int ret;
+
+ if (!device->bdev)
+ return 0;
+
+ if (zinfo->max_active_zones == 0)
+ return 0;
+
+ if (btrfs_dev_is_sequential(device, physical)) {
+ unsigned int nofs_flags;
+
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
+
+ if (ret)
+ return ret;
+ }
+
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ zinfo->reserved_active_zones++;
+ btrfs_dev_clear_active_zone(device, physical);
+
+ return 0;
+}
+
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2270,31 +2383,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
down_read(&dev_replace->rwsem);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_device *device = map->stripes[i].dev;
- const u64 physical = map->stripes[i].physical;
- struct btrfs_zoned_device_info *zinfo = device->zone_info;
- unsigned int nofs_flags;
-
- if (!device->bdev)
- continue;
-
- if (zinfo->max_active_zones == 0)
- continue;
-
- nofs_flags = memalloc_nofs_save();
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT);
- memalloc_nofs_restore(nofs_flags);
+ ret = call_zone_finish(block_group, &map->stripes[i]);
if (ret) {
up_read(&dev_replace->rwsem);
return ret;
}
-
- if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
- zinfo->reserved_active_zones++;
- btrfs_dev_clear_active_zone(device, physical);
}
up_read(&dev_replace->rwsem);
@@ -2372,16 +2466,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
return ret;
}
-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
{
struct btrfs_block_group *block_group;
u64 min_alloc_bytes;
if (!btrfs_is_zoned(fs_info))
- return;
+ return 0;
block_group = btrfs_lookup_block_group(fs_info, logical);
- ASSERT(block_group);
+ if (WARN_ON_ONCE(!block_group))
+ return -ENOENT;
/* No MIXED_BG on zoned btrfs. */
if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
@@ -2398,16 +2493,21 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
out:
btrfs_put_block_group(block_group);
+ return 0;
}
static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
{
+ int ret;
struct btrfs_block_group *bg =
container_of(work, struct btrfs_block_group, zone_finish_work);
wait_on_extent_buffer_writeback(bg->last_eb);
free_extent_buffer(bg->last_eb);
- btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+ ret = do_zone_finish(bg, true);
+ if (ret)
+ btrfs_handle_fs_error(bg->fs_info, ret,
+ "Failed to finish block-group's zone");
btrfs_put_block_group(bg);
}
@@ -2426,10 +2526,10 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
/* For the work */
btrfs_get_block_group(bg);
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
bg->last_eb = eb;
INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
- queue_work(system_unbound_wq, &bg->zone_finish_work);
+ queue_work(system_dfl_wq, &bg->zone_finish_work);
}
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2442,6 +2542,106 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
spin_unlock(&fs_info->relocation_bg_lock);
}
+void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ struct btrfs_space_info *space_info = data_sinfo;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_group *bg;
+ struct list_head *bg_list;
+ u64 alloc_flags;
+ bool first = true;
+ bool did_chunk_alloc = false;
+ int index;
+ int ret;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ if (fs_info->data_reloc_bg)
+ return;
+
+ if (sb_rdonly(fs_info->sb))
+ return;
+
+ alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
+ index = btrfs_bg_flags_to_raid_index(alloc_flags);
+
+ /* Scan the data space_info to find empty block groups. Take the second one. */
+again:
+ bg_list = &space_info->block_groups[index];
+ list_for_each_entry(bg, bg_list, list) {
+ if (bg->alloc_offset != 0)
+ continue;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (space_info == data_sinfo) {
+ /* Migrate the block group to the data relocation space_info. */
+ struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0];
+ int factor;
+
+ ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC,
+ "reloc_sinfo->subgroup_id=%d", reloc_sinfo->subgroup_id);
+ factor = btrfs_bg_type_to_factor(bg->flags);
+
+ down_write(&space_info->groups_sem);
+ list_del_init(&bg->list);
+ /* We can assume this as we choose the second empty one. */
+ ASSERT(!list_empty(&space_info->block_groups[index]));
+ up_write(&space_info->groups_sem);
+
+ spin_lock(&space_info->lock);
+ space_info->total_bytes -= bg->length;
+ space_info->disk_total -= bg->length * factor;
+ space_info->disk_total -= bg->zone_unusable;
+ /* There is no allocation ever happened. */
+ ASSERT(bg->used == 0, "bg->used=%llu", bg->used);
+ /* No super block in a block group on the zoned setup. */
+ ASSERT(bg->bytes_super == 0, "bg->bytes_super=%llu", bg->bytes_super);
+ spin_unlock(&space_info->lock);
+
+ bg->space_info = reloc_sinfo;
+ if (reloc_sinfo->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(bg);
+
+ btrfs_add_bg_to_space_info(fs_info, bg);
+ }
+
+ fs_info->data_reloc_bg = bg->start;
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
+ btrfs_zone_activate(bg);
+
+ return;
+ }
+
+ if (did_chunk_alloc)
+ return;
+
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return;
+
+ /* Allocate new BG in the data relocation space_info. */
+ space_info = data_sinfo->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC,
+ "space_info->subgroup_id=%d", space_info->subgroup_id);
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
+ if (ret == 1) {
+ /*
+ * We allocated a new block group in the data relocation space_info. We
+ * can take that one.
+ */
+ first = false;
+ did_chunk_alloc = true;
+ goto again;
+ }
+}
+
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2464,8 +2664,8 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ u64 total = btrfs_super_total_bytes(fs_info->super_copy);
u64 used = 0;
- u64 total = 0;
u64 factor;
ASSERT(btrfs_is_zoned(fs_info));
@@ -2478,7 +2678,6 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
if (!device->bdev)
continue;
- total += device->disk_total_bytes;
used += device->bytes_used;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -2532,7 +2731,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->alloc_offset == 0 ||
- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
continue;
@@ -2559,10 +2758,9 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
return ret < 0 ? ret : 1;
}
-int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- bool do_finish)
+int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_block_group *bg;
int index;
@@ -2771,7 +2969,8 @@ int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num
* This holds because we currently reset fully used then freed
* block group.
*/
- ASSERT(reclaimed == bg->zone_capacity);
+ ASSERT(reclaimed == bg->zone_capacity,
+ "reclaimed=%llu bg->zone_capacity=%llu", reclaimed, bg->zone_capacity);
bg->free_space_ctl->free_space += reclaimed;
space_info->bytes_zone_unusable -= reclaimed;
spin_unlock(&bg->lock);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 9672bf4c3335..5cefdeb08b7b 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -15,7 +15,6 @@
#include "disk-io.h"
#include "block-group.h"
#include "btrfs_inode.h"
-#include "fs.h"
struct block_device;
struct extent_buffer;
@@ -83,18 +82,18 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group);
int btrfs_zone_finish(struct btrfs_block_group *block_group);
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info);
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
-int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, bool do_finish);
+int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes);
#else /* CONFIG_BLK_DEV_ZONED */
@@ -233,14 +232,19 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
return true;
}
-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length) { }
+static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ return 0;
+}
static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb) { }
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+static inline void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { }
+
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
@@ -256,8 +260,7 @@ static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
return 1;
}
-static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static inline int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info,
bool do_finish)
{
/* Consider all the block groups are active */
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index cd5f38d6fbaa..c9cddcfa337b 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -24,7 +24,7 @@
#include "super.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
-#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
#define ZSTD_BTRFS_MIN_LEVEL -15
#define ZSTD_BTRFS_MAX_LEVEL 15
@@ -77,7 +77,6 @@ struct workspace {
*/
struct zstd_workspace_manager {
- const struct btrfs_compress_op *ops;
spinlock_t lock;
struct list_head lru_list;
struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
@@ -86,8 +85,6 @@ struct zstd_workspace_manager {
struct timer_list timer;
};
-static struct zstd_workspace_manager wsm;
-
static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
static inline struct workspace *list_to_workspace(struct list_head *list)
@@ -112,19 +109,19 @@ static inline int clip_level(int level)
*/
static void zstd_reclaim_timer_fn(struct timer_list *timer)
{
+ struct zstd_workspace_manager *zwsm =
+ container_of(timer, struct zstd_workspace_manager, timer);
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
- ASSERT(timer == &wsm.timer);
-
- spin_lock(&wsm.lock);
+ spin_lock(&zwsm->lock);
- if (list_empty(&wsm.lru_list)) {
- spin_unlock(&wsm.lock);
+ if (list_empty(&zwsm->lru_list)) {
+ spin_unlock(&zwsm->lock);
return;
}
- list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+ list_for_each_prev_safe(pos, next, &zwsm->lru_list) {
struct workspace *victim = container_of(pos, struct workspace,
lru_list);
int level;
@@ -141,15 +138,15 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
list_del(&victim->list);
zstd_free_workspace(&victim->list);
- if (list_empty(&wsm.idle_ws[level]))
- clear_bit(level, &wsm.active_map);
+ if (list_empty(&zwsm->idle_ws[level]))
+ clear_bit(level, &zwsm->active_map);
}
- if (!list_empty(&wsm.lru_list))
- mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+ if (!list_empty(&zwsm->lru_list))
+ mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
- spin_unlock(&wsm.lock);
+ spin_unlock(&zwsm->lock);
}
/*
@@ -182,50 +179,56 @@ static void zstd_calc_ws_mem_sizes(void)
}
}
-void zstd_init_workspace_manager(void)
+int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info)
{
+ struct zstd_workspace_manager *zwsm;
struct list_head *ws;
- int i;
+ ASSERT(fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] == NULL);
+ zwsm = kzalloc(sizeof(*zwsm), GFP_KERNEL);
+ if (!zwsm)
+ return -ENOMEM;
zstd_calc_ws_mem_sizes();
+ spin_lock_init(&zwsm->lock);
+ init_waitqueue_head(&zwsm->wait);
+ timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0);
- wsm.ops = &btrfs_zstd_compress;
- spin_lock_init(&wsm.lock);
- init_waitqueue_head(&wsm.wait);
- timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
-
- INIT_LIST_HEAD(&wsm.lru_list);
- for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
- INIT_LIST_HEAD(&wsm.idle_ws[i]);
+ INIT_LIST_HEAD(&zwsm->lru_list);
+ for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&zwsm->idle_ws[i]);
+ fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = zwsm;
- ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+ ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL);
if (IS_ERR(ws)) {
- pr_warn(
- "BTRFS: cannot preallocate zstd compression workspace\n");
+ btrfs_warn(NULL, "cannot preallocate zstd compression workspace");
} else {
- set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
- list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+ set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &zwsm->active_map);
+ list_add(ws, &zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
}
+ return 0;
}
-void zstd_cleanup_workspace_manager(void)
+void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct workspace *workspace;
- int i;
- spin_lock_bh(&wsm.lock);
- for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
- while (!list_empty(&wsm.idle_ws[i])) {
- workspace = container_of(wsm.idle_ws[i].next,
+ if (!zwsm)
+ return;
+ fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = NULL;
+ spin_lock_bh(&zwsm->lock);
+ for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+ while (!list_empty(&zwsm->idle_ws[i])) {
+ workspace = container_of(zwsm->idle_ws[i].next,
struct workspace, list);
list_del(&workspace->list);
list_del(&workspace->lru_list);
zstd_free_workspace(&workspace->list);
}
}
- spin_unlock_bh(&wsm.lock);
-
- del_timer_sync(&wsm.timer);
+ spin_unlock_bh(&zwsm->lock);
+ timer_delete_sync(&zwsm->timer);
+ kfree(zwsm);
}
/*
@@ -240,29 +243,31 @@ void zstd_cleanup_workspace_manager(void)
* offer the opportunity to reclaim the workspace in favor of allocating an
* appropriately sized one in the future.
*/
-static struct list_head *zstd_find_workspace(int level)
+static struct list_head *zstd_find_workspace(struct btrfs_fs_info *fs_info, int level)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct list_head *ws;
struct workspace *workspace;
int i = clip_level(level);
- spin_lock_bh(&wsm.lock);
- for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
- if (!list_empty(&wsm.idle_ws[i])) {
- ws = wsm.idle_ws[i].next;
+ ASSERT(zwsm);
+ spin_lock_bh(&zwsm->lock);
+ for_each_set_bit_from(i, &zwsm->active_map, ZSTD_BTRFS_MAX_LEVEL) {
+ if (!list_empty(&zwsm->idle_ws[i])) {
+ ws = zwsm->idle_ws[i].next;
workspace = list_to_workspace(ws);
list_del_init(ws);
/* keep its place if it's a lower level using this */
workspace->req_level = level;
if (clip_level(level) == workspace->level)
list_del(&workspace->lru_list);
- if (list_empty(&wsm.idle_ws[i]))
- clear_bit(i, &wsm.active_map);
- spin_unlock_bh(&wsm.lock);
+ if (list_empty(&zwsm->idle_ws[i]))
+ clear_bit(i, &zwsm->active_map);
+ spin_unlock_bh(&zwsm->lock);
return ws;
}
}
- spin_unlock_bh(&wsm.lock);
+ spin_unlock_bh(&zwsm->lock);
return NULL;
}
@@ -277,30 +282,33 @@ static struct list_head *zstd_find_workspace(int level)
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
-struct list_head *zstd_get_workspace(int level)
+struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct list_head *ws;
unsigned int nofs_flag;
+ ASSERT(zwsm);
+
/* level == 0 means we can use any workspace */
if (!level)
level = 1;
again:
- ws = zstd_find_workspace(level);
+ ws = zstd_find_workspace(fs_info, level);
if (ws)
return ws;
nofs_flag = memalloc_nofs_save();
- ws = zstd_alloc_workspace(level);
+ ws = zstd_alloc_workspace(fs_info, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ws)) {
DEFINE_WAIT(wait);
- prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&zwsm->wait, &wait, TASK_UNINTERRUPTIBLE);
schedule();
- finish_wait(&wsm.wait, &wait);
+ finish_wait(&zwsm->wait, &wait);
goto again;
}
@@ -319,34 +327,36 @@ again:
* isn't set, it is also set here. Only the max level workspace tries and wakes
* up waiting workspaces.
*/
-void zstd_put_workspace(struct list_head *ws)
+void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct workspace *workspace = list_to_workspace(ws);
- spin_lock_bh(&wsm.lock);
+ ASSERT(zwsm);
+ spin_lock_bh(&zwsm->lock);
/* A node is only taken off the lru if we are the corresponding level */
if (clip_level(workspace->req_level) == workspace->level) {
/* Hide a max level workspace from reclaim */
- if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+ if (list_empty(&zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
INIT_LIST_HEAD(&workspace->lru_list);
} else {
workspace->last_used = jiffies;
- list_add(&workspace->lru_list, &wsm.lru_list);
- if (!timer_pending(&wsm.timer))
- mod_timer(&wsm.timer,
+ list_add(&workspace->lru_list, &zwsm->lru_list);
+ if (!timer_pending(&zwsm->timer))
+ mod_timer(&zwsm->timer,
jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
}
}
- set_bit(workspace->level, &wsm.active_map);
- list_add(&workspace->list, &wsm.idle_ws[workspace->level]);
+ set_bit(workspace->level, &zwsm->active_map);
+ list_add(&workspace->list, &zwsm->idle_ws[workspace->level]);
workspace->req_level = 0;
- spin_unlock_bh(&wsm.lock);
+ spin_unlock_bh(&zwsm->lock);
if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL))
- cond_wake_up(&wsm.wait);
+ cond_wake_up(&zwsm->wait);
}
void zstd_free_workspace(struct list_head *ws)
@@ -358,8 +368,9 @@ void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *zstd_alloc_workspace(int level)
+struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level)
{
+ const u32 blocksize = fs_info->sectorsize;
struct workspace *workspace;
workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
@@ -372,7 +383,7 @@ struct list_head *zstd_alloc_workspace(int level)
workspace->req_level = level;
workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN);
- workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ workspace->buf = kmalloc(blocksize, GFP_KERNEL);
if (!workspace->mem || !workspace->buf)
goto fail;
@@ -385,11 +396,13 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
zstd_cstream *stream;
int ret = 0;
int nr_folios = 0;
@@ -400,7 +413,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long len = *total_out;
const unsigned long nr_dest_folios = *out_folios;
const u64 orig_end = start + len;
- unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
+ unsigned long max_out = nr_dest_folios * min_folio_size;
unsigned int cur_len;
workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len);
@@ -412,9 +427,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
stream = zstd_init_cstream(&workspace->params, len, workspace->mem,
workspace->size);
if (unlikely(!stream)) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_err(inode->root->fs_info,
+ btrfs_err(fs_info,
"zstd compression init level %d failed, root %llu inode %llu offset %llu",
workspace->req_level, btrfs_root_id(inode->root),
btrfs_ino(inode), start);
@@ -426,13 +439,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- cur_len = btrfs_calc_input_length(orig_end, start);
- workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_page(start));
+ cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
+ workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
/* Allocate and map in the output buffer */
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -440,7 +453,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
while (1) {
size_t ret2;
@@ -448,9 +461,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret2 = zstd_compress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
if (unlikely(zstd_is_error(ret2))) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_warn(inode->root->fs_info,
+ btrfs_warn(fs_info,
"zstd compression level %d failed, error %d root %llu inode %llu offset %llu",
workspace->req_level, zstd_get_error_code(ret2),
btrfs_root_id(inode->root), btrfs_ino(inode),
@@ -460,7 +471,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
}
/* Check to see if we are making it bigger */
- if (tot_in + workspace->in_buf.pos > 8192 &&
+ if (tot_in + workspace->in_buf.pos > blocksize * 2 &&
tot_in + workspace->in_buf.pos <
tot_out + workspace->out_buf.pos) {
ret = -E2BIG;
@@ -476,13 +487,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Check if we need more output space */
if (workspace->out_buf.pos == workspace->out_buf.size) {
- tot_out += PAGE_SIZE;
- max_out -= PAGE_SIZE;
+ tot_out += min_folio_size;
+ max_out -= min_folio_size;
if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -490,8 +501,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out,
- PAGE_SIZE);
+ workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
}
/* We've reached the end of the input */
@@ -511,9 +521,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- cur_len = btrfs_calc_input_length(orig_end, start);
+ cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
workspace->in_buf.src = kmap_local_folio(in_folio,
- offset_in_page(start));
+ offset_in_folio(in_folio, start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
}
@@ -523,9 +533,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret2 = zstd_end_stream(stream, &workspace->out_buf);
if (unlikely(zstd_is_error(ret2))) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_err(inode->root->fs_info,
+ btrfs_err(fs_info,
"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu",
workspace->req_level, zstd_get_error_code(ret2),
btrfs_root_id(inode->root), btrfs_ino(inode),
@@ -543,13 +551,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
goto out;
}
- tot_out += PAGE_SIZE;
- max_out -= PAGE_SIZE;
+ tot_out += min_folio_size;
+ max_out -= min_folio_size;
if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -557,7 +565,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
}
if (tot_out >= tot_in) {
@@ -579,13 +587,16 @@ out:
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct folio **folios_in = cb->compressed_folios;
size_t srclen = cb->compressed_len;
zstd_dstream *stream;
int ret = 0;
+ const u32 blocksize = fs_info->sectorsize;
+ const unsigned int min_folio_size = btrfs_min_folio_size(fs_info);
unsigned long folio_in_index = 0;
- unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size);
unsigned long buf_start;
unsigned long total_out = 0;
@@ -603,11 +614,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
workspace->out_buf.dst = workspace->buf;
workspace->out_buf.pos = 0;
- workspace->out_buf.size = PAGE_SIZE;
+ workspace->out_buf.size = blocksize;
while (1) {
size_t ret2;
@@ -643,16 +654,16 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->in_buf.pos == workspace->in_buf.size) {
kunmap_local(workspace->in_buf.src);
folio_in_index++;
- if (folio_in_index >= total_folios_in) {
+ if (unlikely(folio_in_index >= total_folios_in)) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
- srclen -= PAGE_SIZE;
+ srclen -= min_folio_size;
workspace->in_buf.src =
kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
}
}
ret = 0;
@@ -719,9 +730,7 @@ finish:
return ret;
}
-const struct btrfs_compress_op btrfs_zstd_compress = {
- /* ZSTD uses own workspace manager */
- .workspace_manager = NULL,
+const struct btrfs_compress_levels btrfs_zstd_compress = {
.min_level = ZSTD_BTRFS_MIN_LEVEL,
.max_level = ZSTD_BTRFS_MAX_LEVEL,
.default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
diff --git a/fs/buffer.c b/fs/buffer.c
index 194eacbefc95..838c0c571022 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -157,8 +157,8 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
*/
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
- __end_buffer_read_notouch(bh, uptodate);
put_bh(bh);
+ __end_buffer_read_notouch(bh, uptodate);
}
EXPORT_SYMBOL(end_buffer_read_sync);
@@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
}
EXPORT_SYMBOL(end_buffer_write_sync);
-/*
- * Various filesystems appear to want __find_get_block to be non-blocking.
- * But it's the page lock which protects the buffers. To get around this,
- * we get exclusion from try_to_free_buffers with the blockdev mapping's
- * i_private_lock.
- *
- * Hack idea: for the blockdev mapping, i_private_lock contention
- * may be quite high. This code could TryLock the page, and if that
- * succeeds, there is no need to take i_private_lock.
- */
static struct buffer_head *
-__find_get_block_slow(struct block_device *bdev, sector_t block)
+__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
{
struct address_space *bd_mapping = bdev->bd_mapping;
const int blkbits = bd_mapping->host->i_blkbits;
@@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
if (IS_ERR(folio))
goto out;
- spin_lock(&bd_mapping->i_private_lock);
+ /*
+ * Folio lock protects the buffers. Callers that cannot block
+ * will fallback to serializing vs try_to_free_buffers() via
+ * the i_private_lock.
+ */
+ if (atomic)
+ spin_lock(&bd_mapping->i_private_lock);
+ else
+ folio_lock(folio);
+
head = folio_buffers(folio);
if (!head)
goto out_unlock;
+ /*
+ * Upon a noref migration, the folio lock serializes here;
+ * otherwise bail.
+ */
+ if (test_bit_acquire(BH_Migrate, &head->b_state)) {
+ WARN_ON(!atomic);
+ goto out_unlock;
+ }
+
bh = head;
do {
if (!buffer_mapped(bh))
@@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
1 << blkbits);
}
out_unlock:
- spin_unlock(&bd_mapping->i_private_lock);
+ if (atomic)
+ spin_unlock(&bd_mapping->i_private_lock);
+ else
+ folio_unlock(folio);
folio_put(folio);
out:
return ret;
@@ -286,7 +297,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
}
struct postprocess_bh_ctx {
@@ -411,7 +421,6 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
}
/*
@@ -602,9 +611,9 @@ int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
return err;
ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY_ALL))
+ if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
goto out;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
goto out;
err = sync_inode_metadata(inode, 1);
@@ -656,7 +665,9 @@ EXPORT_SYMBOL(generic_buffers_fsync);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize)
{
- struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+ struct buffer_head *bh;
+
+ bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
write_dirty_buffer(bh, 0);
@@ -1109,27 +1120,26 @@ static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- /* Size must be multiple of hard sectorsize */
- if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
- (size < 512 || size > PAGE_SIZE))) {
- printk(KERN_ERR "getblk(): invalid block size %d requested\n",
- size);
- printk(KERN_ERR "logical block size: %d\n",
- bdev_logical_block_size(bdev));
+ bool blocking = gfpflags_allow_blocking(gfp);
- dump_stack();
+ if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
+ printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
+ size, bdev_logical_block_size(bdev));
return NULL;
}
for (;;) {
struct buffer_head *bh;
- bh = __find_get_block(bdev, block, size);
- if (bh)
- return bh;
-
if (!grow_buffers(bdev, block, size, gfp))
return NULL;
+
+ if (blocking)
+ bh = __find_get_block_nonatomic(bdev, block, size);
+ else
+ bh = __find_get_block(bdev, block, size);
+ if (bh)
+ return bh;
}
}
@@ -1207,10 +1217,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh)
/* FIXME: do we need to set this in both places? */
if (bh->b_folio && bh->b_folio->mapping)
mapping_set_error(bh->b_folio->mapping, -EIO);
- if (bh->b_assoc_map) {
+ if (bh->b_assoc_map)
mapping_set_error(bh->b_assoc_map, -EIO);
- errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
- }
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
@@ -1386,16 +1394,18 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
/*
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
* it in the LRU and mark it as accessed. If it is not present then return
- * NULL
+ * NULL. Atomic context callers may also return NULL if the buffer is being
+ * migrated; similarly the page is not marked accessed either.
*/
-struct buffer_head *
-__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+static struct buffer_head *
+find_get_block_common(struct block_device *bdev, sector_t block,
+ unsigned size, bool atomic)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
/* __find_get_block_slow will mark the page accessed */
- bh = __find_get_block_slow(bdev, block);
+ bh = __find_get_block_slow(bdev, block, atomic);
if (bh)
bh_lru_install(bh);
} else
@@ -1403,8 +1413,23 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
return bh;
}
+
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+{
+ return find_get_block_common(bdev, block, size, true);
+}
EXPORT_SYMBOL(__find_get_block);
+/* same as __find_get_block() but allows sleeping contexts */
+struct buffer_head *
+__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
+ unsigned size)
+{
+ return find_get_block_common(bdev, block, size, false);
+}
+EXPORT_SYMBOL(__find_get_block_nonatomic);
+
/**
* bdev_getblk - Get a buffer_head in a block device's buffer cache.
* @bdev: The block device.
@@ -1422,7 +1447,12 @@ EXPORT_SYMBOL(__find_get_block);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __find_get_block(bdev, block, size);
+ struct buffer_head *bh;
+
+ if (gfpflags_allow_blocking(gfp))
+ bh = __find_get_block_nonatomic(bdev, block, size);
+ else
+ bh = __find_get_block(bdev, block, size);
might_alloc(gfp);
if (bh)
@@ -1578,8 +1608,8 @@ static void discard_buffer(struct buffer_head * bh)
bh->b_bdev = NULL;
b_state = READ_ONCE(bh->b_state);
do {
- } while (!try_cmpxchg(&bh->b_state, &b_state,
- b_state & ~BUFFER_FLAGS_DISCARD));
+ } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
+ b_state & ~BUFFER_FLAGS_DISCARD));
unlock_buffer(bh);
}
@@ -1644,7 +1674,6 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
filemap_release_folio(folio, 0);
out:
folio_clear_mappedtodisk(folio);
- return;
}
EXPORT_SYMBOL(block_invalidate_folio);
@@ -2166,7 +2195,7 @@ int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(__block_write_begin);
-static void __block_commit_write(struct folio *folio, size_t from, size_t to)
+void block_commit_write(struct folio *folio, size_t from, size_t to)
{
size_t block_start, block_end;
bool partial = false;
@@ -2204,6 +2233,7 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to)
if (!partial)
folio_mark_uptodate(folio);
}
+EXPORT_SYMBOL(block_commit_write);
/*
* block_write_begin takes care of the basic task of block allocation and
@@ -2235,9 +2265,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(block_write_begin);
-int block_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+int block_write_end(loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio)
{
size_t start = pos - folio_pos(folio);
@@ -2262,21 +2291,21 @@ int block_write_end(struct file *file, struct address_space *mapping,
flush_dcache_folio(folio);
/* This could be a short (even 0-length) commit */
- __block_commit_write(folio, start, start + copied);
+ block_commit_write(folio, start, start + copied);
return copied;
}
EXPORT_SYMBOL(block_write_end);
-int generic_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool i_size_changed = false;
- copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ copied = block_write_end(pos, len, copied, folio);
/*
* No need to use i_size_read() here, the i_size cannot change under us
@@ -2465,7 +2494,8 @@ out:
}
EXPORT_SYMBOL(generic_cont_expand_simple);
-static int cont_expand_zero(struct file *file, struct address_space *mapping,
+static int cont_expand_zero(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, loff_t *bytes)
{
struct inode *inode = mapping->host;
@@ -2489,12 +2519,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = PAGE_SIZE - zerofrom;
- err = aops->write_begin(file, mapping, curpos, len,
+ err = aops->write_begin(iocb, mapping, curpos, len,
&folio, &fsdata);
if (err)
goto out;
folio_zero_range(folio, offset_in_folio(folio, curpos), len);
- err = aops->write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(iocb, mapping, curpos, len, len,
folio, fsdata);
if (err < 0)
goto out;
@@ -2522,12 +2552,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = offset - zerofrom;
- err = aops->write_begin(file, mapping, curpos, len,
+ err = aops->write_begin(iocb, mapping, curpos, len,
&folio, &fsdata);
if (err)
goto out;
folio_zero_range(folio, offset_in_folio(folio, curpos), len);
- err = aops->write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(iocb, mapping, curpos, len, len,
folio, fsdata);
if (err < 0)
goto out;
@@ -2542,17 +2572,16 @@ out:
* For moronic filesystems that do not allow holes in file.
* We may have to extend the file.
*/
-int cont_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata,
- get_block_t *get_block, loff_t *bytes)
+int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, struct folio **foliop,
+ void **fsdata, get_block_t *get_block, loff_t *bytes)
{
struct inode *inode = mapping->host;
unsigned int blocksize = i_blocksize(inode);
unsigned int zerofrom;
int err;
- err = cont_expand_zero(file, mapping, pos, bytes);
+ err = cont_expand_zero(iocb, mapping, pos, bytes);
if (err)
return err;
@@ -2566,13 +2595,6 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(cont_write_begin);
-void block_commit_write(struct page *page, unsigned from, unsigned to)
-{
- struct folio *folio = page_folio(page);
- __block_commit_write(folio, from, to);
-}
-EXPORT_SYMBOL(block_commit_write);
-
/*
* block_page_mkwrite() is not allowed to change the file size as it gets
* called from a page fault handler when a page is first dirtied. Hence we must
@@ -2581,7 +2603,7 @@ EXPORT_SYMBOL(block_commit_write);
* holes and correct delalloc and unwritten extent mapping on filesystems that
* support these features.
*
- * We are not allowed to take the i_mutex here so we have to play games to
+ * We are not allowed to take the i_rwsem here so we have to play games to
* protect against truncate races as the page could now be beyond EOF. Because
* truncate writes the inode size before removing pages, once we have the
* page lock we can determine safely if the page is beyond EOF. If it is not
@@ -2618,7 +2640,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
if (unlikely(ret))
goto out_unlock;
- __block_commit_write(folio, 0, end);
+ block_commit_write(folio, 0, end);
folio_mark_dirty(folio);
folio_wait_stable(folio);
@@ -2701,7 +2723,7 @@ unlock:
EXPORT_SYMBOL(block_truncate_page);
/*
- * The generic ->writepage function for buffer-backed address_spaces
+ * The generic write folio function for buffer-backed address_spaces
*/
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
void *get_block)
@@ -2710,7 +2732,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
loff_t i_size = i_size_read(inode);
/* Is the folio fully inside i_size? */
- if (folio_pos(folio) + folio_size(folio) <= i_size)
+ if (folio_next_pos(folio) <= i_size)
return __block_write_full_folio(inode, folio, get_block, wbc);
/* Is the folio fully outside i_size? (truncate in progress) */
@@ -2721,7 +2743,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
/*
* The folio straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
+ * writeback invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 3e63cfe15874..a08250d244ea 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
#include <linux/mount.h>
#include <linux/xattr.h>
#include <linux/file.h>
+#include <linux/namei.h>
#include <linux/falloc.h>
#include <trace/events/fscache.h>
#include "internal.h"
@@ -428,11 +429,13 @@ static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie)
if (!old_tmpfile) {
struct cachefiles_volume *volume = object->volume;
struct dentry *fan = volume->fanout[(u8)cookie->key_hash];
+ struct dentry *obj;
- inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
- cachefiles_bury_object(volume->cache, object, fan,
- old_file->f_path.dentry,
- FSCACHE_OBJECT_INVALIDATED);
+ obj = start_removing_dentry(fan, old_file->f_path.dentry);
+ if (!IS_ERR(obj))
+ cachefiles_bury_object(volume->cache, object,
+ fan, obj,
+ FSCACHE_OBJECT_INVALIDATED);
}
fput(old_file);
}
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 38c236e38cef..b62cd3e9a18e 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -71,7 +71,6 @@ struct cachefiles_object {
int debug_id;
spinlock_t lock;
refcount_t ref;
- u8 d_name_len; /* Length of filename */
enum cachefiles_content content_info:8; /* Info about content presence */
unsigned long flags;
#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 92058ae43488..3e0576d9db1d 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -63,7 +63,7 @@ static void cachefiles_read_complete(struct kiocb *iocb, long ret)
ret = -ESTALE;
}
- ki->term_func(ki->term_func_priv, ret, ki->was_async);
+ ki->term_func(ki->term_func_priv, ret);
}
cachefiles_put_kiocb(ki);
@@ -188,7 +188,7 @@ in_progress:
presubmission_error:
if (term_func)
- term_func(term_func_priv, ret < 0 ? ret : skipped, false);
+ term_func(term_func_priv, ret < 0 ? ret : skipped);
return ret;
}
@@ -271,7 +271,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing);
set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags);
if (ki->term_func)
- ki->term_func(ki->term_func_priv, ret, ki->was_async);
+ ki->term_func(ki->term_func_priv, ret);
cachefiles_put_kiocb(ki);
}
@@ -301,7 +301,7 @@ int __cachefiles_write(struct cachefiles_object *object,
ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
if (!ki) {
if (term_func)
- term_func(term_func_priv, -ENOMEM, false);
+ term_func(term_func_priv, -ENOMEM);
return -ENOMEM;
}
@@ -347,8 +347,6 @@ int __cachefiles_write(struct cachefiles_object *object,
default:
ki->was_async = false;
cachefiles_write_complete(&ki->iocb, ret);
- if (ret > 0)
- ret = 0;
break;
}
@@ -366,7 +364,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
{
if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) {
if (term_func)
- term_func(term_func_priv, -ENOBUFS, false);
+ term_func(term_func_priv, -ENOBUFS);
trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite);
return -ENOBUFS;
}
@@ -665,7 +663,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
pre = CACHEFILES_DIO_BLOCK_SIZE - off;
if (pre >= len) {
fscache_count_dio_misfit();
- netfs_write_subrequest_terminated(subreq, len, false);
+ netfs_write_subrequest_terminated(subreq, len);
return;
}
subreq->transferred += pre;
@@ -691,7 +689,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
len -= post;
if (len == 0) {
fscache_count_dio_misfit();
- netfs_write_subrequest_terminated(subreq, post, false);
+ netfs_write_subrequest_terminated(subreq, post);
return;
}
iov_iter_truncate(&subreq->io_iter, len);
@@ -703,7 +701,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
&start, &len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0) {
- netfs_write_subrequest_terminated(subreq, ret, false);
+ netfs_write_subrequest_terminated(subreq, ret);
return;
}
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index bf935e25bdbe..aae86af48ed5 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -8,7 +8,7 @@
#include <linux/slab.h>
#include "internal.h"
-static const char cachefiles_charmap[64] =
+static const char cachefiles_charmap[64] __nonstring =
"0123456789" /* 0 - 9 */
"abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */
@@ -132,7 +132,6 @@ bool cachefiles_cook_key(struct cachefiles_object *object)
success:
name[len] = 0;
object->d_name = name;
- object->d_name_len = len;
_leave(" = %s", object->d_name);
return true;
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 83a60126de0f..e5ec90dccc27 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -93,12 +93,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
_enter(",,%s", dirname);
/* search the current directory for the element name */
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
retry:
ret = cachefiles_inject_read_error();
if (ret == 0)
- subdir = lookup_one_len(dirname, dir, strlen(dirname));
+ subdir = start_creating(&nop_mnt_idmap, dir, &QSTR(dirname));
else
subdir = ERR_PTR(ret);
trace_cachefiles_lookup(NULL, dir, subdir);
@@ -128,10 +127,13 @@ retry:
ret = security_path_mkdir(&path, subdir, 0700);
if (ret < 0)
goto mkdir_error;
- subdir = ERR_PTR(cachefiles_inject_write_error());
- if (!IS_ERR(subdir))
- subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
- ret = PTR_ERR(subdir);
+ ret = cachefiles_inject_write_error();
+ if (ret == 0) {
+ subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700, NULL);
+ } else {
+ end_creating(subdir);
+ subdir = ERR_PTR(ret);
+ }
if (IS_ERR(subdir)) {
trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
cachefiles_trace_mkdir_error);
@@ -140,7 +142,7 @@ retry:
trace_cachefiles_mkdir(dir, subdir);
if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) {
- dput(subdir);
+ end_creating(subdir);
goto retry;
}
ASSERT(d_backing_inode(subdir));
@@ -153,7 +155,7 @@ retry:
/* Tell rmdir() it's not allowed to delete the subdir */
inode_lock(d_inode(subdir));
- inode_unlock(d_inode(dir));
+ end_creating_keep(subdir);
if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) {
pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
@@ -195,14 +197,11 @@ mark_error:
return ERR_PTR(-EBUSY);
mkdir_error:
- inode_unlock(d_inode(dir));
- if (!IS_ERR(subdir))
- dput(subdir);
+ end_creating(subdir);
pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
lookup_error:
- inode_unlock(d_inode(dir));
ret = PTR_ERR(subdir);
pr_err("Lookup %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
@@ -262,6 +261,8 @@ static int cachefiles_unlink(struct cachefiles_cache *cache,
* - File backed objects are unlinked
* - Directory backed objects are stuffed into the graveyard for userspace to
* delete
+ * On entry dir must be locked. It will be unlocked on exit.
+ * On entry there must be at least 2 refs on rep, one will be dropped on exit.
*/
int cachefiles_bury_object(struct cachefiles_cache *cache,
struct cachefiles_object *object,
@@ -277,27 +278,23 @@ int cachefiles_bury_object(struct cachefiles_cache *cache,
_enter(",'%pd','%pd'", dir, rep);
if (rep->d_parent != dir) {
- inode_unlock(d_inode(dir));
+ end_removing(rep);
_leave(" = -ESTALE");
return -ESTALE;
}
/* non-directories can just be unlinked */
if (!d_is_dir(rep)) {
- dget(rep); /* Stop the dentry being negated if it's only pinned
- * by a file struct.
- */
ret = cachefiles_unlink(cache, object, dir, rep, why);
- dput(rep);
+ end_removing(rep);
- inode_unlock(d_inode(dir));
_leave(" = %d", ret);
return ret;
}
/* directories have to be moved to the graveyard */
_debug("move stale object to graveyard");
- inode_unlock(d_inode(dir));
+ end_removing(rep);
try_again:
/* first step is to make up a grave dentry in the graveyard */
@@ -337,7 +334,7 @@ try_again:
return -EIO;
}
- grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+ grave = lookup_one(&nop_mnt_idmap, &QSTR(nbuffer), cache->graveyard);
if (IS_ERR(grave)) {
unlock_rename(cache->graveyard, dir);
trace_cachefiles_vfs_error(object, d_inode(cache->graveyard),
@@ -386,11 +383,10 @@ try_again:
cachefiles_io_error(cache, "Rename security error %d", ret);
} else {
struct renamedata rd = {
- .old_mnt_idmap = &nop_mnt_idmap,
- .old_dir = d_inode(dir),
+ .mnt_idmap = &nop_mnt_idmap,
+ .old_parent = dir,
.old_dentry = rep,
- .new_mnt_idmap = &nop_mnt_idmap,
- .new_dir = d_inode(cache->graveyard),
+ .new_parent = cache->graveyard,
.new_dentry = grave,
};
trace_cachefiles_rename(object, d_inode(rep)->i_ino, why);
@@ -425,13 +421,12 @@ int cachefiles_delete_object(struct cachefiles_object *object,
_enter(",OBJ%x{%pD}", object->debug_id, object->file);
- /* Stop the dentry being negated if it's only pinned by a file struct. */
- dget(dentry);
-
- inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT);
- ret = cachefiles_unlink(volume->cache, object, fan, dentry, why);
- inode_unlock(d_backing_inode(fan));
- dput(dentry);
+ dentry = start_removing_dentry(fan, dentry);
+ if (IS_ERR(dentry))
+ ret = PTR_ERR(dentry);
+ else
+ ret = cachefiles_unlink(volume->cache, object, fan, dentry, why);
+ end_removing(dentry);
return ret;
}
@@ -629,8 +624,8 @@ bool cachefiles_look_up_object(struct cachefiles_object *object)
/* Look up path "cache/vol/fanout/file". */
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_positive_unlocked(object->d_name, fan,
- object->d_name_len);
+ dentry = lookup_one_positive_unlocked(&nop_mnt_idmap,
+ &QSTR(object->d_name), fan);
else
dentry = ERR_PTR(ret);
trace_cachefiles_lookup(object, fan, dentry);
@@ -644,9 +639,13 @@ bool cachefiles_look_up_object(struct cachefiles_object *object)
if (!d_is_reg(dentry)) {
pr_err("%pd is not a file\n", dentry);
- inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
- ret = cachefiles_bury_object(volume->cache, object, fan, dentry,
- FSCACHE_OBJECT_IS_WEIRD);
+ struct dentry *de = start_removing_dentry(fan, dentry);
+ if (IS_ERR(de))
+ ret = PTR_ERR(de);
+ else
+ ret = cachefiles_bury_object(volume->cache, object,
+ fan, de,
+ FSCACHE_OBJECT_IS_WEIRD);
dput(dentry);
if (ret < 0)
return false;
@@ -679,36 +678,41 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
_enter(",%pD", object->file);
- inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_one_len(object->d_name, fan, object->d_name_len);
+ dentry = start_creating(&nop_mnt_idmap, fan, &QSTR(object->d_name));
else
dentry = ERR_PTR(ret);
if (IS_ERR(dentry)) {
trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
cachefiles_trace_lookup_error);
_debug("lookup fail %ld", PTR_ERR(dentry));
- goto out_unlock;
+ goto out;
}
- if (!d_is_negative(dentry)) {
+ /*
+ * This loop will only execute more than once if some other thread
+ * races to create the object we are trying to create.
+ */
+ while (!d_is_negative(dentry)) {
ret = cachefiles_unlink(volume->cache, object, fan, dentry,
FSCACHE_OBJECT_IS_STALE);
if (ret < 0)
- goto out_dput;
+ goto out_end;
+
+ end_creating(dentry);
- dput(dentry);
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_one_len(object->d_name, fan, object->d_name_len);
+ dentry = start_creating(&nop_mnt_idmap, fan,
+ &QSTR(object->d_name));
else
dentry = ERR_PTR(ret);
if (IS_ERR(dentry)) {
trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
cachefiles_trace_lookup_error);
_debug("lookup fail %ld", PTR_ERR(dentry));
- goto out_unlock;
+ goto out;
}
}
@@ -729,10 +733,9 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
success = true;
}
-out_dput:
- dput(dentry);
-out_unlock:
- inode_unlock(d_inode(fan));
+out_end:
+ end_creating(dentry);
+out:
_leave(" = %u", success);
return success;
}
@@ -748,26 +751,20 @@ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache,
struct dentry *victim;
int ret = -ENOENT;
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+ victim = start_removing(&nop_mnt_idmap, dir, &QSTR(filename));
- victim = lookup_one_len(filename, dir, strlen(filename));
if (IS_ERR(victim))
goto lookup_error;
- if (d_is_negative(victim))
- goto lookup_put;
if (d_inode(victim)->i_flags & S_KERNEL_FILE)
goto lookup_busy;
return victim;
lookup_busy:
ret = -EBUSY;
-lookup_put:
- inode_unlock(d_inode(dir));
- dput(victim);
+ end_removing(victim);
return ERR_PTR(ret);
lookup_error:
- inode_unlock(d_inode(dir));
ret = PTR_ERR(victim);
if (ret == -ENOENT)
return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */
@@ -815,18 +812,17 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
ret = cachefiles_bury_object(cache, NULL, dir, victim,
FSCACHE_OBJECT_WAS_CULLED);
+ dput(victim);
if (ret < 0)
goto error;
fscache_count_culled();
- dput(victim);
_leave(" = 0");
return 0;
error_unlock:
- inode_unlock(d_inode(dir));
+ end_removing(victim);
error:
- dput(victim);
if (ret == -ENOENT)
return -ESTALE; /* Probably got retired by the netfs */
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index d9bc67176128..a7ed86fa98bb 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -83,10 +83,8 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
- if (!ret) {
- ret = len;
+ if (ret > 0)
kiocb->ki_pos += ret;
- }
out:
fput(file);
diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c
index 781aac4ef274..90ba926f488e 100644
--- a/fs/cachefiles/volume.c
+++ b/fs/cachefiles/volume.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
+#include <linux/namei.h>
#include "internal.h"
#include <trace/events/fscache.h>
@@ -58,9 +59,11 @@ retry:
if (ret < 0) {
if (ret != -ESTALE)
goto error_dir;
- inode_lock_nested(d_inode(cache->store), I_MUTEX_PARENT);
- cachefiles_bury_object(cache, NULL, cache->store, vdentry,
- FSCACHE_VOLUME_IS_WEIRD);
+ vdentry = start_removing_dentry(cache->store, vdentry);
+ if (!IS_ERR(vdentry))
+ cachefiles_bury_object(cache, NULL, cache->store,
+ vdentry,
+ FSCACHE_VOLUME_IS_WEIRD);
cachefiles_put_directory(volume->dentry);
cond_resched();
goto retry;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 7249d70e1a43..3e7def3d31c1 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -3,7 +3,7 @@ config CEPH_FS
tristate "Ceph distributed file system"
depends on INET
select CEPH_LIB
- select LIBCRC32C
+ select CRC32
select CRYPTO_AES
select CRYPTO
select NETFS_SUPPORT
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 29be367905a1..63b75d214210 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -238,6 +238,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
if (sparse && err > 0)
err = ceph_sparse_ext_map_end(op);
if (err < subreq->len &&
+ subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (IS_ENCRYPTED(inode) && err > 0) {
@@ -281,7 +282,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
size_t len;
int mode;
- if (rreq->origin != NETFS_DIO_READ)
+ if (rreq->origin != NETFS_UNBUFFERED_READ &&
+ rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
@@ -407,6 +409,15 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct page **pages;
size_t page_off;
+ /*
+ * FIXME: io_iter.count needs to be corrected to aligned
+ * length. Otherwise, iov_iter_get_pages_alloc2() operates
+ * with the initial unaligned length value. As a result,
+ * ceph_msg_data_cursor_init() triggers BUG_ON() in the case
+ * if msg->sparse_read_total > msg->data_length.
+ */
+ subreq->io_iter.count = len;
+
err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);
if (err < 0) {
doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
@@ -539,7 +550,7 @@ static void ceph_set_page_fscache(struct page *page)
folio_start_private_2(page_folio(page)); /* [DEPRECATED] */
}
-static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
+static void ceph_fscache_write_terminated(void *priv, ssize_t error)
{
struct inode *inode = priv;
@@ -1034,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping,
ceph_wbc->index = ceph_wbc->start_index;
ceph_wbc->end = -1;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
- ceph_wbc->tag = PAGECACHE_TAG_TOWRITE;
- } else {
- ceph_wbc->tag = PAGECACHE_TAG_DIRTY;
- }
+ ceph_wbc->tag = wbc_to_tag(wbc);
ceph_wbc->op_idx = -1;
ceph_wbc->num_ops = 0;
@@ -1253,7 +1260,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
0,
gfp_flags);
if (IS_ERR(pages[index])) {
- if (PTR_ERR(pages[index]) == -EINVAL) {
+ int err = PTR_ERR(pages[index]);
+
+ if (err == -EINVAL) {
pr_err_client(cl, "inode->i_blkbits=%hhu\n",
inode->i_blkbits);
}
@@ -1262,7 +1271,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
BUG_ON(ceph_wbc->locked_pages == 0);
pages[index] = NULL;
- return PTR_ERR(pages[index]);
+ return err;
}
} else {
pages[index] = &folio->page;
@@ -1676,6 +1685,7 @@ get_more_pages:
process_folio_batch:
rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+ ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
if (rc)
goto release_folios;
@@ -1684,8 +1694,6 @@ process_folio_batch:
goto release_folios;
if (ceph_wbc.processed_in_fbatch) {
- ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
-
if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
ceph_wbc.locked_pages < ceph_wbc.max_pages) {
doutc(cl, "reached end fbatch, trying for more\n");
@@ -1853,10 +1861,12 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
* We are only allowed to write into/dirty the page if the page is
* clean, or already dirty within the same snap context.
*/
-static int ceph_write_begin(struct file *file, struct address_space *mapping,
+static int ceph_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
int r;
@@ -1874,10 +1884,12 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
* we don't do anything in here that simple_write_end doesn't do
* except adjust dirty page accounting
*/
-static int ceph_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
+static int ceph_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_client *cl = ceph_inode_to_client(inode);
bool check_cap = false;
@@ -2319,13 +2331,13 @@ static const struct vm_operations_struct ceph_vmops = {
.page_mkwrite = ceph_page_mkwrite,
};
-int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+int ceph_mmap_prepare(struct vm_area_desc *desc)
{
- struct address_space *mapping = file->f_mapping;
+ struct address_space *mapping = desc->file->f_mapping;
if (!mapping->a_ops->read_folio)
return -ENOEXEC;
- vma->vm_ops = &ceph_vmops;
+ desc->vm_ops = &ceph_vmops;
return 0;
}
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 930fbd54d2c8..f678bab189d8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -26,7 +26,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
return;
/* Only new inodes! */
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return;
WARN_ON_ONCE(ci->netfs.cache);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a8d8b56cf9d2..b1a8ff612c41 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4957,24 +4957,20 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+ int len = dentry->d_name.len;
doutc(cl, "%p mds%d seq %d\n", dentry, mds,
(int)di->lease_seq);
rel->dname_seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
+ memcpy(*p, dentry->d_name.name, len);
spin_unlock(&dentry->d_lock);
if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) {
- int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p);
-
- if (ret2 < 0)
- return ret2;
-
- rel->dname_len = cpu_to_le32(ret2);
- *p += ret2;
- } else {
- rel->dname_len = cpu_to_le32(dentry->d_name.len);
- memcpy(*p, dentry->d_name.name, dentry->d_name.len);
- *p += dentry->d_name.len;
+ len = ceph_encode_encrypted_dname(dir, *p, len);
+ if (len < 0)
+ return len;
}
+ rel->dname_len = cpu_to_le32(len);
+ *p += len;
} else {
spin_unlock(&dentry->d_lock);
}
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 3b3c4d8d401e..0ea4db650f85 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -15,59 +15,6 @@
#include "mds_client.h"
#include "crypto.h"
-/*
- * The base64url encoding used by fscrypt includes the '_' character, which may
- * cause problems in snapshot names (which can not start with '_'). Thus, we
- * used the base64 encoding defined for IMAP mailbox names (RFC 3501) instead,
- * which replaces '-' and '_' by '+' and ','.
- */
-static const char base64_table[65] =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
-
-int ceph_base64_encode(const u8 *src, int srclen, char *dst)
-{
- u32 ac = 0;
- int bits = 0;
- int i;
- char *cp = dst;
-
- for (i = 0; i < srclen; i++) {
- ac = (ac << 8) | src[i];
- bits += 8;
- do {
- bits -= 6;
- *cp++ = base64_table[(ac >> bits) & 0x3f];
- } while (bits >= 6);
- }
- if (bits)
- *cp++ = base64_table[(ac << (6 - bits)) & 0x3f];
- return cp - dst;
-}
-
-int ceph_base64_decode(const char *src, int srclen, u8 *dst)
-{
- u32 ac = 0;
- int bits = 0;
- int i;
- u8 *bp = dst;
-
- for (i = 0; i < srclen; i++) {
- const char *p = strchr(base64_table, src[i]);
-
- if (p == NULL || src[i] == 0)
- return -1;
- ac = (ac << 6) | (p - base64_table);
- bits += 6;
- if (bits >= 8) {
- bits -= 8;
- *bp++ = (u8)(ac >> bits);
- }
- }
- if (ac & ((1 << bits) - 1))
- return -1;
- return bp - dst;
-}
-
static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -133,6 +80,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
}
static struct fscrypt_operations ceph_fscrypt_ops = {
+ .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) -
+ (int)offsetof(struct ceph_inode_info, netfs.inode),
.needs_bounce_pages = 1,
.get_context = ceph_crypt_get_context,
.set_context = ceph_crypt_set_context,
@@ -215,35 +164,31 @@ static struct inode *parse_longname(const struct inode *parent,
struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = NULL;
struct ceph_vino vino = { .snap = CEPH_NOSNAP };
- char *inode_number;
- char *name_end;
- int orig_len = *name_len;
+ char *name_end, *inode_number;
int ret = -EIO;
-
+ /* NUL-terminate */
+ char *str __free(kfree) = kmemdup_nul(name, *name_len, GFP_KERNEL);
+ if (!str)
+ return ERR_PTR(-ENOMEM);
/* Skip initial '_' */
- name++;
- name_end = strrchr(name, '_');
+ str++;
+ name_end = strrchr(str, '_');
if (!name_end) {
- doutc(cl, "failed to parse long snapshot name: %s\n", name);
+ doutc(cl, "failed to parse long snapshot name: %s\n", str);
return ERR_PTR(-EIO);
}
- *name_len = (name_end - name);
+ *name_len = (name_end - str);
if (*name_len <= 0) {
pr_err_client(cl, "failed to parse long snapshot name\n");
return ERR_PTR(-EIO);
}
/* Get the inode number */
- inode_number = kmemdup_nul(name_end + 1,
- orig_len - *name_len - 2,
- GFP_KERNEL);
- if (!inode_number)
- return ERR_PTR(-ENOMEM);
+ inode_number = name_end + 1;
ret = kstrtou64(inode_number, 10, &vino.ino);
if (ret) {
- doutc(cl, "failed to parse inode number: %s\n", name);
- dir = ERR_PTR(ret);
- goto out;
+ doutc(cl, "failed to parse inode number: %s\n", str);
+ return ERR_PTR(ret);
}
/* And finally the inode */
@@ -254,42 +199,29 @@ static struct inode *parse_longname(const struct inode *parent,
if (IS_ERR(dir))
doutc(cl, "can't find inode %s (%s)\n", inode_number, name);
}
-
-out:
- kfree(inode_number);
return dir;
}
-int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
- char *buf)
+int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen)
{
struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = parent;
- struct qstr iname;
+ char *p = buf;
u32 len;
- int name_len;
- int elen;
+ int name_len = elen;
int ret;
u8 *cryptbuf = NULL;
- iname.name = d_name->name;
- name_len = d_name->len;
-
/* Handle the special case of snapshot names that start with '_' */
- if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) &&
- (iname.name[0] == '_')) {
- dir = parse_longname(parent, iname.name, &name_len);
+ if (ceph_snap(dir) == CEPH_SNAPDIR && *p == '_') {
+ dir = parse_longname(parent, p, &name_len);
if (IS_ERR(dir))
return PTR_ERR(dir);
- iname.name++; /* skip initial '_' */
+ p++; /* skip initial '_' */
}
- iname.len = name_len;
- if (!fscrypt_has_encryption_key(dir)) {
- memcpy(buf, d_name->name, d_name->len);
- elen = d_name->len;
+ if (!fscrypt_has_encryption_key(dir))
goto out;
- }
/*
* Convert cleartext d_name to ciphertext. If result is longer than
@@ -297,7 +229,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
*
* See: fscrypt_setup_filename
*/
- if (!fscrypt_fname_encrypted_size(dir, iname.len, NAME_MAX, &len)) {
+ if (!fscrypt_fname_encrypted_size(dir, name_len, NAME_MAX, &len)) {
elen = -ENAMETOOLONG;
goto out;
}
@@ -310,7 +242,9 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
goto out;
}
- ret = fscrypt_fname_encrypt(dir, &iname, cryptbuf, len);
+ ret = fscrypt_fname_encrypt(dir,
+ &(struct qstr)QSTR_INIT(p, name_len),
+ cryptbuf, len);
if (ret) {
elen = ret;
goto out;
@@ -331,23 +265,18 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
}
/* base64 encode the encrypted name */
- elen = ceph_base64_encode(cryptbuf, len, buf);
- doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, buf);
+ elen = base64_encode(cryptbuf, len, p, false, BASE64_IMAP);
+ doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, p);
/* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */
WARN_ON(elen > 240);
- if ((elen > 0) && (dir != parent)) {
- char tmp_buf[NAME_MAX];
-
- elen = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld",
- elen, buf, dir->i_ino);
- memcpy(buf, tmp_buf, elen);
- }
+ if (dir != parent) // leading _ is already there; append _<inum>
+ elen += 1 + sprintf(p + elen, "_%ld", dir->i_ino);
out:
kfree(cryptbuf);
if (dir != parent) {
- if ((dir->i_state & I_NEW))
+ if ((inode_state_read_once(dir) & I_NEW))
discard_new_inode(dir);
else
iput(dir);
@@ -355,14 +284,6 @@ out:
return elen;
}
-int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry,
- char *buf)
-{
- WARN_ON_ONCE(!fscrypt_has_encryption_key(parent));
-
- return ceph_encode_encrypted_dname(parent, &dentry->d_name, buf);
-}
-
/**
* ceph_fname_to_usr - convert a filename for userland presentation
* @fname: ceph_fname to be converted
@@ -438,7 +359,8 @@ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname,
tname = &_tname;
}
- declen = ceph_base64_decode(name, name_len, tname->name);
+ declen = base64_decode(name, name_len,
+ tname->name, false, BASE64_IMAP);
if (declen <= 0) {
ret = -EIO;
goto out;
@@ -452,7 +374,7 @@ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname,
ret = fscrypt_fname_disk_to_usr(dir, 0, 0, &iname, oname);
if (!ret && (dir != fname->dir)) {
- char tmp_buf[CEPH_BASE64_CHARS(NAME_MAX)];
+ char tmp_buf[BASE64_CHARS(NAME_MAX)];
name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld",
oname->len, oname->name, dir->i_ino);
@@ -464,7 +386,7 @@ out:
fscrypt_fname_free_buffer(&_tname);
out_inode:
if (dir != fname->dir) {
- if ((dir->i_state & I_NEW))
+ if ((inode_state_read_once(dir) & I_NEW))
discard_new_inode(dir);
else
iput(dir);
@@ -516,15 +438,13 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
- unsigned int offs, u64 lblk_num,
- gfp_t gfp_flags)
+ unsigned int offs, u64 lblk_num)
{
struct ceph_client *cl = ceph_inode_to_client(inode);
doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
ceph_vinop(inode), len, offs, lblk_num);
- return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num,
- gfp_flags);
+ return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num);
}
/**
@@ -642,9 +562,8 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
* @page: pointer to page array
* @off: offset into the file that the data starts
* @len: max length to encrypt
- * @gfp: gfp flags to use for allocation
*
- * Decrypt an array of cleartext pages and return the amount of
+ * Encrypt an array of cleartext pages and return the amount of
* data encrypted. Any data in the page prior to the start of the
* first complete block in the read is ignored. Any incomplete
* crypto blocks at the end of the array are ignored.
@@ -652,7 +571,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
* Returns the length of the encrypted data or a negative errno.
*/
int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
- int len, gfp_t gfp)
+ int len)
{
int i, num_blocks;
u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT;
@@ -673,7 +592,7 @@ int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx],
CEPH_FSCRYPT_BLOCK_SIZE, pgoffs,
- baseblk + i, gfp);
+ baseblk + i);
if (fret < 0) {
if (ret == 0)
ret = fret;
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
index d0768239a1c9..b748e2060bc9 100644
--- a/fs/ceph/crypto.h
+++ b/fs/ceph/crypto.h
@@ -8,6 +8,7 @@
#include <crypto/sha2.h>
#include <linux/fscrypt.h>
+#include <linux/base64.h>
#define CEPH_FSCRYPT_BLOCK_SHIFT 12
#define CEPH_FSCRYPT_BLOCK_SIZE (_AC(1, UL) << CEPH_FSCRYPT_BLOCK_SHIFT)
@@ -89,11 +90,6 @@ static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa)
*/
#define CEPH_NOHASH_NAME_MAX (180 - SHA256_DIGEST_SIZE)
-#define CEPH_BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
-
-int ceph_base64_encode(const u8 *src, int srclen, char *dst);
-int ceph_base64_decode(const char *src, int srclen, u8 *dst);
-
void ceph_fscrypt_set_ops(struct super_block *sb);
void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc);
@@ -102,10 +98,7 @@ int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode,
struct ceph_acl_sec_ctx *as);
void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
struct ceph_acl_sec_ctx *as);
-int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
- char *buf);
-int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry,
- char *buf);
+int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int len);
static inline int ceph_fname_alloc_buffer(struct inode *parent,
struct fscrypt_str *fname)
@@ -155,15 +148,14 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
unsigned int offs, u64 lblk_num);
int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
- unsigned int offs, u64 lblk_num,
- gfp_t gfp_flags);
+ unsigned int offs, u64 lblk_num);
int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page,
u64 off, int len);
int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
u64 off, struct ceph_sparse_extent *map,
u32 ext_cnt);
int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
- int len, gfp_t gfp);
+ int len);
static inline struct page *ceph_fscrypt_pagecache_page(struct page *page)
{
@@ -194,17 +186,10 @@ static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
{
}
-static inline int ceph_encode_encrypted_dname(struct inode *parent,
- struct qstr *d_name, char *buf)
-{
- memcpy(buf, d_name->name, d_name->len);
- return d_name->len;
-}
-
-static inline int ceph_encode_encrypted_fname(struct inode *parent,
- struct dentry *dentry, char *buf)
+static inline int ceph_encode_encrypted_dname(struct inode *parent, char *buf,
+ int len)
{
- return -EOPNOTSUPP;
+ return len;
}
static inline int ceph_fname_alloc_buffer(struct inode *parent,
@@ -246,8 +231,7 @@ static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
- unsigned int offs, u64 lblk_num,
- gfp_t gfp_flags)
+ unsigned int offs, u64 lblk_num)
{
return 0;
}
@@ -269,7 +253,7 @@ static inline int ceph_fscrypt_decrypt_extents(struct inode *inode,
static inline int ceph_fscrypt_encrypt_pages(struct inode *inode,
struct page **page, u64 off,
- int len, gfp_t gfp)
+ int len)
{
return 0;
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdd404fc8112..f3fe786b4143 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
- int pathlen = 0;
- u64 pathbase;
char *path;
mutex_lock(&mdsc->mutex);
@@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p)
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_dentry->d_lock);
@@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
@@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p)
}
if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
@@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_old_dentry,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a321aa6d0ed2..86d7aa594ea9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -423,17 +423,16 @@ more:
req->r_inode_drop = CEPH_CAP_FILE_EXCL;
}
if (dfi->last_name) {
- struct qstr d_name = { .name = dfi->last_name,
- .len = strlen(dfi->last_name) };
+ int len = strlen(dfi->last_name);
req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL);
if (!req->r_path2) {
ceph_mdsc_put_request(req);
return -ENOMEM;
}
+ memcpy(req->r_path2, dfi->last_name, len);
- err = ceph_encode_encrypted_dname(inode, &d_name,
- req->r_path2);
+ err = ceph_encode_encrypted_dname(inode, req->r_path2, len);
if (err < 0) {
ceph_mdsc_put_request(req);
return err;
@@ -999,13 +998,14 @@ static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
if (err)
goto out;
- req->r_path2 = kmalloc(CEPH_BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL);
+ req->r_path2 = kmalloc(BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL);
if (!req->r_path2) {
err = -ENOMEM;
goto out;
}
- len = ceph_base64_encode(osd_link.name, osd_link.len, req->r_path2);
+ len = base64_encode(osd_link.name, osd_link.len,
+ req->r_path2, false, BASE64_IMAP);
req->r_path2[len] = '\0';
out:
fscrypt_fname_free_buffer(&osd_link);
@@ -1261,8 +1261,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
spin_unlock(&fsc->async_unlink_conflict_lock);
spin_lock(&dentry->d_lock);
- di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
- wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
+ clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags);
spin_unlock(&dentry->d_lock);
synchronize_rcu();
@@ -1272,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
/* If op failed, mark everyone involved for errors */
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
/* mark error on parent + clear complete */
mapping_set_error(req->r_parent->i_mapping, result);
@@ -1289,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_old_inode->i_mapping, result);
pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
}
out:
iput(req->r_old_inode);
@@ -1348,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
int err = -EROFS;
int op;
char *path;
- int pathlen;
- u64 pathbase;
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
@@ -1368,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -2160,7 +2156,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
" rfiles: %20lld\n"
" rsubdirs: %20lld\n"
"rbytes: %20lld\n"
- "rctime: %10lld.%09ld\n",
+ "rctime: %ptSp\n",
ci->i_files + ci->i_subdirs,
ci->i_files,
ci->i_subdirs,
@@ -2168,8 +2164,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
ci->i_rfiles,
ci->i_rsubdirs,
ci->i_rbytes,
- ci->i_rctime.tv_sec,
- ci->i_rctime.tv_nsec);
+ &ci->i_rctime);
}
if (*ppos >= dfi->dir_info_len)
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 150076ced937..b2f2af104679 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -33,12 +33,19 @@ struct ceph_nfs_snapfh {
u32 hash;
} __attribute__ ((packed));
+#define BYTES_PER_U32 (sizeof(u32))
+#define CEPH_FH_BASIC_SIZE \
+ (sizeof(struct ceph_nfs_fh) / BYTES_PER_U32)
+#define CEPH_FH_WITH_PARENT_SIZE \
+ (sizeof(struct ceph_nfs_confh) / BYTES_PER_U32)
+#define CEPH_FH_SNAPPED_INODE_SIZE \
+ (sizeof(struct ceph_nfs_snapfh) / BYTES_PER_U32)
+
static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
struct ceph_client *cl = ceph_inode_to_client(inode);
- static const int snap_handle_length =
- sizeof(struct ceph_nfs_snapfh) >> 2;
+ static const int snap_handle_length = CEPH_FH_SNAPPED_INODE_SIZE;
struct ceph_nfs_snapfh *sfh = (void *)rawfh;
u64 snapid = ceph_snap(inode);
int ret;
@@ -88,10 +95,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
struct ceph_client *cl = ceph_inode_to_client(inode);
- static const int handle_length =
- sizeof(struct ceph_nfs_fh) >> 2;
- static const int connected_handle_length =
- sizeof(struct ceph_nfs_confh) >> 2;
+ static const int handle_length = CEPH_FH_BASIC_SIZE;
+ static const int connected_handle_length = CEPH_FH_WITH_PARENT_SIZE;
int type;
if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -308,7 +313,7 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
if (fh_type != FILEID_INO32_GEN &&
fh_type != FILEID_INO32_GEN_PARENT)
return NULL;
- if (fh_len < sizeof(*fh) / 4)
+ if (fh_len < sizeof(*fh) / BYTES_PER_U32)
return NULL;
doutc(fsc->client, "%llx\n", fh->ino);
@@ -427,7 +432,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
if (fh_type != FILEID_INO32_GEN_PARENT)
return NULL;
- if (fh_len < sizeof(*cfh) / 4)
+ if (fh_len < sizeof(*cfh) / BYTES_PER_U32)
return NULL;
doutc(fsc->client, "%llx\n", cfh->parent_ino);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 851d70200c6b..983390069f73 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
int flags, fmode, wanted;
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
int mask = MAY_READ;
@@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file)
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -580,8 +579,7 @@ static void wake_async_create_waiters(struct inode *inode,
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
- ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
- wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+ clear_and_wake_up_bit(CEPH_ASYNC_CREATE_BIT, &ci->i_ceph_flags);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) {
ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS;
@@ -614,15 +612,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
pr_warn_client(cl,
"async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
@@ -744,7 +740,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
vino.ino, ceph_ino(dir), dentry->d_name.name);
ceph_dir_clear_ordered(dir);
ceph_init_inode_acls(inode, as_ctx);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
/*
* If it's not I_NEW, then someone created this before
* we got here. Assume the server is aware of it at
@@ -765,8 +761,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
}
spin_lock(&dentry->d_lock);
- di->flags &= ~CEPH_DENTRY_ASYNC_CREATE;
- wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT);
+ clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_CREATE_BIT, &di->flags);
spin_unlock(&dentry->d_lock);
return ret;
@@ -791,8 +786,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
int mask;
int err;
char *path;
- int pathlen;
- u64 pathbase;
doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
dir, ceph_vinop(dir), dentry, dentry,
@@ -814,7 +807,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
@@ -826,7 +820,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
mask |= MAY_WRITE;
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -907,7 +901,7 @@ retry:
new_inode = NULL;
goto out_req;
}
- WARN_ON_ONCE(!(new_inode->i_state & I_NEW));
+ WARN_ON_ONCE(!(inode_state_read_once(new_inode) & I_NEW));
spin_lock(&dentry->d_lock);
di->flags |= CEPH_DENTRY_ASYNC_CREATE;
@@ -1992,8 +1986,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (IS_ENCRYPTED(inode)) {
ret = ceph_fscrypt_encrypt_pages(inode, pages,
- write_pos, write_len,
- GFP_KERNEL);
+ write_pos, write_len);
if (ret < 0) {
doutc(cl, "encryption failed with %d\n", ret);
ceph_release_page_vector(pages, num_pages);
@@ -2126,10 +2119,10 @@ again:
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
- if (direct_lock)
- ceph_start_io_direct(inode);
- else
- ceph_start_io_read(inode);
+ ret = direct_lock ? ceph_start_io_direct(inode) :
+ ceph_start_io_read(inode);
+ if (ret)
+ return ret;
if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
want |= CEPH_CAP_FILE_CACHE;
@@ -2282,7 +2275,9 @@ static ssize_t ceph_splice_read(struct file *in, loff_t *ppos,
(fi->flags & CEPH_F_SYNC))
return copy_splice_read(in, ppos, pipe, len, flags);
- ceph_start_io_read(inode);
+ ret = ceph_start_io_read(inode);
+ if (ret)
+ return ret;
want = CEPH_CAP_FILE_CACHE;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
@@ -2361,10 +2356,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
direct_lock = true;
retry_snap:
- if (direct_lock)
- ceph_start_io_direct(inode);
- else
- ceph_start_io_write(inode);
+ err = direct_lock ? ceph_start_io_direct(inode) :
+ ceph_start_io_write(inode);
+ if (err)
+ goto out_unlocked;
if (iocb->ki_flags & IOCB_APPEND) {
err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
@@ -2530,19 +2525,19 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
}
-static inline void ceph_zero_partial_page(
- struct inode *inode, loff_t offset, unsigned size)
+static inline void ceph_zero_partial_page(struct inode *inode,
+ loff_t offset, size_t size)
{
- struct page *page;
- pgoff_t index = offset >> PAGE_SHIFT;
+ struct folio *folio;
- page = find_lock_page(inode->i_mapping, index);
- if (page) {
- wait_on_page_writeback(page);
- zero_user(page, offset & (PAGE_SIZE - 1), size);
- unlock_page(page);
- put_page(page);
- }
+ folio = filemap_lock_folio(inode->i_mapping, offset >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ folio_wait_writeback(folio);
+ folio_zero_range(folio, offset_in_folio(folio, offset), size);
+ folio_unlock(folio);
+ folio_put(folio);
}
static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
@@ -2616,7 +2611,7 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
s32 stripe_unit = ci->i_layout.stripe_unit;
s32 stripe_count = ci->i_layout.stripe_count;
s32 object_size = ci->i_layout.object_size;
- u64 object_set_size = object_size * stripe_count;
+ u64 object_set_size = (u64) object_size * stripe_count;
u64 nearly, t;
/* round offset up to next period boundary */
@@ -2883,7 +2878,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
struct ceph_object_id src_oid, dst_oid;
struct ceph_osd_client *osdc;
struct ceph_osd_request *req;
- size_t bytes = 0;
+ ssize_t bytes = 0;
u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
u32 src_objlen, dst_objlen;
u32 object_size = src_ci->i_layout.object_size;
@@ -2933,7 +2928,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
"OSDs don't support copy-from2; disabling copy offload\n");
}
doutc(cl, "returned %d\n", ret);
- if (!bytes)
+ if (bytes <= 0)
bytes = ret;
goto out;
}
@@ -3171,7 +3166,7 @@ const struct file_operations ceph_file_fops = {
.llseek = ceph_llseek,
.read_iter = ceph_read_iter,
.write_iter = ceph_write_iter,
- .mmap = ceph_mmap,
+ .mmap_prepare = ceph_mmap_prepare,
.fsync = ceph_fsync,
.lock = ceph_lock,
.setlease = simple_nosetlease,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6ac2bd555e86..2966f88310e3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
+/*
+ * Check if the parent inode matches the vino from directory reply info
+ */
+static inline bool ceph_vino_matches_parent(struct inode *parent,
+ struct ceph_vino vino)
+{
+ return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
+}
+
+/*
+ * Validate that the directory inode referenced by @req->r_parent matches the
+ * inode number and snapshot id contained in the reply's directory record. If
+ * they do not match – which can theoretically happen if the parent dentry was
+ * moved between the time the request was issued and the reply arrived – fall
+ * back to looking up the correct inode in the inode cache.
+ *
+ * A reference is *always* returned. Callers that receive a different inode
+ * than the original @parent are responsible for dropping the extra reference
+ * once the reply has been processed.
+ */
+static struct inode *ceph_get_reply_dir(struct super_block *sb,
+ struct inode *parent,
+ struct ceph_mds_reply_info_parsed *rinfo)
+{
+ struct ceph_vino vino;
+
+ if (unlikely(!rinfo->diri.in))
+ return parent; /* nothing to compare against */
+
+ /* If we didn't have a cached parent inode to begin with, just bail out. */
+ if (!parent)
+ return NULL;
+
+ vino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (likely(ceph_vino_matches_parent(parent, vino)))
+ return parent; /* matches – use the original reference */
+
+ /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
+ WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
+ ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
+
+ return ceph_get_inode(sb, vino, NULL);
+}
+
/**
* ceph_new_inode - allocate a new inode in advance of an expected create
* @dir: parent directory for new inode
@@ -86,7 +132,7 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
goto out_err;
}
- inode->i_state = 0;
+ inode_state_assign_raw(inode, 0);
inode->i_mode = *mode;
err = ceph_security_init_secctx(dentry, *mode, as_ctx);
@@ -155,7 +201,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
ceph_present_inode(inode), ceph_vinop(inode), inode,
- !!(inode->i_state & I_NEW));
+ !!(inode_state_read_once(inode) & I_NEW));
return inode;
}
@@ -182,7 +228,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
goto err;
}
- if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
+ if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) {
pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
inode->i_mode);
goto err;
@@ -215,7 +261,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
}
}
#endif
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
@@ -224,7 +270,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
return inode;
err:
- if ((inode->i_state & I_NEW))
+ if ((inode_state_read_once(inode) & I_NEW))
discard_new_inode(inode);
else
iput(inode);
@@ -665,6 +711,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
#ifdef CONFIG_FS_ENCRYPTION
+ ci->i_crypt_info = NULL;
ci->fscrypt_auth = NULL;
ci->fscrypt_auth_len = 0;
#endif
@@ -697,7 +744,7 @@ void ceph_evict_inode(struct inode *inode)
netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_NETFS_WB)
+ if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
@@ -832,7 +879,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
{
struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct timespec64 iatime = inode_get_atime(inode);
struct timespec64 ictime = inode_get_ctime(inode);
+ struct timespec64 imtime = inode_get_mtime(inode);
int warn = 0;
if (issued & (CEPH_CAP_FILE_EXCL|
@@ -842,39 +891,26 @@ void ceph_fill_file_time(struct inode *inode, int issued,
CEPH_CAP_XATTR_EXCL)) {
if (ci->i_version == 0 ||
timespec64_compare(ctime, &ictime) > 0) {
- doutc(cl, "ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
- ictime.tv_sec, ictime.tv_nsec,
- ctime->tv_sec, ctime->tv_nsec);
+ doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime);
inode_set_ctime_to_ts(inode, *ctime);
}
if (ci->i_version == 0 ||
ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
/* the MDS did a utimes() */
- doutc(cl, "mtime %lld.%09ld -> %lld.%09ld tw %d -> %d\n",
- inode_get_mtime_sec(inode),
- inode_get_mtime_nsec(inode),
- mtime->tv_sec, mtime->tv_nsec,
- ci->i_time_warp_seq, (int)time_warp_seq);
+ doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime,
+ ci->i_time_warp_seq, (int)time_warp_seq);
inode_set_mtime_to_ts(inode, *mtime);
inode_set_atime_to_ts(inode, *atime);
ci->i_time_warp_seq = time_warp_seq;
} else if (time_warp_seq == ci->i_time_warp_seq) {
- struct timespec64 ts;
-
/* nobody did utimes(); take the max */
- ts = inode_get_mtime(inode);
- if (timespec64_compare(mtime, &ts) > 0) {
- doutc(cl, "mtime %lld.%09ld -> %lld.%09ld inc\n",
- ts.tv_sec, ts.tv_nsec,
- mtime->tv_sec, mtime->tv_nsec);
+ if (timespec64_compare(mtime, &imtime) > 0) {
+ doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime);
inode_set_mtime_to_ts(inode, *mtime);
}
- ts = inode_get_atime(inode);
- if (timespec64_compare(atime, &ts) > 0) {
- doutc(cl, "atime %lld.%09ld -> %lld.%09ld inc\n",
- ts.tv_sec, ts.tv_nsec,
- atime->tv_sec, atime->tv_nsec);
+ if (timespec64_compare(atime, &iatime) > 0) {
+ doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime);
inode_set_atime_to_ts(inode, *atime);
}
} else if (issued & CEPH_CAP_FILE_EXCL) {
@@ -911,7 +947,7 @@ static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
if (!sym)
return -ENOMEM;
- declen = ceph_base64_decode(encsym, enclen, sym);
+ declen = base64_decode(encsym, enclen, sym, false, BASE64_IMAP);
if (declen < 0) {
pr_err_client(cl,
"can't decode symlink (%d). Content: %.*s\n",
@@ -966,7 +1002,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
le64_to_cpu(info->version), ci->i_version);
/* Once I_NEW is cleared, we can't change type or dev numbers */
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_mode = mode;
} else {
if (inode_wrong_type(inode, mode)) {
@@ -1043,7 +1079,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
#ifdef CONFIG_FS_ENCRYPTION
if (iinfo->fscrypt_auth_len &&
- ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) {
+ ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) {
kfree(ci->fscrypt_auth);
ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
ci->fscrypt_auth = iinfo->fscrypt_auth;
@@ -1523,6 +1559,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct ceph_vino tvino, dvino;
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_client *cl = fsc->client;
+ struct inode *parent_dir = NULL;
int err = 0;
doutc(cl, "%p is_dentry %d is_target %d\n", req,
@@ -1536,10 +1573,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
}
if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_parent;
-
- if (dir) {
- err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ /*
+ * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
+ * so we need to get the correct inode
+ */
+ parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
+ if (unlikely(IS_ERR(parent_dir))) {
+ err = PTR_ERR(parent_dir);
+ goto done;
+ }
+ if (parent_dir) {
+ err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
rinfo->dirfrag, session, -1,
&req->r_caps_reservation);
if (err < 0)
@@ -1548,14 +1592,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
bool is_nokey = false;
struct qstr dname;
struct dentry *dn, *parent;
struct fscrypt_str oname = FSTR_INIT(NULL, 0);
- struct ceph_fname fname = { .dir = dir,
+ struct ceph_fname fname = { .dir = parent_dir,
.name = rinfo->dname,
.ctext = rinfo->altname,
.name_len = rinfo->dname_len,
@@ -1564,10 +1608,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
BUG_ON(!rinfo->head->is_target);
BUG_ON(req->r_dentry);
- parent = d_find_any_alias(dir);
+ parent = d_find_any_alias(parent_dir);
BUG_ON(!parent);
- err = ceph_fname_alloc_buffer(dir, &oname);
+ err = ceph_fname_alloc_buffer(parent_dir, &oname);
if (err < 0) {
dput(parent);
goto done;
@@ -1576,7 +1620,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
if (err < 0) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
goto done;
}
dname.name = oname.name;
@@ -1595,7 +1639,7 @@ retry_lookup:
dname.len, dname.name, dn);
if (!dn) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
err = -ENOMEM;
goto done;
}
@@ -1610,12 +1654,12 @@ retry_lookup:
ceph_snap(d_inode(dn)) != tvino.snap)) {
doutc(cl, " dn %p points to wrong inode %p\n",
dn, d_inode(dn));
- ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(parent_dir);
d_delete(dn);
dput(dn);
goto retry_lookup;
}
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
req->r_dentry = dn;
dput(parent);
@@ -1637,13 +1681,13 @@ retry_lookup:
pr_err_client(cl, "badness %p %llx.%llx\n", in,
ceph_vinop(in));
req->r_target_inode = NULL;
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
discard_new_inode(in);
else
iput(in);
goto done;
}
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
unlock_new_inode(in);
}
@@ -1739,6 +1783,11 @@ retry_lookup:
goto done;
}
+ if (unlikely(!in)) {
+ err = -EINVAL;
+ goto done;
+ }
+
/* attach proper inode */
if (d_really_is_negative(dn)) {
ceph_dir_clear_ordered(dir);
@@ -1774,6 +1823,12 @@ retry_lookup:
doutc(cl, " linking snapped dir %p to dn %p\n", in,
req->r_dentry);
ceph_dir_clear_ordered(dir);
+
+ if (unlikely(!in)) {
+ err = -EINVAL;
+ goto done;
+ }
+
ihold(in);
err = splice_dentry(&req->r_dentry, in);
if (err < 0)
@@ -1794,6 +1849,9 @@ retry_lookup:
&dvino, ptvino);
}
done:
+ /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
+ if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
+ iput(parent_dir);
doutc(cl, "done err=%d\n", err);
return err;
}
@@ -1829,11 +1887,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
pr_err_client(cl, "inode badness on %p got %d\n", in,
rc);
err = rc;
- if (in->i_state & I_NEW) {
+ if (inode_state_read_once(in) & I_NEW) {
ihold(in);
discard_new_inode(in);
}
- } else if (in->i_state & I_NEW) {
+ } else if (inode_state_read_once(in) & I_NEW) {
unlock_new_inode(in);
}
@@ -2045,7 +2103,7 @@ retry_lookup:
pr_err_client(cl, "badness on %p %llx.%llx\n", in,
ceph_vinop(in));
if (d_really_is_negative(dn)) {
- if (in->i_state & I_NEW) {
+ if (inode_state_read_once(in) & I_NEW) {
ihold(in);
discard_new_inode(in);
}
@@ -2055,7 +2113,7 @@ retry_lookup:
err = ret;
goto next_item;
}
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
unlock_new_inode(in);
if (d_really_is_negative(dn)) {
@@ -2367,7 +2425,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
/* Try to writeback the dirty pagecaches */
if (issued & (CEPH_CAP_FILE_BUFFER)) {
- loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;
ret = filemap_write_and_wait_range(inode->i_mapping,
orig_pos, lend);
@@ -2436,8 +2494,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
/* encrypt the last block */
ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
CEPH_FSCRYPT_BLOCK_SIZE,
- 0, block,
- GFP_KERNEL);
+ 0, block);
if (ret)
goto out;
}
@@ -2488,22 +2545,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
int truncate_retry = 20; /* The RMW will take around 50ms */
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
dentry = d_find_alias(inode);
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -2636,10 +2692,8 @@ retry:
if (ia_valid & ATTR_ATIME) {
struct timespec64 atime = inode_get_atime(inode);
- doutc(cl, "%p %llx.%llx atime %lld.%09ld -> %lld.%09ld\n",
- inode, ceph_vinop(inode),
- atime.tv_sec, atime.tv_nsec,
- attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+ doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n",
+ inode, ceph_vinop(inode), &atime, &attr->ia_atime);
if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
ci->i_time_warp_seq++;
inode_set_atime_to_ts(inode, attr->ia_atime);
@@ -2713,10 +2767,8 @@ retry:
if (ia_valid & ATTR_MTIME) {
struct timespec64 mtime = inode_get_mtime(inode);
- doutc(cl, "%p %llx.%llx mtime %lld.%09ld -> %lld.%09ld\n",
- inode, ceph_vinop(inode),
- mtime.tv_sec, mtime.tv_nsec,
- attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+ doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n",
+ inode, ceph_vinop(inode), &mtime, &attr->ia_mtime);
if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
ci->i_time_warp_seq++;
inode_set_mtime_to_ts(inode, attr->ia_mtime);
@@ -2737,13 +2789,11 @@ retry:
/* these do nothing */
if (ia_valid & ATTR_CTIME) {
+ struct timespec64 ictime = inode_get_ctime(inode);
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
- doutc(cl, "%p %llx.%llx ctime %lld.%09ld -> %lld.%09ld (%s)\n",
- inode, ceph_vinop(inode),
- inode_get_ctime_sec(inode),
- inode_get_ctime_nsec(inode),
- attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+ doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n",
+ inode, ceph_vinop(inode), &ictime, &attr->ia_ctime,
only ? "ctime only" : "ignored");
if (only) {
/*
diff --git a/fs/ceph/io.c b/fs/ceph/io.c
index c456509b31c3..2d10f49c93a9 100644
--- a/fs/ceph/io.c
+++ b/fs/ceph/io.c
@@ -21,14 +21,23 @@
/* Call with exclusively locked inode->i_rwsem */
static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
{
+ bool is_odirect;
+
lockdep_assert_held_write(&inode->i_rwsem);
- if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
- spin_lock(&ci->i_ceph_lock);
- ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
- spin_unlock(&ci->i_ceph_lock);
- inode_dio_wait(inode);
+ spin_lock(&ci->i_ceph_lock);
+ /* ensure that bit state is consistent */
+ smp_mb__before_atomic();
+ is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT;
+ if (is_odirect) {
+ clear_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags);
+ /* ensure modified bit is visible */
+ smp_mb__after_atomic();
}
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (is_odirect)
+ inode_dio_wait(inode);
}
/**
@@ -47,20 +56,35 @@ static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
* Note that buffered writes and truncates both take a write lock on
* inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
*/
-void
-ceph_start_io_read(struct inode *inode)
+int ceph_start_io_read(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ bool is_odirect;
+ int err;
/* Be an optimist! */
- down_read(&inode->i_rwsem);
- if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
- return;
+ err = down_read_killable(&inode->i_rwsem);
+ if (err)
+ return err;
+
+ spin_lock(&ci->i_ceph_lock);
+ /* ensure that bit state is consistent */
+ smp_mb__before_atomic();
+ is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT;
+ spin_unlock(&ci->i_ceph_lock);
+ if (!is_odirect)
+ return 0;
up_read(&inode->i_rwsem);
+
/* Slow path.... */
- down_write(&inode->i_rwsem);
+ err = down_write_killable(&inode->i_rwsem);
+ if (err)
+ return err;
+
ceph_block_o_direct(ci, inode);
downgrade_write(&inode->i_rwsem);
+
+ return 0;
}
/**
@@ -83,11 +107,12 @@ ceph_end_io_read(struct inode *inode)
* Declare that a buffered write operation is about to start, and ensure
* that we block all direct I/O.
*/
-void
-ceph_start_io_write(struct inode *inode)
+int ceph_start_io_write(struct inode *inode)
{
- down_write(&inode->i_rwsem);
- ceph_block_o_direct(ceph_inode(inode), inode);
+ int err = down_write_killable(&inode->i_rwsem);
+ if (!err)
+ ceph_block_o_direct(ceph_inode(inode), inode);
+ return err;
}
/**
@@ -106,12 +131,22 @@ ceph_end_io_write(struct inode *inode)
/* Call with exclusively locked inode->i_rwsem */
static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
{
+ bool is_odirect;
+
lockdep_assert_held_write(&inode->i_rwsem);
- if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
- spin_lock(&ci->i_ceph_lock);
- ci->i_ceph_flags |= CEPH_I_ODIRECT;
- spin_unlock(&ci->i_ceph_lock);
+ spin_lock(&ci->i_ceph_lock);
+ /* ensure that bit state is consistent */
+ smp_mb__before_atomic();
+ is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT;
+ if (!is_odirect) {
+ set_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags);
+ /* ensure modified bit is visible */
+ smp_mb__after_atomic();
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (!is_odirect) {
/* FIXME: unmap_mapping_range? */
filemap_write_and_wait(inode->i_mapping);
}
@@ -133,20 +168,35 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
* Note that buffered writes and truncates both take a write lock on
* inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
*/
-void
-ceph_start_io_direct(struct inode *inode)
+int ceph_start_io_direct(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ bool is_odirect;
+ int err;
/* Be an optimist! */
- down_read(&inode->i_rwsem);
- if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
- return;
+ err = down_read_killable(&inode->i_rwsem);
+ if (err)
+ return err;
+
+ spin_lock(&ci->i_ceph_lock);
+ /* ensure that bit state is consistent */
+ smp_mb__before_atomic();
+ is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT;
+ spin_unlock(&ci->i_ceph_lock);
+ if (is_odirect)
+ return 0;
up_read(&inode->i_rwsem);
+
/* Slow path.... */
- down_write(&inode->i_rwsem);
+ err = down_write_killable(&inode->i_rwsem);
+ if (err)
+ return err;
+
ceph_block_buffered(ci, inode);
downgrade_write(&inode->i_rwsem);
+
+ return 0;
}
/**
diff --git a/fs/ceph/io.h b/fs/ceph/io.h
index fa594cd77348..79029825e8b8 100644
--- a/fs/ceph/io.h
+++ b/fs/ceph/io.h
@@ -2,11 +2,13 @@
#ifndef _FS_CEPH_IO_H
#define _FS_CEPH_IO_H
-void ceph_start_io_read(struct inode *inode);
+#include <linux/compiler_attributes.h>
+
+int __must_check ceph_start_io_read(struct inode *inode);
void ceph_end_io_read(struct inode *inode);
-void ceph_start_io_write(struct inode *inode);
+int __must_check ceph_start_io_write(struct inode *inode);
void ceph_end_io_write(struct inode *inode);
-void ceph_start_io_direct(struct inode *inode);
+int __must_check ceph_start_io_direct(struct inode *inode);
void ceph_end_io_direct(struct inode *inode);
#endif /* FS_CEPH_IO_H */
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index e861de3c79b9..15cde055f3da 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -246,21 +246,28 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
struct ceph_client *cl = mdsc->fsc->client;
+ bool is_file_already_lazy = false;
+ spin_lock(&ci->i_ceph_lock);
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
- spin_lock(&ci->i_ceph_lock);
fi->fmode |= CEPH_FILE_MODE_LAZY;
ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
__ceph_touch_fmode(ci, mdsc, fi->fmode);
- spin_unlock(&ci->i_ceph_lock);
+ } else {
+ is_file_already_lazy = true;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (is_file_already_lazy) {
+ doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode,
+ ceph_vinop(inode));
+ } else {
doutc(cl, "file %p %p %llx.%llx marked lazy\n", file, inode,
ceph_vinop(inode));
ceph_check_caps(ci, 0);
- } else {
- doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode,
- ceph_vinop(inode));
}
+
return 0;
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ebf4ac0055dd..dd764f9c64b9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -221,7 +221,10 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
if (err && err != -ERESTARTSYS)
return err;
- wait_for_completion_killable(&req->r_safe_completion);
+ err = wait_for_completion_killable(&req->r_safe_completion);
+ if (err)
+ return err;
+
return 0;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 230e0c3f341f..1740047aef0f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -979,14 +979,15 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
if (mds >= mdsc->max_sessions) {
int newmax = 1 << get_count_order(mds + 1);
struct ceph_mds_session **sa;
+ size_t ptr_size = sizeof(struct ceph_mds_session *);
doutc(cl, "realloc to %d\n", newmax);
- sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+ sa = kcalloc(newmax, ptr_size, GFP_NOFS);
if (!sa)
goto fail_realloc;
if (mdsc->sessions) {
memcpy(sa, mdsc->sessions,
- mdsc->max_sessions * sizeof(void *));
+ mdsc->max_sessions * ptr_size);
kfree(mdsc->sessions);
}
mdsc->sessions = sa;
@@ -2221,7 +2222,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
int count;
dput(dentry);
d_prune_aliases(inode);
- count = atomic_read(&inode->i_count);
+ count = icount_read(inode);
if (count == 1)
(*remaining)--;
doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
@@ -2532,6 +2533,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
size_t size = sizeof(struct ceph_mds_reply_dir_entry);
unsigned int num_entries;
+ u64 bytes_count;
int order;
spin_lock(&ci->i_ceph_lock);
@@ -2540,7 +2542,11 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
num_entries = max(num_entries, 1U);
num_entries = min(num_entries, opt->max_readdir);
- order = get_order(size * num_entries);
+ bytes_count = (u64)size * num_entries;
+ if (unlikely(bytes_count > ULONG_MAX))
+ bytes_count = ULONG_MAX;
+
+ order = get_order((unsigned long)bytes_count);
while (order >= 0) {
rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
__GFP_NOWARN |
@@ -2550,7 +2556,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
break;
order--;
}
- if (!rinfo->dir_entries)
+ if (!rinfo->dir_entries || unlikely(order < 0))
return -ENOMEM;
num_entries = (PAGE_SIZE << order) / size;
@@ -2681,8 +2687,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* ceph_mdsc_build_path - build a path string to a given dentry
* @mdsc: mds client
* @dentry: dentry to which path should be built
- * @plen: returned length of string
- * @pbase: returned base inode number
+ * @path_info: output path, length, base ino+snap, and freepath ownership flag
* @for_wire: is this path going to be sent to the MDS?
*
* Build a string that represents the path to the dentry. This is mostly called
@@ -2700,7 +2705,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* foo/.snap/bar -> foo//bar
*/
char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- int *plen, u64 *pbase, int for_wire)
+ struct ceph_path_info *path_info, int for_wire)
{
struct ceph_client *cl = mdsc->fsc->client;
struct dentry *cur;
@@ -2766,8 +2771,8 @@ retry:
}
if (fscrypt_has_encryption_key(d_inode(parent))) {
- len = ceph_encode_encrypted_fname(d_inode(parent),
- cur, buf);
+ len = ceph_encode_encrypted_dname(d_inode(parent),
+ buf, len);
if (len < 0) {
dput(parent);
dput(cur);
@@ -2810,16 +2815,28 @@ retry:
return ERR_PTR(-ENAMETOOLONG);
}
- *pbase = base;
- *plen = PATH_MAX - 1 - pos;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
+ path_info->vino.ino = base;
+ path_info->pathlen = PATH_MAX - 1 - pos;
+ path_info->path = path + pos;
+ path_info->freepath = true;
+
+ /* Set snap from dentry if available */
+ if (d_inode(dentry))
+ path_info->vino.snap = ceph_snap(d_inode(dentry));
+ else
+ path_info->vino.snap = CEPH_NOSNAP;
+
doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
- base, *plen, path + pos);
+ base, PATH_MAX - 1 - pos, path + pos);
return path + pos;
}
static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- struct inode *dir, const char **ppath, int *ppathlen,
- u64 *pino, bool *pfreepath, bool parent_locked)
+ struct inode *dir, struct ceph_path_info *path_info,
+ bool parent_locked)
{
char *path;
@@ -2828,41 +2845,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
dir = d_inode_rcu(dentry->d_parent);
if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
!IS_ENCRYPTED(dir)) {
- *pino = ceph_ino(dir);
+ path_info->vino.ino = ceph_ino(dir);
+ path_info->vino.snap = ceph_snap(dir);
rcu_read_unlock();
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
+ path_info->path = dentry->d_name.name;
+ path_info->pathlen = dentry->d_name.len;
+ path_info->freepath = false;
return 0;
}
rcu_read_unlock();
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap handling.
+ */
return 0;
}
-static int build_inode_path(struct inode *inode,
- const char **ppath, int *ppathlen, u64 *pino,
- bool *pfreepath)
+static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct dentry *dentry;
char *path;
if (ceph_snap(inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(inode);
- *ppathlen = 0;
+ path_info->vino.ino = ceph_ino(inode);
+ path_info->vino.snap = ceph_snap(inode);
+ path_info->pathlen = 0;
+ path_info->freepath = false;
return 0;
}
dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
dput(dentry);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap from dentry.
+ * Override with inode's snap since that's what this function is for.
+ */
+ path_info->vino.snap = ceph_snap(inode);
return 0;
}
@@ -2872,26 +2895,32 @@ static int build_inode_path(struct inode *inode,
*/
static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
struct dentry *rdentry, struct inode *rdiri,
- const char *rpath, u64 rino, const char **ppath,
- int *pathlen, u64 *ino, bool *freepath,
+ const char *rpath, u64 rino,
+ struct ceph_path_info *path_info,
bool parent_locked)
{
struct ceph_client *cl = mdsc->fsc->client;
int r = 0;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
if (rinode) {
- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+ r = build_inode_path(rinode, path_info);
doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
- freepath, parent_locked);
- doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
+ r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
+ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
+ path_info->pathlen, path_info->path);
} else if (rpath || rino) {
- *ino = rino;
- *ppath = rpath;
- *pathlen = rpath ? strlen(rpath) : 0;
- doutc(cl, " path %.*s\n", *pathlen, rpath);
+ path_info->vino.ino = rino;
+ path_info->vino.snap = CEPH_NOSNAP;
+ path_info->path = rpath;
+ path_info->pathlen = rpath ? strlen(rpath) : 0;
+ path_info->freepath = false;
+
+ doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
}
return r;
@@ -2968,11 +2997,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_request_head_legacy *lhead;
- const char *path1 = NULL;
- const char *path2 = NULL;
- u64 ino1 = 0, ino2 = 0;
- int pathlen1 = 0, pathlen2 = 0;
- bool freepath1 = false, freepath2 = false;
+ struct ceph_path_info path_info1 = {0};
+ struct ceph_path_info path_info2 = {0};
struct dentry *old_dentry = NULL;
int len;
u16 releases;
@@ -2982,25 +3008,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
u16 request_head_version = mds_supported_head_version(session);
kuid_t caller_fsuid = req->r_cred->fsuid;
kgid_t caller_fsgid = req->r_cred->fsgid;
+ bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
- req->r_parent, req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1,
- test_bit(CEPH_MDS_R_PARENT_LOCKED,
- &req->r_req_flags));
+ req->r_parent, req->r_path1, req->r_ino1.ino,
+ &path_info1, parent_locked);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out;
}
+ /*
+ * When the parent directory's i_rwsem is *not* locked, req->r_parent may
+ * have become stale (e.g. after a concurrent rename) between the time the
+ * dentry was looked up and now. If we detect that the stored r_parent
+ * does not match the inode number we just encoded for the request, switch
+ * to the correct inode so that the MDS receives a valid parent reference.
+ */
+ if (!parent_locked && req->r_parent && path_info1.vino.ino &&
+ ceph_ino(req->r_parent) != path_info1.vino.ino) {
+ struct inode *old_parent = req->r_parent;
+ struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
+ if (!IS_ERR(correct_dir)) {
+ WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
+ ceph_ino(old_parent), path_info1.vino.ino);
+ /*
+ * Transfer CEPH_CAP_PIN from the old parent to the new one.
+ * The pin was taken earlier in ceph_mdsc_submit_request().
+ */
+ ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN);
+ iput(old_parent);
+ req->r_parent = correct_dir;
+ ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ }
+ }
+
/* If r_old_dentry is set, then assume that its parent is locked */
if (req->r_old_dentry &&
!(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
old_dentry = req->r_old_dentry;
ret = set_request_path_attr(mdsc, NULL, old_dentry,
- req->r_old_dentry_dir,
- req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2, true);
+ req->r_old_dentry_dir,
+ req->r_path2, req->r_ino2.ino,
+ &path_info2, true);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out_free1;
@@ -3031,7 +3081,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
/* filepaths */
len += 2 * (1 + sizeof(u32) + sizeof(u64));
- len += pathlen1 + pathlen2;
+ len += path_info1.pathlen + path_info2.pathlen;
/* cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -3039,9 +3089,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
if (req->r_dentry_drop)
- len += pathlen1;
+ len += path_info1.pathlen;
if (req->r_old_dentry_drop)
- len += pathlen2;
+ len += path_info2.pathlen;
/* MClientRequest tail */
@@ -3154,8 +3204,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead->ino = cpu_to_le64(req->r_deleg_ino);
lhead->args = req->r_args;
- ceph_encode_filepath(&p, end, ino1, path1);
- ceph_encode_filepath(&p, end, ino2, path2);
+ ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
+ ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
/* make note of release offset, in case we need to replay */
req->r_request_release_offset = p - msg->front.iov_base;
@@ -3218,11 +3268,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
msg->hdr.data_off = cpu_to_le16(0);
out_free2:
- if (freepath2)
- ceph_mdsc_free_path((char *)path2, pathlen2);
+ ceph_mdsc_free_path_info(&path_info2);
out_free1:
- if (freepath1)
- ceph_mdsc_free_path((char *)path1, pathlen1);
+ ceph_mdsc_free_path_info(&path_info1);
out:
return msg;
out_err:
@@ -4579,24 +4627,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
struct ceph_cap *cap;
- char *path;
- int pathlen = 0, err;
- u64 pathbase;
+ struct ceph_path_info path_info = {0};
+ int err;
u64 snap_follows;
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
recon_state->msg_version >= 2);
dput(dentry);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out_err;
}
- } else {
- path = NULL;
- pathbase = 0;
}
spin_lock(&ci->i_ceph_lock);
@@ -4629,7 +4673,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
+ rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -4644,7 +4688,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
ts = inode_get_atime(inode);
ceph_encode_timespec64(&rec.v1.atime, &ts);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
+ rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -4706,7 +4750,7 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
+ struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@@ -4730,7 +4774,7 @@ encode_again:
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@@ -4741,17 +4785,17 @@ out_freeflocks:
} else {
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
- pathlen + sizeof(rec.v1));
+ path_info.pathlen + sizeof(rec.v1));
if (err)
goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
}
out_err:
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
if (!err)
recon_state->nr_caps++;
return err;
@@ -5611,11 +5655,19 @@ static int ceph_mds_auth_match(struct ceph_mds_client *mdsc,
u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid);
u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid);
struct ceph_client *cl = mdsc->fsc->client;
+ const char *fs_name = mdsc->fsc->mount_options->mds_namespace;
const char *spath = mdsc->fsc->mount_options->server_path;
bool gid_matched = false;
u32 gid, tlen, len;
int i, j;
+ doutc(cl, "fsname check fs_name=%s match.fs_name=%s\n",
+ fs_name, auth->match.fs_name ? auth->match.fs_name : "");
+ if (auth->match.fs_name && strcmp(auth->match.fs_name, fs_name)) {
+ /* fsname mismatch, try next one */
+ return 0;
+ }
+
doutc(cl, "match.uid %lld\n", auth->match.uid);
if (auth->match.uid != MDS_AUTH_UID_ANY) {
if (auth->match.uid != caller_uid)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 3e2a6fa7c19a..0428a5eaf28c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-static inline void ceph_mdsc_free_path(char *path, int len)
+/*
+ * Structure to group path-related output parameters for build_*_path functions
+ */
+struct ceph_path_info {
+ const char *path;
+ int pathlen;
+ struct ceph_vino vino;
+ bool freepath;
+};
+
+static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
{
- if (!IS_ERR_OR_NULL(path))
- __putname(path - (PATH_MAX - 1 - len));
+ if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
+ __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
}
extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
- struct dentry *dentry, int *plen, u64 *base,
+ struct dentry *dentry, struct ceph_path_info *path_info,
int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 8109aba66e02..2c7b151a7c95 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -353,10 +353,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
__decode_and_drop_type(p, end, u8, bad_ext);
}
if (mdsmap_ev >= 8) {
+ u32 fsname_len;
/* enabled */
ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
/* fs_name */
- ceph_decode_skip_string(p, end, bad_ext);
+ ceph_decode_32_safe(p, end, fsname_len, bad_ext);
+
+ /* validate fsname against mds_namespace */
+ if (!namespace_equals(mdsc->fsc->mount_options, *p,
+ fsname_len)) {
+ pr_warn_client(cl, "fsname %*pE doesn't match mds_namespace %s\n",
+ (int)fsname_len, (char *)*p,
+ mdsc->fsc->mount_options->mds_namespace);
+ goto bad;
+ }
+ /* skip fsname after validation */
+ ceph_decode_skip_n(p, end, fsname_len, bad);
}
/* damaged */
if (mdsmap_ev >= 9) {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f3951253e393..f6bf24b5c683 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -246,20 +246,6 @@ static void canonicalize_path(char *path)
path[j] = '\0';
}
-/*
- * Check if the mds namespace in ceph_mount_options matches
- * the passed in namespace string. First time match (when
- * ->mds_namespace is NULL) is treated specially, since
- * ->mds_namespace needs to be initialized by the caller.
- */
-static int namespace_equals(struct ceph_mount_options *fsopt,
- const char *namespace, size_t len)
-{
- return !(fsopt->mds_namespace &&
- (strlen(fsopt->mds_namespace) != len ||
- strncmp(fsopt->mds_namespace, namespace, len)));
-}
-
static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end,
struct fs_context *fc)
{
@@ -862,7 +848,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0);
if (!fsc->inode_wq)
goto fail_client;
- fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
+ fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1);
if (!fsc->cap_wq)
goto fail_inode_wq;
@@ -1033,8 +1019,7 @@ void ceph_umount_begin(struct super_block *sb)
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
doutc(fsc->client, "starting forced umount\n");
- if (!fsc)
- return;
+
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
__ceph_umount_begin(fsc);
}
@@ -1043,7 +1028,7 @@ static const struct super_operations ceph_super_ops = {
.alloc_inode = ceph_alloc_inode,
.free_inode = ceph_free_inode,
.write_inode = ceph_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
@@ -1164,7 +1149,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
const char *path = fsc->mount_options->server_path ?
fsc->mount_options->server_path + 1 : "";
- err = __ceph_open_session(fsc->client, started);
+ err = __ceph_open_session(fsc->client);
if (err < 0)
goto out;
@@ -1220,13 +1205,14 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */
s->s_op = &ceph_super_ops;
- s->s_d_op = &ceph_dentry_ops;
+ set_default_d_op(s, &ceph_dentry_ops);
s->s_export_op = &ceph_export_ops;
s->s_time_gran = 1;
s->s_time_min = 0;
s->s_time_max = U32_MAX;
s->s_flags |= SB_NODIRATIME | SB_NOATIME;
+ s->s_magic = CEPH_SUPER_MAGIC;
ceph_fscrypt_set_ops(s);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index bb0db0cc8003..a1f781c46b41 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -104,6 +104,20 @@ struct ceph_mount_options {
struct fscrypt_dummy_policy dummy_enc_policy;
};
+/*
+ * Check if the mds namespace in ceph_mount_options matches
+ * the passed in namespace string. First time match (when
+ * ->mds_namespace is NULL) is treated specially, since
+ * ->mds_namespace needs to be initialized by the caller.
+ */
+static inline int namespace_equals(struct ceph_mount_options *fsopt,
+ const char *namespace, size_t len)
+{
+ return !(fsopt->mds_namespace &&
+ (strlen(fsopt->mds_namespace) != len ||
+ strncmp(fsopt->mds_namespace, namespace, len)));
+}
+
/* mount state */
enum {
CEPH_MOUNT_MOUNTING,
@@ -463,6 +477,7 @@ struct ceph_inode_info {
unsigned long i_work_mask;
#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
u32 fscrypt_auth_len;
u32 fscrypt_file_len;
u8 *fscrypt_auth;
@@ -638,7 +653,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */
#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */
#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */
-#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */
+#define CEPH_I_ODIRECT_BIT (11) /* inode in direct I/O mode */
+#define CEPH_I_ODIRECT (1 << CEPH_I_ODIRECT_BIT)
#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */
@@ -1286,7 +1302,7 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
/* addr.c */
extern const struct address_space_operations ceph_aops;
extern const struct netfs_request_ops ceph_netfs_ops;
-extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+int ceph_mmap_prepare(struct vm_area_desc *desc);
extern int ceph_uninline_data(struct file *file);
extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 537165db4519..ad1f30bea175 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -249,8 +249,7 @@ static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
static ssize_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
size_t size)
{
- return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_rctime.tv_sec,
- ci->i_rctime.tv_nsec);
+ return ceph_fmt_xattr(val, size, "%ptSp", &ci->i_rctime);
}
/* dir pin */
@@ -307,8 +306,7 @@ static bool ceph_vxattrcb_snap_btime_exists(struct ceph_inode_info *ci)
static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
size_t size)
{
- return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_snap_btime.tv_sec,
- ci->i_snap_btime.tv_nsec);
+ return ceph_fmt_xattr(val, size, "%ptSp", &ci->i_snap_btime);
}
static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 62a3d2565c26..70bb0579b40c 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -70,7 +70,7 @@ retry:
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
cii = ITOC(inode);
/* we still need to set i_ino for things like stat(2) */
inode->i_ino = hash;
@@ -148,7 +148,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
/* we should never see newly created inodes because we intentionally
* fail in the initialization callback */
- BUG_ON(inode->i_state & I_NEW);
+ BUG_ON(inode_state_read_once(inode) & I_NEW);
return inode;
}
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ab69d8f0cec2..ca9990017265 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -429,17 +429,9 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
cfi = coda_ftoc(coda_file);
host_file = cfi->cfi_container;
- if (host_file->f_op->iterate_shared) {
- struct inode *host_inode = file_inode(host_file);
- ret = -ENOENT;
- if (!IS_DEADDIR(host_inode)) {
- inode_lock_shared(host_inode);
- ret = host_file->f_op->iterate_shared(host_file, ctx);
- file_accessed(host_file);
- inode_unlock_shared(host_inode);
- }
+ ret = iterate_dir(host_file, ctx);
+ if (ret != -ENOTDIR)
return ret;
- }
/* Venus: we must read Venus dirents from a file */
return coda_venus_readdir(coda_file, ctx);
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 148856a582a9..a390b5d21196 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -160,7 +160,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
size_t count;
int ret;
- if (!host_file->f_op->mmap)
+ if (!can_mmap_file(host_file))
return -ENODEV;
if (WARN_ON(coda_file != vma->vm_file))
@@ -199,10 +199,10 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
spin_unlock(&cii->c_lock);
vma->vm_file = get_file(host_file);
- ret = call_mmap(vma->vm_file, vma);
+ ret = vfs_mmap(vma->vm_file, vma);
if (ret) {
- /* if call_mmap fails, our caller will put host_file so we
+ /* if vfs_mmap fails, our caller will put host_file so we
* should drop the reference to the coda_file that we got.
*/
fput(coda_file);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6896fce122e1..08450d006016 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -230,7 +230,7 @@ static int coda_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = 12;
sb->s_magic = CODA_SUPER_MAGIC;
sb->s_op = &coda_super_operations;
- sb->s_d_op = &coda_dentry_operations;
+ set_default_d_op(sb, &coda_dentry_operations);
sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 272b64456999..1fcd761fe7be 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem"
- select SYSFS
help
configfs is a RAM-based filesystem that provides the converse
of sysfs's functionality. Where sysfs is a filesystem-based
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5568cb74b322..ba95f636a5ab 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,7 +67,6 @@ static void configfs_d_iput(struct dentry * dentry,
const struct dentry_operations configfs_dentry_ops = {
.d_iput = configfs_d_iput,
- .d_delete = always_delete_dentry,
};
#ifdef CONFIG_LOCKDEP
@@ -401,8 +400,14 @@ static void remove_dir(struct dentry * d)
configfs_remove_dirent(d);
- if (d_really_is_positive(d))
- simple_rmdir(d_inode(parent),d);
+ if (d_really_is_positive(d)) {
+ if (likely(simple_empty(d))) {
+ __simple_rmdir(d_inode(parent),d);
+ dput(d);
+ } else {
+ pr_warn("remove_dir (%pd): attributes remain", d);
+ }
+ }
pr_debug(" o %pd removing done (%d)\n", d, d_count(d));
@@ -599,7 +604,7 @@ static void detach_attrs(struct config_item * item)
static int populate_attrs(struct config_item *item)
{
const struct config_item_type *t = item->ci_type;
- struct configfs_group_operations *ops;
+ const struct configfs_group_operations *ops;
struct configfs_attribute *attr;
struct configfs_bin_attribute *bin_attr;
int error = 0;
@@ -619,7 +624,7 @@ static int populate_attrs(struct config_item *item)
break;
}
}
- if (t->ct_bin_attrs) {
+ if (!error && t->ct_bin_attrs) {
for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i))
continue;
@@ -970,7 +975,7 @@ static void configfs_dump_one(struct configfs_dirent *sd, int level)
{
pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd));
-#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type);
+#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type)
type_print(CONFIGFS_ROOT);
type_print(CONFIGFS_DIR);
type_print(CONFIGFS_ITEM_ATTR);
@@ -1602,10 +1607,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
err = -ENOENT;
if (configfs_dirent_is_ready(parent_sd)) {
file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL);
- if (IS_ERR(file->private_data))
- err = PTR_ERR(file->private_data);
- else
- err = 0;
+ err = PTR_ERR_OR_ZERO(file->private_data);
}
inode_unlock(d_inode(dentry));
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 0ad32150611e..affe4742bbb5 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -30,7 +30,7 @@ struct configfs_buffer {
size_t count;
loff_t pos;
char * page;
- struct configfs_item_operations * ops;
+ const struct configfs_item_operations *ops;
struct mutex mutex;
int needs_read_fill;
bool read_in_progress;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 1d2e3a5738d1..bcda3372e141 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -211,7 +211,8 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
dget_dlock(dentry);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
- simple_unlink(d_inode(parent), dentry);
+ __simple_unlink(d_inode(parent), dentry);
+ dput(dentry);
} else
spin_unlock(&dentry->d_lock);
}
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 254170a82aa3..c378b5cbf87d 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -66,7 +66,7 @@ int config_item_set_name(struct config_item *item, const char *fmt, ...)
name = kvasprintf(GFP_KERNEL, fmt, args);
va_end(args);
if (!name)
- return -EFAULT;
+ return -ENOMEM;
}
/* Free the old name, if necessary. */
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index c2d820063ec4..4929f3431189 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode)
static const struct super_operations configfs_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.free_inode = configfs_free_inode,
};
@@ -92,7 +92,8 @@ static int configfs_fill_super(struct super_block *sb, struct fs_context *fc)
configfs_root_group.cg_item.ci_dentry = root;
root->d_fsdata = &configfs_root;
sb->s_root = root;
- sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
+ set_default_d_op(sb, &configfs_dentry_ops); /* the rest get that */
+ sb->s_d_flags |= DCACHE_DONTCACHE;
return 0;
}
@@ -115,7 +116,7 @@ static struct file_system_type configfs_fs_type = {
.owner = THIS_MODULE,
.name = "configfs",
.init_fs_context = configfs_init_fs_context,
- .kill_sb = kill_litter_super,
+ .kill_sb = kill_anon_super,
};
MODULE_ALIAS_FS("configfs");
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 69133ec1fac2..f3f79c67add5 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -114,26 +114,21 @@ static int create_link(struct config_item *parent_item,
}
-static int get_target(const char *symname, struct path *path,
- struct config_item **target, struct super_block *sb)
+static int get_target(const char *symname, struct config_item **target,
+ struct super_block *sb)
{
+ struct path path __free(path_put) = {};
int ret;
- ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);
- if (!ret) {
- if (path->dentry->d_sb == sb) {
- *target = configfs_get_config_item(path->dentry);
- if (!*target) {
- ret = -ENOENT;
- path_put(path);
- }
- } else {
- ret = -EPERM;
- path_put(path);
- }
- }
-
- return ret;
+ ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+ if (ret)
+ return ret;
+ if (path.dentry->d_sb != sb)
+ return -EPERM;
+ *target = configfs_get_config_item(path.dentry);
+ if (!*target)
+ return -ENOENT;
+ return 0;
}
@@ -141,7 +136,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
int ret;
- struct path path;
struct configfs_dirent *sd;
struct config_item *parent_item;
struct config_item *target_item = NULL;
@@ -188,7 +182,7 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
* AV, a thoroughly annoyed bastard.
*/
inode_unlock(dir);
- ret = get_target(symname, &path, &target_item, dentry->d_sb);
+ ret = get_target(symname, &target_item, dentry->d_sb);
inode_lock(dir);
if (ret)
goto out_put;
@@ -210,7 +204,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
}
config_item_put(target_item);
- path_put(&path);
out_put:
config_item_put(parent_item);
diff --git a/fs/coredump.c b/fs/coredump.c
index c33c177a701b..8feb9c1cf83d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -43,6 +43,15 @@
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
#include <linux/elf.h>
+#include <linux/pidfs.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/af_unix.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <uapi/linux/pidfd.h>
+#include <uapi/linux/un.h>
+#include <uapi/linux/coredump.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -60,6 +69,12 @@ static void free_vma_snapshot(struct coredump_params *cprm);
#define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024)
/* Define a reasonable max cap */
#define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024)
+/*
+ * File descriptor number for the pidfd for the thread-group leader of
+ * the coredumping task installed into the usermode helper's file
+ * descriptor table.
+ */
+#define COREDUMP_PIDFD_NUMBER 3
static int core_uses_pid;
static unsigned int core_pipe_limit;
@@ -67,10 +82,22 @@ static unsigned int core_sort_vma;
static char core_pattern[CORENAME_MAX_SIZE] = "core";
static int core_name_size = CORENAME_MAX_SIZE;
unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT;
+static atomic_t core_pipe_count = ATOMIC_INIT(0);
+
+enum coredump_type_t {
+ COREDUMP_FILE = 1,
+ COREDUMP_PIPE = 2,
+ COREDUMP_SOCK = 3,
+ COREDUMP_SOCK_REQ = 4,
+};
struct core_name {
char *corename;
int used, size;
+ unsigned int core_pipe_limit;
+ bool core_dumped;
+ enum coredump_type_t core_type;
+ u64 mask;
};
static int expand_corename(struct core_name *cn, int size)
@@ -201,35 +228,104 @@ put_exe_file:
return ret;
}
-/* format_corename will inspect the pattern parameter, and output a
- * name into corename, which must have space for at least
- * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
+/*
+ * coredump_parse will inspect the pattern parameter, and output a name
+ * into corename, which must have space for at least CORENAME_MAX_SIZE
+ * bytes plus one byte for the zero terminator.
*/
-static int format_corename(struct core_name *cn, struct coredump_params *cprm,
+static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm,
size_t **argv, int *argc)
{
const struct cred *cred = current_cred();
const char *pat_ptr = core_pattern;
- int ispipe = (*pat_ptr == '|');
bool was_space = false;
int pid_in_pattern = 0;
int err = 0;
+ cn->mask = COREDUMP_KERNEL;
+ if (core_pipe_limit)
+ cn->mask |= COREDUMP_WAIT;
cn->used = 0;
cn->corename = NULL;
+ cn->core_pipe_limit = 0;
+ cn->core_dumped = false;
+ if (*pat_ptr == '|')
+ cn->core_type = COREDUMP_PIPE;
+ else if (*pat_ptr == '@')
+ cn->core_type = COREDUMP_SOCK;
+ else
+ cn->core_type = COREDUMP_FILE;
if (expand_corename(cn, core_name_size))
- return -ENOMEM;
+ return false;
cn->corename[0] = '\0';
- if (ispipe) {
+ switch (cn->core_type) {
+ case COREDUMP_PIPE: {
int argvs = sizeof(core_pattern) / 2;
(*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
if (!(*argv))
- return -ENOMEM;
+ return false;
(*argv)[(*argc)++] = 0;
++pat_ptr;
if (!(*pat_ptr))
- return -ENOMEM;
+ return false;
+ break;
+ }
+ case COREDUMP_SOCK: {
+ /* skip the @ */
+ pat_ptr++;
+ if (!(*pat_ptr))
+ return false;
+ if (*pat_ptr == '@') {
+ pat_ptr++;
+ if (!(*pat_ptr))
+ return false;
+
+ cn->core_type = COREDUMP_SOCK_REQ;
+ }
+
+ err = cn_printf(cn, "%s", pat_ptr);
+ if (err)
+ return false;
+
+ /* Require absolute paths. */
+ if (cn->corename[0] != '/')
+ return false;
+
+ /*
+ * Ensure we can uses spaces to indicate additional
+ * parameters in the future.
+ */
+ if (strchr(cn->corename, ' ')) {
+ coredump_report_failure("Coredump socket may not %s contain spaces", cn->corename);
+ return false;
+ }
+
+ /* Must not contain ".." in the path. */
+ if (name_contains_dotdot(cn->corename)) {
+ coredump_report_failure("Coredump socket may not %s contain '..' spaces", cn->corename);
+ return false;
+ }
+
+ if (strlen(cn->corename) >= UNIX_PATH_MAX) {
+ coredump_report_failure("Coredump socket path %s too long", cn->corename);
+ return false;
+ }
+
+ /*
+ * Currently no need to parse any other options.
+ * Relevant information can be retrieved from the peer
+ * pidfd retrievable via SO_PEERPIDFD by the receiver or
+ * via /proc/<pid>, using the SO_PEERPIDFD to guard
+ * against pid recycling when opening /proc/<pid>.
+ */
+ return true;
+ }
+ case COREDUMP_FILE:
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ return false;
}
/* Repeat as long as we have more pattern to process and more output
@@ -239,7 +335,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
* Split on spaces before doing template expansion so that
* %e and %E don't get split if they have spaces in them
*/
- if (ispipe) {
+ if (cn->core_type == COREDUMP_PIPE) {
if (isspace(*pat_ptr)) {
if (cn->used != 0)
was_space = true;
@@ -249,7 +345,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
was_space = false;
err = cn_printf(cn, "%c", '\0');
if (err)
- return err;
+ return false;
(*argv)[(*argc)++] = cn->used;
}
}
@@ -339,6 +435,27 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
case 'C':
err = cn_printf(cn, "%d", cprm->cpu);
break;
+ /* pidfd number */
+ case 'F': {
+ /*
+ * Installing a pidfd only makes sense if
+ * we actually spawn a usermode helper.
+ */
+ if (cn->core_type != COREDUMP_PIPE)
+ break;
+
+ /*
+ * Note that we'll install a pidfd for the
+ * thread-group leader. We know that task
+ * linkage hasn't been removed yet and even if
+ * this @current isn't the actual thread-group
+ * leader we know that the thread-group leader
+ * cannot be reaped until @current has exited.
+ */
+ cprm->pid = task_tgid(current);
+ err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER);
+ break;
+ }
default:
break;
}
@@ -346,7 +463,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
}
if (err)
- return err;
+ return false;
}
out:
@@ -355,12 +472,10 @@ out:
* If core_pattern does not include a %p (as is the default)
* and core_uses_pid is set, then .%pid will be appended to
* the filename. Do not do this for piped commands. */
- if (!ispipe && !pid_in_pattern && core_uses_pid) {
- err = cn_printf(cn, ".%d", task_tgid_vnr(current));
- if (err)
- return err;
- }
- return ispipe;
+ if (cn->core_type == COREDUMP_FILE && !pid_in_pattern && core_uses_pid)
+ return cn_printf(cn, ".%d", task_tgid_vnr(current)) == 0;
+
+ return true;
}
static int zap_process(struct signal_struct *signal, int exit_code)
@@ -493,7 +608,7 @@ static void wait_for_dump_helpers(struct file *file)
}
/*
- * umh_pipe_setup
+ * umh_coredump_setup
* helper function to customize the process used
* to collect the core in userspace. Specifically
* it sets up a pipe and installs it as fd 0 (stdin)
@@ -503,11 +618,34 @@ static void wait_for_dump_helpers(struct file *file)
* is a special value that we use to trap recursive
* core dumps
*/
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
{
struct file *files[2];
struct coredump_params *cp = (struct coredump_params *)info->data;
- int err = create_pipe_files(files, 0);
+ int err;
+
+ if (cp->pid) {
+ struct file *pidfs_file __free(fput) = NULL;
+
+ pidfs_file = pidfs_alloc_file(cp->pid, 0);
+ if (IS_ERR(pidfs_file))
+ return PTR_ERR(pidfs_file);
+
+ pidfs_coredump(cp);
+
+ /*
+ * Usermode helpers are childen of either
+ * system_dfl_wq or of kthreadd. So we know that
+ * we're starting off with a clean file descriptor
+ * table. So we should always be able to use
+ * COREDUMP_PIDFD_NUMBER as our file descriptor value.
+ */
+ err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0);
+ if (err < 0)
+ return err;
+ }
+
+ err = create_pipe_files(files, 0);
if (err)
return err;
@@ -515,277 +653,552 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
err = replace_fd(0, files[0], 0);
fput(files[0]);
+ if (err < 0)
+ return err;
+
/* and disallow core files too */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
- return err;
+ return 0;
}
-void do_coredump(const kernel_siginfo_t *siginfo)
+#ifdef CONFIG_UNIX
+static bool coredump_sock_connect(struct core_name *cn, struct coredump_params *cprm)
{
- struct core_state core_state;
- struct core_name cn;
- struct mm_struct *mm = current->mm;
- struct linux_binfmt * binfmt;
- const struct cred *old_cred;
- struct cred *cred;
- int retval = 0;
- int ispipe;
- size_t *argv = NULL;
- int argc = 0;
- /* require nonrelative corefile path and be extra careful */
- bool need_suid_safe = false;
- bool core_dumped = false;
- static atomic_t core_dump_count = ATOMIC_INIT(0);
- struct coredump_params cprm = {
- .siginfo = siginfo,
- .limit = rlimit(RLIMIT_CORE),
- /*
- * We must use the same mm->flags while dumping core to avoid
- * inconsistency of bit flags, since this flag is not protected
- * by any locks.
- */
- .mm_flags = mm->flags,
- .vma_meta = NULL,
- .cpu = raw_smp_processor_id(),
+ struct file *file __free(fput) = NULL;
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
};
+ ssize_t addr_len;
+ int retval;
+ struct socket *socket;
- audit_core_dumps(siginfo->si_signo);
+ addr_len = strscpy(addr.sun_path, cn->corename);
+ if (addr_len < 0)
+ return false;
+ addr_len += offsetof(struct sockaddr_un, sun_path) + 1;
+
+ /*
+ * It is possible that the userspace process which is supposed
+ * to handle the coredump and is listening on the AF_UNIX socket
+ * coredumps. Userspace should just mark itself non dumpable.
+ */
- binfmt = mm->binfmt;
- if (!binfmt || !binfmt->core_dump)
- goto fail;
- if (!__get_dumpable(cprm.mm_flags))
- goto fail;
+ retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket);
+ if (retval < 0)
+ return false;
+
+ file = sock_alloc_file(socket, 0, NULL);
+ if (IS_ERR(file))
+ return false;
- cred = prepare_creds();
- if (!cred)
- goto fail;
/*
- * We cannot trust fsuid as being the "true" uid of the process
- * nor do we know its entire history. We only know it was tainted
- * so we dump it as root in mode 2, and only into a controlled
- * environment (pipe handler or fully qualified path).
+ * Set the thread-group leader pid which is used for the peer
+ * credentials during connect() below. Then immediately register
+ * it in pidfs...
+ */
+ cprm->pid = task_tgid(current);
+ retval = pidfs_register_pid(cprm->pid);
+ if (retval)
+ return false;
+
+ /*
+ * ... and set the coredump information so userspace has it
+ * available after connect()...
*/
- if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
- /* Setuid core dump mode */
- cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
- need_suid_safe = true;
+ pidfs_coredump(cprm);
+
+ retval = kernel_connect(socket, (struct sockaddr_unsized *)(&addr), addr_len,
+ O_NONBLOCK | SOCK_COREDUMP);
+
+ if (retval) {
+ if (retval == -EAGAIN)
+ coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
+ else
+ coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval);
+ return false;
}
- retval = coredump_wait(siginfo->si_signo, &core_state);
- if (retval < 0)
- goto fail_creds;
+ /* ... and validate that @sk_peer_pid matches @cprm.pid. */
+ if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm->pid))
+ return false;
- old_cred = override_creds(cred);
+ cprm->limit = RLIM_INFINITY;
+ cprm->file = no_free_ptr(file);
- ispipe = format_corename(&cn, &cprm, &argv, &argc);
+ return true;
+}
- if (ispipe) {
- int argi;
- int dump_count;
- char **helper_argv;
- struct subprocess_info *sub_info;
+static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags)
+{
+ struct msghdr msg = {};
+ struct kvec iov = { .iov_base = ack, .iov_len = size };
+ ssize_t ret;
- if (ispipe < 0) {
- coredump_report_failure("format_corename failed, aborting core");
- goto fail_unlock;
- }
+ memset(ack, 0, size);
+ ret = kernel_recvmsg(sock_from_file(file), &msg, &iov, 1, size, flags);
+ return ret == size;
+}
- if (cprm.limit == 1) {
- /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
- *
- * Normally core limits are irrelevant to pipes, since
- * we're not writing to the file system, but we use
- * cprm.limit of 1 here as a special value, this is a
- * consistent way to catch recursive crashes.
- * We can still crash if the core_pattern binary sets
- * RLIM_CORE = !1, but it runs as root, and can do
- * lots of stupid things.
- *
- * Note that we use task_tgid_vnr here to grab the pid
- * of the process group leader. That way we get the
- * right pid if a thread in a multi-threaded
- * core_pattern process dies.
- */
- coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
- goto fail_unlock;
- }
- cprm.limit = RLIM_INFINITY;
+static inline bool coredump_sock_send(struct file *file, struct coredump_req *req)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ struct kvec iov = { .iov_base = req, .iov_len = sizeof(*req) };
+ ssize_t ret;
- dump_count = atomic_inc_return(&core_dump_count);
- if (core_pipe_limit && (core_pipe_limit < dump_count)) {
- coredump_report_failure("over core_pipe_limit, skipping core dump");
- goto fail_dropcount;
- }
+ ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(*req));
+ return ret == sizeof(*req);
+}
- helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
- GFP_KERNEL);
- if (!helper_argv) {
- coredump_report_failure("%s failed to allocate memory", __func__);
- goto fail_dropcount;
- }
- for (argi = 0; argi < argc; argi++)
- helper_argv[argi] = cn.corename + argv[argi];
- helper_argv[argi] = NULL;
-
- retval = -ENOMEM;
- sub_info = call_usermodehelper_setup(helper_argv[0],
- helper_argv, NULL, GFP_KERNEL,
- umh_pipe_setup, NULL, &cprm);
- if (sub_info)
- retval = call_usermodehelper_exec(sub_info,
- UMH_WAIT_EXEC);
-
- kfree(helper_argv);
- if (retval) {
- coredump_report_failure("|%s pipe failed", cn.corename);
- goto close_fail;
- }
- } else {
- struct mnt_idmap *idmap;
- struct inode *inode;
- int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW |
- O_LARGEFILE | O_EXCL;
-
- if (cprm.limit < binfmt->min_coredump)
- goto fail_unlock;
-
- if (need_suid_safe && cn.corename[0] != '/') {
- coredump_report_failure(
- "this process can only dump core to a fully qualified path, skipping core dump");
- goto fail_unlock;
- }
+static_assert(sizeof(enum coredump_mark) == sizeof(__u32));
- /*
- * Unlink the file if it exists unless this is a SUID
- * binary - in that case, we're running around with root
- * privs and don't want to unlink another user's coredump.
- */
- if (!need_suid_safe) {
- /*
- * If it doesn't exist, that's fine. If there's some
- * other problem, we'll catch it at the filp_open().
- */
- do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
- }
+static inline bool coredump_sock_mark(struct file *file, enum coredump_mark mark)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ struct kvec iov = { .iov_base = &mark, .iov_len = sizeof(mark) };
+ ssize_t ret;
+
+ ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(mark));
+ return ret == sizeof(mark);
+}
+
+static inline void coredump_sock_wait(struct file *file)
+{
+ ssize_t n;
+
+ /*
+ * We use a simple read to wait for the coredump processing to
+ * finish. Either the socket is closed or we get sent unexpected
+ * data. In both cases, we're done.
+ */
+ n = __kernel_read(file, &(char){ 0 }, 1, NULL);
+ if (n > 0)
+ coredump_report_failure("Coredump socket had unexpected data");
+ else if (n < 0)
+ coredump_report_failure("Coredump socket failed");
+}
+
+static inline void coredump_sock_shutdown(struct file *file)
+{
+ struct socket *socket;
+
+ socket = sock_from_file(file);
+ if (!socket)
+ return;
+
+ /* Let userspace know we're done processing the coredump. */
+ kernel_sock_shutdown(socket, SHUT_WR);
+}
+
+static bool coredump_sock_request(struct core_name *cn, struct coredump_params *cprm)
+{
+ struct coredump_req req = {
+ .size = sizeof(struct coredump_req),
+ .mask = COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT,
+ .size_ack = sizeof(struct coredump_ack),
+ };
+ struct coredump_ack ack = {};
+ ssize_t usize;
+
+ if (cn->core_type != COREDUMP_SOCK_REQ)
+ return true;
+ /* Let userspace know what we support. */
+ if (!coredump_sock_send(cprm->file, &req))
+ return false;
+
+ /* Peek the size of the coredump_ack. */
+ if (!coredump_sock_recv(cprm->file, &ack, sizeof(ack.size),
+ MSG_PEEK | MSG_WAITALL))
+ return false;
+
+ /* Refuse unknown coredump_ack sizes. */
+ usize = ack.size;
+ if (usize < COREDUMP_ACK_SIZE_VER0) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_MINSIZE);
+ return false;
+ }
+
+ if (usize > sizeof(ack)) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_MAXSIZE);
+ return false;
+ }
+
+ /* Now retrieve the coredump_ack. */
+ if (!coredump_sock_recv(cprm->file, &ack, usize, MSG_WAITALL))
+ return false;
+ if (ack.size != usize)
+ return false;
+
+ /* Refuse unknown coredump_ack flags. */
+ if (ack.mask & ~req.mask) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED);
+ return false;
+ }
+
+ /* Refuse mutually exclusive options. */
+ if (hweight64(ack.mask & (COREDUMP_USERSPACE | COREDUMP_KERNEL |
+ COREDUMP_REJECT)) != 1) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_CONFLICTING);
+ return false;
+ }
+
+ if (ack.spare) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED);
+ return false;
+ }
+
+ cn->mask = ack.mask;
+ return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK);
+}
+
+static bool coredump_socket(struct core_name *cn, struct coredump_params *cprm)
+{
+ if (!coredump_sock_connect(cn, cprm))
+ return false;
+
+ return coredump_sock_request(cn, cprm);
+}
+#else
+static inline void coredump_sock_wait(struct file *file) { }
+static inline void coredump_sock_shutdown(struct file *file) { }
+static inline bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { return false; }
+#endif
+
+/* cprm->mm_flags contains a stable snapshot of dumpability flags. */
+static inline bool coredump_force_suid_safe(const struct coredump_params *cprm)
+{
+ /* Require nonrelative corefile path and be extra careful. */
+ return __get_dumpable(cprm->mm_flags) == SUID_DUMP_ROOT;
+}
+
+static bool coredump_file(struct core_name *cn, struct coredump_params *cprm,
+ const struct linux_binfmt *binfmt)
+{
+ struct mnt_idmap *idmap;
+ struct inode *inode;
+ struct file *file __free(fput) = NULL;
+ int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL;
+
+ if (cprm->limit < binfmt->min_coredump)
+ return false;
+
+ if (coredump_force_suid_safe(cprm) && cn->corename[0] != '/') {
+ coredump_report_failure("this process can only dump core to a fully qualified path, skipping core dump");
+ return false;
+ }
+
+ /*
+ * Unlink the file if it exists unless this is a SUID
+ * binary - in that case, we're running around with root
+ * privs and don't want to unlink another user's coredump.
+ */
+ if (!coredump_force_suid_safe(cprm)) {
/*
- * There is a race between unlinking and creating the
- * file, but if that causes an EEXIST here, that's
- * fine - another process raced with us while creating
- * the corefile, and the other process won. To userspace,
- * what matters is that at least one of the two processes
- * writes its coredump successfully, not which one.
+ * If it doesn't exist, that's fine. If there's some
+ * other problem, we'll catch it at the filp_open().
*/
- if (need_suid_safe) {
- /*
- * Using user namespaces, normal user tasks can change
- * their current->fs->root to point to arbitrary
- * directories. Since the intention of the "only dump
- * with a fully qualified path" rule is to control where
- * coredumps may be placed using root privileges,
- * current->fs->root must not be used. Instead, use the
- * root directory of init_task.
- */
- struct path root;
-
- task_lock(&init_task);
- get_fs_root(init_task.fs, &root);
- task_unlock(&init_task);
- cprm.file = file_open_root(&root, cn.corename,
- open_flags, 0600);
- path_put(&root);
- } else {
- cprm.file = filp_open(cn.corename, open_flags, 0600);
- }
- if (IS_ERR(cprm.file))
- goto fail_unlock;
-
- inode = file_inode(cprm.file);
- if (inode->i_nlink > 1)
- goto close_fail;
- if (d_unhashed(cprm.file->f_path.dentry))
- goto close_fail;
+ do_unlinkat(AT_FDCWD, getname_kernel(cn->corename));
+ }
+
+ /*
+ * There is a race between unlinking and creating the
+ * file, but if that causes an EEXIST here, that's
+ * fine - another process raced with us while creating
+ * the corefile, and the other process won. To userspace,
+ * what matters is that at least one of the two processes
+ * writes its coredump successfully, not which one.
+ */
+ if (coredump_force_suid_safe(cprm)) {
/*
- * AK: actually i see no reason to not allow this for named
- * pipes etc, but keep the previous behaviour for now.
+ * Using user namespaces, normal user tasks can change
+ * their current->fs->root to point to arbitrary
+ * directories. Since the intention of the "only dump
+ * with a fully qualified path" rule is to control where
+ * coredumps may be placed using root privileges,
+ * current->fs->root must not be used. Instead, use the
+ * root directory of init_task.
*/
- if (!S_ISREG(inode->i_mode))
- goto close_fail;
- /*
- * Don't dump core if the filesystem changed owner or mode
- * of the file during file creation. This is an issue when
- * a process dumps core while its cwd is e.g. on a vfat
- * filesystem.
+ struct path root;
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+ file = file_open_root(&root, cn->corename, open_flags, 0600);
+ path_put(&root);
+ } else {
+ file = filp_open(cn->corename, open_flags, 0600);
+ }
+ if (IS_ERR(file))
+ return false;
+
+ inode = file_inode(file);
+ if (inode->i_nlink > 1)
+ return false;
+ if (d_unhashed(file->f_path.dentry))
+ return false;
+ /*
+ * AK: actually i see no reason to not allow this for named
+ * pipes etc, but keep the previous behaviour for now.
+ */
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ /*
+ * Don't dump core if the filesystem changed owner or mode
+ * of the file during file creation. This is an issue when
+ * a process dumps core while its cwd is e.g. on a vfat
+ * filesystem.
+ */
+ idmap = file_mnt_idmap(file);
+ if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) {
+ coredump_report_failure("Core dump to %s aborted: cannot preserve file owner", cn->corename);
+ return false;
+ }
+ if ((inode->i_mode & 0677) != 0600) {
+ coredump_report_failure("Core dump to %s aborted: cannot preserve file permissions", cn->corename);
+ return false;
+ }
+ if (!(file->f_mode & FMODE_CAN_WRITE))
+ return false;
+ if (do_truncate(idmap, file->f_path.dentry, 0, 0, file))
+ return false;
+
+ cprm->file = no_free_ptr(file);
+ return true;
+}
+
+static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm,
+ size_t *argv, int argc)
+{
+ int argi;
+ char **helper_argv __free(kfree) = NULL;
+ struct subprocess_info *sub_info;
+
+ if (cprm->limit == 1) {
+ /* See umh_coredump_setup() which sets RLIMIT_CORE = 1.
+ *
+ * Normally core limits are irrelevant to pipes, since
+ * we're not writing to the file system, but we use
+ * cprm.limit of 1 here as a special value, this is a
+ * consistent way to catch recursive crashes.
+ * We can still crash if the core_pattern binary sets
+ * RLIM_CORE = !1, but it runs as root, and can do
+ * lots of stupid things.
+ *
+ * Note that we use task_tgid_vnr here to grab the pid
+ * of the process group leader. That way we get the
+ * right pid if a thread in a multi-threaded
+ * core_pattern process dies.
*/
- idmap = file_mnt_idmap(cprm.file);
- if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode),
- current_fsuid())) {
- coredump_report_failure("Core dump to %s aborted: "
- "cannot preserve file owner", cn.corename);
- goto close_fail;
- }
- if ((inode->i_mode & 0677) != 0600) {
- coredump_report_failure("Core dump to %s aborted: "
- "cannot preserve file permissions", cn.corename);
- goto close_fail;
- }
- if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
- goto close_fail;
- if (do_truncate(idmap, cprm.file->f_path.dentry,
- 0, 0, cprm.file))
- goto close_fail;
+ coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
+ return false;
}
+ cprm->limit = RLIM_INFINITY;
+
+ cn->core_pipe_limit = atomic_inc_return(&core_pipe_count);
+ if (core_pipe_limit && (core_pipe_limit < cn->core_pipe_limit)) {
+ coredump_report_failure("over core_pipe_limit, skipping core dump");
+ return false;
+ }
+
+ helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), GFP_KERNEL);
+ if (!helper_argv) {
+ coredump_report_failure("%s failed to allocate memory", __func__);
+ return false;
+ }
+ for (argi = 0; argi < argc; argi++)
+ helper_argv[argi] = cn->corename + argv[argi];
+ helper_argv[argi] = NULL;
+
+ sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL,
+ GFP_KERNEL, umh_coredump_setup,
+ NULL, cprm);
+ if (!sub_info)
+ return false;
+
+ if (call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC)) {
+ coredump_report_failure("|%s pipe failed", cn->corename);
+ return false;
+ }
+
+ /*
+ * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
+ * have this set to NULL.
+ */
+ if (!cprm->file) {
+ coredump_report_failure("Core dump to |%s disabled", cn->corename);
+ return false;
+ }
+
+ return true;
+}
+
+static bool coredump_write(struct core_name *cn,
+ struct coredump_params *cprm,
+ const struct linux_binfmt *binfmt)
+{
+
+ if (dump_interrupted())
+ return true;
+
+ if (!dump_vma_snapshot(cprm))
+ return false;
+
+ file_start_write(cprm->file);
+ cn->core_dumped = binfmt->core_dump(cprm);
+ /*
+ * Ensures that file size is big enough to contain the current
+ * file postion. This prevents gdb from complaining about
+ * a truncated file if the last "write" to the file was
+ * dump_skip.
+ */
+ if (cprm->to_skip) {
+ cprm->to_skip--;
+ dump_emit(cprm, "", 1);
+ }
+ file_end_write(cprm->file);
+ free_vma_snapshot(cprm);
+ return true;
+}
+
+static void coredump_cleanup(struct core_name *cn, struct coredump_params *cprm)
+{
+ if (cprm->file)
+ filp_close(cprm->file, NULL);
+ if (cn->core_pipe_limit) {
+ VFS_WARN_ON_ONCE(cn->core_type != COREDUMP_PIPE);
+ atomic_dec(&core_pipe_count);
+ }
+ kfree(cn->corename);
+ coredump_finish(cn->core_dumped);
+}
+
+static inline bool coredump_skip(const struct coredump_params *cprm,
+ const struct linux_binfmt *binfmt)
+{
+ if (!binfmt)
+ return true;
+ if (!binfmt->core_dump)
+ return true;
+ if (!__get_dumpable(cprm->mm_flags))
+ return true;
+ return false;
+}
+
+static void do_coredump(struct core_name *cn, struct coredump_params *cprm,
+ size_t **argv, int *argc, const struct linux_binfmt *binfmt)
+{
+ if (!coredump_parse(cn, cprm, argv, argc)) {
+ coredump_report_failure("format_corename failed, aborting core");
+ return;
+ }
+
+ switch (cn->core_type) {
+ case COREDUMP_FILE:
+ if (!coredump_file(cn, cprm, binfmt))
+ return;
+ break;
+ case COREDUMP_PIPE:
+ if (!coredump_pipe(cn, cprm, *argv, *argc))
+ return;
+ break;
+ case COREDUMP_SOCK_REQ:
+ fallthrough;
+ case COREDUMP_SOCK:
+ if (!coredump_socket(cn, cprm))
+ return;
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ return;
+ }
+
+ /* Don't even generate the coredump. */
+ if (cn->mask & COREDUMP_REJECT)
+ return;
/* get us an unshared descriptor table; almost always a no-op */
/* The cell spufs coredump code reads the file descriptor tables */
- retval = unshare_files();
- if (retval)
- goto close_fail;
- if (!dump_interrupted()) {
- /*
- * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
- * have this set to NULL.
- */
- if (!cprm.file) {
- coredump_report_failure("Core dump to |%s disabled", cn.corename);
- goto close_fail;
+ if (unshare_files())
+ return;
+
+ if ((cn->mask & COREDUMP_KERNEL) && !coredump_write(cn, cprm, binfmt))
+ return;
+
+ coredump_sock_shutdown(cprm->file);
+
+ /* Let the parent know that a coredump was generated. */
+ if (cn->mask & COREDUMP_USERSPACE)
+ cn->core_dumped = true;
+
+ /*
+ * When core_pipe_limit is set we wait for the coredump server
+ * or usermodehelper to finish before exiting so it can e.g.,
+ * inspect /proc/<pid>.
+ */
+ if (cn->mask & COREDUMP_WAIT) {
+ switch (cn->core_type) {
+ case COREDUMP_PIPE:
+ wait_for_dump_helpers(cprm->file);
+ break;
+ case COREDUMP_SOCK_REQ:
+ fallthrough;
+ case COREDUMP_SOCK:
+ coredump_sock_wait(cprm->file);
+ break;
+ default:
+ break;
}
- if (!dump_vma_snapshot(&cprm))
- goto close_fail;
+ }
+}
- file_start_write(cprm.file);
- core_dumped = binfmt->core_dump(&cprm);
+void vfs_coredump(const kernel_siginfo_t *siginfo)
+{
+ size_t *argv __free(kfree) = NULL;
+ struct core_state core_state;
+ struct core_name cn;
+ const struct mm_struct *mm = current->mm;
+ const struct linux_binfmt *binfmt = mm->binfmt;
+ int argc = 0;
+ struct coredump_params cprm = {
+ .siginfo = siginfo,
+ .limit = rlimit(RLIMIT_CORE),
/*
- * Ensures that file size is big enough to contain the current
- * file postion. This prevents gdb from complaining about
- * a truncated file if the last "write" to the file was
- * dump_skip.
+ * We must use the same mm->flags while dumping core to avoid
+ * inconsistency of bit flags, since this flag is not protected
+ * by any locks.
+ *
+ * Note that we only care about MMF_DUMP* flags.
*/
- if (cprm.to_skip) {
- cprm.to_skip--;
- dump_emit(&cprm, "", 1);
- }
- file_end_write(cprm.file);
- free_vma_snapshot(&cprm);
- }
- if (ispipe && core_pipe_limit)
- wait_for_dump_helpers(cprm.file);
-close_fail:
- if (cprm.file)
- filp_close(cprm.file, NULL);
-fail_dropcount:
- if (ispipe)
- atomic_dec(&core_dump_count);
-fail_unlock:
- kfree(argv);
- kfree(cn.corename);
- coredump_finish(core_dumped);
- revert_creds(old_cred);
-fail_creds:
- put_cred(cred);
-fail:
+ .mm_flags = __mm_flags_get_dumpable(mm),
+ .vma_meta = NULL,
+ .cpu = raw_smp_processor_id(),
+ };
+
+ audit_core_dumps(siginfo->si_signo);
+
+ if (coredump_skip(&cprm, binfmt))
+ return;
+
+ CLASS(prepare_creds, cred)();
+ if (!cred)
+ return;
+ /*
+ * We cannot trust fsuid as being the "true" uid of the process
+ * nor do we know its entire history. We only know it was tainted
+ * so we dump it as root in mode 2, and only into a controlled
+ * environment (pipe handler or fully qualified path).
+ */
+ if (coredump_force_suid_safe(&cprm))
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ if (coredump_wait(siginfo->si_signo, &core_state) < 0)
+ return;
+
+ scoped_with_creds(cred)
+ do_coredump(&cn, &cprm, &argv, &argc, binfmt);
+ coredump_cleanup(&cn, &cprm);
return;
}
@@ -799,10 +1212,9 @@ static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
struct file *file = cprm->file;
loff_t pos = file->f_pos;
ssize_t n;
+
if (cprm->written + nr > cprm->limit)
return 0;
-
-
if (dump_interrupted())
return 0;
n = __kernel_write(file, addr, nr, &pos);
@@ -819,20 +1231,21 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
{
static char zeroes[PAGE_SIZE];
struct file *file = cprm->file;
+
if (file->f_mode & FMODE_LSEEK) {
- if (dump_interrupted() ||
- vfs_llseek(file, nr, SEEK_CUR) < 0)
+ if (dump_interrupted() || vfs_llseek(file, nr, SEEK_CUR) < 0)
return 0;
cprm->pos += nr;
return 1;
- } else {
- while (nr > PAGE_SIZE) {
- if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
- return 0;
- nr -= PAGE_SIZE;
- }
- return __dump_emit(cprm, zeroes, nr);
}
+
+ while (nr > PAGE_SIZE) {
+ if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
+ return 0;
+ nr -= PAGE_SIZE;
+ }
+
+ return __dump_emit(cprm, zeroes, nr);
}
int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
@@ -1001,7 +1414,7 @@ EXPORT_SYMBOL(dump_align);
void validate_coredump_safety(void)
{
if (suid_dumpable == SUID_DUMP_ROOT &&
- core_pattern[0] != '/' && core_pattern[0] != '|') {
+ core_pattern[0] != '/' && core_pattern[0] != '|' && core_pattern[0] != '@') {
coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: "
"pipe handler or fully qualified core dump path required. "
@@ -1009,18 +1422,78 @@ void validate_coredump_safety(void)
}
}
+static inline bool check_coredump_socket(void)
+{
+ const char *p;
+
+ if (core_pattern[0] != '@')
+ return true;
+
+ /*
+ * Coredump socket must be located in the initial mount
+ * namespace. Don't give the impression that anything else is
+ * supported right now.
+ */
+ if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns)
+ return false;
+
+ /* Must be an absolute path... */
+ if (core_pattern[1] != '/') {
+ /* ... or the socket request protocol... */
+ if (core_pattern[1] != '@')
+ return false;
+ /* ... and if so must be an absolute path. */
+ if (core_pattern[2] != '/')
+ return false;
+ p = &core_pattern[2];
+ } else {
+ p = &core_pattern[1];
+ }
+
+ /* The path obviously cannot exceed UNIX_PATH_MAX. */
+ if (strlen(p) >= UNIX_PATH_MAX)
+ return false;
+
+ /* Must not contain ".." in the path. */
+ if (name_contains_dotdot(core_pattern))
+ return false;
+
+ return true;
+}
+
static int proc_dostring_coredump(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int error = proc_dostring(table, write, buffer, lenp, ppos);
+ int error;
+ ssize_t retval;
+ char old_core_pattern[CORENAME_MAX_SIZE];
+
+ if (!write)
+ return proc_dostring(table, write, buffer, lenp, ppos);
+
+ retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
+
+ error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (error)
+ return error;
+
+ if (!check_coredump_socket()) {
+ strscpy(core_pattern, old_core_pattern, retval + 1);
+ return -EINVAL;
+ }
- if (!error)
- validate_coredump_safety();
+ validate_coredump_safety();
return error;
}
static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT;
static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX;
+static char core_modes[] = {
+ "file\npipe"
+#ifdef CONFIG_UNIX
+ "\nsocket"
+#endif
+};
static const struct ctl_table coredump_sysctls[] = {
{
@@ -1064,6 +1537,13 @@ static const struct ctl_table coredump_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "core_modes",
+ .data = core_modes,
+ .maxlen = sizeof(core_modes) - 1,
+ .mode = 0444,
+ .proc_handler = proc_dostring,
+ },
};
static int __init init_fs_coredump_sysctls(void)
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b84d1747a020..e54ebe402df7 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -17,7 +17,6 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
-#include <linux/pfn_t.h>
#include <linux/ramfs.h>
#include <linux/init.h>
#include <linux/string.h>
@@ -96,7 +95,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
inode = iget_locked(sb, cramino(cramfs_inode, offset));
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
switch (cramfs_inode->mode & S_IFMT) {
@@ -117,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
inode_nohighmem(inode);
inode->i_data.a_ops = &cramfs_aops;
break;
- default:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
init_special_inode(inode, cramfs_inode->mode,
old_decode_dev(cramfs_inode->size));
+ break;
+ default:
+ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ iget_failed(inode);
+ return ERR_PTR(-EIO);
}
inode->i_mode = cramfs_inode->mode;
@@ -412,8 +420,8 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
for (i = 0; i < pages && !ret; i++) {
vm_fault_t vmf;
unsigned long off = i * PAGE_SIZE;
- pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
- vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
+ vmf = vmf_insert_mixed(vma, vma->vm_start + off,
+ PHYS_PFN(address + off));
if (vmf & VM_FAULT_ERROR)
ret = vm_fault_to_errno(vmf, 0);
}
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index b5dfb0aa405a..464b54610fd3 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -2,10 +2,9 @@
config FS_ENCRYPTION
bool "FS Encryption (Per-file encryption)"
select CRYPTO
- select CRYPTO_HASH
- select CRYPTO_HKDF
select CRYPTO_SKCIPHER
select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_SHA512
select KEYS
help
Enable encryption of files and directories. This
@@ -32,8 +31,6 @@ config FS_ENCRYPTION_ALGS
select CRYPTO_CBC
select CRYPTO_CTS
select CRYPTO_ECB
- select CRYPTO_HMAC
- select CRYPTO_SHA512
select CRYPTO_XTS
config FS_ENCRYPTION_INLINE_CRYPT
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0ad8c30b8fa5..5f5599020e94 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -7,10 +7,12 @@
* Copyright (C) 2015, Motorola Mobility
*/
-#include <linux/pagemap.h>
-#include <linux/module.h>
#include <linux/bio.h>
+#include <linux/export.h>
+#include <linux/module.h>
#include <linux/namei.h>
+#include <linux/pagemap.h>
+
#include "fscrypt_private.h"
/**
@@ -111,7 +113,7 @@ out:
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
sector_t pblk, unsigned int len)
{
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
@@ -146,7 +148,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
*/
for (i = 0; i < nr_pages; i++) {
pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS :
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
if (!pages[i])
break;
}
@@ -165,8 +167,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
do {
err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index,
ZERO_PAGE(0), pages[i],
- du_size, offset,
- GFP_NOFS);
+ du_size, offset);
if (err)
goto out;
du_index++;
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index b74b5937e695..07f9cbfe3ea4 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -20,12 +20,14 @@
* Special Publication 800-38E and IEEE P1619/D16.
*/
-#include <linux/pagemap.h>
+#include <crypto/skcipher.h>
+#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/module.h>
-#include <linux/scatterlist.h>
+#include <linux/pagemap.h>
#include <linux/ratelimit.h>
-#include <crypto/skcipher.h>
+#include <linux/scatterlist.h>
+
#include "fscrypt_private.h"
static unsigned int num_prealloc_crypto_pages = 32;
@@ -108,15 +110,13 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
fscrypt_direction_t rw, u64 index,
struct page *src_page, struct page *dest_page,
- unsigned int len, unsigned int offs,
- gfp_t gfp_flags)
+ unsigned int len, unsigned int offs)
{
+ struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
- struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
- int res = 0;
+ int err;
if (WARN_ON_ONCE(len <= 0))
return -EINVAL;
@@ -125,31 +125,23 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
fscrypt_generate_iv(&iv, index, ci);
- req = skcipher_request_alloc(tfm, gfp_flags);
- if (!req)
- return -ENOMEM;
-
skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
-
+ NULL, NULL);
sg_init_table(&dst, 1);
sg_set_page(&dst, dest_page, len, offs);
sg_init_table(&src, 1);
sg_set_page(&src, src_page, len, offs);
skcipher_request_set_crypt(req, &src, &dst, len, &iv);
if (rw == FS_DECRYPT)
- res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
+ err = crypto_skcipher_decrypt(req);
else
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
- skcipher_request_free(req);
- if (res) {
+ err = crypto_skcipher_encrypt(req);
+ if (err)
fscrypt_err(ci->ci_inode,
"%scryption failed for data unit %llu: %d",
- (rw == FS_DECRYPT ? "De" : "En"), index, res);
- return res;
- }
- return 0;
+ (rw == FS_DECRYPT ? "De" : "En"), index, err);
+ return err;
}
/**
@@ -181,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
size_t len, size_t offs, gfp_t gfp_flags)
{
const struct inode *inode = folio->mapping->host;
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
struct page *ciphertext_page;
@@ -204,7 +196,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
for (i = offs; i < offs + len; i += du_size, index++) {
err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
&folio->page, ciphertext_page,
- du_size, i, gfp_flags);
+ du_size, i);
if (err) {
fscrypt_free_bounce_page(ciphertext_page);
return ERR_PTR(err);
@@ -225,7 +217,6 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
* @offs: Byte offset within @page at which the block to encrypt begins
* @lblk_num: Filesystem logical block number of the block, i.e. the 0-based
* number of the block within the file
- * @gfp_flags: Memory allocation flags
*
* Encrypt a possibly-compressed filesystem block that is located in an
* arbitrary page, not necessarily in the original pagecache page. The @inode
@@ -237,13 +228,13 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
*/
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
unsigned int len, unsigned int offs,
- u64 lblk_num, gfp_t gfp_flags)
+ u64 lblk_num)
{
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
- return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
- lblk_num, page, page, len, offs,
- gfp_flags);
+ return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode),
+ FS_ENCRYPT, lblk_num, page, page, len,
+ offs);
}
EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
@@ -265,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
size_t offs)
{
const struct inode *inode = folio->mapping->host;
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
@@ -283,8 +274,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
struct page *page = folio_page(folio, i >> PAGE_SHIFT);
err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page,
- page, du_size, i & ~PAGE_MASK,
- GFP_NOFS);
+ page, du_size, i & ~PAGE_MASK);
if (err)
return err;
}
@@ -316,9 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
{
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
- return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
- lblk_num, page, page, len, offs,
- GFP_NOFS);
+ return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode),
+ FS_DECRYPT, lblk_num, page, page, len,
+ offs);
}
EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 010f9c0a4c2f..a9a4432d12ba 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -11,11 +11,13 @@
* This has not yet undergone a rigorous security audit.
*/
-#include <linux/namei.h>
-#include <linux/scatterlist.h>
-#include <crypto/hash.h>
#include <crypto/sha2.h>
#include <crypto/skcipher.h>
+#include <linux/export.h>
+#include <linux/namei.h>
+#include <linux/scatterlist.h>
+#include <linux/base64.h>
+
#include "fscrypt_private.h"
/*
@@ -70,7 +72,7 @@ struct fscrypt_nokey_name {
/* Encoded size of max-size no-key name */
#define FSCRYPT_NOKEY_NAME_MAX_ENCODED \
- FSCRYPT_BASE64URL_CHARS(FSCRYPT_NOKEY_NAME_MAX)
+ BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX)
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
@@ -92,13 +94,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen)
{
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
+ struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
struct scatterlist sg;
- int res;
+ int err;
/*
* Copy the filename to the output buffer for encrypting in-place and
@@ -109,28 +110,17 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
memcpy(out, iname->name, iname->len);
memset(out + iname->len, 0, olen - iname->len);
- /* Initialize the IV */
fscrypt_generate_iv(&iv, 0, ci);
- /* Set up the encryption request */
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req)
- return -ENOMEM;
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
+ skcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
sg_init_one(&sg, out, olen);
skcipher_request_set_crypt(req, &sg, &sg, olen, &iv);
-
- /* Do the encryption */
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
- skcipher_request_free(req);
- if (res < 0) {
- fscrypt_err(inode, "Filename encryption failed: %d", res);
- return res;
- }
-
- return 0;
+ err = crypto_skcipher_encrypt(req);
+ if (err)
+ fscrypt_err(inode, "Filename encryption failed: %d", err);
+ return err;
}
EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt);
@@ -148,118 +138,31 @@ static int fname_decrypt(const struct inode *inode,
const struct fscrypt_str *iname,
struct fscrypt_str *oname)
{
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
- struct scatterlist src_sg, dst_sg;
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
+ struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
- int res;
-
- /* Allocate request */
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req)
- return -ENOMEM;
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
+ struct scatterlist src_sg, dst_sg;
+ int err;
- /* Initialize IV */
fscrypt_generate_iv(&iv, 0, ci);
- /* Create decryption request */
+ skcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv);
- res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
- skcipher_request_free(req);
- if (res < 0) {
- fscrypt_err(inode, "Filename decryption failed: %d", res);
- return res;
+ err = crypto_skcipher_decrypt(req);
+ if (err) {
+ fscrypt_err(inode, "Filename decryption failed: %d", err);
+ return err;
}
oname->len = strnlen(oname->name, iname->len);
return 0;
}
-static const char base64url_table[65] =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
-
-#define FSCRYPT_BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
-
-/**
- * fscrypt_base64url_encode() - base64url-encode some binary data
- * @src: the binary data to encode
- * @srclen: the length of @src in bytes
- * @dst: (output) the base64url-encoded string. Not NUL-terminated.
- *
- * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL
- * and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used,
- * as it's unneeded and not required by the RFC. base64url is used instead of
- * base64 to avoid the '/' character, which isn't allowed in filenames.
- *
- * Return: the length of the resulting base64url-encoded string in bytes.
- * This will be equal to FSCRYPT_BASE64URL_CHARS(srclen).
- */
-static int fscrypt_base64url_encode(const u8 *src, int srclen, char *dst)
-{
- u32 ac = 0;
- int bits = 0;
- int i;
- char *cp = dst;
-
- for (i = 0; i < srclen; i++) {
- ac = (ac << 8) | src[i];
- bits += 8;
- do {
- bits -= 6;
- *cp++ = base64url_table[(ac >> bits) & 0x3f];
- } while (bits >= 6);
- }
- if (bits)
- *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f];
- return cp - dst;
-}
-
-/**
- * fscrypt_base64url_decode() - base64url-decode a string
- * @src: the string to decode. Doesn't need to be NUL-terminated.
- * @srclen: the length of @src in bytes
- * @dst: (output) the decoded binary data
- *
- * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with
- * URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't
- * accepted, nor are non-encoding characters such as whitespace.
- *
- * This implementation hasn't been optimized for performance.
- *
- * Return: the length of the resulting decoded binary data in bytes,
- * or -1 if the string isn't a valid base64url string.
- */
-static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst)
-{
- u32 ac = 0;
- int bits = 0;
- int i;
- u8 *bp = dst;
-
- for (i = 0; i < srclen; i++) {
- const char *p = strchr(base64url_table, src[i]);
-
- if (p == NULL || src[i] == 0)
- return -1;
- ac = (ac << 6) | (p - base64url_table);
- bits += 6;
- if (bits >= 8) {
- bits -= 8;
- *bp++ = (u8)(ac >> bits);
- }
- }
- if (ac & ((1 << bits) - 1))
- return -1;
- return bp - dst;
-}
-
bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
u32 orig_len, u32 max_len,
u32 *encrypted_len_ret)
@@ -293,8 +196,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
u32 max_len, u32 *encrypted_len_ret)
{
- return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
- orig_len, max_len,
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
+
+ return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len,
encrypted_len_ret);
}
EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);
@@ -406,8 +310,8 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
nokey_name.sha256);
size = FSCRYPT_NOKEY_NAME_MAX;
}
- oname->len = fscrypt_base64url_encode((const u8 *)&nokey_name, size,
- oname->name);
+ oname->len = base64_encode((const u8 *)&nokey_name, size,
+ oname->name, false, BASE64_URLSAFE);
return 0;
}
EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
@@ -486,8 +390,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
if (fname->crypto_buf.name == NULL)
return -ENOMEM;
- ret = fscrypt_base64url_decode(iname->name, iname->len,
- fname->crypto_buf.name);
+ ret = base64_decode(iname->name, iname->len,
+ fname->crypto_buf.name, false, BASE64_URLSAFE);
if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) ||
(ret > offsetof(struct fscrypt_nokey_name, sha256) &&
ret != FSCRYPT_NOKEY_NAME_MAX)) {
@@ -562,7 +466,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
*/
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
{
- const struct fscrypt_inode_info *ci = dir->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir);
WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 8371e4e1f596..4e8e82a9ccf9 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -11,9 +11,10 @@
#ifndef _FSCRYPT_PRIVATE_H
#define _FSCRYPT_PRIVATE_H
+#include <crypto/sha2.h>
#include <linux/fscrypt.h>
+#include <linux/minmax.h>
#include <linux/siphash.h>
-#include <crypto/hash.h>
#include <linux/blk-crypto.h>
#define CONST_STRLEN(str) (sizeof(str) - 1)
@@ -27,6 +28,41 @@
*/
#define FSCRYPT_MIN_KEY_SIZE 16
+/* Maximum size of a raw fscrypt master key */
+#define FSCRYPT_MAX_RAW_KEY_SIZE 64
+
+/* Maximum size of a hardware-wrapped fscrypt master key */
+#define FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE
+
+/* Maximum size of an fscrypt master key across both key types */
+#define FSCRYPT_MAX_ANY_KEY_SIZE \
+ MAX(FSCRYPT_MAX_RAW_KEY_SIZE, FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE)
+
+/*
+ * FSCRYPT_MAX_KEY_SIZE is defined in the UAPI header, but the addition of
+ * hardware-wrapped keys has made it misleading as it's only for raw keys.
+ * Don't use it in kernel code; use one of the above constants instead.
+ */
+#undef FSCRYPT_MAX_KEY_SIZE
+
+/*
+ * This mask is passed as the third argument to the crypto_alloc_*() functions
+ * to prevent fscrypt from using the Crypto API drivers for non-inline crypto
+ * engines. Those drivers have been problematic for fscrypt. fscrypt users
+ * have reported hangs and even incorrect en/decryption with these drivers.
+ * Since going to the driver, off CPU, and back again is really slow, such
+ * drivers can be over 50 times slower than the CPU-based code for fscrypt's
+ * workload. Even on platforms that lack AES instructions on the CPU, using the
+ * offloads has been shown to be slower, even staying with AES. (Of course,
+ * Adiantum is faster still, and is the recommended option on such platforms...)
+ *
+ * Note that fscrypt also supports inline crypto engines. Those don't use the
+ * Crypto API and work much better than the old-style (non-inline) engines.
+ */
+#define FSCRYPT_CRYPTOAPI_MASK \
+ (CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | \
+ CRYPTO_ALG_KERN_DRIVER_ONLY)
+
#define FSCRYPT_CONTEXT_V1 1
#define FSCRYPT_CONTEXT_V2 2
@@ -203,7 +239,7 @@ struct fscrypt_symlink_data {
* Normally only one of the fields will be non-NULL.
*/
struct fscrypt_prepared_key {
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
struct blk_crypto_key *blk_key;
#endif
@@ -213,8 +249,8 @@ struct fscrypt_prepared_key {
* fscrypt_inode_info - the "encryption key" for an inode
*
* When an encrypted file's key is made available, an instance of this struct is
- * allocated and stored in ->i_crypt_info. Once created, it remains until the
- * inode is evicted.
+ * allocated and a pointer to it is stored in the file's in-memory inode. Once
+ * created, it remains until the inode is evicted.
*/
struct fscrypt_inode_info {
@@ -301,8 +337,7 @@ int fscrypt_initialize(struct super_block *sb);
int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
fscrypt_direction_t rw, u64 index,
struct page *src_page, struct page *dest_page,
- unsigned int len, unsigned int offs,
- gfp_t gfp_flags);
+ unsigned int len, unsigned int offs);
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
void __printf(3, 4) __cold
@@ -346,12 +381,8 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
u32 *encrypted_len_ret);
/* hkdf.c */
-struct fscrypt_hkdf {
- struct crypto_shash *hmac_tfm;
-};
-
-int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
- unsigned int master_key_size);
+void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key,
+ unsigned int master_key_size);
/*
* The list of contexts in which fscrypt uses HKDF. These values are used as
@@ -360,23 +391,24 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
* outputs are unique and cryptographically isolated, i.e. knowledge of one
* output doesn't reveal another.
*/
-#define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */
+#define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY 1 /* info=<empty> */
#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */
#define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */
#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */
#define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */
#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */
#define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */
+#define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY \
+ 8 /* info=<empty> */
-int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
- const u8 *info, unsigned int infolen,
- u8 *okm, unsigned int okmlen);
-
-void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
+void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context,
+ const u8 *info, unsigned int infolen,
+ u8 *okm, unsigned int okmlen);
/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
-int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci);
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci,
+ bool is_hw_wrapped_key);
static inline bool
fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
@@ -385,12 +417,17 @@ fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
}
int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key,
+ const u8 *key_bytes, size_t key_size,
+ bool is_hw_wrapped,
const struct fscrypt_inode_info *ci);
void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
struct fscrypt_prepared_key *prep_key);
+int fscrypt_derive_sw_secret(struct super_block *sb,
+ const u8 *wrapped_key, size_t wrapped_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]);
+
/*
* Check whether the crypto transform or blk-crypto key has been allocated in
* @prep_key, depending on which encryption implementation the file will use.
@@ -414,7 +451,8 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
-static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
+static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci,
+ bool is_hw_wrapped_key)
{
return 0;
}
@@ -427,7 +465,8 @@ fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
static inline int
fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key,
+ const u8 *key_bytes, size_t key_size,
+ bool is_hw_wrapped,
const struct fscrypt_inode_info *ci)
{
WARN_ON_ONCE(1);
@@ -440,6 +479,15 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb,
{
}
+static inline int
+fscrypt_derive_sw_secret(struct super_block *sb,
+ const u8 *wrapped_key, size_t wrapped_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
+{
+ fscrypt_warn(NULL, "kernel doesn't support hardware-wrapped keys");
+ return -EOPNOTSUPP;
+}
+
static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
const struct fscrypt_inode_info *ci)
@@ -456,20 +504,38 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
struct fscrypt_master_key_secret {
/*
- * For v2 policy keys: HKDF context keyed by this master key.
- * For v1 policy keys: not set (hkdf.hmac_tfm == NULL).
+ * The KDF with which subkeys of this key can be derived.
+ *
+ * For v1 policy keys, this isn't applicable and won't be set.
+ * Otherwise, this KDF will be keyed by this master key if
+ * ->is_hw_wrapped=false, or by the "software secret" that hardware
+ * derived from this master key if ->is_hw_wrapped=true.
*/
- struct fscrypt_hkdf hkdf;
+ struct hmac_sha512_key hkdf;
/*
- * Size of the raw key in bytes. This remains set even if ->raw was
+ * True if this key is a hardware-wrapped key; false if this key is a
+ * raw key (i.e. a "software key"). For v1 policy keys this will always
+ * be false, as v1 policy support is a legacy feature which doesn't
+ * support newer functionality such as hardware-wrapped keys.
+ */
+ bool is_hw_wrapped;
+
+ /*
+ * Size of the key in bytes. This remains set even if ->bytes was
* zeroized due to no longer being needed. I.e. we still remember the
* size of the key even if we don't need to remember the key itself.
*/
u32 size;
- /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */
- u8 raw[FSCRYPT_MAX_KEY_SIZE];
+ /*
+ * The bytes of the key, when still needed. This can be either a raw
+ * key or a hardware-wrapped key, as indicated by ->is_hw_wrapped. In
+ * the case of a raw, v2 policy key, there is no need to remember the
+ * actual key separately from ->hkdf so this field will be zeroized as
+ * soon as ->hkdf is initialized.
+ */
+ u8 bytes[FSCRYPT_MAX_ANY_KEY_SIZE];
} __randomize_layout;
@@ -624,7 +690,7 @@ struct fscrypt_master_key *
fscrypt_find_master_key(struct super_block *sb,
const struct fscrypt_key_specifier *mk_spec);
-int fscrypt_get_test_dummy_key_identifier(
+void fscrypt_get_test_dummy_key_identifier(
u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
int fscrypt_add_test_dummy_key(struct super_block *sb,
@@ -660,8 +726,8 @@ void fscrypt_destroy_prepared_key(struct super_block *sb,
int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
const u8 *raw_key);
-int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
- const struct fscrypt_master_key *mk);
+void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
+ const struct fscrypt_master_key *mk);
void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk);
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index 855a0f4b7318..706f56d0076e 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -1,14 +1,16 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This is used to derive keys from the fscrypt master keys.
+ * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
+ * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010):
+ * "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
+ *
+ * This is used to derive keys from the fscrypt master keys (or from the
+ * "software secrets" which hardware derives from the fscrypt master keys, in
+ * the case that the fscrypt master keys are hardware-wrapped keys).
*
* Copyright 2019 Google LLC
*/
-#include <crypto/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/hkdf.h>
-
#include "fscrypt_private.h"
/*
@@ -22,7 +24,6 @@
* HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of
* SHA-512 causes HKDF-Expand to only need to do one iteration rather than two.
*/
-#define HKDF_HMAC_ALG "hmac(sha512)"
#define HKDF_HASHLEN SHA512_DIGEST_SIZE
/*
@@ -42,54 +43,24 @@
*/
/*
- * Compute HKDF-Extract using the given master key as the input keying material,
- * and prepare an HMAC transform object keyed by the resulting pseudorandom key.
- *
- * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many
- * times without having to recompute HKDF-Extract each time.
+ * Compute HKDF-Extract using 'master_key' as the input keying material, and
+ * prepare the resulting HMAC key in 'hkdf'. Afterwards, 'hkdf' can be used for
+ * HKDF-Expand many times without having to recompute HKDF-Extract each time.
*/
-int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
- unsigned int master_key_size)
+void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key,
+ unsigned int master_key_size)
{
- struct crypto_shash *hmac_tfm;
static const u8 default_salt[HKDF_HASHLEN];
u8 prk[HKDF_HASHLEN];
- int err;
-
- hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0);
- if (IS_ERR(hmac_tfm)) {
- fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld",
- PTR_ERR(hmac_tfm));
- return PTR_ERR(hmac_tfm);
- }
-
- if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) {
- err = -EINVAL;
- goto err_free_tfm;
- }
-
- err = hkdf_extract(hmac_tfm, master_key, master_key_size,
- default_salt, HKDF_HASHLEN, prk);
- if (err)
- goto err_free_tfm;
-
- err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk));
- if (err)
- goto err_free_tfm;
- hkdf->hmac_tfm = hmac_tfm;
- goto out;
-
-err_free_tfm:
- crypto_free_shash(hmac_tfm);
-out:
+ hmac_sha512_usingrawkey(default_salt, sizeof(default_salt),
+ master_key, master_key_size, prk);
+ hmac_sha512_preparekey(hkdf, prk, sizeof(prk));
memzero_explicit(prk, sizeof(prk));
- return err;
}
/*
- * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which
- * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen'
+ * HKDF-Expand (RFC 5869 section 2.3). Expand the HMAC key 'hkdf' into 'okmlen'
* bytes of output keying material parameterized by the application-specific
* 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context'
* byte. This is thread-safe and may be called by multiple threads in parallel.
@@ -98,30 +69,32 @@ out:
* adds to its application-specific info strings to guarantee that it doesn't
* accidentally repeat an info string when using HKDF for different purposes.)
*/
-int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
- const u8 *info, unsigned int infolen,
- u8 *okm, unsigned int okmlen)
-{
- SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm);
- u8 *full_info;
- int err;
-
- full_info = kzalloc(infolen + 9, GFP_KERNEL);
- if (!full_info)
- return -ENOMEM;
- desc->tfm = hkdf->hmac_tfm;
-
- memcpy(full_info, "fscrypt\0", 8);
- full_info[8] = context;
- memcpy(full_info + 9, info, infolen);
-
- err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9,
- okm, okmlen);
- kfree_sensitive(full_info);
- return err;
-}
-
-void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf)
+void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context,
+ const u8 *info, unsigned int infolen,
+ u8 *okm, unsigned int okmlen)
{
- crypto_free_shash(hkdf->hmac_tfm);
+ struct hmac_sha512_ctx ctx;
+ u8 counter = 1;
+ u8 tmp[HKDF_HASHLEN];
+
+ WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN);
+
+ for (unsigned int i = 0; i < okmlen; i += HKDF_HASHLEN) {
+ hmac_sha512_init(&ctx, hkdf);
+ if (i != 0)
+ hmac_sha512_update(&ctx, &okm[i - HKDF_HASHLEN],
+ HKDF_HASHLEN);
+ hmac_sha512_update(&ctx, "fscrypt\0", 8);
+ hmac_sha512_update(&ctx, &context, 1);
+ hmac_sha512_update(&ctx, info, infolen);
+ hmac_sha512_update(&ctx, &counter, 1);
+ if (okmlen - i < HKDF_HASHLEN) {
+ hmac_sha512_final(&ctx, tmp);
+ memcpy(&okm[i], tmp, okmlen - i);
+ memzero_explicit(tmp, sizeof(tmp));
+ } else {
+ hmac_sha512_final(&ctx, &okm[i]);
+ }
+ counter++;
+ }
}
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index d8d5049b8fe1..b97de0d1430f 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -5,6 +5,8 @@
* Encryption hooks for higher-level filesystem operations.
*/
+#include <linux/export.h>
+
#include "fscrypt_private.h"
/**
@@ -197,13 +199,13 @@ int fscrypt_prepare_setflags(struct inode *inode,
err = fscrypt_require_key(inode);
if (err)
return err;
- ci = inode->i_crypt_info;
+ ci = fscrypt_get_inode_info_raw(inode);
if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
return -EINVAL;
mk = ci->ci_master_key;
down_read(&mk->mk_sem);
if (mk->mk_present)
- err = fscrypt_derive_dirhash_key(ci, mk);
+ fscrypt_derive_dirhash_key(ci, mk);
else
err = -ENOKEY;
up_read(&mk->mk_sem);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 7fa53d30aec3..ed6e926226b5 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -15,6 +15,7 @@
#include <linux/blk-crypto.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
+#include <linux/export.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/uio.h>
@@ -89,7 +90,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
}
/* Enable inline encryption for this file if supported. */
-int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci,
+ bool is_hw_wrapped_key)
{
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
@@ -130,7 +132,8 @@ int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
- crypto_cfg.key_type = BLK_CRYPTO_KEY_TYPE_RAW;
+ crypto_cfg.key_type = is_hw_wrapped_key ?
+ BLK_CRYPTO_KEY_TYPE_HW_WRAPPED : BLK_CRYPTO_KEY_TYPE_RAW;
devs = fscrypt_get_devices(sb, &num_devs);
if (IS_ERR(devs))
@@ -151,12 +154,15 @@ out_free_devs:
}
int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key,
+ const u8 *key_bytes, size_t key_size,
+ bool is_hw_wrapped,
const struct fscrypt_inode_info *ci)
{
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode;
+ enum blk_crypto_key_type key_type = is_hw_wrapped ?
+ BLK_CRYPTO_KEY_TYPE_HW_WRAPPED : BLK_CRYPTO_KEY_TYPE_RAW;
struct blk_crypto_key *blk_key;
struct block_device **devs;
unsigned int num_devs;
@@ -167,9 +173,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
if (!blk_key)
return -ENOMEM;
- err = blk_crypto_init_key(blk_key, raw_key, ci->ci_mode->keysize,
- BLK_CRYPTO_KEY_TYPE_RAW, crypto_mode,
- fscrypt_get_dun_bytes(ci),
+ err = blk_crypto_init_key(blk_key, key_bytes, key_size, key_type,
+ crypto_mode, fscrypt_get_dun_bytes(ci),
1U << ci->ci_data_unit_bits);
if (err) {
fscrypt_err(inode, "error %d initializing blk-crypto key", err);
@@ -228,9 +233,37 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
kfree_sensitive(blk_key);
}
+/*
+ * Ask the inline encryption hardware to derive the software secret from a
+ * hardware-wrapped key. Returns -EOPNOTSUPP if hardware-wrapped keys aren't
+ * supported on this filesystem or hardware.
+ */
+int fscrypt_derive_sw_secret(struct super_block *sb,
+ const u8 *wrapped_key, size_t wrapped_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
+{
+ int err;
+
+ /* The filesystem must be mounted with -o inlinecrypt. */
+ if (!(sb->s_flags & SB_INLINECRYPT)) {
+ fscrypt_warn(NULL,
+ "%s: filesystem not mounted with inlinecrypt\n",
+ sb->s_id);
+ return -EOPNOTSUPP;
+ }
+
+ err = blk_crypto_derive_sw_secret(sb->s_bdev, wrapped_key,
+ wrapped_key_size, sw_secret);
+ if (err == -EOPNOTSUPP)
+ fscrypt_warn(NULL,
+ "%s: block device doesn't support hardware-wrapped keys\n",
+ sb->s_id);
+ return err;
+}
+
bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
- return inode->i_crypt_info->ci_inlinecrypt;
+ return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt;
}
EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
@@ -274,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
if (!fscrypt_inode_uses_inline_crypto(inode))
return;
- ci = inode->i_crypt_info;
+ ci = fscrypt_get_inode_info_raw(inode);
fscrypt_generate_dun(ci, first_lblk, dun);
bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask);
@@ -300,8 +333,7 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh,
inode = mapping->host;
*inode_ret = inode;
- *lblk_num_ret = ((u64)folio->index << (PAGE_SHIFT - inode->i_blkbits)) +
- (bh_offset(bh) >> inode->i_blkbits);
+ *lblk_num_ret = (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits;
return true;
}
@@ -352,22 +384,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
u64 next_lblk)
{
const struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+ const struct fscrypt_inode_info *ci;
u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
if (!!bc != fscrypt_inode_uses_inline_crypto(inode))
return false;
if (!bc)
return true;
+ ci = fscrypt_get_inode_info_raw(inode);
/*
* Comparing the key pointers is good enough, as all I/O for each key
* uses the same pointer. I.e., there's currently no need to support
* merging requests where the keys are the same but the pointers differ.
*/
- if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key)
+ if (bc->bc_key != ci->ci_enc_key.blk_key)
return false;
- fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun);
+ fscrypt_generate_dun(ci, next_lblk, next_dun);
return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun);
}
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio);
@@ -469,7 +503,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
if (nr_blocks <= 1)
return nr_blocks;
- ci = inode->i_crypt_info;
+ ci = fscrypt_get_inode_info_raw(inode);
if (!(fscrypt_policy_flags(&ci->ci_policy) &
FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
return nr_blocks;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 787e9c8938ba..5e939ea3ac28 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -18,12 +18,13 @@
* information about these ioctls.
*/
-#include <linux/unaligned.h>
#include <crypto/skcipher.h>
+#include <linux/export.h>
#include <linux/key-type.h>
-#include <linux/random.h>
#include <linux/once.h>
+#include <linux/random.h>
#include <linux/seq_file.h>
+#include <linux/unaligned.h>
#include "fscrypt_private.h"
@@ -41,7 +42,6 @@ struct fscrypt_keyring {
static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
{
- fscrypt_destroy_hkdf(&secret->hkdf);
memzero_explicit(secret, sizeof(*secret));
}
@@ -149,11 +149,11 @@ static int fscrypt_user_key_instantiate(struct key *key,
struct key_preparsed_payload *prep)
{
/*
- * We just charge FSCRYPT_MAX_KEY_SIZE bytes to the user's key quota for
- * each key, regardless of the exact key size. The amount of memory
+ * We just charge FSCRYPT_MAX_RAW_KEY_SIZE bytes to the user's key quota
+ * for each key, regardless of the exact key size. The amount of memory
* actually used is greater than the size of the raw key anyway.
*/
- return key_payload_reserve(key, FSCRYPT_MAX_KEY_SIZE);
+ return key_payload_reserve(key, FSCRYPT_MAX_RAW_KEY_SIZE);
}
static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m)
@@ -558,41 +558,79 @@ static int add_master_key(struct super_block *sb,
int err;
if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
- err = fscrypt_init_hkdf(&secret->hkdf, secret->raw,
- secret->size);
- if (err)
- return err;
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE];
+ u8 *kdf_key = secret->bytes;
+ unsigned int kdf_key_size = secret->size;
+ u8 keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY;
/*
- * Now that the HKDF context is initialized, the raw key is no
- * longer needed.
+ * For raw keys, the fscrypt master key is used directly as the
+ * fscrypt KDF key. For hardware-wrapped keys, we have to pass
+ * the master key to the hardware to derive the KDF key, which
+ * is then only used to derive non-file-contents subkeys.
*/
- memzero_explicit(secret->raw, secret->size);
+ if (secret->is_hw_wrapped) {
+ err = fscrypt_derive_sw_secret(sb, secret->bytes,
+ secret->size, sw_secret);
+ if (err)
+ return err;
+ kdf_key = sw_secret;
+ kdf_key_size = sizeof(sw_secret);
+ /*
+ * To avoid weird behavior if someone manages to
+ * determine sw_secret and add it as a raw key, ensure
+ * that hardware-wrapped keys and raw keys will have
+ * different key identifiers by deriving their key
+ * identifiers using different KDF contexts.
+ */
+ keyid_kdf_ctx =
+ HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY;
+ }
+ fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size);
+ /*
+ * Now that the KDF context is initialized, the raw KDF key is
+ * no longer needed.
+ */
+ memzero_explicit(kdf_key, kdf_key_size);
/* Calculate the key identifier */
- err = fscrypt_hkdf_expand(&secret->hkdf,
- HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0,
- key_spec->u.identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE);
- if (err)
- return err;
+ fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0,
+ key_spec->u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
}
return do_add_master_key(sb, secret, key_spec);
}
+/*
+ * Validate the size of an fscrypt master key being added. Note that this is
+ * just an initial check, as we don't know which ciphers will be used yet.
+ * There is a stricter size check later when the key is actually used by a file.
+ */
+static inline bool fscrypt_valid_key_size(size_t size, u32 add_key_flags)
+{
+ u32 max_size = (add_key_flags & FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) ?
+ FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE :
+ FSCRYPT_MAX_RAW_KEY_SIZE;
+
+ return size >= FSCRYPT_MIN_KEY_SIZE && size <= max_size;
+}
+
static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep)
{
const struct fscrypt_provisioning_key_payload *payload = prep->data;
- if (prep->datalen < sizeof(*payload) + FSCRYPT_MIN_KEY_SIZE ||
- prep->datalen > sizeof(*payload) + FSCRYPT_MAX_KEY_SIZE)
+ if (prep->datalen < sizeof(*payload))
+ return -EINVAL;
+
+ if (!fscrypt_valid_key_size(prep->datalen - sizeof(*payload),
+ payload->flags))
return -EINVAL;
if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER)
return -EINVAL;
- if (payload->__reserved)
+ if (payload->flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED)
return -EINVAL;
prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL);
@@ -636,21 +674,21 @@ static struct key_type key_type_fscrypt_provisioning = {
};
/*
- * Retrieve the raw key from the Linux keyring key specified by 'key_id', and
- * store it into 'secret'.
+ * Retrieve the key from the Linux keyring key specified by 'key_id', and store
+ * it into 'secret'.
*
- * The key must be of type "fscrypt-provisioning" and must have the field
- * fscrypt_provisioning_key_payload::type set to 'type', indicating that it's
- * only usable with fscrypt with the particular KDF version identified by
- * 'type'. We don't use the "logon" key type because there's no way to
- * completely restrict the use of such keys; they can be used by any kernel API
- * that accepts "logon" keys and doesn't require a specific service prefix.
+ * The key must be of type "fscrypt-provisioning" and must have the 'type' and
+ * 'flags' field of the payload set to the given values, indicating that the key
+ * is intended for use for the specified purpose. We don't use the "logon" key
+ * type because there's no way to completely restrict the use of such keys; they
+ * can be used by any kernel API that accepts "logon" keys and doesn't require a
+ * specific service prefix.
*
* The ability to specify the key via Linux keyring key is intended for cases
* where userspace needs to re-add keys after the filesystem is unmounted and
- * re-mounted. Most users should just provide the raw key directly instead.
+ * re-mounted. Most users should just provide the key directly instead.
*/
-static int get_keyring_key(u32 key_id, u32 type,
+static int get_keyring_key(u32 key_id, u32 type, u32 flags,
struct fscrypt_master_key_secret *secret)
{
key_ref_t ref;
@@ -667,12 +705,16 @@ static int get_keyring_key(u32 key_id, u32 type,
goto bad_key;
payload = key->payload.data[0];
- /* Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. */
- if (payload->type != type)
+ /*
+ * Don't allow fscrypt v1 keys to be used as v2 keys and vice versa.
+ * Similarly, don't allow hardware-wrapped keys to be used as
+ * non-hardware-wrapped keys and vice versa.
+ */
+ if (payload->type != type || payload->flags != flags)
goto bad_key;
secret->size = key->datalen - sizeof(*payload);
- memcpy(secret->raw, payload->raw, secret->size);
+ memcpy(secret->bytes, payload->raw, secret->size);
err = 0;
goto out_put;
@@ -734,19 +776,28 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg)
return -EACCES;
memset(&secret, 0, sizeof(secret));
+
+ if (arg.flags) {
+ if (arg.flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED)
+ return -EINVAL;
+ if (arg.key_spec.type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER)
+ return -EINVAL;
+ secret.is_hw_wrapped = true;
+ }
+
if (arg.key_id) {
if (arg.raw_size != 0)
return -EINVAL;
- err = get_keyring_key(arg.key_id, arg.key_spec.type, &secret);
+ err = get_keyring_key(arg.key_id, arg.key_spec.type, arg.flags,
+ &secret);
if (err)
goto out_wipe_secret;
} else {
- if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE ||
- arg.raw_size > FSCRYPT_MAX_KEY_SIZE)
+ if (!fscrypt_valid_key_size(arg.raw_size, arg.flags))
return -EINVAL;
secret.size = arg.raw_size;
err = -EFAULT;
- if (copy_from_user(secret.raw, uarg->raw, secret.size))
+ if (copy_from_user(secret.bytes, uarg->raw, secret.size))
goto out_wipe_secret;
}
@@ -770,32 +821,26 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key);
static void
fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret)
{
- static u8 test_key[FSCRYPT_MAX_KEY_SIZE];
+ static u8 test_key[FSCRYPT_MAX_RAW_KEY_SIZE];
- get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE);
+ get_random_once(test_key, sizeof(test_key));
memset(secret, 0, sizeof(*secret));
- secret->size = FSCRYPT_MAX_KEY_SIZE;
- memcpy(secret->raw, test_key, FSCRYPT_MAX_KEY_SIZE);
+ secret->size = sizeof(test_key);
+ memcpy(secret->bytes, test_key, sizeof(test_key));
}
-int fscrypt_get_test_dummy_key_identifier(
+void fscrypt_get_test_dummy_key_identifier(
u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
{
struct fscrypt_master_key_secret secret;
- int err;
fscrypt_get_test_dummy_secret(&secret);
-
- err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size);
- if (err)
- goto out;
- err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER,
- NULL, 0, key_identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE);
-out:
+ fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size);
+ fscrypt_hkdf_expand(&secret.hkdf,
+ HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0,
+ key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE);
wipe_master_key_secret(&secret);
- return err;
}
/**
@@ -900,7 +945,7 @@ static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) {
inode = ci->ci_inode;
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index b4fe01ea4bd4..40fa05688d3a 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -9,6 +9,7 @@
*/
#include <crypto/skcipher.h>
+#include <linux/export.h>
#include <linux/random.h>
#include "fscrypt_private.h"
@@ -96,14 +97,15 @@ select_encryption_mode(const union fscrypt_policy *policy,
}
/* Create a symmetric cipher object for the given encryption mode and key */
-static struct crypto_skcipher *
+static struct crypto_sync_skcipher *
fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
const struct inode *inode)
{
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
int err;
- tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+ tfm = crypto_alloc_sync_skcipher(mode->cipher_str, 0,
+ FSCRYPT_CRYPTOAPI_MASK);
if (IS_ERR(tfm)) {
if (PTR_ERR(tfm) == -ENOENT) {
fscrypt_warn(inode,
@@ -123,21 +125,22 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
* first time a mode is used.
*/
pr_info("fscrypt: %s using implementation \"%s\"\n",
- mode->friendly_name, crypto_skcipher_driver_name(tfm));
+ mode->friendly_name,
+ crypto_skcipher_driver_name(&tfm->base));
}
- if (WARN_ON_ONCE(crypto_skcipher_ivsize(tfm) != mode->ivsize)) {
+ if (WARN_ON_ONCE(crypto_sync_skcipher_ivsize(tfm) != mode->ivsize)) {
err = -EINVAL;
goto err_free_tfm;
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
+ crypto_sync_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+ err = crypto_sync_skcipher_setkey(tfm, raw_key, mode->keysize);
if (err)
goto err_free_tfm;
return tfm;
err_free_tfm:
- crypto_free_skcipher(tfm);
+ crypto_free_sync_skcipher(tfm);
return ERR_PTR(err);
}
@@ -150,10 +153,12 @@ err_free_tfm:
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key, const struct fscrypt_inode_info *ci)
{
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
if (fscrypt_using_inline_encryption(ci))
- return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci);
+ return fscrypt_prepare_inline_crypt_key(prep_key, raw_key,
+ ci->ci_mode->keysize,
+ false, ci);
tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode);
if (IS_ERR(tfm))
@@ -172,7 +177,7 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
void fscrypt_destroy_prepared_key(struct super_block *sb,
struct fscrypt_prepared_key *prep_key)
{
- crypto_free_skcipher(prep_key->tfm);
+ crypto_free_sync_skcipher(prep_key->tfm);
fscrypt_destroy_inline_crypt_key(sb, prep_key);
memzero_explicit(prep_key, sizeof(*prep_key));
}
@@ -195,14 +200,29 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
struct fscrypt_mode *mode = ci->ci_mode;
const u8 mode_num = mode - fscrypt_modes;
struct fscrypt_prepared_key *prep_key;
- u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
+ u8 mode_key[FSCRYPT_MAX_RAW_KEY_SIZE];
u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)];
unsigned int hkdf_infolen = 0;
+ bool use_hw_wrapped_key = false;
int err;
if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX))
return -EINVAL;
+ if (mk->mk_secret.is_hw_wrapped && S_ISREG(inode->i_mode)) {
+ /* Using a hardware-wrapped key for file contents encryption */
+ if (!fscrypt_using_inline_encryption(ci)) {
+ if (sb->s_flags & SB_INLINECRYPT)
+ fscrypt_warn(ci->ci_inode,
+ "Hardware-wrapped key required, but no suitable inline encryption capabilities are available");
+ else
+ fscrypt_warn(ci->ci_inode,
+ "Hardware-wrapped keys require inline encryption (-o inlinecrypt)");
+ return -EINVAL;
+ }
+ use_hw_wrapped_key = true;
+ }
+
prep_key = &keys[mode_num];
if (fscrypt_is_key_prepared(prep_key, ci)) {
ci->ci_enc_key = *prep_key;
@@ -214,6 +234,16 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
if (fscrypt_is_key_prepared(prep_key, ci))
goto done_unlock;
+ if (use_hw_wrapped_key) {
+ err = fscrypt_prepare_inline_crypt_key(prep_key,
+ mk->mk_secret.bytes,
+ mk->mk_secret.size, true,
+ ci);
+ if (err)
+ goto out_unlock;
+ goto done_unlock;
+ }
+
BUILD_BUG_ON(sizeof(mode_num) != 1);
BUILD_BUG_ON(sizeof(sb->s_uuid) != 16);
BUILD_BUG_ON(sizeof(hkdf_info) != 17);
@@ -223,11 +253,8 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
sizeof(sb->s_uuid));
hkdf_infolen += sizeof(sb->s_uuid);
}
- err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
- hkdf_context, hkdf_info, hkdf_infolen,
- mode_key, mode->keysize);
- if (err)
- goto out_unlock;
+ fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info,
+ hkdf_infolen, mode_key, mode->keysize);
err = fscrypt_prepare_key(prep_key, mode_key, ci);
memzero_explicit(mode_key, mode->keysize);
if (err)
@@ -248,36 +275,25 @@ out_unlock:
* as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an
* endianness swap in order to get the same results as on little endian CPUs.
*/
-static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
- u8 context, const u8 *info,
- unsigned int infolen, siphash_key_t *key)
+static void fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
+ u8 context, const u8 *info,
+ unsigned int infolen, siphash_key_t *key)
{
- int err;
-
- err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen,
- (u8 *)key, sizeof(*key));
- if (err)
- return err;
-
+ fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen,
+ (u8 *)key, sizeof(*key));
BUILD_BUG_ON(sizeof(*key) != 16);
BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2);
le64_to_cpus(&key->key[0]);
le64_to_cpus(&key->key[1]);
- return 0;
}
-int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
- const struct fscrypt_master_key *mk)
+void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
+ const struct fscrypt_master_key *mk)
{
- int err;
-
- err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY,
- ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
- &ci->ci_dirhash_key);
- if (err)
- return err;
+ fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY,
+ ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
+ &ci->ci_dirhash_key);
ci->ci_dirhash_key_initialized = true;
- return 0;
}
void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
@@ -308,17 +324,12 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
if (mk->mk_ino_hash_key_initialized)
goto unlock;
- err = fscrypt_derive_siphash_key(mk,
- HKDF_CONTEXT_INODE_HASH_KEY,
- NULL, 0, &mk->mk_ino_hash_key);
- if (err)
- goto unlock;
+ fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY,
+ NULL, 0, &mk->mk_ino_hash_key);
/* pairs with smp_load_acquire() above */
smp_store_release(&mk->mk_ino_hash_key_initialized, true);
unlock:
mutex_unlock(&fscrypt_mode_key_setup_mutex);
- if (err)
- return err;
}
/*
@@ -336,6 +347,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
{
int err;
+ if (mk->mk_secret.is_hw_wrapped &&
+ !(ci->ci_policy.v2.flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))) {
+ fscrypt_warn(ci->ci_inode,
+ "Hardware-wrapped keys are only supported with IV_INO_LBLK policies");
+ return -EINVAL;
+ }
+
if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
/*
* DIRECT_KEY: instead of deriving per-file encryption keys, the
@@ -362,15 +381,12 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk);
} else {
- u8 derived_key[FSCRYPT_MAX_KEY_SIZE];
-
- err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
- HKDF_CONTEXT_PER_FILE_ENC_KEY,
- ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
- derived_key, ci->ci_mode->keysize);
- if (err)
- return err;
+ u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE];
+ fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
+ HKDF_CONTEXT_PER_FILE_ENC_KEY,
+ ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
+ derived_key, ci->ci_mode->keysize);
err = fscrypt_set_per_file_enc_key(ci, derived_key);
memzero_explicit(derived_key, ci->ci_mode->keysize);
}
@@ -378,11 +394,8 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
return err;
/* Derive a secret dirhash key for directories that need it. */
- if (need_dirhash_key) {
- err = fscrypt_derive_dirhash_key(ci, mk);
- if (err)
- return err;
- }
+ if (need_dirhash_key)
+ fscrypt_derive_dirhash_key(ci, mk);
return 0;
}
@@ -445,10 +458,6 @@ static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
struct fscrypt_master_key *mk;
int err;
- err = fscrypt_select_encryption_impl(ci);
- if (err)
- return err;
-
err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec);
if (err)
return err;
@@ -476,6 +485,10 @@ static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
if (ci->ci_policy.version != FSCRYPT_POLICY_V1)
return -ENOKEY;
+ err = fscrypt_select_encryption_impl(ci, false);
+ if (err)
+ return err;
+
/*
* As a legacy fallback for v1 policies, search for the key in
* the current task's subscribed keyrings too. Don't move this
@@ -497,9 +510,21 @@ static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
goto out_release_key;
}
+ err = fscrypt_select_encryption_impl(ci, mk->mk_secret.is_hw_wrapped);
+ if (err)
+ goto out_release_key;
+
switch (ci->ci_policy.version) {
case FSCRYPT_POLICY_V1:
- err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
+ if (WARN_ON_ONCE(mk->mk_secret.is_hw_wrapped)) {
+ /*
+ * This should never happen, as adding a v1 policy key
+ * that is hardware-wrapped isn't allowed.
+ */
+ err = -EINVAL;
+ goto out_release_key;
+ }
+ err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.bytes);
break;
case FSCRYPT_POLICY_V2:
err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key);
@@ -592,15 +617,16 @@ fscrypt_setup_encryption_info(struct inode *inode,
goto out;
/*
- * For existing inodes, multiple tasks may race to set ->i_crypt_info.
- * So use cmpxchg_release(). This pairs with the smp_load_acquire() in
- * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with
- * a RELEASE barrier so that other tasks can ACQUIRE it.
+ * For existing inodes, multiple tasks may race to set the inode's
+ * fscrypt info pointer. So use cmpxchg_release(). This pairs with the
+ * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the
+ * pointer with a RELEASE barrier so that other tasks can ACQUIRE it.
*/
- if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
+ if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) ==
+ NULL) {
/*
- * We won the race and set ->i_crypt_info to our crypt_info.
- * Now link it into the master key's inode list.
+ * We won the race and set the inode's fscrypt info to our
+ * crypt_info. Now link it into the master key's inode list.
*/
if (mk) {
crypt_info->ci_master_key = mk;
@@ -631,13 +657,13 @@ out:
* %false unless the operation being performed is needed in
* order for files (or directories) to be deleted.
*
- * Set up ->i_crypt_info, if it hasn't already been done.
+ * Set up the inode's encryption key, if it hasn't already been done.
*
- * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So
+ * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So
* generally this shouldn't be called from within a filesystem transaction.
*
- * Return: 0 if ->i_crypt_info was set or was already set, *or* if the
- * encryption key is unavailable. (Use fscrypt_has_encryption_key() to
+ * Return: 0 if the key is now set up, *or* if it couldn't be set up because the
+ * needed master key is absent. (Use fscrypt_has_encryption_key() to
* distinguish these cases.) Also can return another -errno code.
*/
int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
@@ -691,9 +717,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
* ->i_ino doesn't need to be set yet.
* @encrypt_ret: (output) set to %true if the new inode will be encrypted
*
- * If the directory is encrypted, set up its ->i_crypt_info in preparation for
+ * If the directory is encrypted, set up its encryption key in preparation for
* encrypting the name of the new file. Also, if the new inode will be
- * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true.
+ * encrypted, set up its encryption key too and set *encrypt_ret=true.
*
* This isn't %GFP_NOFS-safe, and therefore it should be called before starting
* any filesystem transaction to create the inode. For this reason, ->i_ino
@@ -702,8 +728,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
* This doesn't persist the new inode's encryption context. That still needs to
* be done later by calling fscrypt_set_context().
*
- * Return: 0 on success, -ENOKEY if the encryption key is missing, or another
- * -errno code
+ * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode
+ * but the needed master key is absent, or another -errno code
*/
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
bool *encrypt_ret)
@@ -750,8 +776,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
*/
void fscrypt_put_encryption_info(struct inode *inode)
{
- put_crypt_info(inode->i_crypt_info);
- inode->i_crypt_info = NULL;
+ /*
+ * Ideally we'd start with a lightweight IS_ENCRYPTED() check here
+ * before proceeding to retrieve and check the pointer. However, during
+ * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If
+ * an error occurs, it needs to be cleaned up regardless.
+ */
+ struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode);
+
+ put_crypt_info(*ci_addr);
+ *ci_addr = NULL;
}
EXPORT_SYMBOL(fscrypt_put_encryption_info);
@@ -800,7 +834,7 @@ int fscrypt_drop_inode(struct inode *inode)
* userspace is still using the files, inodes can be dirtied between
* then and now. We mustn't lose any writes, so skip dirty inodes here.
*/
- if (inode->i_state & I_DIRTY_ALL)
+ if (inode_state_read(inode) & I_DIRTY_ALL)
return 0;
/*
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index cf3b58ec32cc..c4d05168522b 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -48,39 +48,30 @@ static int derive_key_aes(const u8 *master_key,
const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
u8 *derived_key, unsigned int derived_keysize)
{
- int res = 0;
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
- struct scatterlist src_sg, dst_sg;
- struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
-
- if (IS_ERR(tfm)) {
- res = PTR_ERR(tfm);
- tfm = NULL;
- goto out;
- }
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- req = skcipher_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- res = -ENOMEM;
- goto out;
- }
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
- res = crypto_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE);
- if (res < 0)
- goto out;
+ struct crypto_sync_skcipher *tfm;
+ int err;
- sg_init_one(&src_sg, master_key, derived_keysize);
- sg_init_one(&dst_sg, derived_key, derived_keysize);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
- NULL);
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
-out:
- skcipher_request_free(req);
- crypto_free_skcipher(tfm);
- return res;
+ tfm = crypto_alloc_sync_skcipher("ecb(aes)", 0, FSCRYPT_CRYPTOAPI_MASK);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ err = crypto_sync_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE);
+ if (err == 0) {
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ struct scatterlist src_sg, dst_sg;
+
+ skcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG |
+ CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+ sg_init_one(&src_sg, master_key, derived_keysize);
+ sg_init_one(&dst_sg, derived_key, derived_keysize);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ derived_keysize, NULL);
+ err = crypto_skcipher_encrypt(req);
+ }
+ crypto_free_sync_skcipher(tfm);
+ return err;
}
/*
@@ -118,7 +109,7 @@ find_and_lock_process_key(const char *prefix,
payload = (const struct fscrypt_key *)ukp->data;
if (ukp->datalen != sizeof(struct fscrypt_key) ||
- payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) {
+ payload->size < 1 || payload->size > sizeof(payload->raw)) {
fscrypt_warn(NULL,
"key with description '%s' has invalid payload",
key->description);
@@ -149,7 +140,7 @@ struct fscrypt_direct_key {
const struct fscrypt_mode *dk_mode;
struct fscrypt_prepared_key dk_key;
u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
- u8 dk_raw[FSCRYPT_MAX_KEY_SIZE];
+ u8 dk_raw[FSCRYPT_MAX_RAW_KEY_SIZE];
};
static void free_direct_key(struct fscrypt_direct_key *dk)
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 701259991277..bbb2f5ced988 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,11 +10,13 @@
* Modified by Eric Biggers, 2019 for v2 policy support.
*/
+#include <linux/export.h>
#include <linux/fs_context.h>
+#include <linux/mount.h>
#include <linux/random.h>
#include <linux/seq_file.h>
#include <linux/string.h>
-#include <linux/mount.h>
+
#include "fscrypt_private.h"
/**
@@ -725,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
err = fscrypt_require_key(dir);
if (err)
return ERR_PTR(err);
- return &dir->i_crypt_info->ci_policy;
+ return &fscrypt_get_inode_info_raw(dir)->ci_policy;
}
return fscrypt_get_dummy_policy(dir->i_sb);
@@ -744,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
*/
int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
{
- struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
BUILD_BUG_ON(sizeof(union fscrypt_context) !=
FSCRYPT_SET_CONTEXT_MAX_SIZE);
@@ -769,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
*/
int fscrypt_set_context(struct inode *inode, void *fs_data)
{
- struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci;
union fscrypt_context ctx;
int ctxsize;
@@ -781,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
* This may be the first time the inode number is available, so do any
* delayed key setup that requires the inode number.
*/
+ ci = fscrypt_get_inode_info_raw(inode);
if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
(ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
fscrypt_hash_inode_number(ci, ci->ci_master_key);
@@ -824,10 +827,8 @@ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
policy->version = FSCRYPT_POLICY_V2;
policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- err = fscrypt_get_test_dummy_key_identifier(
+ fscrypt_get_test_dummy_key_identifier(
policy->v2.master_key_identifier);
- if (err)
- goto out;
} else {
err = -EINVAL;
goto out;
diff --git a/fs/d_path.c b/fs/d_path.c
index 5f4da5c8d5db..bb365511066b 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
*root = fs->root;
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
}
/**
@@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
*root = fs->root;
*pwd = fs->pwd;
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
}
/*
diff --git a/fs/dax.c b/fs/dax.c
index 7fd4cd9a51f2..289e6254aa30 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -20,12 +20,11 @@
#include <linux/sched/signal.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
-#include <linux/pfn_t.h>
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
#include <linux/rmap.h>
-#include <asm/pgalloc.h>
+#include <linux/pgalloc.h>
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
@@ -71,9 +70,14 @@ static unsigned long dax_to_pfn(void *entry)
return xa_to_value(entry) >> DAX_SHIFT;
}
-static void *dax_make_entry(pfn_t pfn, unsigned long flags)
+static struct folio *dax_to_folio(void *entry)
{
- return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
+ return page_folio(pfn_to_page(dax_to_pfn(entry)));
+}
+
+static void *dax_make_entry(unsigned long pfn, unsigned long flags)
+{
+ return xa_mk_value(flags | (pfn << DAX_SHIFT));
}
static bool dax_is_locked(void *entry)
@@ -206,7 +210,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry,
*
* Must be called with the i_pages lock held.
*/
-static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
+static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order)
{
void *entry;
struct wait_exceptional_entry_queue ewait;
@@ -236,6 +240,37 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
}
/*
+ * Wait for the given entry to become unlocked. Caller must hold the i_pages
+ * lock and call either put_unlocked_entry() if it did not lock the entry or
+ * dax_unlock_entry() if it did. Returns an unlocked entry if still present.
+ */
+static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
+{
+ struct wait_exceptional_entry_queue ewait;
+ wait_queue_head_t *wq;
+
+ init_wait(&ewait.wait);
+ ewait.wait.func = wake_exceptional_entry_func;
+
+ while (unlikely(dax_is_locked(entry))) {
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+ prepare_to_wait_exclusive(wq, &ewait.wait,
+ TASK_UNINTERRUPTIBLE);
+ xas_reset(xas);
+ xas_unlock_irq(xas);
+ schedule();
+ finish_wait(wq, &ewait.wait);
+ xas_lock_irq(xas);
+ entry = xas_load(xas);
+ }
+
+ if (xa_is_internal(entry))
+ return NULL;
+
+ return entry;
+}
+
+/*
* The only thing keeping the address space around is the i_pages lock
* (it's cycled in clear_inode() after removing the entries from i_pages)
* After we call xas_unlock_irq(), we cannot touch xas->xa.
@@ -250,7 +285,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
wq = dax_entry_waitqueue(xas, entry, &ewait.key);
/*
- * Unlike get_unlocked_entry() there is no guarantee that this
+ * Unlike get_next_unlocked_entry() there is no guarantee that this
* path ever successfully retrieves an unlocked entry before an
* inode dies. Perform a non-exclusive wait in case this path
* never successfully performs its own wake up.
@@ -307,109 +342,151 @@ static unsigned long dax_entry_size(void *entry)
return PAGE_SIZE;
}
-static unsigned long dax_end_pfn(void *entry)
+/*
+ * A DAX folio is considered shared if it has no mapping set and ->share (which
+ * shares the ->index field) is non-zero. Note this may return false even if the
+ * page is shared between multiple files but has not yet actually been mapped
+ * into multiple address spaces.
+ */
+static inline bool dax_folio_is_shared(struct folio *folio)
{
- return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+ return !folio->mapping && folio->share;
}
/*
- * Iterate through all mapped pfns represented by an entry, i.e. skip
- * 'empty' and 'zero' entries.
+ * When it is called by dax_insert_entry(), the shared flag will indicate
+ * whether this entry is shared by multiple files. If the page has not
+ * previously been associated with any mappings the ->mapping and ->index
+ * fields will be set. If it has already been associated with a mapping
+ * the mapping will be cleared and the share count set. It's then up to
+ * reverse map users like memory_failure() to call back into the filesystem to
+ * recover ->mapping and ->index information. For example by implementing
+ * dax_holder_operations.
*/
-#define for_each_mapped_pfn(entry, pfn) \
- for (pfn = dax_to_pfn(entry); \
- pfn < dax_end_pfn(entry); pfn++)
-
-static inline bool dax_page_is_shared(struct page *page)
+static void dax_folio_make_shared(struct folio *folio)
{
- return page->mapping == PAGE_MAPPING_DAX_SHARED;
+ /*
+ * folio is not currently shared so mark it as shared by clearing
+ * folio->mapping.
+ */
+ folio->mapping = NULL;
+
+ /*
+ * folio has previously been mapped into one address space so set the
+ * share count.
+ */
+ folio->share = 1;
}
-/*
- * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
- * refcount.
- */
-static inline void dax_page_share_get(struct page *page)
+static inline unsigned long dax_folio_put(struct folio *folio)
{
- if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
+ unsigned long ref;
+ int order, i;
+
+ if (!dax_folio_is_shared(folio))
+ ref = 0;
+ else
+ ref = --folio->share;
+
+ if (ref)
+ return ref;
+
+ folio->mapping = NULL;
+ order = folio_order(folio);
+ if (!order)
+ return 0;
+ folio_reset_order(folio);
+
+ for (i = 0; i < (1UL << order); i++) {
+ struct dev_pagemap *pgmap = page_pgmap(&folio->page);
+ struct page *page = folio_page(folio, i);
+ struct folio *new_folio = (struct folio *)page;
+
+ ClearPageHead(page);
+ clear_compound_head(page);
+
+ new_folio->mapping = NULL;
/*
- * Reset the index if the page was already mapped
- * regularly before.
+ * Reset pgmap which was over-written by
+ * prep_compound_page().
*/
- if (page->mapping)
- page->share = 1;
- page->mapping = PAGE_MAPPING_DAX_SHARED;
+ new_folio->pgmap = pgmap;
+ new_folio->share = 0;
+ WARN_ON_ONCE(folio_ref_count(new_folio));
}
- page->share++;
+
+ return ref;
}
-static inline unsigned long dax_page_share_put(struct page *page)
+static void dax_folio_init(void *entry)
{
- return --page->share;
+ struct folio *folio = dax_to_folio(entry);
+ int order = dax_entry_order(entry);
+
+ /*
+ * Folio should have been split back to order-0 pages in
+ * dax_folio_put() when they were removed from their
+ * final mapping.
+ */
+ WARN_ON_ONCE(folio_order(folio));
+
+ if (order > 0) {
+ prep_compound_page(&folio->page, order);
+ if (order > 1)
+ INIT_LIST_HEAD(&folio->_deferred_list);
+ WARN_ON_ONCE(folio_ref_count(folio));
+ }
}
-/*
- * When it is called in dax_insert_entry(), the shared flag will indicate that
- * whether this entry is shared by multiple files. If so, set the page->mapping
- * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
- */
static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address, bool shared)
+ struct vm_area_struct *vma,
+ unsigned long address, bool shared)
{
- unsigned long size = dax_entry_size(entry), pfn, index;
- int i = 0;
+ unsigned long size = dax_entry_size(entry), index;
+ struct folio *folio = dax_to_folio(entry);
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
return;
index = linear_page_index(vma, address & ~(size - 1));
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
+ if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
+ if (folio->mapping)
+ dax_folio_make_shared(folio);
- if (shared) {
- dax_page_share_get(page);
- } else {
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
- }
+ WARN_ON_ONCE(!folio->share);
+ WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
+ folio->share++;
+ } else {
+ WARN_ON_ONCE(folio->mapping);
+ dax_folio_init(entry);
+ folio = dax_to_folio(entry);
+ folio->mapping = mapping;
+ folio->index = index;
}
}
static void dax_disassociate_entry(void *entry, struct address_space *mapping,
- bool trunc)
+ bool trunc)
{
- unsigned long pfn;
+ struct folio *folio = dax_to_folio(entry);
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
return;
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- if (dax_page_is_shared(page)) {
- /* keep the shared flag if this page is still shared */
- if (dax_page_share_put(page) > 0)
- continue;
- } else
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
- page->mapping = NULL;
- page->index = 0;
- }
+ dax_folio_put(folio);
}
static struct page *dax_busy_page(void *entry)
{
- unsigned long pfn;
+ struct folio *folio = dax_to_folio(entry);
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
+ return NULL;
- if (page_ref_count(page) > 1)
- return page;
- }
- return NULL;
+ if (folio_ref_count(folio) - folio_mapcount(folio))
+ return &folio->page;
+ else
+ return NULL;
}
/**
@@ -580,7 +657,7 @@ static void *grab_mapping_entry(struct xa_state *xas,
retry:
pmd_downgrade = false;
xas_lock_irq(xas);
- entry = get_unlocked_entry(xas, order);
+ entry = get_next_unlocked_entry(xas, order);
if (entry) {
if (dax_is_conflict(entry))
@@ -635,7 +712,7 @@ retry:
if (order > 0)
flags |= DAX_PMD;
- entry = dax_make_entry(pfn_to_pfn_t(0), flags);
+ entry = dax_make_entry(0, flags);
dax_lock_entry(xas, entry);
if (xas_error(xas))
goto out_unlock;
@@ -684,13 +761,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
pgoff_t end_idx;
XA_STATE(xas, &mapping->i_pages, start_idx);
- /*
- * In the 'limited' case get_user_pages() for dax is disabled.
- */
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return NULL;
-
- if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+ if (!dax_mapping(mapping))
return NULL;
/* If end == LLONG_MAX, all pages from start to till end of file */
@@ -716,8 +787,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
xas_for_each(&xas, entry, end_idx) {
if (WARN_ON_ONCE(!xa_is_value(entry)))
continue;
- if (unlikely(dax_is_locked(entry)))
- entry = get_unlocked_entry(&xas, 0);
+ entry = wait_entry_unlocked_exclusive(&xas, entry);
if (entry)
page = dax_busy_page(entry);
put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -743,14 +813,14 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
static int __dax_invalidate_entry(struct address_space *mapping,
- pgoff_t index, bool trunc)
+ pgoff_t index, bool trunc)
{
XA_STATE(xas, &mapping->i_pages, index);
int ret = 0;
void *entry;
xas_lock_irq(&xas);
- entry = get_unlocked_entry(&xas, 0);
+ entry = get_next_unlocked_entry(&xas, 0);
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
goto out;
if (!trunc &&
@@ -776,7 +846,9 @@ static int __dax_clear_dirty_range(struct address_space *mapping,
xas_lock_irq(&xas);
xas_for_each(&xas, entry, end) {
- entry = get_unlocked_entry(&xas, 0);
+ entry = wait_entry_unlocked_exclusive(&xas, entry);
+ if (!entry)
+ continue;
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -813,6 +885,107 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
return ret;
}
+void dax_delete_mapping_range(struct address_space *mapping,
+ loff_t start, loff_t end)
+{
+ void *entry;
+ pgoff_t start_idx = start >> PAGE_SHIFT;
+ pgoff_t end_idx;
+ XA_STATE(xas, &mapping->i_pages, start_idx);
+
+ /* If end == LLONG_MAX, all pages from start to till end of file */
+ if (end == LLONG_MAX)
+ end_idx = ULONG_MAX;
+ else
+ end_idx = end >> PAGE_SHIFT;
+
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, entry, end_idx) {
+ if (!xa_is_value(entry))
+ continue;
+ entry = wait_entry_unlocked_exclusive(&xas, entry);
+ if (!entry)
+ continue;
+ dax_disassociate_entry(entry, mapping, true);
+ xas_store(&xas, NULL);
+ mapping->nrpages -= 1UL << dax_entry_order(entry);
+ put_unlocked_entry(&xas, entry, WAKE_ALL);
+ }
+ xas_unlock_irq(&xas);
+}
+EXPORT_SYMBOL_GPL(dax_delete_mapping_range);
+
+static int wait_page_idle(struct page *page,
+ void (cb)(struct inode *),
+ struct inode *inode)
+{
+ return ___wait_var_event(page, dax_page_is_idle(page),
+ TASK_INTERRUPTIBLE, 0, 0, cb(inode));
+}
+
+static void wait_page_idle_uninterruptible(struct page *page,
+ struct inode *inode)
+{
+ ___wait_var_event(page, dax_page_is_idle(page),
+ TASK_UNINTERRUPTIBLE, 0, 0, schedule());
+}
+
+/*
+ * Unmaps the inode and waits for any DMA to complete prior to deleting the
+ * DAX mapping entries for the range.
+ *
+ * For NOWAIT behavior, pass @cb as NULL to early-exit on first found
+ * busy page
+ */
+int dax_break_layout(struct inode *inode, loff_t start, loff_t end,
+ void (cb)(struct inode *))
+{
+ struct page *page;
+ int error = 0;
+
+ if (!dax_mapping(inode->i_mapping))
+ return 0;
+
+ do {
+ page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+ if (!page)
+ break;
+ if (!cb) {
+ error = -ERESTARTSYS;
+ break;
+ }
+
+ error = wait_page_idle(page, cb, inode);
+ } while (error == 0);
+
+ if (!page)
+ dax_delete_mapping_range(inode->i_mapping, start, end);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(dax_break_layout);
+
+void dax_break_layout_final(struct inode *inode)
+{
+ struct page *page;
+
+ if (!dax_mapping(inode->i_mapping))
+ return;
+
+ do {
+ page = dax_layout_busy_page_range(inode->i_mapping, 0,
+ LLONG_MAX);
+ if (!page)
+ break;
+
+ wait_page_idle_uninterruptible(page, inode);
+ } while (true);
+
+ if (!page)
+ dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
+}
+EXPORT_SYMBOL_GPL(dax_break_layout_final);
+
/*
* Invalidate DAX entry if it is clean.
*/
@@ -867,7 +1040,7 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
* appropriate.
*/
static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap_iter *iter, void *entry, pfn_t pfn,
+ const struct iomap_iter *iter, void *entry, unsigned long pfn,
unsigned long flags)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -895,8 +1068,9 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
void *old;
dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
- shared);
+ dax_associate_entry(new_entry, mapping, vmf->vma,
+ vmf->address, shared);
+
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -940,7 +1114,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
if (unlikely(dax_is_locked(entry))) {
void *old_entry = entry;
- entry = get_unlocked_entry(xas, 0);
+ entry = get_next_unlocked_entry(xas, 0);
/* Entry got punched out / reallocated? */
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
@@ -1064,7 +1238,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
- size_t size, void **kaddr, pfn_t *pfnp)
+ size_t size, void **kaddr, unsigned long *pfnp)
{
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
int id, rc = 0;
@@ -1082,11 +1256,9 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
- if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
- goto out;
- /* For larger pages we need devmap */
- if (length > 1 && !pfn_t_devmap(*pfnp))
+ if (*pfnp & (PHYS_PFN(size)-1))
goto out;
+
rc = 0;
out_check_addr:
@@ -1188,12 +1360,12 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
{
struct inode *inode = iter->inode;
unsigned long vaddr = vmf->address;
- pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
+ unsigned long pfn = my_zero_pfn(vaddr);
vm_fault_t ret;
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
- ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
+ ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false);
trace_dax_load_hole(inode, vmf, ret);
return ret;
}
@@ -1203,52 +1375,24 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
const struct iomap_iter *iter, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- unsigned long pmd_addr = vmf->address & PMD_MASK;
- struct vm_area_struct *vma = vmf->vma;
struct inode *inode = mapping->host;
- pgtable_t pgtable = NULL;
struct folio *zero_folio;
- spinlock_t *ptl;
- pmd_t pmd_entry;
- pfn_t pfn;
+ vm_fault_t ret;
zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
- if (unlikely(!zero_folio))
- goto fallback;
-
- pfn = page_to_pfn_t(&zero_folio->page);
- *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE);
-
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
- ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
- if (!pmd_none(*(vmf->pmd))) {
- spin_unlock(ptl);
- goto fallback;
+ if (unlikely(!zero_folio)) {
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
+ return VM_FAULT_FALLBACK;
}
- if (pgtable) {
- pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- mm_inc_nr_ptes(vma->vm_mm);
- }
- pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
- pmd_entry = pmd_mkhuge(pmd_entry);
- set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
- spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
- return VM_FAULT_NOPAGE;
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio),
+ DAX_PMD | DAX_ZERO_PAGE);
-fallback:
- if (pgtable)
- pte_free(vma->vm_mm, pgtable);
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
- return VM_FAULT_FALLBACK;
+ ret = vmf_insert_folio_pmd(vmf, zero_folio, false);
+ if (ret == VM_FAULT_NOPAGE)
+ trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
+ return ret;
}
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
@@ -1363,7 +1507,7 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
/*
* invalidate the pages whose sharing state is to be changed
@@ -1392,10 +1536,10 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (ret < 0)
return ret;
- ret = iomap_iter_advance(iter, &length);
+ ret = iomap_iter_advance(iter, length);
if (ret)
return ret;
- } while (length > 0);
+ } while ((length = iomap_length(iter)) > 0);
if (did_zero)
*did_zero = true;
@@ -1453,7 +1597,7 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
done = iov_iter_zero(min(length, end - pos), iter);
- return iomap_iter_advance(iomi, &done);
+ return iomap_iter_advance(iomi, done);
}
}
@@ -1537,12 +1681,12 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
- length = xfer;
- ret = iomap_iter_advance(iomi, &length);
+ ret = iomap_iter_advance(iomi, xfer);
if (!ret && xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
+ length = iomap_length(iomi);
}
dax_read_unlock(id);
@@ -1572,13 +1716,16 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
loff_t done = 0;
int ret;
+ if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC))
+ return -EIO;
+
if (!iomi.len)
return 0;
if (iov_iter_rw(iter) == WRITE) {
lockdep_assert_held_write(&iomi.inode->i_rwsem);
iomi.flags |= IOMAP_WRITE;
- } else {
+ } else if (!sb_rdonly(iomi.inode->i_sb)) {
lockdep_assert_held(&iomi.inode->i_rwsem);
}
@@ -1607,7 +1754,8 @@ static vm_fault_t dax_fault_return(int error)
* insertion for now and return the pfn so that caller can insert it after the
* fsync is done.
*/
-static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
+static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp,
+ unsigned long pfn)
{
if (WARN_ON_ONCE(!pfnp))
return VM_FAULT_SIGBUS;
@@ -1655,7 +1803,7 @@ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
* @pmd: distinguish whether it is a pmd fault
*/
static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
- const struct iomap_iter *iter, pfn_t *pfnp,
+ const struct iomap_iter *iter, unsigned long *pfnp,
struct xa_state *xas, void **entry, bool pmd)
{
const struct iomap *iomap = &iter->iomap;
@@ -1664,8 +1812,9 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
- int err = 0;
- pfn_t pfn;
+ struct folio *folio;
+ int ret, err = 0;
+ unsigned long pfn;
void *kaddr;
if (!pmd && vmf->cow_page)
@@ -1696,20 +1845,21 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
return dax_fault_return(err);
}
+ folio = dax_to_folio(*entry);
if (dax_fault_is_synchronous(iter, vmf->vma))
return dax_fault_synchronous_pfnp(pfnp, pfn);
- /* insert PMD pfn */
+ folio_ref_inc(folio);
if (pmd)
- return vmf_insert_pfn_pmd(vmf, pfn, write);
+ ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write);
+ else
+ ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write);
+ folio_put(folio);
- /* insert PTE pfn */
- if (write)
- return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+ return ret;
}
-static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
int *iomap_errp, const struct iomap_ops *ops)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1750,7 +1900,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
* the PTE we need to set up. If so just return and the fault will be
* retried.
*/
- if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
+ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto unlock_entry;
}
@@ -1769,10 +1919,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ret |= VM_FAULT_MAJOR;
}
- if (!(ret & VM_FAULT_ERROR)) {
- u64 length = PAGE_SIZE;
- iter.status = iomap_iter_advance(&iter, &length);
- }
+ if (!(ret & VM_FAULT_ERROR))
+ iter.status = iomap_iter_advance(&iter, PAGE_SIZE);
}
if (iomap_errp)
@@ -1821,7 +1969,7 @@ static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
return false;
}
-static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
const struct iomap_ops *ops)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1873,8 +2021,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* the PMD we need to set up. If so just return and the fault will be
* retried.
*/
- if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
- !pmd_devmap(*vmf->pmd)) {
+ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) {
ret = 0;
goto unlock_entry;
}
@@ -1885,10 +2032,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
continue; /* actually breaks out of the loop */
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
- if (ret != VM_FAULT_FALLBACK) {
- u64 length = PMD_SIZE;
- iter.status = iomap_iter_advance(&iter, &length);
- }
+ if (ret != VM_FAULT_FALLBACK)
+ iter.status = iomap_iter_advance(&iter, PMD_SIZE);
}
unlock_entry:
@@ -1903,7 +2048,7 @@ out:
return ret;
}
#else
-static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
const struct iomap_ops *ops)
{
return VM_FAULT_FALLBACK;
@@ -1924,7 +2069,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* successfully.
*/
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
- pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
+ unsigned long *pfnp, int *iomap_errp,
+ const struct iomap_ops *ops)
{
if (order == 0)
return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
@@ -1944,16 +2090,17 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
* This function inserts a writeable PTE or PMD entry into the page tables
* for an mmaped DAX file. It also marks the page cache entry as dirty.
*/
-static vm_fault_t
-dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
+static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+ unsigned long pfn, unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+ struct folio *folio;
void *entry;
vm_fault_t ret;
xas_lock_irq(&xas);
- entry = get_unlocked_entry(&xas, order);
+ entry = get_next_unlocked_entry(&xas, order);
/* Did we race with someone splitting entry or so? */
if (!entry || dax_is_conflict(entry) ||
(order == 0 && !dax_is_pte_entry(entry))) {
@@ -1966,14 +2113,17 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
dax_lock_entry(&xas, entry);
xas_unlock_irq(&xas);
+ folio = pfn_folio(pfn);
+ folio_ref_inc(folio);
if (order == 0)
- ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+ ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
#ifdef CONFIG_FS_DAX_PMD
else if (order == PMD_ORDER)
- ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
+ ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE);
#endif
else
ret = VM_FAULT_FALLBACK;
+ folio_put(folio);
dax_unlock_entry(&xas, entry);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
return ret;
@@ -1990,7 +2140,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
* table entry.
*/
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
- pfn_t pfn)
+ unsigned long pfn)
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
@@ -2009,7 +2159,6 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
- u64 dest_len;
void *saddr, *daddr;
int id, ret;
@@ -2042,10 +2191,9 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
dax_read_unlock(id);
advance:
- dest_len = len;
- ret = iomap_iter_advance(it_src, &len);
+ ret = iomap_iter_advance(it_src, len);
if (!ret)
- ret = iomap_iter_advance(it_dest, &dest_len);
+ ret = iomap_iter_advance(it_dest, len);
return ret;
out_unlock:
diff --git a/fs/dcache.c b/fs/dcache.c
index bd5aa136153a..dc2fff4811d1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -74,10 +74,11 @@
* arbitrary, since it's serialized on rename_lock
*/
static int sysctl_vfs_cache_pressure __read_mostly = 100;
+static int sysctl_vfs_cache_pressure_denom __read_mostly = 100;
unsigned long vfs_pressure_ratio(unsigned long val)
{
- return mult_frac(val, sysctl_vfs_cache_pressure, 100);
+ return mult_frac(val, sysctl_vfs_cache_pressure, sysctl_vfs_cache_pressure_denom);
}
EXPORT_SYMBOL_GPL(vfs_pressure_ratio);
@@ -85,7 +86,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
-static struct kmem_cache *dentry_cache __ro_after_init;
+static struct kmem_cache *__dentry_cache __ro_after_init;
+#define dentry_cache runtime_const_ptr(__dentry_cache)
const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
@@ -225,6 +227,14 @@ static const struct ctl_table vm_dcache_sysctls[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
+ {
+ .procname = "vfs_cache_pressure_denom",
+ .data = &sysctl_vfs_cache_pressure_denom,
+ .maxlen = sizeof(sysctl_vfs_cache_pressure_denom),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE_HUNDRED,
+ },
};
static int __init init_fs_dcache_sysctls(void)
@@ -785,7 +795,7 @@ void d_mark_dontcache(struct inode *inode)
de->d_flags |= DCACHE_DONTCACHE;
spin_unlock(&de->d_lock);
}
- inode->i_state |= I_DONTCACHE;
+ inode_state_set(inode, I_DONTCACHE);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);
@@ -860,6 +870,24 @@ locked:
return false;
}
+static void finish_dput(struct dentry *dentry)
+ __releases(dentry->d_lock)
+ __releases(RCU)
+{
+ while (lock_for_kill(dentry)) {
+ rcu_read_unlock();
+ dentry = __dentry_kill(dentry);
+ if (!dentry)
+ return;
+ if (retain_dentry(dentry, true)) {
+ spin_unlock(&dentry->d_lock);
+ return;
+ }
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ spin_unlock(&dentry->d_lock);
+}
/*
* This is dput
@@ -897,22 +925,21 @@ void dput(struct dentry *dentry)
rcu_read_unlock();
return;
}
- while (lock_for_kill(dentry)) {
- rcu_read_unlock();
- dentry = __dentry_kill(dentry);
- if (!dentry)
- return;
- if (retain_dentry(dentry, true)) {
- spin_unlock(&dentry->d_lock);
- return;
- }
- rcu_read_lock();
- }
- rcu_read_unlock();
- spin_unlock(&dentry->d_lock);
+ finish_dput(dentry);
}
EXPORT_SYMBOL(dput);
+void d_make_discardable(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT));
+ dentry->d_flags &= ~DCACHE_PERSISTENT;
+ dentry->d_lockref.count--;
+ rcu_read_lock();
+ finish_dput(dentry);
+}
+EXPORT_SYMBOL(d_make_discardable);
+
static void to_shrink_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
@@ -1064,7 +1091,7 @@ struct dentry *d_find_alias_rcu(struct inode *inode)
spin_lock(&inode->i_lock);
// ->i_dentry and ->i_rcu are colocated, but the latter won't be
// used without having I_FREEING set, which means no aliases left
- if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) {
+ if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) {
if (S_ISDIR(inode->i_mode)) {
de = hlist_entry(l->first, struct dentry, d_u.d_alias);
} else {
@@ -1077,6 +1104,15 @@ struct dentry *d_find_alias_rcu(struct inode *inode)
return de;
}
+void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose)
+{
+ spin_lock(&dentry->d_lock);
+ if (!dentry->d_lockref.count)
+ to_shrink_list(dentry, dispose);
+ spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_dispose_if_unused);
+
/*
* Try to kill dentries associated with this inode.
* WARNING: you must own a reference to inode.
@@ -1087,12 +1123,8 @@ void d_prune_aliases(struct inode *inode)
struct dentry *dentry;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
- spin_lock(&dentry->d_lock);
- if (!dentry->d_lockref.count)
- to_shrink_list(dentry, &dispose);
- spin_unlock(&dentry->d_lock);
- }
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias)
+ d_dispose_if_unused(dentry, &dispose);
spin_unlock(&inode->i_lock);
shrink_dentry_list(&dispose);
}
@@ -1132,6 +1164,7 @@ void shrink_dentry_list(struct list_head *list)
shrink_kill(dentry);
}
}
+EXPORT_SYMBOL(shrink_dentry_list);
static enum lru_status dentry_lru_isolate(struct list_head *item,
struct list_lru_one *lru, void *arg)
@@ -1381,6 +1414,7 @@ struct check_mount {
unsigned int mounted;
};
+/* locks: mount_locked_reader && dentry->d_lock */
static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
struct check_mount *info = data;
@@ -1407,9 +1441,8 @@ int path_has_submounts(const struct path *parent)
{
struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
- read_seqlock_excl(&mount_lock);
+ guard(mount_locked_reader)();
d_walk(parent->dentry, &data, path_check_mount);
- read_sequnlock_excl(&mount_lock);
return data.mounted;
}
@@ -1427,7 +1460,7 @@ int d_set_mounted(struct dentry *dentry)
{
struct dentry *p;
int ret = -ENOENT;
- write_seqlock(&rename_lock);
+ read_seqlock_excl(&rename_lock);
for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
/* Need exclusion wrt. d_invalidate() */
spin_lock(&p->d_lock);
@@ -1447,7 +1480,7 @@ int d_set_mounted(struct dentry *dentry)
}
spin_unlock(&dentry->d_lock);
out:
- write_sequnlock(&rename_lock);
+ read_sequnlock_excl(&rename_lock);
return ret;
}
@@ -1502,6 +1535,15 @@ out:
return ret;
}
+static enum d_walk_ret select_collect_umount(void *_data, struct dentry *dentry)
+{
+ if (dentry->d_flags & DCACHE_PERSISTENT) {
+ dentry->d_flags &= ~DCACHE_PERSISTENT;
+ dentry->d_lockref.count--;
+ }
+ return select_collect(_data, dentry);
+}
+
static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
{
struct select_data *data = _data;
@@ -1530,18 +1572,20 @@ out:
}
/**
- * shrink_dcache_parent - prune dcache
+ * shrink_dcache_tree - prune dcache
* @parent: parent of entries to prune
+ * @for_umount: true if we want to unpin the persistent ones
*
* Prune the dcache to remove unused children of the parent dentry.
*/
-void shrink_dcache_parent(struct dentry *parent)
+static void shrink_dcache_tree(struct dentry *parent, bool for_umount)
{
for (;;) {
struct select_data data = {.start = parent};
INIT_LIST_HEAD(&data.dispose);
- d_walk(parent, &data, select_collect);
+ d_walk(parent, &data,
+ for_umount ? select_collect_umount : select_collect);
if (!list_empty(&data.dispose)) {
shrink_dentry_list(&data.dispose);
@@ -1566,6 +1610,11 @@ void shrink_dcache_parent(struct dentry *parent)
shrink_dentry_list(&data.dispose);
}
}
+
+void shrink_dcache_parent(struct dentry *parent)
+{
+ shrink_dcache_tree(parent, false);
+}
EXPORT_SYMBOL(shrink_dcache_parent);
static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
@@ -1592,7 +1641,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
static void do_one_tree(struct dentry *dentry)
{
- shrink_dcache_parent(dentry);
+ shrink_dcache_tree(dentry, true);
d_walk(dentry, dentry, umount_check);
d_drop(dentry);
dput(dentry);
@@ -1708,13 +1757,13 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dname = dentry->d_shortname.string;
}
- dentry->d_name.len = name->len;
- dentry->d_name.hash = name->hash;
+ dentry->__d_name.len = name->len;
+ dentry->__d_name.hash = name->hash;
memcpy(dname, name->name, name->len);
dname[name->len] = 0;
/* Make sure we always see the terminating NUL character */
- smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
+ smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */
dentry->d_flags = 0;
lockref_init(&dentry->d_lockref);
@@ -1722,14 +1771,14 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_inode = NULL;
dentry->d_parent = dentry;
dentry->d_sb = sb;
- dentry->d_op = NULL;
+ dentry->d_op = sb->__s_d_op;
+ dentry->d_flags = sb->s_d_flags;
dentry->d_fsdata = NULL;
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
INIT_HLIST_HEAD(&dentry->d_children);
INIT_HLIST_NODE(&dentry->d_u.d_alias);
INIT_HLIST_NODE(&dentry->d_sib);
- d_set_d_op(dentry, dentry->d_sb->s_d_op);
if (dentry->d_op && dentry->d_op->d_init) {
err = dentry->d_op->d_init(dentry);
@@ -1812,8 +1861,9 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
struct dentry *dentry = __d_alloc(sb, name);
if (likely(dentry)) {
dentry->d_flags |= DCACHE_NORCU;
- if (!sb->s_d_op)
- d_set_d_op(dentry, &anon_ops);
+ /* d_op_flags(&anon_ops) is 0 */
+ if (!dentry->d_op)
+ dentry->d_op = &anon_ops;
}
return dentry;
}
@@ -1828,35 +1878,50 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
}
EXPORT_SYMBOL(d_alloc_name);
-void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+#define DCACHE_OP_FLAGS \
+ (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | \
+ DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_PRUNE | \
+ DCACHE_OP_REAL)
+
+static unsigned int d_op_flags(const struct dentry_operations *op)
{
+ unsigned int flags = 0;
+ if (op) {
+ if (op->d_hash)
+ flags |= DCACHE_OP_HASH;
+ if (op->d_compare)
+ flags |= DCACHE_OP_COMPARE;
+ if (op->d_revalidate)
+ flags |= DCACHE_OP_REVALIDATE;
+ if (op->d_weak_revalidate)
+ flags |= DCACHE_OP_WEAK_REVALIDATE;
+ if (op->d_delete)
+ flags |= DCACHE_OP_DELETE;
+ if (op->d_prune)
+ flags |= DCACHE_OP_PRUNE;
+ if (op->d_real)
+ flags |= DCACHE_OP_REAL;
+ }
+ return flags;
+}
+
+static void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+{
+ unsigned int flags = d_op_flags(op);
WARN_ON_ONCE(dentry->d_op);
- WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
- DCACHE_OP_COMPARE |
- DCACHE_OP_REVALIDATE |
- DCACHE_OP_WEAK_REVALIDATE |
- DCACHE_OP_DELETE |
- DCACHE_OP_REAL));
+ WARN_ON_ONCE(dentry->d_flags & DCACHE_OP_FLAGS);
dentry->d_op = op;
- if (!op)
- return;
- if (op->d_hash)
- dentry->d_flags |= DCACHE_OP_HASH;
- if (op->d_compare)
- dentry->d_flags |= DCACHE_OP_COMPARE;
- if (op->d_revalidate)
- dentry->d_flags |= DCACHE_OP_REVALIDATE;
- if (op->d_weak_revalidate)
- dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
- if (op->d_delete)
- dentry->d_flags |= DCACHE_OP_DELETE;
- if (op->d_prune)
- dentry->d_flags |= DCACHE_OP_PRUNE;
- if (op->d_real)
- dentry->d_flags |= DCACHE_OP_REAL;
-
-}
-EXPORT_SYMBOL(d_set_d_op);
+ if (flags)
+ dentry->d_flags |= flags;
+}
+
+void set_default_d_op(struct super_block *s, const struct dentry_operations *ops)
+{
+ unsigned int flags = d_op_flags(ops);
+ s->__s_d_op = ops;
+ s->s_d_flags = (s->s_d_flags & ~DCACHE_OP_FLAGS) | flags;
+}
+EXPORT_SYMBOL(set_default_d_op);
static unsigned d_flags_for_inode(struct inode *inode)
{
@@ -1898,7 +1963,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
unsigned add_flags = d_flags_for_inode(inode);
WARN_ON(d_in_lookup(dentry));
- spin_lock(&dentry->d_lock);
/*
* The negative counter only tracks dentries on the LRU. Don't dec if
* d_lru is on another list.
@@ -1911,7 +1975,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
__d_set_inode_and_type(dentry, inode, add_flags);
raw_write_seqcount_end(&dentry->d_seq);
fsnotify_update_flags(dentry);
- spin_unlock(&dentry->d_lock);
}
/**
@@ -1935,7 +1998,9 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
if (inode) {
security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
+ spin_lock(&entry->d_lock);
__d_instantiate(entry, inode);
+ spin_unlock(&entry->d_lock);
spin_unlock(&inode->i_lock);
}
}
@@ -1954,15 +2019,11 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
lockdep_annotate_inode_mutex_key(inode);
security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
+ spin_lock(&entry->d_lock);
__d_instantiate(entry, inode);
- WARN_ON(!(inode->i_state & I_NEW));
- inode->i_state &= ~I_NEW & ~I_CREATING;
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb();
+ spin_unlock(&entry->d_lock);
+ WARN_ON(!(inode_state_read(inode) & I_NEW));
+ inode_state_clear(inode, I_NEW | I_CREATING);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
}
@@ -2281,11 +2342,20 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
- if (d_unhashed(dentry))
- continue;
if (dentry->d_name.hash_len != hashlen)
continue;
- if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
+ if (unlikely(dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0))
+ continue;
+ /*
+ * Check for the dentry being unhashed.
+ *
+ * As tempting as it is, we *can't* skip it because of a race window
+ * between us finding the dentry before it gets unhashed and loading
+ * the sequence counter after unhashing is finished.
+ *
+ * We can at least predict on it.
+ */
+ if (unlikely(d_unhashed(dentry)))
continue;
*seqp = seq;
return dentry;
@@ -2412,7 +2482,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
}
return d_lookup(dir, name);
}
-EXPORT_SYMBOL(d_hash_and_lookup);
/*
* When a file is deleted, we have two options:
@@ -2485,8 +2554,8 @@ static inline unsigned start_dir_add(struct inode *dir)
{
preempt_disable_nested();
for (;;) {
- unsigned n = dir->i_dir_seq;
- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+ unsigned n = READ_ONCE(dir->i_dir_seq);
+ if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1))
return n;
cpu_relax();
}
@@ -2522,13 +2591,21 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
unsigned int hash = name->hash;
struct hlist_bl_head *b = in_lookup_hash(parent, hash);
struct hlist_bl_node *node;
- struct dentry *new = d_alloc(parent, name);
+ struct dentry *new = __d_alloc(parent->d_sb, name);
struct dentry *dentry;
unsigned seq, r_seq, d_seq;
if (unlikely(!new))
return ERR_PTR(-ENOMEM);
+ new->d_flags |= DCACHE_PAR_LOOKUP;
+ spin_lock(&parent->d_lock);
+ new->d_parent = dget_dlock(parent);
+ hlist_add_head(&new->d_sib, &parent->d_children);
+ if (parent->d_flags & DCACHE_DISCONNECTED)
+ new->d_flags |= DCACHE_DISCONNECTED;
+ spin_unlock(&parent->d_lock);
+
retry:
rcu_read_lock();
seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
@@ -2612,8 +2689,6 @@ retry:
return dentry;
}
rcu_read_unlock();
- /* we can't take ->d_lock here; it's OK, though. */
- new->d_flags |= DCACHE_PAR_LOOKUP;
new->d_wait = wq;
hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
hlist_bl_unlock(b);
@@ -2659,7 +2734,8 @@ EXPORT_SYMBOL(__d_lookup_unhash_wake);
/* inode->i_lock held if inode is non-NULL */
-static inline void __d_add(struct dentry *dentry, struct inode *inode)
+static inline void __d_add(struct dentry *dentry, struct inode *inode,
+ const struct dentry_operations *ops)
{
wait_queue_head_t *d_wait;
struct inode *dir = NULL;
@@ -2670,6 +2746,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
n = start_dir_add(dir);
d_wait = __d_lookup_unhash(dentry);
}
+ if (unlikely(ops))
+ d_set_d_op(dentry, ops);
if (inode) {
unsigned add_flags = d_flags_for_inode(inode);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
@@ -2701,10 +2779,28 @@ void d_add(struct dentry *entry, struct inode *inode)
security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
}
- __d_add(entry, inode);
+ __d_add(entry, inode, NULL);
}
EXPORT_SYMBOL(d_add);
+struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode)
+{
+ WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
+ WARN_ON(!inode);
+ security_d_instantiate(dentry, inode);
+ spin_lock(&inode->i_lock);
+ spin_lock(&dentry->d_lock);
+ __d_instantiate(dentry, inode);
+ dentry->d_flags |= DCACHE_PERSISTENT;
+ dget_dlock(dentry);
+ if (d_unhashed(dentry))
+ __d_rehash(dentry);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&inode->i_lock);
+ return dentry;
+}
+EXPORT_SYMBOL(d_make_persistent);
+
static void swap_names(struct dentry *dentry, struct dentry *target)
{
if (unlikely(dname_external(target))) {
@@ -2712,15 +2808,15 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
/*
* Both external: swap the pointers
*/
- swap(target->d_name.name, dentry->d_name.name);
+ swap(target->__d_name.name, dentry->__d_name.name);
} else {
/*
* dentry:internal, target:external. Steal target's
* storage and make target internal.
*/
- dentry->d_name.name = target->d_name.name;
+ dentry->__d_name.name = target->__d_name.name;
target->d_shortname = dentry->d_shortname;
- target->d_name.name = target->d_shortname.string;
+ target->__d_name.name = target->d_shortname.string;
}
} else {
if (unlikely(dname_external(dentry))) {
@@ -2728,9 +2824,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
* dentry:external, target:internal. Give dentry's
* storage to target and make dentry internal
*/
- target->d_name.name = dentry->d_name.name;
+ target->__d_name.name = dentry->__d_name.name;
dentry->d_shortname = target->d_shortname;
- dentry->d_name.name = dentry->d_shortname.string;
+ dentry->__d_name.name = dentry->d_shortname.string;
} else {
/*
* Both are internal.
@@ -2740,7 +2836,7 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
target->d_shortname.words[i]);
}
}
- swap(dentry->d_name.hash_len, target->d_name.hash_len);
+ swap(dentry->__d_name.hash_len, target->__d_name.hash_len);
}
static void copy_name(struct dentry *dentry, struct dentry *target)
@@ -2750,11 +2846,11 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
old_name = external_name(dentry);
if (unlikely(dname_external(target))) {
atomic_inc(&external_name(target)->count);
- dentry->d_name = target->d_name;
+ dentry->__d_name = target->__d_name;
} else {
dentry->d_shortname = target->d_shortname;
- dentry->d_name.name = dentry->d_shortname.string;
- dentry->d_name.hash_len = target->d_name.hash_len;
+ dentry->__d_name.name = dentry->d_shortname.string;
+ dentry->__d_name.hash_len = target->__d_name.hash_len;
}
if (old_name && likely(atomic_dec_and_test(&old_name->count)))
kfree_rcu(old_name, head);
@@ -2766,10 +2862,10 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
* @target: new dentry
* @exchange: exchange the two dentries
*
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way. Caller must hold
- * rename_lock, the i_mutex of the source and target directories,
- * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
+ * Update the dcache to reflect the move of a file name. Negative dcache
+ * entries should not be moved in this way. Caller must hold rename_lock, the
+ * i_rwsem of the source and target directories (exclusively), and the sb->
+ * s_vfs_rename_mutex if they differ. See lock_rename().
*/
static void __d_move(struct dentry *dentry, struct dentry *target,
bool exchange)
@@ -2891,6 +2987,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
write_sequnlock(&rename_lock);
}
+EXPORT_SYMBOL(d_exchange);
/**
* d_ancestor - search for an ancestor
@@ -2915,7 +3012,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
* This helper attempts to cope with remotely renamed directories
*
* It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex, and rename_lock
+ * dentry->d_parent->d_inode->i_rwsem, and rename_lock
*
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
@@ -2953,30 +3050,8 @@ out_err:
return ret;
}
-/**
- * d_splice_alias - splice a disconnected dentry into the tree if one exists
- * @inode: the inode which may have a disconnected dentry
- * @dentry: a negative dentry which we want to point to the inode.
- *
- * If inode is a directory and has an IS_ROOT alias, then d_move that in
- * place of the given dentry and return it, else simply d_add the inode
- * to the dentry and return NULL.
- *
- * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
- * we should error out: directories can't have multiple aliases.
- *
- * This is needed in the lookup routine of any filesystem that is exportable
- * (via knfsd) so that we can build dcache paths to directories effectively.
- *
- * If a dentry was found and moved, then it is returned. Otherwise NULL
- * is returned. This matches the expected return value of ->lookup.
- *
- * Cluster filesystems may call this function with a negative, hashed dentry.
- * In that case, we know that the inode will be a regular file, and also this
- * will only occur during atomic_open. So we need to check for the dentry
- * being already hashed only in the final case.
- */
-struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry,
+ const struct dentry_operations *ops)
{
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -3022,9 +3097,37 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
}
}
out:
- __d_add(dentry, inode);
+ __d_add(dentry, inode, ops);
return NULL;
}
+
+/**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode: the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
+ *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned. Otherwise NULL
+ * is returned. This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+ return d_splice_alias_ops(inode, dentry, NULL);
+}
EXPORT_SYMBOL(d_splice_alias);
/*
@@ -3067,26 +3170,6 @@ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
}
EXPORT_SYMBOL(is_subdir);
-static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
-{
- struct dentry *root = data;
- if (dentry != root) {
- if (d_unhashed(dentry) || !dentry->d_inode)
- return D_WALK_SKIP;
-
- if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
- dentry->d_flags |= DCACHE_GENOCIDE;
- dentry->d_lockref.count--;
- }
- }
- return D_WALK_CONTINUE;
-}
-
-void d_genocide(struct dentry *parent)
-{
- d_walk(parent, parent, d_genocide_kill);
-}
-
void d_mark_tmpfile(struct file *file, struct inode *inode)
{
struct dentry *dentry = file->f_path.dentry;
@@ -3096,7 +3179,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode)
!d_unlinked(dentry));
spin_lock(&dentry->d_parent->d_lock);
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
+ dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
(unsigned long long)inode->i_ino);
spin_unlock(&dentry->d_lock);
spin_unlock(&dentry->d_parent->d_lock);
@@ -3182,9 +3265,10 @@ static void __init dcache_init(void)
* but it is probably not worth it because of the cache nature
* of the dcache.
*/
- dentry_cache = KMEM_CACHE_USERCOPY(dentry,
+ __dentry_cache = KMEM_CACHE_USERCOPY(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
d_shortname.string);
+ runtime_const_init(ptr, __dentry_cache);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 69e9ddcb113d..3ec3324c2060 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -47,29 +47,12 @@ const struct file_operations debugfs_noop_file_operations = {
#define F_DENTRY(filp) ((filp)->f_path.dentry)
-const void *debugfs_get_aux(const struct file *file)
+void *debugfs_get_aux(const struct file *file)
{
return DEBUGFS_I(file_inode(file))->aux;
}
EXPORT_SYMBOL_GPL(debugfs_get_aux);
-const struct file_operations *debugfs_real_fops(const struct file *filp)
-{
- struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
-
- if (!fsd) {
- /*
- * Urgh, we've been called w/o a protecting
- * debugfs_file_get().
- */
- WARN_ON(1);
- return NULL;
- }
-
- return fsd->real_fops;
-}
-EXPORT_SYMBOL_GPL(debugfs_real_fops);
-
enum dbgfs_get_mode {
DBGFS_GET_ALREADY,
DBGFS_GET_REGULAR,
@@ -302,15 +285,13 @@ static int debugfs_locked_down(struct inode *inode,
static int open_proxy_open(struct inode *inode, struct file *filp)
{
struct dentry *dentry = F_DENTRY(filp);
- const struct file_operations *real_fops = NULL;
+ const struct file_operations *real_fops = DEBUGFS_I(inode)->real_fops;
int r;
r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR);
if (r)
return r == -EIO ? -ENOENT : r;
- real_fops = debugfs_real_fops(filp);
-
r = debugfs_locked_down(inode, filp, real_fops);
if (r)
goto out;
@@ -352,7 +333,6 @@ static ret_type full_proxy_ ## name(proto) \
{ \
struct dentry *dentry = F_DENTRY(filp); \
struct debugfs_fsdata *fsd = dentry->d_fsdata; \
- const struct file_operations *real_fops; \
ret_type r; \
\
if (!(fsd->methods & bit)) \
@@ -360,14 +340,13 @@ static ret_type full_proxy_ ## name(proto) \
r = debugfs_file_get(dentry); \
if (unlikely(r)) \
return r; \
- real_fops = debugfs_real_fops(filp); \
- r = real_fops->name(args); \
+ r = fsd->real_fops->name(args); \
debugfs_file_put(dentry); \
return r; \
}
-#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args, bit, ret) \
-static ret_type full_proxy_ ## name(proto) \
+#define SHORT_PROXY_FUNC(name, ret_type, filp, proto, args, bit, ret) \
+static ret_type short_proxy_ ## name(proto) \
{ \
struct dentry *dentry = F_DENTRY(filp); \
struct debugfs_fsdata *fsd = dentry->d_fsdata; \
@@ -378,27 +357,38 @@ static ret_type full_proxy_ ## name(proto) \
r = debugfs_file_get(dentry); \
if (unlikely(r)) \
return r; \
- if (fsd->real_fops) \
- r = fsd->real_fops->name(args); \
- else \
- r = fsd->short_fops->name(args); \
+ r = fsd->short_fops->name(args); \
debugfs_file_put(dentry); \
return r; \
}
-FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp,
- PROTO(struct file *filp, loff_t offset, int whence),
- ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
+SHORT_PROXY_FUNC(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
-FULL_PROXY_FUNC_BOTH(read, ssize_t, filp,
- PROTO(struct file *filp, char __user *buf, size_t size,
- loff_t *ppos),
- ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+FULL_PROXY_FUNC(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
-FULL_PROXY_FUNC_BOTH(write, ssize_t, filp,
- PROTO(struct file *filp, const char __user *buf,
- size_t size, loff_t *ppos),
- ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
+SHORT_PROXY_FUNC(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+
+FULL_PROXY_FUNC(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+
+SHORT_PROXY_FUNC(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf,
+ size_t size, loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
+
+FULL_PROXY_FUNC(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf,
+ size_t size, loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
@@ -410,22 +400,21 @@ static __poll_t full_proxy_poll(struct file *filp,
struct dentry *dentry = F_DENTRY(filp);
struct debugfs_fsdata *fsd = dentry->d_fsdata;
__poll_t r = 0;
- const struct file_operations *real_fops;
if (!(fsd->methods & HAS_POLL))
return DEFAULT_POLLMASK;
if (debugfs_file_get(dentry))
return EPOLLHUP;
- real_fops = debugfs_real_fops(filp);
- r = real_fops->poll(filp, wait);
+ r = fsd->real_fops->poll(filp, wait);
debugfs_file_put(dentry);
return r;
}
-static int full_proxy_release(struct inode *inode, struct file *filp)
+static int full_proxy_release(struct inode *inode, struct file *file)
{
- const struct file_operations *real_fops = debugfs_real_fops(filp);
+ struct debugfs_fsdata *fsd = F_DENTRY(file)->d_fsdata;
+ const struct file_operations *real_fops = fsd->real_fops;
int r = 0;
/*
@@ -435,7 +424,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
* ->i_private is still being meaningful here.
*/
if (real_fops->release)
- r = real_fops->release(inode, filp);
+ r = real_fops->release(inode, file);
fops_put(real_fops);
return r;
@@ -517,9 +506,9 @@ static int full_proxy_open_short(struct inode *inode, struct file *filp)
const struct file_operations debugfs_full_short_proxy_file_operations = {
.open = full_proxy_open_short,
- .llseek = full_proxy_llseek,
- .read = full_proxy_read,
- .write = full_proxy_write,
+ .llseek = short_proxy_llseek,
+ .read = short_proxy_read,
+ .write = short_proxy_write,
};
ssize_t debugfs_attr_read(struct file *file, char __user *buf,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 75715d8877ee..4b263c328ed2 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -35,7 +35,7 @@
static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
-static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS;
+static bool debugfs_enabled __ro_after_init = IS_ENABLED(CONFIG_DEBUG_FS_ALLOW_ALL);
/*
* Don't allow access attributes to be changed whilst the kernel is locked down
@@ -183,6 +183,9 @@ static int debugfs_reconfigure(struct fs_context *fc)
struct debugfs_fs_info *sb_opts = sb->s_fs_info;
struct debugfs_fs_info *new_opts = fc->s_fs_info;
+ if (!new_opts)
+ return 0;
+
sync_filesystem(sb);
/* structure copy of new mount options to sb */
@@ -258,7 +261,6 @@ static struct vfsmount *debugfs_automount(struct path *path)
}
static const struct dentry_operations debugfs_dops = {
- .d_delete = always_delete_dentry,
.d_release = debugfs_release_dentry,
.d_automount = debugfs_automount,
};
@@ -273,7 +275,8 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
return err;
sb->s_op = &debugfs_super_operations;
- sb->s_d_op = &debugfs_dops;
+ set_default_d_op(sb, &debugfs_dops);
+ sb->s_d_flags |= DCACHE_DONTCACHE;
debugfs_apply_options(sb);
@@ -282,10 +285,13 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
static int debugfs_get_tree(struct fs_context *fc)
{
- if (!(debugfs_allow & DEBUGFS_ALLOW_API))
- return -EPERM;
+ int err;
+
+ err = get_tree_single(fc, debugfs_fill_super);
+ if (err)
+ return err;
- return get_tree_single(fc, debugfs_fill_super);
+ return debugfs_reconfigure(fc);
}
static void debugfs_free_fc(struct fs_context *fc)
@@ -320,7 +326,7 @@ static struct file_system_type debug_fs_type = {
.name = "debugfs",
.init_fs_context = debugfs_init_fs_context,
.parameters = debugfs_param_specs,
- .kill_sb = kill_litter_super,
+ .kill_sb = kill_anon_super,
};
MODULE_ALIAS_FS("debugfs");
@@ -346,19 +352,20 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- dentry = lookup_positive_unlocked(name, parent, strlen(name));
+ dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent);
if (IS_ERR(dentry))
return NULL;
return dentry;
}
EXPORT_SYMBOL_GPL(debugfs_lookup);
-static struct dentry *start_creating(const char *name, struct dentry *parent)
+static struct dentry *debugfs_start_creating(const char *name,
+ struct dentry *parent)
{
struct dentry *dentry;
int error;
- if (!(debugfs_allow & DEBUGFS_ALLOW_API))
+ if (!debugfs_enabled)
return ERR_PTR(-EPERM);
if (!debugfs_initialized())
@@ -384,42 +391,26 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- inode_lock(d_inode(parent));
- if (unlikely(IS_DEADDIR(d_inode(parent))))
- dentry = ERR_PTR(-ENOENT);
- else
- dentry = lookup_one_len(name, parent, strlen(name));
- if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
- if (d_is_dir(dentry))
- pr_err("Directory '%s' with parent '%s' already present!\n",
- name, parent->d_name.name);
- else
- pr_err("File '%s' in directory '%s' already present!\n",
- name, parent->d_name.name);
- dput(dentry);
- dentry = ERR_PTR(-EEXIST);
- }
-
+ dentry = simple_start_creating(parent, name);
if (IS_ERR(dentry)) {
- inode_unlock(d_inode(parent));
+ if (dentry == ERR_PTR(-EEXIST))
+ pr_err("'%s' already exists in '%pd'\n", name, parent);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
-
return dentry;
}
-static struct dentry *failed_creating(struct dentry *dentry)
+static struct dentry *debugfs_failed_creating(struct dentry *dentry)
{
- inode_unlock(d_inode(dentry->d_parent));
- dput(dentry);
+ simple_done_creating(dentry);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
return ERR_PTR(-ENOMEM);
}
-static struct dentry *end_creating(struct dentry *dentry)
+static struct dentry *debugfs_end_creating(struct dentry *dentry)
{
- inode_unlock(d_inode(dentry->d_parent));
- return dentry;
+ simple_done_creating(dentry);
+ return dentry; // borrowed
}
static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
@@ -434,21 +425,16 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
if (!(mode & S_IFMT))
mode |= S_IFREG;
BUG_ON(!S_ISREG(mode));
- dentry = start_creating(name, parent);
+ dentry = debugfs_start_creating(name, parent);
if (IS_ERR(dentry))
return dentry;
- if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
- failed_creating(dentry);
- return ERR_PTR(-EPERM);
- }
-
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create file '%s'\n",
name);
- return failed_creating(dentry);
+ return debugfs_failed_creating(dentry);
}
inode->i_mode = mode;
@@ -459,11 +445,11 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
proxy_fops = &debugfs_noop_file_operations;
inode->i_fop = proxy_fops;
DEBUGFS_I(inode)->raw = real_fops;
- DEBUGFS_I(inode)->aux = aux;
+ DEBUGFS_I(inode)->aux = (void *)aux;
- d_instantiate(dentry, inode);
+ d_make_persistent(dentry, inode);
fsnotify_create(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return debugfs_end_creating(dentry);
}
struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
@@ -583,22 +569,17 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size);
*/
struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
{
- struct dentry *dentry = start_creating(name, parent);
+ struct dentry *dentry = debugfs_start_creating(name, parent);
struct inode *inode;
if (IS_ERR(dentry))
return dentry;
- if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
- failed_creating(dentry);
- return ERR_PTR(-EPERM);
- }
-
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create directory '%s'\n",
name);
- return failed_creating(dentry);
+ return debugfs_failed_creating(dentry);
}
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
@@ -607,10 +588,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
- d_instantiate(dentry, inode);
+ d_make_persistent(dentry, inode);
inc_nlink(d_inode(dentry->d_parent));
fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return debugfs_end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_dir);
@@ -630,22 +611,17 @@ struct dentry *debugfs_create_automount(const char *name,
debugfs_automount_t f,
void *data)
{
- struct dentry *dentry = start_creating(name, parent);
+ struct dentry *dentry = debugfs_start_creating(name, parent);
struct inode *inode;
if (IS_ERR(dentry))
return dentry;
- if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
- failed_creating(dentry);
- return ERR_PTR(-EPERM);
- }
-
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create automount '%s'\n",
name);
- return failed_creating(dentry);
+ return debugfs_failed_creating(dentry);
}
make_empty_dir_inode(inode);
@@ -654,10 +630,10 @@ struct dentry *debugfs_create_automount(const char *name,
DEBUGFS_I(inode)->automount = f;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
- d_instantiate(dentry, inode);
+ d_make_persistent(dentry, inode);
inc_nlink(d_inode(dentry->d_parent));
fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return debugfs_end_creating(dentry);
}
EXPORT_SYMBOL(debugfs_create_automount);
@@ -693,7 +669,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
if (!link)
return ERR_PTR(-ENOMEM);
- dentry = start_creating(name, parent);
+ dentry = debugfs_start_creating(name, parent);
if (IS_ERR(dentry)) {
kfree(link);
return dentry;
@@ -704,13 +680,13 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
pr_err("out of free dentries, can not create symlink '%s'\n",
name);
kfree(link);
- return failed_creating(dentry);
+ return debugfs_failed_creating(dentry);
}
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_op = &debugfs_symlink_inode_operations;
inode->i_link = link;
- d_instantiate(dentry, inode);
- return end_creating(dentry);
+ d_make_persistent(dentry, inode);
+ return debugfs_end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_symlink);
@@ -847,7 +823,8 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, .
int error = 0;
const char *new_name;
struct name_snapshot old_name;
- struct dentry *parent, *target;
+ struct dentry *target;
+ struct renamedata rd = {};
struct inode *dir;
va_list ap;
@@ -860,36 +837,31 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, .
if (!new_name)
return -ENOMEM;
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- inode_lock(dir);
+ rd.old_parent = dget_parent(dentry);
+ rd.new_parent = rd.old_parent;
+ rd.flags = RENAME_NOREPLACE;
+ target = lookup_noperm_unlocked(&QSTR(new_name), rd.new_parent);
+ if (IS_ERR(target))
+ return PTR_ERR(target);
- take_dentry_name_snapshot(&old_name, dentry);
-
- if (WARN_ON_ONCE(dentry->d_parent != parent)) {
- error = -EINVAL;
- goto out;
- }
- if (strcmp(old_name.name.name, new_name) == 0)
- goto out;
- target = lookup_one_len(new_name, parent, strlen(new_name));
- if (IS_ERR(target)) {
- error = PTR_ERR(target);
- goto out;
- }
- if (d_really_is_positive(target)) {
- dput(target);
- error = -EINVAL;
+ error = start_renaming_two_dentries(&rd, dentry, target);
+ if (error) {
+ if (error == -EEXIST && target == dentry)
+ /* it isn't an error to rename a thing to itself */
+ error = 0;
goto out;
}
- simple_rename_timestamp(dir, dentry, dir, target);
- d_move(dentry, target);
- dput(target);
+
+ dir = d_inode(rd.old_parent);
+ take_dentry_name_snapshot(&old_name, dentry);
+ simple_rename_timestamp(dir, dentry, dir, rd.new_dentry);
+ d_move(dentry, rd.new_dentry);
fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry);
-out:
release_dentry_name_snapshot(&old_name);
- inode_unlock(dir);
- dput(parent);
+ end_renaming(&rd);
+out:
+ dput(rd.old_parent);
+ dput(target);
kfree_const(new_name);
return error;
}
@@ -908,21 +880,25 @@ static int __init debugfs_kernel(char *str)
{
if (str) {
if (!strcmp(str, "on"))
- debugfs_allow = DEBUGFS_ALLOW_API | DEBUGFS_ALLOW_MOUNT;
- else if (!strcmp(str, "no-mount"))
- debugfs_allow = DEBUGFS_ALLOW_API;
+ debugfs_enabled = true;
else if (!strcmp(str, "off"))
- debugfs_allow = 0;
+ debugfs_enabled = false;
+ else if (!strcmp(str, "no-mount")) {
+ pr_notice("debugfs=no-mount is a deprecated alias "
+ "for debugfs=off\n");
+ debugfs_enabled = false;
+ }
}
return 0;
}
early_param("debugfs", debugfs_kernel);
+
static int __init debugfs_init(void)
{
int retval;
- if (!(debugfs_allow & DEBUGFS_ALLOW_MOUNT))
+ if (!debugfs_enabled)
return -EPERM;
retval = sysfs_create_mount_point(kernel_kobj, "debug");
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 93483fe84425..c95699b27a56 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -19,7 +19,7 @@ struct debugfs_inode_info {
const struct debugfs_short_fops *short_fops;
debugfs_automount_t automount;
};
- const void *aux;
+ void *aux;
};
static inline struct debugfs_inode_info *DEBUGFS_I(struct inode *inode)
@@ -55,17 +55,4 @@ enum {
HAS_IOCTL = 16
};
-#define DEBUGFS_ALLOW_API BIT(0)
-#define DEBUGFS_ALLOW_MOUNT BIT(1)
-
-#ifdef CONFIG_DEBUG_FS_ALLOW_ALL
-#define DEFAULT_DEBUGFS_ALLOW_BITS (DEBUGFS_ALLOW_MOUNT | DEBUGFS_ALLOW_API)
-#endif
-#ifdef CONFIG_DEBUG_FS_DISALLOW_MOUNT
-#define DEFAULT_DEBUGFS_ALLOW_BITS (DEBUGFS_ALLOW_API)
-#endif
-#ifdef CONFIG_DEBUG_FS_ALLOW_NONE
-#define DEFAULT_DEBUGFS_ALLOW_BITS (0)
-#endif
-
#endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 42e4d6eeb29f..9f3de528c358 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -89,12 +89,12 @@ enum {
};
static const struct fs_parameter_spec devpts_param_specs[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_s32 ("max", Opt_max),
fsparam_u32oct ("mode", Opt_mode),
fsparam_flag ("newinstance", Opt_newinstance),
fsparam_u32oct ("ptmxmode", Opt_ptmxmode),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
@@ -102,7 +102,7 @@ struct pts_fs_info {
struct ida allocated_ptys;
struct pts_mount_opts mount_opts;
struct super_block *sb;
- struct dentry *ptmx_dentry;
+ struct inode *ptmx_inode; // borrowed
};
static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
@@ -259,7 +259,6 @@ static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param)
static int mknod_ptmx(struct super_block *sb, struct fs_context *fc)
{
int mode;
- int rc = -ENOMEM;
struct dentry *dentry;
struct inode *inode;
struct dentry *root = sb->s_root;
@@ -268,18 +267,10 @@ static int mknod_ptmx(struct super_block *sb, struct fs_context *fc)
kuid_t ptmx_uid = current_fsuid();
kgid_t ptmx_gid = current_fsgid();
- inode_lock(d_inode(root));
-
- /* If we have already created ptmx node, return */
- if (fsi->ptmx_dentry) {
- rc = 0;
- goto out;
- }
-
- dentry = d_alloc_name(root, "ptmx");
- if (!dentry) {
+ dentry = simple_start_creating(root, "ptmx");
+ if (IS_ERR(dentry)) {
pr_err("Unable to alloc dentry for ptmx node\n");
- goto out;
+ return PTR_ERR(dentry);
}
/*
@@ -287,9 +278,9 @@ static int mknod_ptmx(struct super_block *sb, struct fs_context *fc)
*/
inode = new_inode(sb);
if (!inode) {
+ simple_done_creating(dentry);
pr_err("Unable to alloc inode for ptmx node\n");
- dput(dentry);
- goto out;
+ return -ENOMEM;
}
inode->i_ino = 2;
@@ -299,23 +290,18 @@ static int mknod_ptmx(struct super_block *sb, struct fs_context *fc)
init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
inode->i_uid = ptmx_uid;
inode->i_gid = ptmx_gid;
+ fsi->ptmx_inode = inode;
- d_add(dentry, inode);
+ d_make_persistent(dentry, inode);
- fsi->ptmx_dentry = dentry;
- rc = 0;
-out:
- inode_unlock(d_inode(root));
- return rc;
+ simple_done_creating(dentry);
+
+ return 0;
}
static void update_ptmx_mode(struct pts_fs_info *fsi)
{
- struct inode *inode;
- if (fsi->ptmx_dentry) {
- inode = d_inode(fsi->ptmx_dentry);
- inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
- }
+ fsi->ptmx_inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
}
static int devpts_reconfigure(struct fs_context *fc)
@@ -381,7 +367,7 @@ static int devpts_fill_super(struct super_block *s, struct fs_context *fc)
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
s->s_op = &devpts_sops;
- s->s_d_op = &simple_dentry_operations;
+ s->s_d_flags = DCACHE_DONTCACHE;
s->s_time_gran = 1;
fsi->sb = s;
@@ -461,7 +447,7 @@ static void devpts_kill_sb(struct super_block *sb)
if (fsi)
ida_destroy(&fsi->allocated_ptys);
kfree(fsi);
- kill_litter_super(sb);
+ kill_anon_super(sb);
}
static struct file_system_type devpts_fs_type = {
@@ -534,16 +520,15 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
sprintf(s, "%d", index);
dentry = d_alloc_name(root, s);
- if (dentry) {
- dentry->d_fsdata = priv;
- d_add(dentry, inode);
- fsnotify_create(d_inode(root), dentry);
- } else {
+ if (!dentry) {
iput(inode);
- dentry = ERR_PTR(-ENOMEM);
+ return ERR_PTR(-ENOMEM);
}
-
- return dentry;
+ dentry->d_fsdata = priv;
+ d_make_persistent(dentry, inode);
+ fsnotify_create(d_inode(root), dentry);
+ dput(dentry);
+ return dentry; // borrowed
}
/**
@@ -573,7 +558,7 @@ void devpts_pty_kill(struct dentry *dentry)
drop_nlink(dentry->d_inode);
d_drop(dentry);
fsnotify_unlink(d_inode(dentry->d_parent), dentry);
- dput(dentry); /* d_alloc_name() in devpts_pty_new() */
+ d_make_discardable(dentry);
}
static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index bbd05f1a2145..2267f5ae7f77 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -996,7 +996,7 @@ do_holes:
dio_unpin_page(dio, page);
goto out;
}
- zero_user(page, from, 1 << blkbits);
+ memzero_page(page, from, 1 << blkbits);
sdio->block_in_file++;
from += 1 << blkbits;
dio->result += 1 << blkbits;
@@ -1083,8 +1083,8 @@ static inline int drop_refcount(struct dio *dio)
* The locking rules are governed by the flags parameter:
* - if the flags value contains DIO_LOCKING we use a fancy locking
* scheme for dumb filesystems.
- * For writes this function is called under i_mutex and returns with
- * i_mutex held, for reads, i_mutex is not held on entry, but it is
+ * For writes this function is called under i_rwsem and returns with
+ * i_rwsem held, for reads, i_rwsem is not held on entry, but it is
* taken and dropped again before returning.
* - if the flags value does NOT contain DIO_LOCKING we don't use any
* internal locking but rather rely on the filesystem to synchronize
@@ -1094,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio)
* counter before starting direct I/O, and decrement it once we are done.
* Truncate can wait for it to reach zero to provide exclusion. It is
* expected that filesystem provide exclusion between new direct I/O
- * and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
+ * and truncates. For DIO_LOCKING filesystems this is done by i_rwsem,
* but other filesystems need to take care of this on their own.
*
* NOTE: if you pass "sdio" to anything by pointer make sure that function
@@ -1279,7 +1279,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
/*
* All block lookups have been performed. For READ requests
- * we can let i_mutex go now that its achieved its purpose
+ * we can let i_rwsem go now that its achieved its purpose
* of protecting us from looking up uninitialized blocks.
*/
if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index f82a4952769d..b46165df5a91 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,7 +3,6 @@ menuconfig DLM
tristate "Distributed Lock Manager (DLM)"
depends on INET
depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
- select IP_SCTP
help
A general purpose distributed lock manager for kernel or userspace
applications.
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index cf9ba6fd7a28..a0d75b5c83c6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -26,6 +26,7 @@
/*
* /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid (refers to <node>)
* /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/release_recover
* /config/dlm/<cluster>/comms/<comm>/nodeid (refers to <comm>)
* /config/dlm/<cluster>/comms/<comm>/local
* /config/dlm/<cluster>/comms/<comm>/addr (write only)
@@ -197,6 +198,9 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x)
break;
case 1:
/* SCTP */
+ if (!IS_ENABLED(CONFIG_IP_SCTP))
+ return -EOPNOTSUPP;
+
break;
default:
return -EINVAL;
@@ -264,6 +268,7 @@ enum {
enum {
NODE_ATTR_NODEID = 0,
NODE_ATTR_WEIGHT,
+ NODE_ATTR_RELEASE_RECOVER,
};
struct dlm_clusters {
@@ -277,6 +282,8 @@ struct dlm_spaces {
struct dlm_space {
struct config_group group;
struct list_head members;
+ struct list_head members_gone;
+ int members_gone_count;
struct mutex members_lock;
int members_count;
struct dlm_nodes *nds;
@@ -307,6 +314,14 @@ struct dlm_node {
int weight;
int new;
int comm_seq; /* copy of cm->seq when nd->nodeid is set */
+ unsigned int release_recover;
+};
+
+struct dlm_member_gone {
+ int nodeid;
+ unsigned int release_recover;
+
+ struct list_head list; /* space->members_gone */
};
static struct configfs_group_operations clusters_ops = {
@@ -477,6 +492,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
configfs_add_default_group(&nds->ns_group, &sp->group);
INIT_LIST_HEAD(&sp->members);
+ INIT_LIST_HEAD(&sp->members_gone);
mutex_init(&sp->members_lock);
sp->members_count = 0;
sp->nds = nds;
@@ -584,10 +600,20 @@ static void drop_node(struct config_group *g, struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
struct dlm_node *nd = config_item_to_node(i);
+ struct dlm_member_gone *mb_gone;
+
+ mb_gone = kzalloc(sizeof(*mb_gone), GFP_KERNEL);
+ if (!mb_gone)
+ return;
mutex_lock(&sp->members_lock);
list_del(&nd->list);
sp->members_count--;
+
+ mb_gone->nodeid = nd->nodeid;
+ mb_gone->release_recover = nd->release_recover;
+ list_add(&mb_gone->list, &sp->members_gone);
+ sp->members_gone_count++;
mutex_unlock(&sp->members_lock);
config_item_put(i);
@@ -812,12 +838,34 @@ static ssize_t node_weight_store(struct config_item *item, const char *buf,
return len;
}
+static ssize_t node_release_recover_show(struct config_item *item, char *buf)
+{
+ struct dlm_node *n = config_item_to_node(item);
+
+ return sprintf(buf, "%u\n", n->release_recover);
+}
+
+static ssize_t node_release_recover_store(struct config_item *item,
+ const char *buf, size_t len)
+{
+ struct dlm_node *n = config_item_to_node(item);
+ int rc;
+
+ rc = kstrtouint(buf, 0, &n->release_recover);
+ if (rc)
+ return rc;
+
+ return len;
+}
+
CONFIGFS_ATTR(node_, nodeid);
CONFIGFS_ATTR(node_, weight);
+CONFIGFS_ATTR(node_, release_recover);
static struct configfs_attribute *node_attrs[] = {
[NODE_ATTR_NODEID] = &node_attr_nodeid,
[NODE_ATTR_WEIGHT] = &node_attr_weight,
+ [NODE_ATTR_RELEASE_RECOVER] = &node_attr_release_recover,
NULL,
};
@@ -879,9 +927,10 @@ static void put_comm(struct dlm_comm *cm)
int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
int *count_out)
{
+ struct dlm_member_gone *mb_gone, *mb_safe;
+ struct dlm_config_node *nodes, *node;
struct dlm_space *sp;
struct dlm_node *nd;
- struct dlm_config_node *nodes, *node;
int rv, count;
sp = get_space(lsname);
@@ -895,7 +944,7 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
goto out;
}
- count = sp->members_count;
+ count = sp->members_count + sp->members_gone_count;
nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
if (!nodes) {
@@ -914,6 +963,20 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
nd->new = 0;
}
+ /* we delay the remove on nodes until here as configfs does
+ * not support addtional attributes for rmdir().
+ */
+ list_for_each_entry_safe(mb_gone, mb_safe, &sp->members_gone, list) {
+ node->nodeid = mb_gone->nodeid;
+ node->release_recover = mb_gone->release_recover;
+ node->gone = true;
+ node++;
+
+ list_del(&mb_gone->list);
+ sp->members_gone_count--;
+ kfree(mb_gone);
+ }
+
*count_out = count;
*nodes_out = nodes;
rv = 0;
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 13a3d0b26194..4ebd45f75276 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -17,8 +17,10 @@
struct dlm_config_node {
int nodeid;
int weight;
+ bool gone;
int new;
uint32_t comm_seq;
+ unsigned int release_recover;
};
extern const struct rhashtable_params dlm_rhash_rsb_params;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e01d5f29f4d2..be938fdf17d9 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -509,7 +509,7 @@ static void add_scan(struct dlm_ls *ls, struct dlm_rsb *r)
void dlm_rsb_scan(struct timer_list *timer)
{
- struct dlm_ls *ls = from_timer(ls, timer, ls_scan_timer);
+ struct dlm_ls *ls = timer_container_of(ls, timer, ls_scan_timer);
int our_nodeid = dlm_our_nodeid();
struct dlm_rsb *r;
int rv;
@@ -5576,7 +5576,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (rl->rl_status == DLM_LKSTS_CONVERT && middle_conversion(lkb)) {
/* We may need to adjust grmode depending on other granted locks. */
- log_limit(ls, "%s %x middle convert gr %d rq %d remote %d %x",
+ log_rinfo(ls, "%s %x middle convert gr %d rq %d remote %d %x",
__func__, lkb->lkb_id, lkb->lkb_grmode,
lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid);
rsb_set_flag(r, RSB_RECOVER_CONVERT);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 1929327ffbe1..ddaa76558706 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -186,12 +186,17 @@ static struct kobj_type dlm_ktype = {
static struct kset *dlm_kset;
-static int do_uevent(struct dlm_ls *ls, int in)
+static int do_uevent(struct dlm_ls *ls, int in, unsigned int release_recover)
{
- if (in)
+ char message[512] = {};
+ char *envp[] = { message, NULL };
+
+ if (in) {
kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
- else
- kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+ } else {
+ snprintf(message, 511, "RELEASE_RECOVER=%u", release_recover);
+ kobject_uevent_env(&ls->ls_kobj, KOBJ_OFFLINE, envp);
+ }
log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
@@ -575,7 +580,7 @@ static int new_lockspace(const char *name, const char *cluster,
current lockspace members are (via configfs) and then tells the
lockspace to start running (via sysfs) in dlm_ls_start(). */
- error = do_uevent(ls, 1);
+ error = do_uevent(ls, 1, 0);
if (error < 0)
goto out_recoverd;
@@ -592,7 +597,7 @@ static int new_lockspace(const char *name, const char *cluster,
return 0;
out_members:
- do_uevent(ls, 0);
+ do_uevent(ls, 0, 0);
dlm_clear_members(ls);
kfree(ls->ls_node_array);
out_recoverd:
@@ -671,19 +676,20 @@ int dlm_new_user_lockspace(const char *name, const char *cluster,
This is because there may be LKBs queued as ASTs that have been unlinked
from their RSBs and are pending deletion once the AST has been delivered */
-static int lockspace_busy(struct dlm_ls *ls, int force)
+static int lockspace_busy(struct dlm_ls *ls, unsigned int release_option)
{
struct dlm_lkb *lkb;
unsigned long id;
int rv = 0;
read_lock_bh(&ls->ls_lkbxa_lock);
- if (force == 0) {
+ if (release_option == DLM_RELEASE_NO_LOCKS) {
xa_for_each(&ls->ls_lkbxa, id, lkb) {
rv = 1;
break;
}
- } else if (force == 1) {
+ } else if (release_option == DLM_RELEASE_UNUSED) {
+ /* TODO: handle this UNUSED option as NO_LOCKS in later patch */
xa_for_each(&ls->ls_lkbxa, id, lkb) {
if (lkb->lkb_nodeid == 0 &&
lkb->lkb_grmode != DLM_LOCK_IV) {
@@ -698,11 +704,11 @@ static int lockspace_busy(struct dlm_ls *ls, int force)
return rv;
}
-static int release_lockspace(struct dlm_ls *ls, int force)
+static int release_lockspace(struct dlm_ls *ls, unsigned int release_option)
{
int busy, rv;
- busy = lockspace_busy(ls, force);
+ busy = lockspace_busy(ls, release_option);
spin_lock_bh(&lslist_lock);
if (ls->ls_create_count == 1) {
@@ -730,8 +736,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
dlm_device_deregister(ls);
- if (force < 3 && dlm_user_daemon_available())
- do_uevent(ls, 0);
+ if (release_option != DLM_RELEASE_NO_EVENT &&
+ dlm_user_daemon_available())
+ do_uevent(ls, 0, (release_option == DLM_RELEASE_RECOVER));
dlm_recoverd_stop(ls);
@@ -782,25 +789,24 @@ static int release_lockspace(struct dlm_ls *ls, int force)
* lockspace must continue to function as usual, participating in recoveries,
* until this returns.
*
- * Force has 4 possible values:
- * 0 - don't destroy lockspace if it has any LKBs
- * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
- * 2 - destroy lockspace regardless of LKBs
- * 3 - destroy lockspace as part of a forced shutdown
+ * See DLM_RELEASE defines for release_option values and their meaning.
*/
-int dlm_release_lockspace(void *lockspace, int force)
+int dlm_release_lockspace(void *lockspace, unsigned int release_option)
{
struct dlm_ls *ls;
int error;
+ if (release_option > __DLM_RELEASE_MAX)
+ return -EINVAL;
+
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
return -EINVAL;
dlm_put_lockspace(ls);
mutex_lock(&ls_lock);
- error = release_lockspace(ls, force);
+ error = release_lockspace(ls, release_option);
if (!error)
ls_count--;
if (!ls_count)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 70abd4da17a6..b3958008ba3f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -160,6 +160,7 @@ struct dlm_proto_ops {
bool try_new_addr;
const char *name;
int proto;
+ int how;
void (*sockopts)(struct socket *sock);
int (*bind)(struct socket *sock);
@@ -533,7 +534,7 @@ static void lowcomms_state_change(struct sock *sk)
/* SCTP layer is not calling sk_data_ready when the connection
* is done, so we catch the signal through here.
*/
- if (sk->sk_shutdown == RCV_SHUTDOWN)
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
lowcomms_data_ready(sk);
}
@@ -810,7 +811,7 @@ static void shutdown_connection(struct connection *con, bool and_other)
return;
}
- ret = kernel_sock_shutdown(con->sock, SHUT_WR);
+ ret = kernel_sock_shutdown(con->sock, dlm_proto_ops->how);
up_read(&con->sock_lock);
if (ret) {
log_print("Connection %p failed to shutdown: %d will force close",
@@ -1125,7 +1126,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
static int sctp_bind_addrs(struct socket *sock, __be16 port)
{
struct sockaddr_storage localaddr;
- struct sockaddr *addr = (struct sockaddr *)&localaddr;
+ struct sockaddr_unsized *addr = (struct sockaddr_unsized *)&localaddr;
int i, addr_len, result = 0;
for (i = 0; i < dlm_local_count; i++) {
@@ -1598,7 +1599,7 @@ static int dlm_connect(struct connection *con)
log_print_ratelimited("connecting to %d", con->nodeid);
make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
- result = kernel_connect(sock, (struct sockaddr *)&addr, addr_len, 0);
+ result = kernel_connect(sock, (struct sockaddr_unsized *)&addr, addr_len, 0);
switch (result) {
case -EINPROGRESS:
/* not an error */
@@ -1702,7 +1703,7 @@ static int work_start(void)
return -ENOMEM;
}
- process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH, 0);
+ process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH | WQ_PERCPU, 0);
if (!process_workqueue) {
log_print("can't start dlm_process");
destroy_workqueue(io_workqueue);
@@ -1812,7 +1813,7 @@ static int dlm_tcp_bind(struct socket *sock)
memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
make_sockaddr(&src_addr, 0, &addr_len);
- result = kernel_bind(sock, (struct sockaddr *)&src_addr,
+ result = kernel_bind(sock, (struct sockaddr_unsized *)&src_addr,
addr_len);
if (result < 0) {
/* This *may* not indicate a critical error */
@@ -1851,13 +1852,14 @@ static int dlm_tcp_listen_bind(struct socket *sock)
/* Bind to our port */
make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
- return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0],
+ return kernel_bind(sock, (struct sockaddr_unsized *)&dlm_local_addr[0],
addr_len);
}
static const struct dlm_proto_ops dlm_tcp_ops = {
.name = "TCP",
.proto = IPPROTO_TCP,
+ .how = SHUT_WR,
.sockopts = dlm_tcp_sockopts,
.bind = dlm_tcp_bind,
.listen_validate = dlm_tcp_listen_validate,
@@ -1896,6 +1898,7 @@ static void dlm_sctp_sockopts(struct socket *sock)
static const struct dlm_proto_ops dlm_sctp_ops = {
.name = "SCTP",
.proto = IPPROTO_SCTP,
+ .how = SHUT_RDWR,
.try_new_addr = true,
.sockopts = dlm_sctp_sockopts,
.bind = dlm_sctp_bind,
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 4887c8a05318..a44d16da7187 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -52,7 +52,7 @@ static int __init init_dlm(void)
if (error)
goto out_user;
- dlm_wq = alloc_workqueue("dlm_wq", 0, 0);
+ dlm_wq = alloc_workqueue("dlm_wq", WQ_PERCPU, 0);
if (!dlm_wq) {
error = -ENOMEM;
goto out_plock;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b0864c93230f..c0f557a80a75 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -478,7 +478,8 @@ static void dlm_lsop_recover_prep(struct dlm_ls *ls)
ls->ls_ops->recover_prep(ls->ls_ops_arg);
}
-static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
+static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb,
+ unsigned int release_recover)
{
struct dlm_slot slot;
uint32_t seq;
@@ -495,7 +496,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
error = dlm_comm_seq(memb->nodeid, &seq, false);
- if (!error && seq == memb->comm_seq)
+ if (!release_recover && !error && seq == memb->comm_seq)
return;
slot.nodeid = memb->nodeid;
@@ -552,6 +553,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
struct dlm_member *memb, *safe;
struct dlm_config_node *node;
int i, error, neg = 0, low = -1;
+ unsigned int release_recover;
/* previously removed members that we've not finished removing need to
* count as a negative change so the "neg" recovery steps will happen
@@ -569,11 +571,21 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
node = find_config_node(rv, memb->nodeid);
- if (node && !node->new)
+ if (!node) {
+ log_error(ls, "remove member %d invalid",
+ memb->nodeid);
+ return -EFAULT;
+ }
+
+ if (!node->new && !node->gone)
continue;
- if (!node) {
- log_rinfo(ls, "remove member %d", memb->nodeid);
+ release_recover = 0;
+
+ if (node->gone) {
+ release_recover = node->release_recover;
+ log_rinfo(ls, "remove member %d%s", memb->nodeid,
+ release_recover ? " (release_recover)" : "");
} else {
/* removed and re-added */
log_rinfo(ls, "remove member %d comm_seq %u %u",
@@ -584,13 +596,16 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
list_move(&memb->list, &ls->ls_nodes_gone);
remove_remote_member(memb->nodeid);
ls->ls_num_nodes--;
- dlm_lsop_recover_slot(ls, memb);
+ dlm_lsop_recover_slot(ls, memb, release_recover);
}
/* add new members to ls_nodes */
for (i = 0; i < rv->nodes_count; i++) {
node = &rv->nodes[i];
+ if (node->gone)
+ continue;
+
if (dlm_is_member(ls, node->nodeid))
continue;
error = dlm_add_member(ls, node);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index be4240f09abd..3ac020fb8139 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -842,7 +842,7 @@ static void recover_conversion(struct dlm_rsb *r)
*/
if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) ||
((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) {
- log_limit(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL",
+ log_rinfo(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL",
__func__, lkb->lkb_id, lkb->lkb_grmode,
lkb->lkb_rqmode, lkb->lkb_nodeid,
lkb->lkb_remid, other_lkid, other_grmode);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 5cb3896be826..51daf4acbe31 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -425,7 +425,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
dlm_put_lockspace(ls);
if (error)
- dlm_release_lockspace(lockspace, 0);
+ dlm_release_lockspace(lockspace, DLM_RELEASE_NO_LOCKS);
else
error = ls->ls_device.minor;
@@ -436,7 +436,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
{
dlm_lockspace_t *lockspace;
struct dlm_ls *ls;
- int error, force = 0;
+ int error, force = DLM_RELEASE_NO_LOCKS;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -446,7 +446,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
return -ENOENT;
if (params->flags & DLM_USER_LSFLG_FORCEFREE)
- force = 2;
+ force = DLM_RELEASE_NORMAL;
lockspace = ls;
dlm_put_lockspace(ls);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 019a8b4eaaf9..49f56a598ecb 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -28,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
* inodes without pages but we deliberately won't in case
* we need to reschedule to avoid softlockups.
*/
- if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+ if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
(mapping_empty(inode->i_mapping) && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 1bdeaa6d5790..c2f4fb41b4e6 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -4,7 +4,7 @@ config ECRYPT_FS
depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
select CRYPTO_ECB
select CRYPTO_CBC
- select CRYPTO_MD5
+ select CRYPTO_LIB_MD5
help
Encrypted filesystem that operates on the VFS layer. See
<file:Documentation/filesystems/ecryptfs.rst> to learn more about
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 69536cacdea8..260f8a4938b0 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -9,7 +9,6 @@
* Michael C. Thompson <mcthomps@us.ibm.com>
*/
-#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -48,32 +47,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size)
}
}
-/**
- * ecryptfs_calculate_md5 - calculates the md5 of @src
- * @dst: Pointer to 16 bytes of allocated memory
- * @crypt_stat: Pointer to crypt_stat struct for the current inode
- * @src: Data to be md5'd
- * @len: Length of @src
- *
- * Uses the allocated crypto context that crypt_stat references to
- * generate the MD5 sum of the contents of src.
- */
-static int ecryptfs_calculate_md5(char *dst,
- struct ecryptfs_crypt_stat *crypt_stat,
- char *src, int len)
-{
- int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst);
-
- if (rc) {
- printk(KERN_ERR
- "%s: Error computing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out;
- }
-out:
- return rc;
-}
-
static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name,
char *cipher_name,
char *chaining_modifier)
@@ -104,13 +77,10 @@ out:
*
* Generate the initialization vector from the given root IV and page
* offset.
- *
- * Returns zero on success; non-zero on error.
*/
-int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
- loff_t offset)
+void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+ loff_t offset)
{
- int rc = 0;
char dst[MD5_DIGEST_SIZE];
char src[ECRYPTFS_MAX_IV_BYTES + 16];
@@ -129,20 +99,12 @@ int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
ecryptfs_printk(KERN_DEBUG, "source:\n");
ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16));
}
- rc = ecryptfs_calculate_md5(dst, crypt_stat, src,
- (crypt_stat->iv_bytes + 16));
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
- "MD5 while generating IV for a page\n");
- goto out;
- }
+ md5(src, crypt_stat->iv_bytes + 16, dst);
memcpy(iv, dst, crypt_stat->iv_bytes);
if (unlikely(ecryptfs_verbosity > 0)) {
ecryptfs_printk(KERN_DEBUG, "derived iv:\n");
ecryptfs_dump_hex(iv, crypt_stat->iv_bytes);
}
-out:
- return rc;
}
/**
@@ -151,29 +113,14 @@ out:
*
* Initialize the crypt_stat structure.
*/
-int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
{
- struct crypto_shash *tfm;
- int rc;
-
- tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0);
- if (IS_ERR(tfm)) {
- rc = PTR_ERR(tfm);
- ecryptfs_printk(KERN_ERR, "Error attempting to "
- "allocate crypto context; rc = [%d]\n",
- rc);
- return rc;
- }
-
memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
INIT_LIST_HEAD(&crypt_stat->keysig_list);
mutex_init(&crypt_stat->keysig_list_mutex);
mutex_init(&crypt_stat->cs_mutex);
mutex_init(&crypt_stat->cs_tfm_mutex);
- crypt_stat->hash_tfm = tfm;
crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED;
-
- return 0;
}
/**
@@ -187,7 +134,6 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
crypto_free_skcipher(crypt_stat->tfm);
- crypto_free_shash(crypt_stat->hash_tfm);
list_for_each_entry_safe(key_sig, key_sig_tmp,
&crypt_stat->keysig_list, crypt_stat_list) {
list_del(&key_sig->crypt_stat_list);
@@ -361,14 +307,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
int rc;
extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size));
- rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
- (extent_base + extent_offset));
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
- "extent [0x%.16llx]; rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- goto out;
- }
+ ecryptfs_derive_iv(extent_iv, crypt_stat, extent_base + extent_offset);
sg_init_table(&src_sg, 1);
sg_init_table(&dst_sg, 1);
@@ -609,31 +548,20 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
*/
int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat)
{
- int rc = 0;
char dst[MD5_DIGEST_SIZE];
BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE);
BUG_ON(crypt_stat->iv_bytes <= 0);
if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
- rc = -EINVAL;
ecryptfs_printk(KERN_WARNING, "Session key not valid; "
"cannot generate root IV\n");
- goto out;
- }
- rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key,
- crypt_stat->key_size);
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
- "MD5 while generating root IV\n");
- goto out;
- }
- memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
-out:
- if (rc) {
memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes);
crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING;
+ return -EINVAL;
}
- return rc;
+ md5(crypt_stat->key, crypt_stat->key_size, dst);
+ memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
+ return 0;
}
static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat)
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 1dfd5b81d831..6648a924e31a 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -59,14 +59,6 @@ static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name,
return rc;
}
-struct kmem_cache *ecryptfs_dentry_info_cache;
-
-static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
-{
- kmem_cache_free(ecryptfs_dentry_info_cache,
- container_of(head, struct ecryptfs_dentry_info, rcu));
-}
-
/**
* ecryptfs_d_release
* @dentry: The ecryptfs dentry
@@ -75,11 +67,7 @@ static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
*/
static void ecryptfs_d_release(struct dentry *dentry)
{
- struct ecryptfs_dentry_info *p = dentry->d_fsdata;
- if (p) {
- path_put(&p->lower_path);
- call_rcu(&p->rcu, ecryptfs_dentry_free_rcu);
- }
+ dput(dentry->d_fsdata);
}
const struct dentry_operations ecryptfs_dops = {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 1f562e75d0e4..62a2ea7f59ed 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -14,6 +14,7 @@
#ifndef ECRYPTFS_KERNEL_H
#define ECRYPTFS_KERNEL_H
+#include <crypto/md5.h>
#include <crypto/skcipher.h>
#include <keys/user-type.h>
#include <keys/encrypted-type.h>
@@ -137,8 +138,6 @@ ecryptfs_get_key_payload_data(struct key *key)
+ MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
#define ECRYPTFS_DEFAULT_CIPHER "aes"
#define ECRYPTFS_DEFAULT_KEY_BYTES 16
-#define ECRYPTFS_DEFAULT_HASH "md5"
-#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -163,8 +162,6 @@ ecryptfs_get_key_payload_data(struct key *key)
* ECRYPTFS_MAX_IV_BYTES */
#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
-#define MD5_DIGEST_SIZE 16
-#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \
+ ECRYPTFS_SIG_SIZE + 1 + 1)
#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \
@@ -237,8 +234,6 @@ struct ecryptfs_crypt_stat {
unsigned int extent_mask;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct crypto_skcipher *tfm;
- struct crypto_shash *hash_tfm; /* Crypto context for generating
- * the initialization vectors */
unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
@@ -258,13 +253,6 @@ struct ecryptfs_inode_info {
struct ecryptfs_crypt_stat crypt_stat;
};
-/* dentry private data. Each dentry must keep track of a lower
- * vfsmount too. */
-struct ecryptfs_dentry_info {
- struct path lower_path;
- struct rcu_head rcu;
-};
-
/**
* ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint
* @flags: Status flags
@@ -348,6 +336,7 @@ struct ecryptfs_mount_crypt_stat {
/* superblock private data. */
struct ecryptfs_sb_info {
struct super_block *wsi_sb;
+ struct vfsmount *lower_mnt;
struct ecryptfs_mount_crypt_stat mount_crypt_stat;
};
@@ -494,22 +483,25 @@ ecryptfs_set_superblock_lower(struct super_block *sb,
}
static inline void
-ecryptfs_set_dentry_private(struct dentry *dentry,
- struct ecryptfs_dentry_info *dentry_info)
+ecryptfs_set_dentry_lower(struct dentry *dentry,
+ struct dentry *lower_dentry)
{
- dentry->d_fsdata = dentry_info;
+ dentry->d_fsdata = lower_dentry;
}
static inline struct dentry *
ecryptfs_dentry_to_lower(struct dentry *dentry)
{
- return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
+ return dentry->d_fsdata;
}
-static inline const struct path *
-ecryptfs_dentry_to_lower_path(struct dentry *dentry)
+static inline struct path
+ecryptfs_lower_path(struct dentry *dentry)
{
- return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
+ return (struct path){
+ .mnt = ecryptfs_superblock_to_private(dentry->d_sb)->lower_mnt,
+ .dentry = ecryptfs_dentry_to_lower(dentry)
+ };
}
#define ecryptfs_printk(type, fmt, arg...) \
@@ -532,7 +524,6 @@ extern unsigned int ecryptfs_number_of_users;
extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
extern struct kmem_cache *ecryptfs_file_info_cache;
-extern struct kmem_cache *ecryptfs_dentry_info_cache;
extern struct kmem_cache *ecryptfs_inode_info_cache;
extern struct kmem_cache *ecryptfs_sb_info_cache;
extern struct kmem_cache *ecryptfs_header_cache;
@@ -557,13 +548,12 @@ int ecryptfs_encrypt_and_encode_filename(
size_t *encoded_name_size,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
const char *name, size_t name_size);
-struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
void ecryptfs_dump_hex(char *data, int bytes);
int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
int sg_size);
int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_rotate_iv(unsigned char *iv);
-int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_destroy_mount_crypt_stat(
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
@@ -698,8 +688,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
char *data, size_t max_packet_size);
int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
-int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
- loff_t offset);
+void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+ loff_t offset);
extern const struct xattr_handler * const ecryptfs_xattr_handlers[];
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index ce0a3c5ed0ca..7929411837cf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -33,13 +33,12 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
struct iov_iter *to)
{
ssize_t rc;
- const struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_read_iter(iocb, to);
if (rc >= 0) {
- path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
- touch_atime(path);
+ struct path path = ecryptfs_lower_path(file->f_path.dentry);
+ touch_atime(&path);
}
return rc;
}
@@ -59,12 +58,11 @@ static ssize_t ecryptfs_splice_read_update_atime(struct file *in, loff_t *ppos,
size_t len, unsigned int flags)
{
ssize_t rc;
- const struct path *path;
rc = filemap_splice_read(in, ppos, pipe, len, flags);
if (rc >= 0) {
- path = ecryptfs_dentry_to_lower_path(in->f_path.dentry);
- touch_atime(path);
+ struct path path = ecryptfs_lower_path(in->f_path.dentry);
+ touch_atime(&path);
}
return rc;
}
@@ -193,7 +191,7 @@ static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma)
* natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs
* allows recursive mounting, this will need to be extended.
*/
- if (!lower_file->f_op->mmap)
+ if (!can_mmap_file(lower_file))
return -ENODEV;
return generic_file_mmap(file, vma);
}
@@ -283,6 +281,7 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file)
* ecryptfs_lookup() */
struct ecryptfs_file_info *file_info;
struct file *lower_file;
+ struct path path;
/* Released in ecryptfs_release or end of function if failure */
file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
@@ -292,8 +291,8 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file)
"Error attempting to allocate memory\n");
return -ENOMEM;
}
- lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry),
- file->f_flags, current_cred());
+ path = ecryptfs_lower_path(ecryptfs_dentry);
+ lower_file = dentry_open(&path, file->f_flags, current_cred());
if (IS_ERR(lower_file)) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 51a5c54eb740..3978248247dc 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -24,18 +24,26 @@
#include <linux/unaligned.h>
#include "ecryptfs_kernel.h"
-static int lock_parent(struct dentry *dentry,
- struct dentry **lower_dentry,
- struct inode **lower_dir)
+static struct dentry *ecryptfs_start_creating_dentry(struct dentry *dentry)
{
- struct dentry *lower_dir_dentry;
+ struct dentry *parent = dget_parent(dentry);
+ struct dentry *ret;
- lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
- *lower_dir = d_inode(lower_dir_dentry);
- *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ ret = start_creating_dentry(ecryptfs_dentry_to_lower(parent),
+ ecryptfs_dentry_to_lower(dentry));
+ dput(parent);
+ return ret;
+}
- inode_lock_nested(*lower_dir, I_MUTEX_PARENT);
- return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL;
+static struct dentry *ecryptfs_start_removing_dentry(struct dentry *dentry)
+{
+ struct dentry *parent = dget_parent(dentry);
+ struct dentry *ret;
+
+ ret = start_removing_dentry(ecryptfs_dentry_to_lower(parent),
+ ecryptfs_dentry_to_lower(dentry));
+ dput(parent);
+ return ret;
}
static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
@@ -95,7 +103,7 @@ static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
iput(lower_inode);
return ERR_PTR(-EACCES);
}
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
iput(lower_inode);
return inode;
@@ -106,7 +114,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode,
{
struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
- if (!IS_ERR(inode) && (inode->i_state & I_NEW))
+ if (!IS_ERR(inode) && (inode_state_read_once(inode) & I_NEW))
unlock_new_inode(inode);
return inode;
@@ -141,15 +149,12 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
struct inode *lower_dir;
int rc;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- dget(lower_dentry); // don't even try to make the lower negative
- if (!rc) {
- if (d_unhashed(lower_dentry))
- rc = -EINVAL;
- else
- rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry,
- NULL);
- }
+ lower_dentry = ecryptfs_start_removing_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return PTR_ERR(lower_dentry);
+
+ lower_dir = lower_dentry->d_parent->d_inode;
+ rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL);
if (rc) {
printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
goto out_unlock;
@@ -158,8 +163,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
out_unlock:
- dput(lower_dentry);
- inode_unlock(lower_dir);
+ end_removing(lower_dentry);
if (!rc)
d_drop(dentry);
return rc;
@@ -186,10 +190,11 @@ ecryptfs_do_create(struct inode *directory_inode,
struct inode *lower_dir;
struct inode *inode;
- rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir);
- if (!rc)
- rc = vfs_create(&nop_mnt_idmap, lower_dir,
- lower_dentry, mode, true);
+ lower_dentry = ecryptfs_start_creating_dentry(ecryptfs_dentry);
+ if (IS_ERR(lower_dentry))
+ return ERR_CAST(lower_dentry);
+ lower_dir = lower_dentry->d_parent->d_inode;
+ rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode, NULL);
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
"rc = [%d]\n", __func__, rc);
@@ -205,7 +210,7 @@ ecryptfs_do_create(struct inode *directory_inode,
fsstack_copy_attr_times(directory_inode, lower_dir);
fsstack_copy_inode_size(directory_inode, lower_dir);
out_lock:
- inode_unlock(lower_dir);
+ end_creating(lower_dentry);
return inode;
}
@@ -327,24 +332,15 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ struct dentry *lower_parent = ecryptfs_dentry_to_lower(dentry->d_parent);
struct inode *inode, *lower_inode;
- struct ecryptfs_dentry_info *dentry_info;
int rc = 0;
- dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- if (!dentry_info) {
- dput(lower_dentry);
- return ERR_PTR(-ENOMEM);
- }
-
fsstack_copy_attr_atime(d_inode(dentry->d_parent),
- d_inode(path->dentry));
+ d_inode(lower_parent));
BUG_ON(!d_count(lower_dentry));
- ecryptfs_set_dentry_private(dentry, dentry_info);
- dentry_info->lower_path.mnt = mntget(path->mnt);
- dentry_info->lower_path.dentry = lower_dentry;
+ ecryptfs_set_dentry_lower(dentry, lower_dentry);
/*
* negative dentry can go positive under us here - its parent is not
@@ -373,7 +369,7 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
}
}
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
unlock_new_inode(inode);
return d_splice_alias(inode, dentry);
}
@@ -394,8 +390,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
char *encrypted_and_encoded_name = NULL;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct dentry *lower_dir_dentry, *lower_dentry;
- const char *name = ecryptfs_dentry->d_name.name;
- size_t len = ecryptfs_dentry->d_name.len;
+ struct qstr qname = QSTR_INIT(ecryptfs_dentry->d_name.name,
+ ecryptfs_dentry->d_name.len);
struct dentry *res;
int rc = 0;
@@ -404,23 +400,25 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+ size_t len = qname.len;
rc = ecryptfs_encrypt_and_encode_filename(
&encrypted_and_encoded_name, &len,
- mount_crypt_stat, name, len);
+ mount_crypt_stat, qname.name, len);
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt and encode "
"filename; rc = [%d]\n", __func__, rc);
return ERR_PTR(rc);
}
- name = encrypted_and_encoded_name;
+ qname.name = encrypted_and_encoded_name;
+ qname.len = len;
}
- lower_dentry = lookup_one_len_unlocked(name, lower_dir_dentry, len);
+ lower_dentry = lookup_noperm_unlocked(&qname, lower_dir_dentry);
if (IS_ERR(lower_dentry)) {
- ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+ ecryptfs_printk(KERN_DEBUG, "%s: lookup_noperm() returned "
"[%ld] on lower_dentry = [%s]\n", __func__,
PTR_ERR(lower_dentry),
- name);
+ qname.name);
res = ERR_CAST(lower_dentry);
} else {
res = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry);
@@ -440,10 +438,12 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
file_size_save = i_size_read(d_inode(old_dentry));
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
- rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir);
- if (!rc)
- rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir,
- lower_new_dentry, NULL);
+ lower_new_dentry = ecryptfs_start_creating_dentry(new_dentry);
+ if (IS_ERR(lower_new_dentry))
+ return PTR_ERR(lower_new_dentry);
+ lower_dir = lower_new_dentry->d_parent->d_inode;
+ rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir,
+ lower_new_dentry, NULL);
if (rc || d_really_is_negative(lower_new_dentry))
goto out_lock;
rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
@@ -455,7 +455,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink);
i_size_write(d_inode(new_dentry), file_size_save);
out_lock:
- inode_unlock(lower_dir);
+ end_creating(lower_new_dentry);
return rc;
}
@@ -475,9 +475,11 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
size_t encoded_symlen;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- if (rc)
- goto out_lock;
+ lower_dentry = ecryptfs_start_creating_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return PTR_ERR(lower_dentry);
+ lower_dir = lower_dentry->d_parent->d_inode;
+
mount_crypt_stat = &ecryptfs_superblock_to_private(
dir->i_sb)->mount_crypt_stat;
rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
@@ -487,7 +489,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
if (rc)
goto out_lock;
rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry,
- encoded_symname);
+ encoded_symname, NULL);
kfree(encoded_symname);
if (rc || d_really_is_negative(lower_dentry))
goto out_lock;
@@ -497,7 +499,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
fsstack_copy_attr_times(dir, lower_dir);
fsstack_copy_inode_size(dir, lower_dir);
out_lock:
- inode_unlock(lower_dir);
+ end_creating(lower_dentry);
if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
@@ -508,14 +510,16 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
{
int rc;
struct dentry *lower_dentry;
+ struct dentry *lower_dir_dentry;
struct inode *lower_dir;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- if (rc)
- goto out;
-
+ lower_dentry = ecryptfs_start_creating_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return lower_dentry;
+ lower_dir_dentry = dget(lower_dentry->d_parent);
+ lower_dir = lower_dir_dentry->d_inode;
lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir,
- lower_dentry, mode);
+ lower_dentry, mode, NULL);
rc = PTR_ERR(lower_dentry);
if (IS_ERR(lower_dentry))
goto out;
@@ -529,7 +533,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
fsstack_copy_inode_size(dir, lower_dir);
set_nlink(dir, lower_dir->i_nlink);
out:
- inode_unlock(lower_dir);
+ end_creating(lower_dentry);
if (d_really_is_negative(dentry))
d_drop(dentry);
return ERR_PTR(rc);
@@ -541,21 +545,18 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *lower_dir;
int rc;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- dget(lower_dentry); // don't even try to make the lower negative
- if (!rc) {
- if (d_unhashed(lower_dentry))
- rc = -EINVAL;
- else
- rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry);
- }
+ lower_dentry = ecryptfs_start_removing_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return PTR_ERR(lower_dentry);
+ lower_dir = lower_dentry->d_parent->d_inode;
+
+ rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry, NULL);
if (!rc) {
clear_nlink(d_inode(dentry));
fsstack_copy_attr_times(dir, lower_dir);
set_nlink(dir, lower_dir->i_nlink);
}
- dput(lower_dentry);
- inode_unlock(lower_dir);
+ end_removing(lower_dentry);
if (!rc)
d_drop(dentry);
return rc;
@@ -569,10 +570,12 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *lower_dentry;
struct inode *lower_dir;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- if (!rc)
- rc = vfs_mknod(&nop_mnt_idmap, lower_dir,
- lower_dentry, mode, dev);
+ lower_dentry = ecryptfs_start_creating_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return PTR_ERR(lower_dentry);
+ lower_dir = lower_dentry->d_parent->d_inode;
+
+ rc = vfs_mknod(&nop_mnt_idmap, lower_dir, lower_dentry, mode, dev, NULL);
if (rc || d_really_is_negative(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
@@ -581,7 +584,7 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
fsstack_copy_attr_times(dir, lower_dir);
fsstack_copy_inode_size(dir, lower_dir);
out:
- inode_unlock(lower_dir);
+ end_removing(lower_dentry);
if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
@@ -597,7 +600,6 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *lower_new_dentry;
struct dentry *lower_old_dir_dentry;
struct dentry *lower_new_dir_dentry;
- struct dentry *trap;
struct inode *target_inode;
struct renamedata rd = {};
@@ -612,32 +614,13 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
target_inode = d_inode(new_dentry);
- trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
- if (IS_ERR(trap))
- return PTR_ERR(trap);
- dget(lower_new_dentry);
- rc = -EINVAL;
- if (lower_old_dentry->d_parent != lower_old_dir_dentry)
- goto out_lock;
- if (lower_new_dentry->d_parent != lower_new_dir_dentry)
- goto out_lock;
- if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry))
- goto out_lock;
- /* source should not be ancestor of target */
- if (trap == lower_old_dentry)
- goto out_lock;
- /* target should not be ancestor of source */
- if (trap == lower_new_dentry) {
- rc = -ENOTEMPTY;
- goto out_lock;
- }
+ rd.mnt_idmap = &nop_mnt_idmap;
+ rd.old_parent = lower_old_dir_dentry;
+ rd.new_parent = lower_new_dir_dentry;
+ rc = start_renaming_two_dentries(&rd, lower_old_dentry, lower_new_dentry);
+ if (rc)
+ return rc;
- rd.old_mnt_idmap = &nop_mnt_idmap;
- rd.old_dir = d_inode(lower_old_dir_dentry);
- rd.old_dentry = lower_old_dentry;
- rd.new_mnt_idmap = &nop_mnt_idmap;
- rd.new_dir = d_inode(lower_new_dir_dentry);
- rd.new_dentry = lower_new_dentry;
rc = vfs_rename(&rd);
if (rc)
goto out_lock;
@@ -648,8 +631,7 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new_dir != old_dir)
fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry));
out_lock:
- dput(lower_new_dentry);
- unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ end_renaming(&rd);
return rc;
}
@@ -911,11 +893,8 @@ static int ecryptfs_setattr(struct mnt_idmap *idmap,
struct ecryptfs_crypt_stat *crypt_stat;
crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat;
- if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) {
- rc = ecryptfs_init_crypt_stat(crypt_stat);
- if (rc)
- return rc;
- }
+ if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
+ ecryptfs_init_crypt_stat(crypt_stat);
inode = d_inode(dentry);
lower_inode = ecryptfs_inode_to_lower(inode);
lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -1020,10 +999,10 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
{
struct dentry *dentry = path->dentry;
struct kstat lower_stat;
+ struct path lower_path = ecryptfs_lower_path(dentry);
int rc;
- rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry),
- &lower_stat, request_mask, flags);
+ rc = vfs_getattr_nosec(&lower_path, &lower_stat, request_mask, flags);
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
@@ -1122,13 +1101,13 @@ out:
return rc;
}
-static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int ecryptfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa);
}
static int ecryptfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
int rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 7f9f68c00ef6..bbf8603242fa 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -11,7 +11,6 @@
* Trevor S. Highland <trevor.highland@gmail.com>
*/
-#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/string.h>
#include <linux/pagemap.h>
@@ -601,10 +600,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
struct crypto_skcipher *skcipher_tfm;
struct skcipher_request *skcipher_req;
char iv[ECRYPTFS_MAX_IV_BYTES];
- char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
- char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
- struct crypto_shash *hash_tfm;
- struct shash_desc *hash_desc;
+ char hash[MD5_DIGEST_SIZE];
};
/*
@@ -741,51 +737,15 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"password tokens\n", __func__);
goto out_free_unlock;
}
- s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0);
- if (IS_ERR(s->hash_tfm)) {
- rc = PTR_ERR(s->hash_tfm);
- printk(KERN_ERR "%s: Error attempting to "
- "allocate hash crypto context; rc = [%d]\n",
- __func__, rc);
- goto out_free_unlock;
- }
-
- s->hash_desc = kmalloc(sizeof(*s->hash_desc) +
- crypto_shash_descsize(s->hash_tfm), GFP_KERNEL);
- if (!s->hash_desc) {
- rc = -ENOMEM;
- goto out_release_free_unlock;
- }
- s->hash_desc->tfm = s->hash_tfm;
-
- rc = crypto_shash_digest(s->hash_desc,
- (u8 *)s->auth_tok->token.password.session_key_encryption_key,
- s->auth_tok->token.password.session_key_encryption_key_bytes,
- s->hash);
- if (rc) {
- printk(KERN_ERR
- "%s: Error computing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out_release_free_unlock;
- }
+ md5(s->auth_tok->token.password.session_key_encryption_key,
+ s->auth_tok->token.password.session_key_encryption_key_bytes,
+ s->hash);
for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
s->block_aligned_filename[s->j] =
- s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
- if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
- == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
- rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash,
- ECRYPTFS_TAG_70_DIGEST_SIZE,
- s->tmp_hash);
- if (rc) {
- printk(KERN_ERR
- "%s: Error computing crypto hash; "
- "rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
- }
- memcpy(s->hash, s->tmp_hash,
- ECRYPTFS_TAG_70_DIGEST_SIZE);
- }
+ s->hash[s->j % MD5_DIGEST_SIZE];
+ if ((s->j % MD5_DIGEST_SIZE) == (MD5_DIGEST_SIZE - 1))
+ md5(s->hash, MD5_DIGEST_SIZE, s->hash);
if (s->block_aligned_filename[s->j] == '\0')
s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
}
@@ -798,7 +758,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"convert filename memory to scatterlist; rc = [%d]. "
"block_aligned_filename_size = [%zd]\n", __func__, rc,
s->block_aligned_filename_size);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
s->dst_sg, 2);
@@ -807,7 +767,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"convert encrypted filename memory to scatterlist; "
"rc = [%d]. block_aligned_filename_size = [%zd]\n",
__func__, rc, s->block_aligned_filename_size);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
/* The characters in the first block effectively do the job
* of the IV here, so we just use 0's for the IV. Note the
@@ -825,7 +785,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
rc,
s->auth_tok->token.password.session_key_encryption_key,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
s->block_aligned_filename_size, s->iv);
@@ -833,13 +793,11 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt filename; "
"rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
s->i += s->block_aligned_filename_size;
(*packet_size) = s->i;
(*remaining_bytes) -= (*packet_size);
-out_release_free_unlock:
- crypto_free_shash(s->hash_tfm);
out_free_unlock:
kfree_sensitive(s->block_aligned_filename);
out_unlock:
@@ -850,7 +808,6 @@ out:
key_put(auth_tok_key);
}
skcipher_request_free(s->skcipher_req);
- kfree_sensitive(s->hash_desc);
kfree(s);
return rc;
}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 8dd1d7189c3b..c12dc680f8fe 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -12,6 +12,7 @@
#include <linux/dcache.h>
#include <linux/file.h>
+#include <linux/fips.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/skbuff.h>
@@ -20,6 +21,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fs_stack.h>
+#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include "ecryptfs_kernel.h"
@@ -105,15 +107,14 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- const struct path *path = ecryptfs_dentry_to_lower_path(dentry);
+ struct path path = ecryptfs_lower_path(dentry);
int rc;
- rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
- cred);
+ rc = ecryptfs_privileged_open(lower_file, path.dentry, path.mnt, cred);
if (rc) {
printk(KERN_ERR "Error opening lower file "
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
- "rc = [%d]\n", path->dentry, path->mnt, rc);
+ "rc = [%d]\n", path.dentry, path.mnt, rc);
(*lower_file) = NULL;
}
return rc;
@@ -436,7 +437,6 @@ static int ecryptfs_get_tree(struct fs_context *fc)
struct ecryptfs_fs_context *ctx = fc->fs_private;
struct ecryptfs_sb_info *sbi = fc->s_fs_info;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
- struct ecryptfs_dentry_info *root_info;
const char *err = "Getting sb failed";
struct inode *inode;
struct path path;
@@ -455,6 +455,12 @@ static int ecryptfs_get_tree(struct fs_context *fc)
goto out;
}
+ if (fips_enabled) {
+ rc = -EINVAL;
+ err = "eCryptfs support is disabled due to FIPS";
+ goto out;
+ }
+
s = sget_fc(fc, NULL, set_anon_super_fc);
if (IS_ERR(s)) {
rc = PTR_ERR(s);
@@ -471,7 +477,7 @@ static int ecryptfs_get_tree(struct fs_context *fc)
sbi = NULL;
s->s_op = &ecryptfs_sops;
s->s_xattr = ecryptfs_xattr_handlers;
- s->s_d_op = &ecryptfs_dops;
+ set_default_d_op(s, &ecryptfs_dops);
err = "Reading sb failed";
rc = kern_path(fc->source, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
@@ -542,14 +548,8 @@ static int ecryptfs_get_tree(struct fs_context *fc)
goto out_free;
}
- rc = -ENOMEM;
- root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- if (!root_info)
- goto out_free;
-
- /* ->kill_sb() will take care of root_info */
- ecryptfs_set_dentry_private(s->s_root, root_info);
- root_info->lower_path = path;
+ ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+ ecryptfs_superblock_to_private(s)->lower_mnt = path.mnt;
s->s_flags |= SB_ACTIVE;
fc->root = dget(s->s_root);
@@ -579,6 +579,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
kill_anon_super(sb);
if (!sb_info)
return;
+ mntput(sb_info->lower_mnt);
ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
}
@@ -667,11 +668,6 @@ static struct ecryptfs_cache_info {
.size = sizeof(struct ecryptfs_file_info),
},
{
- .cache = &ecryptfs_dentry_info_cache,
- .name = "ecryptfs_dentry_info_cache",
- .size = sizeof(struct ecryptfs_dentry_info),
- },
- {
.cache = &ecryptfs_inode_info_cache,
.name = "ecryptfs_inode_cache",
.size = sizeof(struct ecryptfs_inode_info),
@@ -764,7 +760,7 @@ static struct kobject *ecryptfs_kobj;
static ssize_t version_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buff)
{
- return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
+ return sysfs_emit(buff, "%d\n", ECRYPTFS_VERSIONING_MASK);
}
static struct kobj_attribute version_attr = __ATTR_RO(version);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 60f0ac8744b5..2c2b12fedeae 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -228,7 +228,7 @@ out:
/**
* ecryptfs_write_begin
- * @file: The eCryptfs file
+ * @iocb: I/O control block for the eCryptfs file
* @mapping: The eCryptfs object
* @pos: The file offset at which to start writing
* @len: Length of the write
@@ -239,7 +239,7 @@ out:
*
* Returns zero on success; non-zero otherwise
*/
-static int ecryptfs_write_begin(struct file *file,
+static int ecryptfs_write_begin(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
@@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file,
* Note, this will increase i_size. */
if (index != 0) {
if (prev_page_end_size > i_size_read(mapping->host)) {
- rc = ecryptfs_truncate(file->f_path.dentry,
+ rc = ecryptfs_truncate(iocb->ki_filp->f_path.dentry,
prev_page_end_size);
if (rc) {
printk(KERN_ERR "%s: Error on attempt to "
@@ -429,7 +429,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
/**
* ecryptfs_write_end
- * @file: The eCryptfs file object
+ * @iocb: I/O control block for the eCryptfs file
* @mapping: The eCryptfs object
* @pos: The file position
* @len: The length of the data (unused)
@@ -437,7 +437,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
* @folio: The eCryptfs folio
* @fsdata: The fsdata (unused)
*/
-static int ecryptfs_write_end(struct file *file,
+static int ecryptfs_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index e7b7f426fecf..3bc21d677564 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -41,10 +41,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL);
if (unlikely(!inode_info))
goto out;
- if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) {
- kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
- goto out;
- }
+ ecryptfs_init_crypt_stat(&inode_info->crypt_stat);
mutex_init(&inode_info->lower_file_mutex);
atomic_set(&inode_info->lower_file_count, 0);
inode_info->lower_file = NULL;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index c294a8fc566d..cb1b6d0c3454 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -57,11 +57,10 @@ static ssize_t efivarfs_file_write(struct file *file,
if (bytes == -ENOENT) {
/*
- * FIXME: temporary workaround for fwupdate, signal
- * failed write with a 1 to keep created but not
- * written files
+ * zero size signals to release that the write deleted
+ * the variable
*/
- i_size_write(inode, 1);
+ i_size_write(inode, 0);
} else {
i_size_write(inode, datasize + sizeof(attributes));
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
@@ -125,8 +124,7 @@ static int efivarfs_file_release(struct inode *inode, struct file *file)
struct efivar_entry *var = inode->i_private;
inode_lock(inode);
- /* FIXME: temporary work around for fwupdate */
- var->removed = (--var->open_count == 0 && i_size_read(inode) == 1);
+ var->removed = (--var->open_count == 0 && i_size_read(inode) == 0);
inode_unlock(inode);
if (var->removed)
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 98a7299a9ee9..95dcad83da11 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -113,8 +113,7 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
inode->i_private = var;
- d_instantiate(dentry, inode);
- dget(dentry);
+ d_make_persistent(dentry, inode);
return 0;
}
@@ -126,9 +125,7 @@ static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
if (efivar_entry_delete(var))
return -EINVAL;
- drop_nlink(d_inode(dentry));
- dput(dentry);
- return 0;
+ return simple_unlink(dir, dentry);
};
const struct inode_operations efivarfs_dir_inode_operations = {
@@ -138,7 +135,7 @@ const struct inode_operations efivarfs_dir_inode_operations = {
};
static int
-efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+efivarfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
unsigned int i_flags;
unsigned int flags = 0;
@@ -154,7 +151,7 @@ efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
static int
efivarfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
unsigned int i_flags = 0;
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index ac6a1dd0a6a5..f913b6824289 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -17,7 +17,6 @@ struct efivarfs_fs_info {
struct efivarfs_mount_opts mount_opts;
struct super_block *sb;
struct notifier_block nb;
- struct notifier_block pm_nb;
};
struct efi_variable {
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 0486e9b68bc6..9da992925920 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -18,8 +18,10 @@
#include <linux/statfs.h>
#include <linux/notifier.h>
#include <linux/printk.h>
+#include <linux/namei.h>
#include "internal.h"
+#include "../internal.h"
static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event,
void *data)
@@ -119,12 +121,18 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+
+static int efivarfs_freeze_fs(struct super_block *sb);
+static int efivarfs_unfreeze_fs(struct super_block *sb);
+
static const struct super_operations efivarfs_ops = {
.statfs = efivarfs_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.alloc_inode = efivarfs_alloc_inode,
.free_inode = efivarfs_free_inode,
.show_options = efivarfs_show_options,
+ .freeze_fs = efivarfs_freeze_fs,
+ .unfreeze_fs = efivarfs_unfreeze_fs,
};
/*
@@ -144,6 +152,10 @@ static int efivarfs_d_compare(const struct dentry *dentry,
{
int guid = len - EFI_VARIABLE_GUID_LEN;
+ /* Parallel lookups may produce a temporary invalid filename */
+ if (guid <= 0)
+ return 1;
+
if (name->len != len)
return 1;
@@ -175,7 +187,6 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
static const struct dentry_operations efivarfs_d_ops = {
.d_compare = efivarfs_d_compare,
.d_hash = efivarfs_d_hash,
- .d_delete = always_delete_dentry,
};
static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
@@ -204,7 +215,6 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name,
char *name = efivar_get_utf8name(variable_name, vendor);
struct super_block *sb = data;
struct dentry *dentry;
- struct qstr qstr;
if (!name)
/*
@@ -217,9 +227,7 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name,
*/
return true;
- qstr.name = name;
- qstr.len = strlen(name);
- dentry = d_hash_and_lookup(sb->s_root, &qstr);
+ dentry = try_lookup_noperm(&QSTR(name), sb->s_root);
kfree(name);
if (!IS_ERR_OR_NULL(dentry))
dput(dentry);
@@ -270,7 +278,8 @@ static int efivarfs_create_dentry(struct super_block *sb, efi_char16_t *name16,
inode->i_private = entry;
i_size_write(inode, size + sizeof(__u32)); /* attributes + data */
inode_unlock(inode);
- d_add(dentry, inode);
+ d_make_persistent(dentry, inode);
+ dput(dentry);
return 0;
@@ -345,7 +354,8 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = EFIVARFS_MAGIC;
sb->s_op = &efivarfs_ops;
- sb->s_d_op = &efivarfs_d_ops;
+ set_default_d_op(sb, &efivarfs_d_ops);
+ sb->s_d_flags |= DCACHE_DONTCACHE;
sb->s_time_gran = 1;
if (!efivar_supports_writes())
@@ -367,8 +377,6 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
- register_pm_notifier(&sfi->pm_nb);
-
return efivar_init(efivarfs_callback, sb, true);
}
@@ -387,61 +395,24 @@ static int efivarfs_reconfigure(struct fs_context *fc)
return 0;
}
+static void efivarfs_free(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
static const struct fs_context_operations efivarfs_context_ops = {
.get_tree = efivarfs_get_tree,
.parse_param = efivarfs_parse_param,
.reconfigure = efivarfs_reconfigure,
+ .free = efivarfs_free,
};
-struct efivarfs_ctx {
- struct dir_context ctx;
- struct super_block *sb;
- struct dentry *dentry;
-};
-
-static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len,
- loff_t offset, u64 ino, unsigned mode)
-{
- unsigned long size;
- struct efivarfs_ctx *ectx = container_of(ctx, struct efivarfs_ctx, ctx);
- struct qstr qstr = { .name = name, .len = len };
- struct dentry *dentry = d_hash_and_lookup(ectx->sb->s_root, &qstr);
- struct inode *inode;
- struct efivar_entry *entry;
- int err;
-
- if (IS_ERR_OR_NULL(dentry))
- return true;
-
- inode = d_inode(dentry);
- entry = efivar_entry(inode);
-
- err = efivar_entry_size(entry, &size);
- size += sizeof(__u32); /* attributes */
- if (err)
- size = 0;
-
- inode_lock_nested(inode, I_MUTEX_CHILD);
- i_size_write(inode, size);
- inode_unlock(inode);
-
- if (!size) {
- ectx->dentry = dentry;
- return false;
- }
-
- dput(dentry);
-
- return true;
-}
-
static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
unsigned long name_size, void *data)
{
char *name;
struct super_block *sb = data;
struct dentry *dentry;
- struct qstr qstr;
int err;
if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
@@ -451,9 +422,7 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
if (!name)
return -ENOMEM;
- qstr.name = name;
- qstr.len = strlen(name);
- dentry = d_hash_and_lookup(sb->s_root, &qstr);
+ dentry = try_lookup_noperm(&QSTR(name), sb->s_root);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out;
@@ -474,111 +443,59 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
return err;
}
-static void efivarfs_deactivate_super_work(struct work_struct *work)
-{
- struct super_block *s = container_of(work, struct super_block,
- destroy_work);
- /*
- * note: here s->destroy_work is free for reuse (which
- * will happen in deactivate_super)
- */
- deactivate_super(s);
-}
-
static struct file_system_type efivarfs_type;
-static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action,
- void *ptr)
+static int efivarfs_freeze_fs(struct super_block *sb)
{
- struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info,
- pm_nb);
- struct path path;
- struct efivarfs_ctx ectx = {
- .ctx = {
- .actor = efivarfs_actor,
- },
- .sb = sfi->sb,
- };
- struct file *file;
- struct super_block *s = sfi->sb;
- static bool rescan_done = true;
-
- if (action == PM_HIBERNATION_PREPARE) {
- rescan_done = false;
- return NOTIFY_OK;
- } else if (action != PM_POST_HIBERNATION) {
- return NOTIFY_DONE;
- }
-
- if (rescan_done)
- return NOTIFY_DONE;
-
- /* ensure single superblock is alive and pin it */
- if (!atomic_inc_not_zero(&s->s_active))
- return NOTIFY_DONE;
-
- pr_info("efivarfs: resyncing variable state\n");
-
- path.dentry = sfi->sb->s_root;
-
- /*
- * do not add SB_KERNMOUNT which a single superblock could
- * expose to userspace and which also causes MNT_INTERNAL, see
- * below
- */
- path.mnt = vfs_kern_mount(&efivarfs_type, 0,
- efivarfs_type.name, NULL);
- if (IS_ERR(path.mnt)) {
- pr_err("efivarfs: internal mount failed\n");
- /*
- * We may be the last pinner of the superblock but
- * calling efivarfs_kill_sb from within the notifier
- * here would deadlock trying to unregister it
- */
- INIT_WORK(&s->destroy_work, efivarfs_deactivate_super_work);
- schedule_work(&s->destroy_work);
- return PTR_ERR(path.mnt);
- }
-
- /* path.mnt now has pin on superblock, so this must be above one */
- atomic_dec(&s->s_active);
-
- file = kernel_file_open(&path, O_RDONLY | O_DIRECTORY | O_NOATIME,
- current_cred());
- /*
- * safe even if last put because no MNT_INTERNAL means this
- * will do delayed deactivate_super and not deadlock
- */
- mntput(path.mnt);
- if (IS_ERR(file))
- return NOTIFY_DONE;
+ /* Nothing for us to do. */
+ return 0;
+}
- rescan_done = true;
+static int efivarfs_unfreeze_fs(struct super_block *sb)
+{
+ struct dentry *child = NULL;
/*
- * First loop over the directory and verify each entry exists,
- * removing it if it doesn't
+ * Unconditionally resync the variable state on a thaw request.
+ * Given the size of efivarfs it really doesn't matter to simply
+ * iterate through all of the entries and resync. Freeze/thaw
+ * requests are rare enough for that to not matter and the
+ * number of entries is pretty low too. So we really don't care.
*/
- file->f_pos = 2; /* skip . and .. */
- do {
- ectx.dentry = NULL;
- iterate_dir(file, &ectx.ctx);
- if (ectx.dentry) {
- pr_info("efivarfs: removing variable %pd\n",
- ectx.dentry);
- simple_recursive_removal(ectx.dentry, NULL);
- dput(ectx.dentry);
+ pr_info("efivarfs: resyncing variable state\n");
+ for (;;) {
+ int err;
+ unsigned long size = 0;
+ struct inode *inode;
+ struct efivar_entry *entry;
+
+ child = find_next_child(sb->s_root, child);
+ if (!child)
+ break;
+
+ inode = d_inode(child);
+ entry = efivar_entry(inode);
+
+ err = efivar_entry_size(entry, &size);
+ if (err)
+ size = 0;
+ else
+ size += sizeof(__u32);
+
+ inode_lock(inode);
+ i_size_write(inode, size);
+ inode_unlock(inode);
+
+ /* The variable doesn't exist anymore, delete it. */
+ if (!size) {
+ pr_info("efivarfs: removing variable %pd\n", child);
+ simple_recursive_removal(child, NULL);
}
- } while (ectx.dentry);
- fput(file);
-
- /*
- * then loop over variables, creating them if there's no matching
- * dentry
- */
- efivar_init(efivarfs_check_missing, sfi->sb, false);
+ }
- return NOTIFY_OK;
+ efivar_init(efivarfs_check_missing, sb, false);
+ pr_info("efivarfs: finished resyncing variable state\n");
+ return 0;
}
static int efivarfs_init_fs_context(struct fs_context *fc)
@@ -598,9 +515,6 @@ static int efivarfs_init_fs_context(struct fs_context *fc)
fc->s_fs_info = sfi;
fc->ops = &efivarfs_context_ops;
- sfi->pm_nb.notifier_call = efivarfs_pm_notify;
- sfi->pm_nb.priority = 0;
-
return 0;
}
@@ -609,8 +523,7 @@ static void efivarfs_kill_sb(struct super_block *sb)
struct efivarfs_fs_info *sfi = sb->s_fs_info;
blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb);
- kill_litter_super(sb);
- unregister_pm_notifier(&sfi->pm_nb);
+ kill_anon_super(sb);
kfree(sfi);
}
@@ -621,6 +534,7 @@ static struct file_system_type efivarfs_type = {
.init_fs_context = efivarfs_init_fs_context,
.kill_sb = efivarfs_kill_sb,
.parameters = efivarfs_parameters,
+ .fs_flags = FS_POWER_FREEZE,
};
static __init int efivarfs_init(void)
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 462619e59766..28407578f83a 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -62,7 +62,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
inode = iget_locked(super, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
in = INODE_INFO(inode);
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 331e49cd1b8d..d81f3318417d 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,8 +3,18 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select CACHEFILES if EROFS_FS_ONDEMAND
+ select CRC32
+ select CRYPTO if EROFS_FS_ZIP_ACCEL
+ select CRYPTO_DEFLATE if EROFS_FS_ZIP_ACCEL
select FS_IOMAP
- select LIBCRC32C
+ select LZ4_DECOMPRESS if EROFS_FS_ZIP
+ select NETFS_SUPPORT if EROFS_FS_ONDEMAND
+ select XXHASH if EROFS_FS_XATTR
+ select XZ_DEC if EROFS_FS_ZIP_LZMA
+ select XZ_DEC_MICROLZMA if EROFS_FS_ZIP_LZMA
+ select ZLIB_INFLATE if EROFS_FS_ZIP_DEFLATE
+ select ZSTD_DECOMPRESS if EROFS_FS_ZIP_ZSTD
help
EROFS (Enhanced Read-Only File System) is a lightweight read-only
file system with modern designs (e.g. no buffer heads, inline
@@ -38,7 +48,6 @@ config EROFS_FS_DEBUG
config EROFS_FS_XATTR
bool "EROFS extended attributes"
depends on EROFS_FS
- select XXHASH
default y
help
Extended attributes are name:value pairs associated with inodes by
@@ -94,7 +103,6 @@ config EROFS_FS_BACKED_BY_FILE
config EROFS_FS_ZIP
bool "EROFS Data Compression Support"
depends on EROFS_FS
- select LZ4_DECOMPRESS
default y
help
Enable transparent compression support for EROFS file systems.
@@ -104,8 +112,6 @@ config EROFS_FS_ZIP
config EROFS_FS_ZIP_LZMA
bool "EROFS LZMA compressed data support"
depends on EROFS_FS_ZIP
- select XZ_DEC
- select XZ_DEC_MICROLZMA
help
Saying Y here includes support for reading EROFS file systems
containing LZMA compressed data, specifically called microLZMA. It
@@ -117,7 +123,6 @@ config EROFS_FS_ZIP_LZMA
config EROFS_FS_ZIP_DEFLATE
bool "EROFS DEFLATE compressed data support"
depends on EROFS_FS_ZIP
- select ZLIB_INFLATE
help
Saying Y here includes support for reading EROFS file systems
containing DEFLATE compressed data. It gives better compression
@@ -132,7 +137,6 @@ config EROFS_FS_ZIP_DEFLATE
config EROFS_FS_ZIP_ZSTD
bool "EROFS Zstandard compressed data support"
depends on EROFS_FS_ZIP
- select ZSTD_DECOMPRESS
help
Saying Y here includes support for reading EROFS file systems
containing Zstandard compressed data. It gives better compression
@@ -144,12 +148,24 @@ config EROFS_FS_ZIP_ZSTD
If unsure, say N.
+config EROFS_FS_ZIP_ACCEL
+ bool "EROFS hardware decompression support"
+ depends on EROFS_FS_ZIP
+ help
+ Saying Y here includes hardware accelerator support for reading
+ EROFS file systems containing compressed data. It gives better
+ decompression speed than the software-implemented decompression, and
+ it costs lower CPU overhead.
+
+ Hardware accelerator support is an experimental feature for now and
+ file systems are still readable without selecting this option.
+
+ If unsure, say N.
+
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support (deprecated)"
depends on EROFS_FS
- select NETFS_SUPPORT
select FSCACHE
- select CACHEFILES
select CACHEFILES_ONDEMAND
help
This permits EROFS to use fscache-backed data blobs with on-demand
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 4331d53c7109..549abc424763 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -7,5 +7,6 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
+erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 2704d7a592a5..84c8e52581f4 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -23,8 +23,8 @@ struct z_erofs_decompress_req {
struct z_erofs_decompressor {
int (*config)(struct super_block *sb, struct erofs_super_block *dsb,
void *data, int size);
- int (*decompress)(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
+ const char *(*decompress)(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
int (*init)(void);
void (*exit)(void);
char *name;
@@ -70,10 +70,20 @@ struct z_erofs_stream_dctx {
bool bounced; /* is the bounce buffer used now? */
};
-int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
- void **src, struct page **pgpl);
-int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
- unsigned int padbufsize);
+const char *z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx,
+ void **dst, void **src, struct page **pgpl);
+const char *z_erofs_fixup_insize(struct z_erofs_decompress_req *rq,
+ const char *padbuf, unsigned int padbufsize);
int __init z_erofs_init_decompressor(void);
void z_erofs_exit_decompressor(void);
+int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl);
+int z_erofs_crypto_enable_engine(const char *name, int len);
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+void z_erofs_crypto_disable_all_engines(void);
+int z_erofs_crypto_show_engines(char *buf, int size, char sep);
+#else
+static inline void z_erofs_crypto_disable_all_engines(void) {}
+static inline int z_erofs_crypto_show_engines(char *buf, int size, char sep) { return 0; }
+#endif
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 2409d2ab0c28..bb13c4cb8455 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -27,7 +27,7 @@ void erofs_put_metabuf(struct erofs_buf *buf)
void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
{
- pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t index = (buf->off + offset) >> PAGE_SHIFT;
struct folio *folio = NULL;
if (buf->page) {
@@ -49,11 +49,19 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
return buf->base + (offset & ~PAGE_MASK);
}
-void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
+int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ bool in_metabox)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
buf->file = NULL;
+ if (in_metabox) {
+ if (unlikely(!sbi->metabox_inode))
+ return -EFSCORRUPTED;
+ buf->mapping = sbi->metabox_inode->i_mapping;
+ return 0;
+ }
+ buf->off = sbi->dif0.fsoff;
if (erofs_is_fileio_mode(sbi)) {
buf->file = sbi->dif0.file; /* some fs like FUSE needs it */
buf->mapping = buf->file->f_mapping;
@@ -61,13 +69,18 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
buf->mapping = sbi->dif0.fscache->inode->i_mapping;
else
buf->mapping = sb->s_bdev->bd_mapping;
+ return 0;
}
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, bool need_kmap)
+ erofs_off_t offset, bool in_metabox)
{
- erofs_init_metabuf(buf, sb);
- return erofs_bread(buf, offset, need_kmap);
+ int err;
+
+ err = erofs_init_metabuf(buf, sb, in_metabox);
+ if (err)
+ return ERR_PTR(err);
+ return erofs_bread(buf, offset, true);
}
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
@@ -117,7 +130,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- idx = erofs_read_metabuf(&buf, sb, pos, true);
+ idx = erofs_read_metabuf(&buf, sb, pos, erofs_inode_in_metabox(inode));
if (IS_ERR(idx)) {
err = PTR_ERR(idx);
goto out;
@@ -213,9 +226,11 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
/*
* bit 30: I/O error occurred on this folio
+ * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
* bit 0 - 29: remaining parts to complete this folio
*/
-#define EROFS_ONLINEFOLIO_EIO (1 << 30)
+#define EROFS_ONLINEFOLIO_EIO 30
+#define EROFS_ONLINEFOLIO_DIRTY 29
void erofs_onlinefolio_init(struct folio *folio)
{
@@ -232,19 +247,23 @@ void erofs_onlinefolio_split(struct folio *folio)
atomic_inc((atomic_t *)&folio->private);
}
-void erofs_onlinefolio_end(struct folio *folio, int err)
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
{
int orig, v;
do {
orig = atomic_read((atomic_t *)&folio->private);
- v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0);
+ DBG_BUGON(orig <= 0);
+ v = dirty << EROFS_ONLINEFOLIO_DIRTY;
+ v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO);
} while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
- if (v & ~EROFS_ONLINEFOLIO_EIO)
+ if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1))
return;
folio->private = 0;
- folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO));
+ if (v & BIT(EROFS_ONLINEFOLIO_DIRTY))
+ flush_dcache_folio(folio);
+ folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
}
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -257,51 +276,51 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_la = offset;
map.m_llen = length;
-
ret = erofs_map_blocks(inode, &map);
if (ret < 0)
return ret;
- mdev = (struct erofs_map_dev) {
- .m_deviceid = map.m_deviceid,
- .m_pa = map.m_pa,
- };
- ret = erofs_map_dev(sb, &mdev);
- if (ret)
- return ret;
-
iomap->offset = map.m_la;
- if (flags & IOMAP_DAX)
- iomap->dax_dev = mdev.m_dif->dax_dev;
- else
- iomap->bdev = mdev.m_bdev;
iomap->length = map.m_llen;
iomap->flags = 0;
iomap->private = NULL;
-
+ iomap->addr = IOMAP_NULL_ADDR;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
iomap->type = IOMAP_HOLE;
- iomap->addr = IOMAP_NULL_ADDR;
- if (!iomap->length)
- iomap->length = length;
return 0;
}
+ if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(inode)) {
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return ret;
+
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = mdev.m_dif->dax_dev;
+ else
+ iomap->bdev = mdev.m_bdev;
+ iomap->addr = mdev.m_dif->fsoff + mdev.m_pa;
+ if (flags & IOMAP_DAX)
+ iomap->addr += mdev.m_dif->dax_part_off;
+ }
+
if (map.m_flags & EROFS_MAP_META) {
void *ptr;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, true);
+ ptr = erofs_read_metabuf(&buf, sb, map.m_pa,
+ erofs_inode_in_metabox(inode));
if (IS_ERR(ptr))
return PTR_ERR(ptr);
iomap->inline_data = ptr;
iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->addr = mdev.m_pa;
- if (flags & IOMAP_DAX)
- iomap->addr += mdev.m_dif->dax_part_off;
}
return 0;
}
@@ -350,12 +369,18 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
- return iomap_read_folio(folio, &erofs_iomap_ops);
+ trace_erofs_read_folio(folio, true);
+
+ iomap_bio_read_folio(folio, &erofs_iomap_ops);
+ return 0;
}
static void erofs_readahead(struct readahead_control *rac)
{
- return iomap_readahead(rac, &erofs_iomap_ops);
+ trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
+ readahead_count(rac), true);
+
+ iomap_bio_readahead(rac, &erofs_iomap_ops);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
@@ -408,20 +433,20 @@ static const struct vm_operations_struct erofs_dax_vm_ops = {
.huge_fault = erofs_dax_huge_fault,
};
-static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int erofs_file_mmap_prepare(struct vm_area_desc *desc)
{
- if (!IS_DAX(file_inode(file)))
- return generic_file_readonly_mmap(file, vma);
+ if (!IS_DAX(file_inode(desc->file)))
+ return generic_file_readonly_mmap_prepare(desc);
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ if ((desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE))
return -EINVAL;
- vma->vm_ops = &erofs_dax_vm_ops;
- vm_flags_set(vma, VM_HUGEPAGE);
+ desc->vm_ops = &erofs_dax_vm_ops;
+ desc->vm_flags |= VM_HUGEPAGE;
return 0;
}
#else
-#define erofs_file_mmap generic_file_readonly_mmap
+#define erofs_file_mmap_prepare generic_file_readonly_mmap_prepare
#endif
static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
@@ -451,7 +476,11 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations erofs_file_fops = {
.llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
- .mmap = erofs_file_mmap,
+ .unlocked_ioctl = erofs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = erofs_compat_ioctl,
+#endif
+ .mmap_prepare = erofs_file_mmap_prepare,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index bf62e2836b60..d5d090276391 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -105,44 +105,58 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
return kaddr ? 1 : 0;
}
-static void *z_erofs_lz4_handle_overlap(struct z_erofs_decompress_req *rq,
+static void *z_erofs_lz4_handle_overlap(const struct z_erofs_decompress_req *rq,
void *inpage, void *out, unsigned int *inputmargin,
int *maptype, bool may_inplace)
{
- unsigned int oend, omargin, total, i;
+ unsigned int oend, omargin, cnt, i;
struct page **in;
- void *src, *tmp;
-
- if (rq->inplace_io) {
- oend = rq->pageofs_out + rq->outputsize;
- omargin = PAGE_ALIGN(oend) - oend;
- if (rq->partial_decoding || !may_inplace ||
- omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
- goto docopy;
+ void *src;
+ /*
+ * If in-place I/O isn't used, for example, the bounce compressed cache
+ * can hold data for incomplete read requests. Just map the compressed
+ * buffer as well and decompress directly.
+ */
+ if (!rq->inplace_io) {
+ if (rq->inpages <= 1) {
+ *maptype = 0;
+ return inpage;
+ }
+ kunmap_local(inpage);
+ src = erofs_vm_map_ram(rq->in, rq->inpages);
+ if (!src)
+ return ERR_PTR(-ENOMEM);
+ *maptype = 1;
+ return src;
+ }
+ /*
+ * Then, deal with in-place I/Os. The reasons why in-place I/O is useful
+ * are: (1) It minimizes memory footprint during the I/O submission,
+ * which is useful for slow storage (including network devices and
+ * low-end HDDs/eMMCs) but with a lot inflight I/Os; (2) If in-place
+ * decompression can also be applied, it will reuse the unique buffer so
+ * that no extra CPU D-cache is polluted with temporary compressed data
+ * for extreme performance.
+ */
+ oend = rq->pageofs_out + rq->outputsize;
+ omargin = PAGE_ALIGN(oend) - oend;
+ if (!rq->partial_decoding && may_inplace &&
+ omargin >= LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) {
for (i = 0; i < rq->inpages; ++i)
if (rq->out[rq->outpages - rq->inpages + i] !=
rq->in[i])
- goto docopy;
- kunmap_local(inpage);
- *maptype = 3;
- return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT);
- }
-
- if (rq->inpages <= 1) {
- *maptype = 0;
- return inpage;
+ break;
+ if (i >= rq->inpages) {
+ kunmap_local(inpage);
+ *maptype = 3;
+ return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT);
+ }
}
- kunmap_local(inpage);
- src = erofs_vm_map_ram(rq->in, rq->inpages);
- if (!src)
- return ERR_PTR(-ENOMEM);
- *maptype = 1;
- return src;
-
-docopy:
- /* Or copy compressed data which can be overlapped to per-CPU buffer */
- in = rq->in;
+ /*
+ * If in-place decompression can't be applied, copy compressed data that
+ * may potentially overlap during decompression to a per-CPU buffer.
+ */
src = z_erofs_get_gbuf(rq->inpages);
if (!src) {
DBG_BUGON(1);
@@ -150,20 +164,13 @@ docopy:
return ERR_PTR(-EFAULT);
}
- tmp = src;
- total = rq->inputsize;
- while (total) {
- unsigned int page_copycnt =
- min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
-
+ for (i = 0, in = rq->in; i < rq->inputsize; i += cnt, ++in) {
+ cnt = min_t(u32, rq->inputsize - i, PAGE_SIZE - *inputmargin);
if (!inpage)
inpage = kmap_local_page(*in);
- memcpy(tmp, inpage + *inputmargin, page_copycnt);
+ memcpy(src + i, inpage + *inputmargin, cnt);
kunmap_local(inpage);
inpage = NULL;
- tmp += page_copycnt;
- total -= page_copycnt;
- ++in;
*inputmargin = 0;
}
*maptype = 2;
@@ -171,21 +178,21 @@ docopy:
}
/*
- * Get the exact inputsize with zero_padding feature.
- * - For LZ4, it should work if zero_padding feature is on (5.3+);
- * - For MicroLZMA, it'd be enabled all the time.
+ * Get the exact on-disk size of the compressed data:
+ * - For LZ4, it should apply if the zero_padding feature is on (5.3+);
+ * - For others, zero_padding is enabled all the time.
*/
-int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
- unsigned int padbufsize)
+const char *z_erofs_fixup_insize(struct z_erofs_decompress_req *rq,
+ const char *padbuf, unsigned int padbufsize)
{
const char *padend;
padend = memchr_inv(padbuf, 0, padbufsize);
if (!padend)
- return -EFSCORRUPTED;
+ return "compressed data start not found";
rq->inputsize -= padend - padbuf;
rq->pageofs_in += padend - padbuf;
- return 0;
+ return NULL;
}
static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst)
@@ -193,6 +200,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst
bool support_0padding = false, may_inplace = false;
unsigned int inputmargin;
u8 *out, *headpage, *src;
+ const char *reason;
int ret, maptype;
DBG_BUGON(*rq->in == NULL);
@@ -201,12 +209,12 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst
/* LZ4 decompression inplace is only safe if zero_padding is enabled */
if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) {
support_0padding = true;
- ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ reason = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
min_t(unsigned int, rq->inputsize,
rq->sb->s_blocksize - rq->pageofs_in));
- if (ret) {
+ if (reason) {
kunmap_local(headpage);
- return ret;
+ return IS_ERR(reason) ? PTR_ERR(reason) : -EFSCORRUPTED;
}
may_inplace = !((rq->pageofs_in + rq->inputsize) &
(rq->sb->s_blocksize - 1));
@@ -228,8 +236,6 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst
rq->inputsize, rq->outputsize);
if (ret != rq->outputsize) {
- erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
- ret, rq->inputsize, inputmargin, rq->outputsize);
if (ret >= 0)
memset(out + ret, 0, rq->outputsize - ret);
ret = -EFSCORRUPTED;
@@ -250,8 +256,8 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst
return ret;
}
-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+static const char *z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
unsigned int dst_maptype;
void *dst;
@@ -266,14 +272,14 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
/* general decoding path which can be used for all cases */
ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
if (ret < 0)
- return ret;
+ return ERR_PTR(ret);
if (ret > 0) {
dst = page_address(*rq->out);
dst_maptype = 1;
} else {
dst = erofs_vm_map_ram(rq->out, rq->outpages);
if (!dst)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
dst_maptype = 2;
}
}
@@ -282,11 +288,11 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
kunmap_local(dst);
else if (dst_maptype == 2)
vm_unmap_ram(dst, rq->outpages);
- return ret;
+ return ERR_PTR(ret);
}
-static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+static const char *z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nrpages_in = rq->inpages, nrpages_out = rq->outpages;
const unsigned int bs = rq->sb->s_blocksize;
@@ -294,20 +300,18 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
u8 *kin;
if (rq->outputsize > rq->inputsize)
- return -EOPNOTSUPP;
+ return ERR_PTR(-EOPNOTSUPP);
if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) {
cur = bs - (rq->pageofs_out & (bs - 1));
pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK;
cur = min(cur, rq->outputsize);
if (cur && rq->out[0]) {
kin = kmap_local_page(rq->in[nrpages_in - 1]);
- if (rq->out[0] == rq->in[nrpages_in - 1]) {
+ if (rq->out[0] == rq->in[nrpages_in - 1])
memmove(kin + rq->pageofs_out, kin + pi, cur);
- flush_dcache_page(rq->out[0]);
- } else {
+ else
memcpy_to_page(rq->out[0], rq->pageofs_out,
kin + pi, cur);
- }
kunmap_local(kin);
}
rq->outputsize -= cur;
@@ -325,35 +329,30 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK;
DBG_BUGON(no >= nrpages_out);
cnt = min(insz - pi, PAGE_SIZE - po);
- if (rq->out[no] == rq->in[ni]) {
+ if (rq->out[no] == rq->in[ni])
memmove(kin + po,
kin + rq->pageofs_in + pi, cnt);
- flush_dcache_page(rq->out[no]);
- } else if (rq->out[no]) {
+ else if (rq->out[no])
memcpy_to_page(rq->out[no], po,
kin + rq->pageofs_in + pi, cnt);
- }
pi += cnt;
} while (pi < insz);
kunmap_local(kin);
}
DBG_BUGON(ni > nrpages_in);
- return 0;
+ return NULL;
}
-int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
- void **src, struct page **pgpl)
+const char *z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx,
+ void **dst, void **src, struct page **pgpl)
{
struct z_erofs_decompress_req *rq = dctx->rq;
- struct super_block *sb = rq->sb;
struct page **pgo, *tmppage;
unsigned int j;
if (!dctx->avail_out) {
- if (++dctx->no >= rq->outpages || !rq->outputsize) {
- erofs_err(sb, "insufficient space for decompressed data");
- return -EFSCORRUPTED;
- }
+ if (++dctx->no >= rq->outpages || !rq->outputsize)
+ return "insufficient space for decompressed data";
if (dctx->kout)
kunmap_local(dctx->kout);
@@ -364,7 +363,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
*pgo = erofs_allocpage(pgpl, rq->gfp);
if (!*pgo) {
dctx->kout = NULL;
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
set_page_private(*pgo, Z_EROFS_SHORTLIVED_PAGE);
}
@@ -378,10 +377,8 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
}
if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) {
- if (++dctx->ni >= rq->inpages) {
- erofs_err(sb, "invalid compressed data");
- return -EFSCORRUPTED;
- }
+ if (++dctx->ni >= rq->inpages)
+ return "invalid compressed data";
if (dctx->kout) /* unlike kmap(), take care of the orders */
kunmap_local(dctx->kout);
kunmap_local(dctx->kin);
@@ -416,12 +413,12 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
continue;
tmppage = erofs_allocpage(pgpl, rq->gfp);
if (!tmppage)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
}
- return 0;
+ return NULL;
}
const struct z_erofs_decompressor *z_erofs_decomp[] = {
@@ -471,7 +468,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
return -EOPNOTSUPP;
}
- erofs_init_metabuf(&buf, sb);
+ (void)erofs_init_metabuf(&buf, sb, false);
offset = EROFS_SUPER_OFFSET + sbi->sb_size;
alg = 0;
for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
diff --git a/fs/erofs/decompressor_crypto.c b/fs/erofs/decompressor_crypto.c
new file mode 100644
index 000000000000..5ef6f71d3b7f
--- /dev/null
+++ b/fs/erofs/decompressor_crypto.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/scatterlist.h>
+#include <crypto/acompress.h>
+#include "compress.h"
+
+static int __z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct crypto_acomp *tfm)
+{
+ struct sg_table st_src, st_dst;
+ struct acomp_req *req;
+ struct crypto_wait wait;
+ const char *reason;
+ u8 *headpage;
+ int ret;
+
+ headpage = kmap_local_page(*rq->in);
+ reason = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ rq->sb->s_blocksize - rq->pageofs_in));
+ kunmap_local(headpage);
+ if (reason)
+ return IS_ERR(reason) ? PTR_ERR(reason) : -EFSCORRUPTED;
+
+ req = acomp_request_alloc(tfm);
+ if (!req)
+ return -ENOMEM;
+
+ ret = sg_alloc_table_from_pages_segment(&st_src, rq->in, rq->inpages,
+ rq->pageofs_in, rq->inputsize, UINT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto failed_src_alloc;
+
+ ret = sg_alloc_table_from_pages_segment(&st_dst, rq->out, rq->outpages,
+ rq->pageofs_out, rq->outputsize, UINT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto failed_dst_alloc;
+
+ acomp_request_set_params(req, st_src.sgl,
+ st_dst.sgl, rq->inputsize, rq->outputsize);
+
+ crypto_init_wait(&wait);
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+
+ ret = crypto_wait_req(crypto_acomp_decompress(req), &wait);
+ if (ret) {
+ erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
+ ret, rq->inputsize, rq->pageofs_in, rq->outputsize);
+ ret = -EIO;
+ }
+
+ sg_free_table(&st_dst);
+failed_dst_alloc:
+ sg_free_table(&st_src);
+failed_src_alloc:
+ acomp_request_free(req);
+ return ret;
+}
+
+struct z_erofs_crypto_engine {
+ char *crypto_name;
+ struct crypto_acomp *tfm;
+};
+
+struct z_erofs_crypto_engine *z_erofs_crypto[Z_EROFS_COMPRESSION_MAX] = {
+ [Z_EROFS_COMPRESSION_LZ4] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+ [Z_EROFS_COMPRESSION_LZMA] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+ [Z_EROFS_COMPRESSION_DEFLATE] = (struct z_erofs_crypto_engine[]) {
+ { .crypto_name = "qat_deflate", },
+ {},
+ },
+ [Z_EROFS_COMPRESSION_ZSTD] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+};
+static DECLARE_RWSEM(z_erofs_crypto_rwsem);
+
+static struct crypto_acomp *z_erofs_crypto_get_engine(int alg)
+{
+ struct z_erofs_crypto_engine *e;
+
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e)
+ if (e->tfm)
+ return e->tfm;
+ return NULL;
+}
+
+int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+ struct crypto_acomp *tfm;
+ int i, err;
+
+ down_read(&z_erofs_crypto_rwsem);
+ tfm = z_erofs_crypto_get_engine(rq->alg);
+ if (!tfm) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ for (i = 0; i < rq->outpages; i++) {
+ struct page *const page = rq->out[i];
+ struct page *victim;
+
+ if (!page) {
+ victim = __erofs_allocpage(pgpl, rq->gfp, true);
+ if (!victim) {
+ err = -ENOMEM;
+ goto out;
+ }
+ set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
+ rq->out[i] = victim;
+ }
+ }
+ err = __z_erofs_crypto_decompress(rq, tfm);
+out:
+ up_read(&z_erofs_crypto_rwsem);
+ return err;
+}
+
+int z_erofs_crypto_enable_engine(const char *name, int len)
+{
+ struct z_erofs_crypto_engine *e;
+ struct crypto_acomp *tfm;
+ int alg;
+
+ down_write(&z_erofs_crypto_rwsem);
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!strncmp(name, e->crypto_name, len)) {
+ if (e->tfm)
+ break;
+ tfm = crypto_alloc_acomp(e->crypto_name, 0, 0);
+ if (IS_ERR(tfm)) {
+ up_write(&z_erofs_crypto_rwsem);
+ return -EOPNOTSUPP;
+ }
+ e->tfm = tfm;
+ break;
+ }
+ }
+ }
+ up_write(&z_erofs_crypto_rwsem);
+ return 0;
+}
+
+void z_erofs_crypto_disable_all_engines(void)
+{
+ struct z_erofs_crypto_engine *e;
+ int alg;
+
+ down_write(&z_erofs_crypto_rwsem);
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!e->tfm)
+ continue;
+ crypto_free_acomp(e->tfm);
+ e->tfm = NULL;
+ }
+ }
+ up_write(&z_erofs_crypto_rwsem);
+}
+
+int z_erofs_crypto_show_engines(char *buf, int size, char sep)
+{
+ struct z_erofs_crypto_engine *e;
+ int alg, len = 0;
+
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!e->tfm)
+ continue;
+ len += scnprintf(buf + len, size - len, "%s%c",
+ e->crypto_name, sep);
+ }
+ }
+ return len;
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index c6908a487054..3fb73000ed27 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -97,21 +97,22 @@ failed:
return -ENOMEM;
}
-static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static const char *__z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
struct super_block *sb = rq->sb;
struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
struct z_erofs_deflate *strm;
- int zerr, err;
+ const char *reason;
+ int zerr;
/* 1. get the exact DEFLATE compressed size */
dctx.kin = kmap_local_page(*rq->in);
- err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ reason = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
- if (err) {
+ if (reason) {
kunmap_local(dctx.kin);
- return err;
+ return reason;
}
/* 2. get an available DEFLATE context */
@@ -129,7 +130,7 @@ again:
/* 3. multi-call decompress */
zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS);
if (zerr != Z_OK) {
- err = -EIO;
+ reason = ERR_PTR(-EINVAL);
goto failed_zinit;
}
@@ -143,10 +144,10 @@ again:
while (1) {
dctx.avail_out = strm->z.avail_out;
dctx.inbuf_sz = strm->z.avail_in;
- err = z_erofs_stream_switch_bufs(&dctx,
+ reason = z_erofs_stream_switch_bufs(&dctx,
(void **)&strm->z.next_out,
(void **)&strm->z.next_in, pgpl);
- if (err)
+ if (reason)
break;
strm->z.avail_out = dctx.avail_out;
strm->z.avail_in = dctx.inbuf_sz;
@@ -157,14 +158,14 @@ again:
break;
if (zerr == Z_STREAM_END && !rq->outputsize)
break;
- erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
- zerr, rq->inputsize, rq->outputsize);
- err = -EFSCORRUPTED;
+ reason = (zerr == Z_DATA_ERROR ?
+ "corrupted compressed data" :
+ "unexpected end of stream");
break;
}
}
- if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
- err = -EIO;
+ if (zlib_inflateEnd(&strm->z) != Z_OK && !reason)
+ reason = ERR_PTR(-EIO);
if (dctx.kout)
kunmap_local(dctx.kout);
failed_zinit:
@@ -175,7 +176,23 @@ failed_zinit:
z_erofs_deflate_head = strm;
spin_unlock(&z_erofs_deflate_lock);
wake_up(&z_erofs_deflate_wq);
- return err;
+ return reason;
+}
+
+static const char *z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ int err;
+
+ if (!rq->partial_decoding) {
+ err = z_erofs_crypto_decompress(rq, pgpl);
+ if (err != -EOPNOTSUPP)
+ return ERR_PTR(err);
+
+ }
+#endif
+ return __z_erofs_deflate_decompress(rq, pgpl);
}
const struct z_erofs_decompressor z_erofs_deflate_decomp = {
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 832cffb83a66..b4ea6978faae 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -146,23 +146,23 @@ again:
return err;
}
-static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static const char *z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
struct super_block *sb = rq->sb;
struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
struct xz_buf buf = {};
struct z_erofs_lzma *strm;
enum xz_ret xz_err;
- int err;
+ const char *reason;
/* 1. get the exact LZMA compressed size */
dctx.kin = kmap_local_page(*rq->in);
- err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ reason = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
- if (err) {
+ if (reason) {
kunmap_local(dctx.kin);
- return err;
+ return reason;
}
/* 2. get an available lzma context */
@@ -188,9 +188,9 @@ again:
dctx.avail_out = buf.out_size - buf.out_pos;
dctx.inbuf_sz = buf.in_size;
dctx.inbuf_pos = buf.in_pos;
- err = z_erofs_stream_switch_bufs(&dctx, (void **)&buf.out,
- (void **)&buf.in, pgpl);
- if (err)
+ reason = z_erofs_stream_switch_bufs(&dctx, (void **)&buf.out,
+ (void **)&buf.in, pgpl);
+ if (reason)
break;
if (buf.out_size == buf.out_pos) {
@@ -207,9 +207,9 @@ again:
if (xz_err != XZ_OK) {
if (xz_err == XZ_STREAM_END && !rq->outputsize)
break;
- erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
- xz_err, rq->inputsize, rq->outputsize);
- err = -EFSCORRUPTED;
+ reason = (xz_err == XZ_DATA_ERROR ?
+ "corrupted compressed data" :
+ "unexpected end of stream");
break;
}
} while (1);
@@ -223,7 +223,7 @@ again:
z_erofs_lzma_head = strm;
spin_unlock(&z_erofs_lzma_lock);
wake_up(&z_erofs_lzma_wq);
- return err;
+ return reason;
}
const struct z_erofs_decompressor z_erofs_lzma_decomp = {
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
index b4bfe14229f9..beae49165c69 100644
--- a/fs/erofs/decompressor_zstd.c
+++ b/fs/erofs/decompressor_zstd.c
@@ -135,8 +135,8 @@ static int z_erofs_load_zstd_config(struct super_block *sb,
return strm ? -ENOMEM : 0;
}
-static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static const char *z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
struct super_block *sb = rq->sb;
struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
@@ -144,15 +144,16 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
zstd_out_buffer out_buf = { NULL, 0, 0 };
struct z_erofs_zstd *strm;
zstd_dstream *stream;
- int zerr, err;
+ const char *reason;
+ int zerr;
/* 1. get the exact compressed size */
dctx.kin = kmap_local_page(*rq->in);
- err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ reason = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
- if (err) {
+ if (reason) {
kunmap_local(dctx.kin);
- return err;
+ return reason;
}
/* 2. get an available ZSTD context */
@@ -161,7 +162,7 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
/* 3. multi-call decompress */
stream = zstd_init_dstream(z_erofs_zstd_max_dictsize, strm->wksp, strm->wkspsz);
if (!stream) {
- err = -EIO;
+ reason = ERR_PTR(-ENOMEM);
goto failed_zinit;
}
@@ -172,12 +173,11 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
dctx.bounce = strm->bounce;
do {
- dctx.avail_out = out_buf.size - out_buf.pos;
dctx.inbuf_sz = in_buf.size;
dctx.inbuf_pos = in_buf.pos;
- err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst,
+ reason = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst,
(void **)&in_buf.src, pgpl);
- if (err)
+ if (reason)
break;
if (out_buf.size == out_buf.pos) {
@@ -188,14 +188,15 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
in_buf.pos = dctx.inbuf_pos;
zerr = zstd_decompress_stream(stream, &out_buf, &in_buf);
- if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) {
- erofs_err(sb, "failed to decompress in[%u] out[%u]: %s",
- rq->inputsize, rq->outputsize,
- zerr ? zstd_get_error_name(zerr) : "unexpected end of stream");
- err = -EFSCORRUPTED;
+ dctx.avail_out = out_buf.size - out_buf.pos;
+ if (zstd_is_error(zerr) ||
+ ((rq->outputsize + dctx.avail_out) && (!zerr || (zerr > 0 &&
+ !(rq->inputsize + in_buf.size - in_buf.pos))))) {
+ reason = zstd_is_error(zerr) ? zstd_get_error_name(zerr) :
+ "unexpected end of stream";
break;
}
- } while (rq->outputsize || out_buf.pos < out_buf.size);
+ } while (rq->outputsize + dctx.avail_out);
if (dctx.kout)
kunmap_local(dctx.kout);
@@ -207,7 +208,7 @@ failed_zinit:
z_erofs_zstd_head = strm;
spin_unlock(&z_erofs_zstd_lock);
wake_up(&z_erofs_zstd_wq);
- return err;
+ return reason;
}
const struct z_erofs_decompressor z_erofs_zstd_decomp = {
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 2fae209d0274..32b4f5aa60c9 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -34,7 +34,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
}
if (!dir_emit(ctx, de_name, de_namelen,
- le64_to_cpu(de->nid), d_type))
+ erofs_nid_to_ino64(EROFS_SB(dir->i_sb),
+ le64_to_cpu(de->nid)), d_type))
return 1;
++de;
ctx->pos += sizeof(struct erofs_dirent);
@@ -47,8 +48,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct inode *dir = file_inode(f);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = dir->i_sb;
+ struct file_ra_state *ra = &f->f_ra;
unsigned long bsz = sb->s_blocksize;
unsigned int ofs = erofs_blkoff(sb, ctx->pos);
+ pgoff_t ra_pages = DIV_ROUND_UP_POW2(
+ EROFS_I_SB(dir)->dir_ra_bytes, PAGE_SIZE);
+ pgoff_t nr_pages = DIV_ROUND_UP_POW2(dir->i_size, PAGE_SIZE);
int err = 0;
bool initial = true;
@@ -58,6 +63,21 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ break;
+ }
+
+ /* readahead blocks to enhance performance for large directories */
+ if (ra_pages) {
+ pgoff_t idx = DIV_ROUND_UP_POW2(ctx->pos, PAGE_SIZE);
+ pgoff_t pages = min(nr_pages - idx, ra_pages);
+
+ if (pages > 1 && !ra_has_index(ra, idx))
+ page_cache_sync_readahead(dir->i_mapping, ra,
+ f, idx, pages);
+ }
+
de = erofs_bread(&buf, dbstart, true);
if (IS_ERR(de)) {
erofs_err(sb, "failed to readdir of logical block %llu of nid %llu",
@@ -88,6 +108,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
break;
ctx->pos = dbstart + maxsize;
ofs = 0;
+ cond_resched();
}
erofs_put_metabuf(&buf);
if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) {
@@ -102,4 +123,8 @@ const struct file_operations erofs_dir_fops = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.iterate_shared = erofs_readdir,
+ .unlocked_ioctl = erofs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = erofs_compat_ioctl,
+#endif
};
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 9581e9bf8192..e24268acdd62 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */
+/* SPDX-License-Identifier: MIT */
/*
* EROFS (Enhanced ROM File System) on-disk format definition
*
@@ -12,9 +12,12 @@
/* to allow for x86 boot sectors and other oddities. */
#define EROFS_SUPER_OFFSET 1024
-#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
-#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
-#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008
+#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010
+
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
@@ -31,8 +34,9 @@
#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040
#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080
+#define EROFS_FEATURE_INCOMPAT_METABOX 0x00000100
#define EROFS_ALL_FEATURE_INCOMPAT \
- ((EROFS_FEATURE_INCOMPAT_48BIT << 1) - 1)
+ ((EROFS_FEATURE_INCOMPAT_METABOX << 1) - 1)
#define EROFS_SB_EXTSLOT_SIZE 16
@@ -46,7 +50,7 @@ struct erofs_deviceslot {
};
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
-/* erofs on-disk super block (currently 128 bytes) */
+/* erofs on-disk super block (currently 144 bytes at maximum) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
__le32 checksum; /* crc32c to avoid unexpected on-disk overlap */
@@ -56,7 +60,7 @@ struct erofs_super_block {
union {
__le16 rootnid_2b; /* nid of root directory */
__le16 blocks_hi; /* (48BIT on) blocks count MSB */
- } rb;
+ } __packed rb;
__le64 inos; /* total valid ino # (== f_files - f_favail) */
__le64 epoch; /* base seconds used for compact inodes */
__le32 fixed_nsec; /* fixed nanoseconds for compact inodes */
@@ -82,7 +86,9 @@ struct erofs_super_block {
__u8 reserved[3];
__le32 build_time; /* seconds added to epoch for mkfs time */
__le64 rootnid_8b; /* (48BIT on) nid of root directory */
- __u8 reserved2[8];
+ __le64 reserved2;
+ __le64 metabox_nid; /* (METABOX on) nid of the metabox inode */
+ __le64 reserved3; /* [align to extslot 1] */
};
/*
@@ -148,7 +154,7 @@ union erofs_inode_i_nb {
__le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */
__le16 blocks_hi; /* total blocks count MSB */
__le16 startblk_hi; /* starting block number MSB */
-};
+} __packed;
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
@@ -267,6 +273,9 @@ struct erofs_inode_chunk_index {
__le32 startblk_lo; /* starting block number of this chunk */
};
+#define EROFS_DIRENT_NID_METABOX_BIT 63
+#define EROFS_DIRENT_NID_MASK (BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT) - 1)
+
/* dirent sorts in alphabet order, thus we can do binary search */
struct erofs_dirent {
__le64 nid; /* node number */
@@ -369,9 +378,9 @@ struct z_erofs_map_header {
* bit 7 : pack the whole file into packed inode
*/
__u8 h_clusterbits;
- };
+ } __packed;
__le16 h_extents_hi; /* extent count MSB */
- };
+ } __packed;
};
enum {
@@ -434,7 +443,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
.h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT
};
- BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
+ BUILD_BUG_ON(sizeof(struct erofs_super_block) != 144);
BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index bec4b56b3826..932e8b353ba1 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -32,13 +32,15 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
ret = 0;
}
if (rq->bio.bi_end_io) {
- rq->bio.bi_end_io(&rq->bio);
+ if (ret < 0 && !rq->bio.bi_status)
+ rq->bio.bi_status = errno_to_blk_status(ret);
} else {
bio_for_each_folio_all(fi, &rq->bio) {
DBG_BUGON(folio_test_uptodate(fi.folio));
- erofs_onlinefolio_end(fi.folio, ret);
+ erofs_onlinefolio_end(fi.folio, ret, false);
}
}
+ bio_endio(&rq->bio);
bio_uninit(&rq->bio);
kfree(rq);
}
@@ -58,7 +60,8 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
rq->iocb.ki_flags = IOCB_DIRECT;
iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
rq->bio.bi_iter.bi_size);
- ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
+ scoped_with_creds(rq->iocb.ki_filp->f_cred)
+ ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
if (ret != -EIOCBQUEUED)
erofs_fileio_ki_complete(&rq->iocb, ret);
}
@@ -91,8 +94,6 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
struct erofs_map_blocks *map = &io->map;
unsigned int cur = 0, end = folio_size(folio), len, attached = 0;
loff_t pos = folio_pos(folio), ofs;
- struct iov_iter iter;
- struct bio_vec bv;
int err = 0;
erofs_onlinefolio_init(folio);
@@ -112,18 +113,12 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
void *src;
src = erofs_read_metabuf(&buf, inode->i_sb,
- map->m_pa + ofs, true);
+ map->m_pa + ofs, erofs_inode_in_metabox(inode));
if (IS_ERR(src)) {
err = PTR_ERR(src);
break;
}
- bvec_set_folio(&bv, folio, len, cur);
- iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len);
- if (copy_to_iter(src, len, &iter) != len) {
- erofs_put_metabuf(&buf);
- err = -EIO;
- break;
- }
+ memcpy_to_folio(folio, cur, src, len);
erofs_put_metabuf(&buf);
} else if (!(map->m_flags & EROFS_MAP_MAPPED)) {
folio_zero_segment(folio, cur, cur + len);
@@ -145,18 +140,19 @@ io_retry:
if (err)
break;
io->rq = erofs_fileio_rq_alloc(&io->dev);
- io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9;
+ io->rq->bio.bi_iter.bi_sector =
+ (io->dev.m_dif->fsoff + io->dev.m_pa) >> 9;
attached = 0;
}
- if (!attached++)
- erofs_onlinefolio_split(folio);
if (!bio_add_folio(&io->rq->bio, folio, len, cur))
goto io_retry;
+ if (!attached++)
+ erofs_onlinefolio_split(folio);
io->dev.m_pa += len;
}
cur += len;
}
- erofs_onlinefolio_end(folio, err);
+ erofs_onlinefolio_end(folio, err, false);
return err;
}
@@ -178,7 +174,7 @@ static void erofs_fileio_readahead(struct readahead_control *rac)
struct folio *folio;
int err;
- trace_erofs_readpages(inode, readahead_index(rac),
+ trace_erofs_readahead(inode, readahead_index(rac),
readahead_count(rac), true);
while ((folio = readahead_folio(rac))) {
err = erofs_fileio_scan_folio(&io, folio);
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 9c9129bca346..7a346e20f7b7 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -102,8 +102,7 @@ static void erofs_fscache_req_io_put(struct erofs_fscache_io *io)
erofs_fscache_req_put(req);
}
-static void erofs_fscache_req_end_io(void *priv,
- ssize_t transferred_or_error, bool was_async)
+static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error)
{
struct erofs_fscache_io *io = priv;
struct erofs_fscache_rq *req = io->private;
@@ -180,14 +179,13 @@ struct erofs_fscache_bio {
struct bio_vec bvecs[BIO_MAX_VECS];
};
-static void erofs_fscache_bio_endio(void *priv,
- ssize_t transferred_or_error, bool was_async)
+static void erofs_fscache_bio_endio(void *priv, ssize_t transferred_or_error)
{
struct erofs_fscache_bio *io = priv;
if (IS_ERR_VALUE(transferred_or_error))
io->bio.bi_status = errno_to_blk_status(transferred_or_error);
- io->bio.bi_end_io(&io->bio);
+ bio_endio(&io->bio);
BUILD_BUG_ON(offsetof(struct erofs_fscache_bio, io) != 0);
erofs_fscache_io_put(&io->io);
}
@@ -218,7 +216,7 @@ void erofs_fscache_submit_bio(struct bio *bio)
if (!ret)
return;
bio->bi_status = errno_to_blk_status(ret);
- bio->bi_end_io(bio);
+ bio_endio(bio);
}
static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
@@ -276,7 +274,8 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
size_t size = map.m_llen;
void *src;
- src = erofs_read_metabuf(&buf, sb, map.m_pa, true);
+ src = erofs_read_metabuf(&buf, sb, map.m_pa,
+ erofs_inode_in_metabox(inode));
if (IS_ERR(src))
return PTR_ERR(src);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index a0ae0b4f7b01..bce98c845a18 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
+#include <linux/compat.h>
#include <trace/events/erofs.h>
static int erofs_fill_symlink(struct inode *inode, void *kaddr,
@@ -29,6 +30,7 @@ static int erofs_read_inode(struct inode *inode)
struct super_block *sb = inode->i_sb;
erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode));
unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode));
+ bool in_mbox = erofs_inode_in_metabox(inode);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_sb_info *sbi = EROFS_SB(sb);
erofs_blk_t addrmask = BIT_ULL(48) - 1;
@@ -39,10 +41,10 @@ static int erofs_read_inode(struct inode *inode)
void *ptr;
int err = 0;
- ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true);
+ ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), in_mbox);
if (IS_ERR(ptr)) {
err = PTR_ERR(ptr);
- erofs_err(sb, "failed to get inode (nid: %llu) page, err %d",
+ erofs_err(sb, "failed to read inode meta block (nid: %llu): %d",
vi->nid, err);
goto err_out;
}
@@ -78,10 +80,10 @@ static int erofs_read_inode(struct inode *inode)
memcpy(&copied, dic, gotten);
ptr = erofs_read_metabuf(&buf, sb,
- erofs_pos(sb, blkaddr + 1), true);
+ erofs_pos(sb, blkaddr + 1), in_mbox);
if (IS_ERR(ptr)) {
err = PTR_ERR(ptr);
- erofs_err(sb, "failed to get inode payload block (nid: %llu), err %d",
+ erofs_err(sb, "failed to read inode payload block (nid: %llu): %d",
vi->nid, err);
goto err_out;
}
@@ -212,10 +214,7 @@ static int erofs_fill_inode(struct inode *inode)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
- if (erofs_inode_is_data_compressed(vi->datalayout))
- inode->i_fop = &generic_ro_fops;
- else
- inode->i_fop = &erofs_file_fops;
+ inode->i_fop = &erofs_file_fops;
break;
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
@@ -264,13 +263,13 @@ static int erofs_fill_inode(struct inode *inode)
* ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
* so that it will fit.
*/
-static ino_t erofs_squash_ino(erofs_nid_t nid)
+static ino_t erofs_squash_ino(struct super_block *sb, erofs_nid_t nid)
{
- ino_t ino = (ino_t)nid;
+ u64 ino64 = erofs_nid_to_ino64(EROFS_SB(sb), nid);
if (sizeof(ino_t) < sizeof(erofs_nid_t))
- ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8;
- return ino;
+ ino64 ^= ino64 >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8;
+ return (ino_t)ino64;
}
static int erofs_iget5_eq(struct inode *inode, void *opaque)
@@ -282,7 +281,7 @@ static int erofs_iget5_set(struct inode *inode, void *opaque)
{
const erofs_nid_t nid = *(erofs_nid_t *)opaque;
- inode->i_ino = erofs_squash_ino(nid);
+ inode->i_ino = erofs_squash_ino(inode->i_sb, nid);
EROFS_I(inode)->nid = nid;
return 0;
}
@@ -291,12 +290,12 @@ struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
{
struct inode *inode;
- inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq,
+ inode = iget5_locked(sb, erofs_squash_ino(sb, nid), erofs_iget5_eq,
erofs_iget5_set, &nid);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
int err = erofs_fill_inode(inode);
if (err) {
@@ -340,6 +339,40 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
return 0;
}
+static int erofs_ioctl_get_volume_label(struct inode *inode, void __user *arg)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ int ret;
+
+ if (!sbi->volume_name)
+ ret = clear_user(arg, 1);
+ else
+ ret = copy_to_user(arg, sbi->volume_name,
+ strlen(sbi->volume_name));
+ return ret ? -EFAULT : 0;
+}
+
+long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case FS_IOC_GETFSLABEL:
+ return erofs_ioctl_get_volume_label(inode, argp);
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ return erofs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
const struct inode_operations erofs_generic_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ac188d5d894..f7f622836198 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -44,7 +44,7 @@ struct erofs_device_info {
struct erofs_fscache *fscache;
struct file *file;
struct dax_device *dax_dev;
- u64 dax_part_off;
+ u64 fsoff, dax_part_off;
erofs_blk_t blocks;
erofs_blk_t uniaddr;
@@ -125,6 +125,7 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
struct inode *packed_inode;
+ struct inode *metabox_inode;
struct erofs_dev_context *devs;
u64 total_blocks;
@@ -148,15 +149,18 @@ struct erofs_sb_info {
/* what we really care is nid, rather than ino.. */
erofs_nid_t root_nid;
erofs_nid_t packed_nid;
+ erofs_nid_t metabox_nid;
/* used for statfs, f_files - f_favail */
u64 inos;
+ char *volume_name;
u32 feature_compat;
u32 feature_incompat;
/* sysfs support */
struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
struct completion s_kobj_unregister;
+ erofs_off_t dir_ra_bytes;
/* fscache support */
struct fscache_volume *volume;
@@ -199,6 +203,7 @@ enum {
struct erofs_buf {
struct address_space *mapping;
struct file *file;
+ u64 off;
struct page *page;
void *base;
};
@@ -226,8 +231,28 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT)
+EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
+EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX)
+EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX)
+
+static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
+{
+ if (!erofs_sb_has_metabox(sbi))
+ return nid;
+
+ /*
+ * When metadata compression is enabled, avoid generating excessively
+ * large inode numbers for metadata-compressed inodes. Shift NIDs in
+ * the 31-62 bit range left by one and move the metabox flag to bit 31.
+ *
+ * Note: on-disk NIDs remain unchanged as they are primarily used for
+ * compatibility with non-LFS 32-bit applications.
+ */
+ return ((nid << 1) & GENMASK_ULL(63, 32)) | (nid & GENMASK(30, 0)) |
+ ((nid >> EROFS_DIRENT_NID_METABOX_BIT) << 31);
+}
/* atomic flag definitions */
#define EROFS_I_EA_INITED_BIT 0
@@ -237,6 +262,9 @@ EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
#define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1)
#define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2)
+/* default readahead size of directories */
+#define EROFS_DIR_RA_BYTES 16384
+
struct erofs_inode {
erofs_nid_t nid;
@@ -278,12 +306,20 @@ struct erofs_inode {
#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode)
+static inline bool erofs_inode_in_metabox(struct inode *inode)
+{
+ return EROFS_I(inode)->nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT);
+}
+
static inline erofs_off_t erofs_iloc(struct inode *inode)
{
struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ erofs_nid_t nid_lo = EROFS_I(inode)->nid & EROFS_DIRENT_NID_MASK;
+ if (erofs_inode_in_metabox(inode))
+ return nid_lo << sbi->islotbits;
return erofs_pos(inode->i_sb, sbi->meta_blkaddr) +
- (EROFS_I(inode)->nid << sbi->islotbits);
+ (nid_lo << sbi->islotbits);
}
static inline unsigned int erofs_inode_version(unsigned int ifmt)
@@ -314,10 +350,12 @@ static inline struct folio *erofs_grab_folio_nowait(struct address_space *as,
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED 0x0008
/* Located in the special packed inode */
-#define EROFS_MAP_FRAGMENT 0x0010
+#define __EROFS_MAP_FRAGMENT 0x0010
/* The extent refers to partial decompressed data */
#define EROFS_MAP_PARTIAL_REF 0x0020
+#define EROFS_MAP_FRAGMENT (EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT)
+
struct erofs_map_blocks {
struct erofs_buf buf;
@@ -380,16 +418,17 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap);
-void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
+int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ bool in_metabox);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, bool need_kmap);
+ erofs_off_t offset, bool in_metabox);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
void erofs_onlinefolio_init(struct folio *folio);
void erofs_onlinefolio_split(struct folio *folio);
-void erofs_onlinefolio_end(struct folio *folio, int err);
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty);
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
@@ -498,6 +537,10 @@ static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) {
static inline void erofs_fscache_submit_bio(struct bio *bio) {}
#endif
+long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg);
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index cadec6b1b554..937a215f626c 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -141,7 +141,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_deviceslot *dis;
struct file *file;
- dis = erofs_read_metabuf(buf, sb, *pos, true);
+ dis = erofs_read_metabuf(buf, sb, *pos, false);
if (IS_ERR(dis))
return PTR_ERR(dis);
@@ -165,8 +165,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
bdev_file_open_by_path(dif->path,
BLK_OPEN_READ, sb->s_type, NULL);
- if (IS_ERR(file))
+ if (IS_ERR(file)) {
+ if (file == ERR_PTR(-ENOTBLK))
+ return -EINVAL;
return PTR_ERR(file);
+ }
if (!erofs_is_fileio_mode(sbi)) {
dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
@@ -175,6 +178,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
fput(file);
return -EINVAL;
}
+ if (!dif->dax_dev && test_opt(&sbi->opt, DAX_ALWAYS)) {
+ erofs_info(sb, "DAX unsupported by %s. Turning off DAX.",
+ dif->path);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
dif->file = file;
}
@@ -207,6 +215,11 @@ static int erofs_scan_devices(struct super_block *sb,
ondisk_extradevs, sbi->devs->extra_devices);
return -EINVAL;
}
+
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && !sbi->dif0.dax_dev) {
+ erofs_info(sb, "DAX unsupported by block device. Turning off DAX.");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
if (!ondisk_extradevs)
return 0;
@@ -255,7 +268,7 @@ static int erofs_read_superblock(struct super_block *sb)
void *data;
int ret;
- data = erofs_read_metabuf(&buf, sb, 0, true);
+ data = erofs_read_metabuf(&buf, sb, 0, false);
if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
return PTR_ERR(data);
@@ -310,28 +323,44 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
- sbi->dif0.blocks = (sbi->dif0.blocks << 32) |
- le16_to_cpu(dsb->rb.blocks_hi);
+ sbi->dif0.blocks = sbi->dif0.blocks |
+ ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
} else {
sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
}
sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
+ if (erofs_sb_has_metabox(sbi)) {
+ if (sbi->sb_size <= offsetof(struct erofs_super_block,
+ metabox_nid))
+ return -EFSCORRUPTED;
+ sbi->metabox_nid = le64_to_cpu(dsb->metabox_nid);
+ if (sbi->metabox_nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT))
+ return -EFSCORRUPTED; /* self-loop detection */
+ }
sbi->inos = le64_to_cpu(dsb->inos);
sbi->epoch = (s64)le64_to_cpu(dsb->epoch);
sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec);
super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
+ if (dsb->volume_name[0]) {
+ sbi->volume_name = kstrndup(dsb->volume_name,
+ sizeof(dsb->volume_name), GFP_KERNEL);
+ if (!sbi->volume_name)
+ return -ENOMEM;
+ }
+
/* parse on-disk compression configurations */
ret = z_erofs_parse_cfgs(sb, dsb);
if (ret < 0)
goto out;
- /* handle multiple devices */
ret = erofs_scan_devices(sb, dsb);
if (erofs_sb_has_48bit(sbi))
erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!");
+ if (erofs_sb_has_metabox(sbi))
+ erofs_info(sb, "EXPERIMENTAL metadata compression support in use. Use at your own risk!");
if (erofs_is_fscache_mode(sb))
erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
@@ -356,8 +385,7 @@ static void erofs_default_options(struct erofs_sb_info *sbi)
enum {
Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
- Opt_device, Opt_fsid, Opt_domain_id, Opt_directio,
- Opt_err
+ Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset,
};
static const struct constant_table erofs_param_cache_strategy[] = {
@@ -384,6 +412,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_string("fsid", Opt_fsid),
fsparam_string("domain_id", Opt_domain_id),
fsparam_flag_no("directio", Opt_directio),
+ fsparam_u64("fsoffset", Opt_fsoffset),
{}
};
@@ -507,28 +536,59 @@ static int erofs_fc_parse_param(struct fs_context *fc,
errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
#endif
break;
+ case Opt_fsoffset:
+ sbi->dif0.fsoff = result.uint_64;
+ break;
}
return 0;
}
-static struct inode *erofs_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+static int erofs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
{
- return erofs_iget(sb, ino);
+ erofs_nid_t nid = EROFS_I(inode)->nid;
+ int len = parent ? 6 : 3;
+
+ if (*max_len < len) {
+ *max_len = len;
+ return FILEID_INVALID;
+ }
+
+ fh[0] = (u32)(nid >> 32);
+ fh[1] = (u32)(nid & 0xffffffff);
+ fh[2] = inode->i_generation;
+
+ if (parent) {
+ nid = EROFS_I(parent)->nid;
+
+ fh[3] = (u32)(nid >> 32);
+ fh[4] = (u32)(nid & 0xffffffff);
+ fh[5] = parent->i_generation;
+ }
+
+ *max_len = len;
+ return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
}
static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if ((fh_type != FILEID_INO64_GEN &&
+ fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[0] << 32) | fid->raw[1]));
}
static struct dentry *erofs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[3] << 32) | fid->raw[4]));
}
static struct dentry *erofs_get_parent(struct dentry *child)
@@ -544,7 +604,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
}
static const struct export_operations erofs_export_ops = {
- .encode_fh = generic_encode_ino32_fh,
+ .encode_fh = erofs_encode_fh,
.fh_to_dentry = erofs_fh_to_dentry,
.fh_to_parent = erofs_fh_to_parent,
.get_parent = erofs_get_parent,
@@ -579,6 +639,22 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sbi->blkszbits = PAGE_SHIFT;
if (!sb->s_bdev) {
+ /*
+ * (File-backed mounts) EROFS claims it's safe to nest other
+ * fs contexts (including its own) due to self-controlled RO
+ * accesses/contexts and no side-effect changes that need to
+ * context save & restore so it can reuse the current thread
+ * context. However, it still needs to bump `s_stack_depth` to
+ * avoid kernel stack overflow from nested filesystems.
+ */
+ if (erofs_is_fileio_mode(sbi)) {
+ sb->s_stack_depth =
+ file_inode(sbi->dif0.file)->i_sb->s_stack_depth + 1;
+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ erofs_err(sb, "maximum fs stacking depth exceeded");
+ return -ENOTBLK;
+ }
+ }
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
@@ -619,14 +695,17 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
}
}
- if (test_opt(&sbi->opt, DAX_ALWAYS)) {
- if (!sbi->dif0.dax_dev) {
- errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
- clear_opt(&sbi->opt, DAX_ALWAYS);
- } else if (sbi->blkszbits != PAGE_SHIFT) {
- errorfc(fc, "unsupported blocksize for DAX");
- clear_opt(&sbi->opt, DAX_ALWAYS);
- }
+ if (sbi->dif0.fsoff) {
+ if (sbi->dif0.fsoff & (sb->s_blocksize - 1))
+ return invalfc(fc, "fsoffset %llu is not aligned to block size %lu",
+ sbi->dif0.fsoff, sb->s_blocksize);
+ if (erofs_is_fscache_mode(sb))
+ return invalfc(fc, "cannot use fsoffset in fscache mode");
+ }
+
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && sbi->blkszbits != PAGE_SHIFT) {
+ erofs_info(sb, "unsupported blocksize for DAX");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
}
sb->s_time_gran = 1;
@@ -648,6 +727,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return PTR_ERR(inode);
sbi->packed_inode = inode;
}
+ if (erofs_sb_has_metabox(sbi)) {
+ inode = erofs_iget(sb, sbi->metabox_nid);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->metabox_inode = inode;
+ }
inode = erofs_iget(sb, sbi->root_nid);
if (IS_ERR(inode))
@@ -673,6 +758,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ sbi->dir_ra_bytes = EROFS_DIR_RA_BYTES;
erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
return 0;
}
@@ -759,6 +845,7 @@ static void erofs_sb_free(struct erofs_sb_info *sbi)
kfree(sbi->domain_id);
if (sbi->dif0.file)
fput(sbi->dif0.file);
+ kfree(sbi->volume_name);
kfree(sbi);
}
@@ -803,6 +890,8 @@ static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi)
{
iput(sbi->packed_inode);
sbi->packed_inode = NULL;
+ iput(sbi->metabox_inode);
+ sbi->metabox_inode = NULL;
#ifdef CONFIG_EROFS_FS_ZIP
iput(sbi->managed_cache);
sbi->managed_cache = NULL;
@@ -948,13 +1037,27 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
if (sbi->domain_id)
seq_printf(seq, ",domain_id=%s", sbi->domain_id);
#endif
+ if (sbi->dif0.fsoff)
+ seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff);
return 0;
}
+static void erofs_evict_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ dax_break_layout_final(inode);
+#endif
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
const struct super_operations erofs_sops = {
.put_super = erofs_put_super,
.alloc_inode = erofs_alloc_inode,
.free_inode = erofs_free_inode,
+ .evict_inode = erofs_evict_inode,
.statfs = erofs_statfs,
.show_options = erofs_show_options,
};
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index dad4e6c6c155..1e0658a1d95b 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -7,12 +7,14 @@
#include <linux/kobject.h>
#include "internal.h"
+#include "compress.h"
enum {
attr_feature,
attr_drop_caches,
attr_pointer_ui,
attr_pointer_bool,
+ attr_accel,
};
enum {
@@ -60,12 +62,25 @@ static struct erofs_attr erofs_attr_##_name = { \
EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
EROFS_ATTR_FUNC(drop_caches, 0200);
#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+EROFS_ATTR_FUNC(accel, 0644);
+#endif
+EROFS_ATTR_RW_UI(dir_ra_bytes, erofs_sb_info);
-static struct attribute *erofs_attrs[] = {
+static struct attribute *erofs_sb_attrs[] = {
#ifdef CONFIG_EROFS_FS_ZIP
ATTR_LIST(sync_decompress),
ATTR_LIST(drop_caches),
#endif
+ ATTR_LIST(dir_ra_bytes),
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs_sb);
+
+static struct attribute *erofs_attrs[] = {
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ ATTR_LIST(accel),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(erofs);
@@ -82,6 +97,7 @@ EROFS_ATTR_FEATURE(ztailpacking);
EROFS_ATTR_FEATURE(fragments);
EROFS_ATTR_FEATURE(dedupe);
EROFS_ATTR_FEATURE(48bit);
+EROFS_ATTR_FEATURE(metabox);
static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(zero_padding),
@@ -95,6 +111,7 @@ static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(fragments),
ATTR_LIST(dedupe),
ATTR_LIST(48bit),
+ ATTR_LIST(metabox),
NULL,
};
ATTRIBUTE_GROUPS(erofs_feat);
@@ -128,12 +145,14 @@ static ssize_t erofs_attr_show(struct kobject *kobj,
if (!ptr)
return 0;
return sysfs_emit(buf, "%d\n", *(bool *)ptr);
+ case attr_accel:
+ return z_erofs_crypto_show_engines(buf, PAGE_SIZE, '\n');
}
return 0;
}
static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t len)
+ const char *buf, size_t len)
{
struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
s_kobj);
@@ -182,6 +201,19 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1);
return len;
#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ case attr_accel:
+ buf = skip_spaces(buf);
+ z_erofs_crypto_disable_all_engines();
+ while (*buf) {
+ t = strcspn(buf, "\n");
+ ret = z_erofs_crypto_enable_engine(buf, t);
+ if (ret < 0)
+ return ret;
+ buf += buf[t] != '\0' ? t + 1 : t;
+ }
+ return len;
+#endif
}
return 0;
}
@@ -199,12 +231,13 @@ static const struct sysfs_ops erofs_attr_ops = {
};
static const struct kobj_type erofs_sb_ktype = {
- .default_groups = erofs_groups,
+ .default_groups = erofs_sb_groups,
.sysfs_ops = &erofs_attr_ops,
.release = erofs_sb_release,
};
static const struct kobj_type erofs_ktype = {
+ .default_groups = erofs_groups,
.sysfs_ops = &erofs_attr_ops,
};
@@ -248,6 +281,12 @@ void erofs_unregister_sysfs(struct super_block *sb)
}
}
+void erofs_exit_sysfs(void)
+{
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+}
+
int __init erofs_init_sysfs(void)
{
int ret;
@@ -255,24 +294,12 @@ int __init erofs_init_sysfs(void)
kobject_set_name(&erofs_root.kobj, "erofs");
erofs_root.kobj.parent = fs_kobj;
ret = kset_register(&erofs_root);
- if (ret)
- goto root_err;
-
- ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
- NULL, "features");
- if (ret)
- goto feat_err;
- return ret;
-
-feat_err:
- kobject_put(&erofs_feat);
- kset_unregister(&erofs_root);
-root_err:
+ if (!ret) {
+ ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
+ NULL, "features");
+ if (!ret)
+ return 0;
+ erofs_exit_sysfs();
+ }
return ret;
}
-
-void erofs_exit_sysfs(void)
-{
- kobject_put(&erofs_feat);
- kset_unregister(&erofs_root);
-}
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 9cf84717a92e..396536d9a862 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -72,12 +72,14 @@ static int erofs_init_inode_xattrs(struct inode *inode)
ret = -EFSCORRUPTED;
goto out_unlock; /* xattr ondisk layout error */
}
- ret = -ENOATTR;
+ ret = -ENODATA;
goto out_unlock;
}
it.buf = __EROFS_BUF_INITIALIZER;
- erofs_init_metabuf(&it.buf, sb);
+ ret = erofs_init_metabuf(&it.buf, sb, erofs_inode_in_metabox(inode));
+ if (ret)
+ goto out_unlock;
it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
@@ -266,20 +268,20 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
(entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
- return -ENOATTR;
+ return -ENODATA;
if (it->index != pf->prefix->base_index ||
it->name.len != entry.e_name_len + pf->infix_len)
- return -ENOATTR;
+ return -ENODATA;
if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len))
- return -ENOATTR;
+ return -ENODATA;
it->infix_len = pf->infix_len;
} else {
if (it->index != entry.e_name_index ||
it->name.len != entry.e_name_len)
- return -ENOATTR;
+ return -ENODATA;
it->infix_len = 0;
}
@@ -295,7 +297,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
entry.e_name_len - processed);
if (memcmp(it->name.name + it->infix_len + processed,
it->kaddr, slice))
- return -ENOATTR;
+ return -ENODATA;
it->pos += slice;
}
@@ -323,9 +325,12 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
sizeof(u32) * vi->xattr_shared_count;
if (xattr_header_sz >= vi->xattr_isize) {
DBG_BUGON(xattr_header_sz > vi->xattr_isize);
- return -ENOATTR;
+ return -ENODATA;
}
+ ret = erofs_init_metabuf(&it->buf, it->sb, erofs_inode_in_metabox(inode));
+ if (ret)
+ return ret;
remaining = vi->xattr_isize - xattr_header_sz;
it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
@@ -347,7 +352,7 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
ret = erofs_getxattr_foreach(it);
else
ret = erofs_listxattr_foreach(it);
- if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
+ if ((getxattr && ret != -ENODATA) || (!getxattr && ret))
break;
it->pos = next_pos;
@@ -361,12 +366,17 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = it->sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- unsigned int i;
- int ret = -ENOATTR;
+ unsigned int i = 0;
+ int ret;
- for (i = 0; i < vi->xattr_shared_count; ++i) {
+ ret = erofs_init_metabuf(&it->buf, sb,
+ erofs_sb_has_shared_ea_in_metabox(sbi));
+ if (ret)
+ return ret;
+
+ while (i < vi->xattr_shared_count) {
it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
- vi->xattr_shared_xattrs[i] * sizeof(__le32);
+ vi->xattr_shared_xattrs[i++] * sizeof(__le32);
it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -375,10 +385,10 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
ret = erofs_getxattr_foreach(it);
else
ret = erofs_listxattr_foreach(it);
- if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
+ if ((getxattr && ret != -ENODATA) || (!getxattr && ret))
break;
}
- return ret;
+ return i ? ret : -ENODATA;
}
int erofs_getxattr(struct inode *inode, int index, const char *name,
@@ -403,7 +413,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
EROFS_XATTR_FILTER_SEED + index);
hashbit &= EROFS_XATTR_FILTER_BITS - 1;
if (vi->xattr_name_filter & (1U << hashbit))
- return -ENOATTR;
+ return -ENODATA;
}
it.index = index;
@@ -413,13 +423,12 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
it.sb = inode->i_sb;
it.buf = __EROFS_BUF_INITIALIZER;
- erofs_init_metabuf(&it.buf, it.sb);
it.buffer = buffer;
it.buffer_size = buffer_size;
it.buffer_ofs = 0;
ret = erofs_xattr_iter_inline(&it, inode, true);
- if (ret == -ENOATTR)
+ if (ret == -ENODATA)
ret = erofs_xattr_iter_shared(&it, inode, true);
erofs_put_metabuf(&it.buf);
return ret ? ret : it.buffer_ofs;
@@ -432,23 +441,22 @@ ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
struct inode *inode = d_inode(dentry);
ret = erofs_init_inode_xattrs(inode);
- if (ret == -ENOATTR)
+ if (ret == -ENODATA)
return 0;
if (ret)
return ret;
it.sb = dentry->d_sb;
it.buf = __EROFS_BUF_INITIALIZER;
- erofs_init_metabuf(&it.buf, it.sb);
it.dentry = dentry;
it.buffer = buffer;
it.buffer_size = buffer_size;
it.buffer_ofs = 0;
ret = erofs_xattr_iter_inline(&it, inode, false);
- if (!ret || ret == -ENOATTR)
+ if (!ret || ret == -ENODATA)
ret = erofs_xattr_iter_shared(&it, inode, false);
- if (ret == -ENOATTR)
+ if (ret == -ENODATA)
ret = 0;
erofs_put_metabuf(&it.buf);
return ret ? ret : it.buffer_ofs;
@@ -474,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2;
struct erofs_xattr_prefix_item *pfs;
int ret = 0, i, len;
+ bool plain = erofs_sb_has_plain_xattr_pfx(sbi);
if (!sbi->xattr_prefix_count)
return 0;
@@ -482,10 +491,16 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
if (!pfs)
return -ENOMEM;
- if (sbi->packed_inode)
- buf.mapping = sbi->packed_inode->i_mapping;
- else
- erofs_init_metabuf(&buf, sb);
+ if (!plain) {
+ if (erofs_sb_has_metabox(sbi))
+ (void)erofs_init_metabuf(&buf, sb, true);
+ else if (sbi->packed_inode)
+ buf.mapping = sbi->packed_inode->i_mapping;
+ else
+ plain = true;
+ }
+ if (plain)
+ (void)erofs_init_metabuf(&buf, sb, false);
for (i = 0; i < sbi->xattr_prefix_count; i++) {
void *ptr = erofs_read_metadata(sb, &buf, &pos, &len);
@@ -539,7 +554,7 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu)
rc = erofs_getxattr(inode, prefix, "", value, rc);
}
- if (rc == -ENOATTR)
+ if (rc == -ENODATA)
acl = NULL;
else if (rc < 0)
acl = ERR_PTR(rc);
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index b246cd0e135e..6317caa8413e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -10,9 +10,6 @@
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
-/* Attribute not found */
-#define ENOATTR ENODATA
-
#ifdef CONFIG_EROFS_FS_XATTR
extern const struct xattr_handler erofs_xattr_user_handler;
extern const struct xattr_handler erofs_xattr_trusted_handler;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 0671184d9cf1..65da21504632 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -79,9 +79,6 @@ struct z_erofs_pcluster {
/* L: whether partial decompression or not */
bool partial;
- /* L: indicate several pageofs_outs or not */
- bool multibases;
-
/* L: whether extra buffer allocations are best-effort */
bool besteffort;
@@ -291,6 +288,7 @@ static struct workqueue_struct *z_erofs_workqueue __read_mostly;
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
static struct kthread_worker __rcu **z_erofs_pcpu_workers;
+static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0);
static void erofs_destroy_percpu_workers(void)
{
@@ -336,12 +334,8 @@ static int erofs_init_percpu_workers(void)
}
return 0;
}
-#else
-static inline void erofs_destroy_percpu_workers(void) {}
-static inline int erofs_init_percpu_workers(void) { return 0; }
-#endif
-#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
+#ifdef CONFIG_HOTPLUG_CPU
static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
static enum cpuhp_state erofs_cpuhp_state;
@@ -398,17 +392,56 @@ static void erofs_cpu_hotplug_destroy(void)
if (erofs_cpuhp_state)
cpuhp_remove_state_nocalls(erofs_cpuhp_state);
}
-#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
+#else /* !CONFIG_HOTPLUG_CPU */
static inline int erofs_cpu_hotplug_init(void) { return 0; }
static inline void erofs_cpu_hotplug_destroy(void) {}
-#endif
+#endif/* CONFIG_HOTPLUG_CPU */
+static int z_erofs_init_pcpu_workers(struct super_block *sb)
+{
+ int err;
-void z_erofs_exit_subsystem(void)
+ if (atomic_xchg(&erofs_percpu_workers_initialized, 1))
+ return 0;
+
+ err = erofs_init_percpu_workers();
+ if (err) {
+ erofs_err(sb, "per-cpu workers: failed to allocate.");
+ goto err_init_percpu_workers;
+ }
+
+ err = erofs_cpu_hotplug_init();
+ if (err < 0) {
+ erofs_err(sb, "per-cpu workers: failed CPU hotplug init.");
+ goto err_cpuhp_init;
+ }
+ erofs_info(sb, "initialized per-cpu workers successfully.");
+ return err;
+
+err_cpuhp_init:
+ erofs_destroy_percpu_workers();
+err_init_percpu_workers:
+ atomic_set(&erofs_percpu_workers_initialized, 0);
+ return err;
+}
+
+static void z_erofs_destroy_pcpu_workers(void)
{
+ if (!atomic_xchg(&erofs_percpu_workers_initialized, 0))
+ return;
erofs_cpu_hotplug_destroy();
erofs_destroy_percpu_workers();
+}
+#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */
+static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; }
+static inline void z_erofs_destroy_pcpu_workers(void) {}
+#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */
+
+void z_erofs_exit_subsystem(void)
+{
+ z_erofs_destroy_pcpu_workers();
destroy_workqueue(z_erofs_workqueue);
z_erofs_destroy_pcluster_pool();
+ z_erofs_crypto_disable_all_engines();
z_erofs_exit_decompressor();
}
@@ -430,19 +463,8 @@ int __init z_erofs_init_subsystem(void)
goto err_workqueue_init;
}
- err = erofs_init_percpu_workers();
- if (err)
- goto err_pcpu_worker;
-
- err = erofs_cpu_hotplug_init();
- if (err < 0)
- goto err_cpuhp_init;
return err;
-err_cpuhp_init:
- erofs_destroy_percpu_workers();
-err_pcpu_worker:
- destroy_workqueue(z_erofs_workqueue);
err_workqueue_init:
z_erofs_destroy_pcluster_pool();
err_pcluster_pool:
@@ -540,7 +562,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
* Allocate a managed folio for cached I/O, or it may be
* then filled with a file-backed folio for in-place I/O
*/
- newfolio = filemap_alloc_folio(gfp, 0);
+ newfolio = filemap_alloc_folio(gfp, 0, NULL);
if (!newfolio)
continue;
newfolio->private = Z_EROFS_PREALLOCATED_FOLIO;
@@ -644,8 +666,14 @@ static const struct address_space_operations z_erofs_cache_aops = {
int z_erofs_init_super(struct super_block *sb)
{
- struct inode *const inode = new_inode(sb);
+ struct inode *inode;
+ int err;
+
+ err = z_erofs_init_pcpu_workers(sb);
+ if (err)
+ return err;
+ inode = new_inode(sb);
if (!inode)
return -ENOMEM;
set_nlink(inode, 1);
@@ -725,7 +753,6 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
lockref_init(&pcl->lockref); /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->pclustersize = map->m_plen;
- pcl->pageofs_in = pageofs_in;
pcl->length = 0;
pcl->partial = true;
pcl->next = fe->head;
@@ -778,6 +805,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
struct z_erofs_pcluster *pcl = NULL;
+ void *ptr;
int ret;
DBG_BUGON(fe->pcl);
@@ -795,9 +823,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
}
rcu_read_unlock();
}
- } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
}
if (pcl) {
@@ -827,15 +852,17 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
/* bind cache first when cached decompression is preferred */
z_erofs_bind_cache(fe);
} else {
- void *mptr;
-
- mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, false);
- if (IS_ERR(mptr)) {
- ret = PTR_ERR(mptr);
- erofs_err(sb, "failed to get inline data %d", ret);
+ ret = erofs_init_metabuf(&map->buf, sb,
+ erofs_inode_in_metabox(fe->inode));
+ if (ret)
+ return ret;
+ ptr = erofs_bread(&map->buf, map->m_pa, false);
+ if (IS_ERR(ptr)) {
+ ret = PTR_ERR(ptr);
+ erofs_err(sb, "failed to get inline folio %d", ret);
return ret;
}
- get_page(map->buf.page);
+ folio_get(page_folio(map->buf.page));
WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
@@ -1007,7 +1034,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
folio_zero_segment(folio, cur, end);
tight = false;
- } else if (map->m_flags & EROFS_MAP_FRAGMENT) {
+ } else if (map->m_flags & __EROFS_MAP_FRAGMENT) {
erofs_off_t fpos = offset + cur - map->m_la;
err = z_erofs_read_fragment(inode->i_sb, folio, cur,
@@ -1047,8 +1074,6 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
break;
erofs_onlinefolio_split(folio);
- if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
- f->pcl->multibases = true;
if (f->pcl->length < offset + end - map->m_la) {
f->pcl->length = offset + end - map->m_la;
f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
@@ -1066,7 +1091,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
tight = (bs == PAGE_SIZE);
}
} while ((end = cur) > 0);
- erofs_onlinefolio_end(folio, err);
+ erofs_onlinefolio_end(folio, err, false);
return err;
}
@@ -1094,7 +1119,6 @@ struct z_erofs_backend {
struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
struct super_block *sb;
struct z_erofs_pcluster *pcl;
-
/* pages with the longest decompressed length for deduplication */
struct page **decompressed_pages;
/* pages to keep the compressed data */
@@ -1103,6 +1127,8 @@ struct z_erofs_backend {
struct list_head decompressed_secondary_bvecs;
struct page **pagepool;
unsigned int onstack_used, nr_pages;
+ /* indicate if temporary copies should be preserved for later use */
+ bool keepxcpy;
};
struct z_erofs_bvec_item {
@@ -1113,18 +1139,20 @@ struct z_erofs_bvec_item {
static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
struct z_erofs_bvec *bvec)
{
+ int poff = bvec->offset + be->pcl->pageofs_out;
struct z_erofs_bvec_item *item;
- unsigned int pgnr;
-
- if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
- (bvec->end == PAGE_SIZE ||
- bvec->offset + bvec->end == be->pcl->length)) {
- pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
- DBG_BUGON(pgnr >= be->nr_pages);
- if (!be->decompressed_pages[pgnr]) {
- be->decompressed_pages[pgnr] = bvec->page;
+ struct page **page;
+
+ if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE ||
+ bvec->offset + bvec->end == be->pcl->length)) {
+ DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages);
+ page = be->decompressed_pages + (poff >> PAGE_SHIFT);
+ if (!*page) {
+ *page = bvec->page;
return;
}
+ } else {
+ be->keepxcpy = true;
}
/* (cold path) one pcluster is requested multiple times */
@@ -1168,7 +1196,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
cur += len;
}
kunmap_local(dst);
- erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
+ erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true);
list_del(p);
kfree(bvi);
}
@@ -1239,12 +1267,13 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
- const struct z_erofs_decompressor *decomp =
+ const struct z_erofs_decompressor *alg =
z_erofs_decomp[pcl->algorithmformat];
+ bool try_free = true;
int i, j, jtop, err2;
struct page *page;
bool overlapped;
- bool try_free = true;
+ const char *reason;
mutex_lock(&pcl->lock);
be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
@@ -1276,8 +1305,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
err2 = z_erofs_parse_in_bvecs(be, &overlapped);
if (err2)
err = err2;
- if (!err)
- err = decomp->decompress(&(struct z_erofs_decompress_req) {
+ if (!err) {
+ reason = alg->decompress(&(struct z_erofs_decompress_req) {
.sb = be->sb,
.in = be->compressed_pages,
.out = be->decompressed_pages,
@@ -1290,16 +1319,27 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
- .fillgaps = pcl->multibases,
+ .fillgaps = be->keepxcpy,
.gfp = pcl->besteffort ? GFP_KERNEL :
GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
+ if (IS_ERR(reason)) {
+ erofs_err(be->sb, "failed to decompress (%s) %ld @ pa %llu size %u => %u",
+ alg->name, PTR_ERR(reason), pcl->pos,
+ pcl->pclustersize, pcl->length);
+ err = PTR_ERR(reason);
+ } else if (unlikely(reason)) {
+ erofs_err(be->sb, "failed to decompress (%s) %s @ pa %llu size %u => %u",
+ alg->name, reason, pcl->pos,
+ pcl->pclustersize, pcl->length);
+ err = -EFSCORRUPTED;
+ }
+ }
/* must handle all compressed pages before actual file pages */
if (pcl->from_meta) {
- page = pcl->compressed_bvecs[0].page;
+ folio_put(page_folio(pcl->compressed_bvecs[0].page));
WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
- put_page(page);
} else {
/* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
@@ -1327,7 +1367,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
DBG_BUGON(z_erofs_page_is_invalidated(page));
if (!z_erofs_is_shortlived_page(page)) {
- erofs_onlinefolio_end(page_folio(page), err);
+ erofs_onlinefolio_end(page_folio(page), err, true);
continue;
}
if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
@@ -1347,7 +1387,6 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
pcl->length = 0;
pcl->partial = true;
- pcl->multibases = false;
pcl->besteffort = false;
pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
@@ -1403,6 +1442,16 @@ static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
}
#endif
+/* Use (kthread_)work in atomic contexts to minimize scheduling overhead */
+static inline bool z_erofs_in_atomic(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth())
+ return true;
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return true;
+ return !preemptible();
+}
+
static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
int bios)
{
@@ -1417,8 +1466,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
if (atomic_add_return(bios, &io->pending_bios))
return;
- /* Use (kthread_)work and sync decompression for atomic contexts only */
- if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
+ if (z_erofs_in_atomic()) {
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
struct kthread_worker *worker;
@@ -1711,7 +1759,8 @@ drain_io:
bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
REQ_OP_READ, GFP_NOIO);
bio->bi_end_io = z_erofs_endio;
- bio->bi_iter.bi_sector = cur >> 9;
+ bio->bi_iter.bi_sector =
+ (mdev.m_dif->fsoff + cur) >> 9;
bio->bi_private = q[JQ_SUBMIT];
if (readahead)
bio->bi_opf |= REQ_RAHEAD;
@@ -1796,7 +1845,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
map->m_la = end;
err = z_erofs_map_blocks_iter(inode, map,
EROFS_GET_BLOCKS_READMORE);
- if (err)
+ if (err || !(map->m_flags & EROFS_MAP_ENCODED))
return;
/* expand ra for the trailing edge if readahead */
@@ -1808,7 +1857,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
end = round_up(end, PAGE_SIZE);
} else {
end = round_up(map->m_la, PAGE_SIZE);
- if (!map->m_llen)
+ if (!(map->m_flags & EROFS_MAP_ENCODED) || !map->m_llen)
return;
}
@@ -1859,13 +1908,12 @@ static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac));
- struct folio *head = NULL, *folio;
unsigned int nrpages = readahead_count(rac);
+ struct folio *head = NULL, *folio;
int err;
+ trace_erofs_readahead(inode, readahead_index(rac), nrpages, false);
z_erofs_pcluster_readmore(&f, rac, true);
- nrpages = readahead_count(rac);
- trace_erofs_readpages(inode, readahead_index(rac), nrpages, false);
while ((folio = readahead_folio(rac))) {
folio->private = head;
head = folio;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 8de50df05dfe..c8d8e129eb4b 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -17,7 +17,7 @@ struct z_erofs_maprecorder {
u16 delta[2];
erofs_blk_t pblk, compressedblks;
erofs_off_t nextpackoff;
- bool partialref;
+ bool partialref, in_mbox;
};
static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
@@ -31,7 +31,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
struct z_erofs_lcluster_index *di;
unsigned int advise;
- di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, true);
+ di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox);
if (IS_ERR(di))
return PTR_ERR(di);
m->lcn = lcn;
@@ -55,10 +55,6 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
} else {
m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);
m->clusterofs = le16_to_cpu(di->di_clusterofs);
- if (m->clusterofs >= 1 << vi->z_lclusterbits) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
m->pblk = le32_to_cpu(di->di_u.blkaddr);
}
return 0;
@@ -146,7 +142,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
- in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, true);
+ in = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox);
if (IS_ERR(in))
return PTR_ERR(in);
@@ -240,14 +236,29 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int lcn, bool lookahead)
{
- switch (EROFS_I(m->inode)->datalayout) {
- case EROFS_INODE_COMPRESSED_FULL:
- return z_erofs_load_full_lcluster(m, lcn);
- case EROFS_INODE_COMPRESSED_COMPACT:
- return z_erofs_load_compact_lcluster(m, lcn, lookahead);
- default:
- return -EINVAL;
+ struct erofs_inode *vi = EROFS_I(m->inode);
+ int err;
+
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT) {
+ err = z_erofs_load_compact_lcluster(m, lcn, lookahead);
+ } else {
+ DBG_BUGON(vi->datalayout != EROFS_INODE_COMPRESSED_FULL);
+ err = z_erofs_load_full_lcluster(m, lcn);
}
+ if (err)
+ return err;
+
+ if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
+ erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu",
+ m->type, lcn, EROFS_I(m->inode)->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ } else if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD &&
+ m->clusterofs >= (1 << vi->z_lclusterbits)) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ return 0;
}
static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
@@ -261,25 +272,19 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned long lcn = m->lcn - lookback_distance;
int err;
+ if (!lookback_distance)
+ break;
+
err = z_erofs_load_lcluster_from_disk(m, lcn, false);
if (err)
return err;
-
- if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
- erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
- } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
lookback_distance = m->delta[0];
- if (!lookback_distance)
- break;
continue;
- } else {
- m->headtype = m->type;
- m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
- return 0;
}
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
}
erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
lookback_distance, m->lcn, vi->nid);
@@ -325,25 +330,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
DBG_BUGON(lcn == initial_lcn &&
m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
- if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- if (m->delta[0] != 1) {
- erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- if (m->compressedblks)
- goto out;
- } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
- /*
- * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
- * rather than CBLKCNT, it's a 1 block-sized pcluster.
- */
- m->compressedblks = 1;
- goto out;
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD && m->delta[0] != 1) {
+ erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
- erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
+
+ /*
+ * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type rather
+ * than CBLKCNT, it's a 1 block-sized pcluster.
+ */
+ if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD || !m->compressedblks)
+ m->compressedblks = 1;
out:
m->map->m_plen = erofs_pos(sb, m->compressedblks);
return 0;
@@ -379,11 +377,6 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
if (lcn != headlcn)
break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
- } else {
- erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
lcn += m->delta[1];
}
@@ -402,19 +395,19 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
+ .in_mbox = erofs_inode_in_metabox(inode),
};
- int err = 0;
- unsigned int endoff, afmt;
+ unsigned int endoff;
unsigned long initial_lcn;
unsigned long long ofs, end;
+ int err;
ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
!vi->z_tailextent_headlcn) {
map->m_la = 0;
map->m_llen = inode->i_size;
- map->m_flags = EROFS_MAP_MAPPED |
- EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
+ map->m_flags = EROFS_MAP_FRAGMENT;
return 0;
}
initial_lcn = ofs >> lclusterbits;
@@ -429,44 +422,26 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
- switch (m.type) {
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
- if (endoff >= m.clusterofs) {
- m.headtype = m.type;
- map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
- /*
- * For ztailpacking files, in order to inline data more
- * effectively, special EOF lclusters are now supported
- * which can have three parts at most.
- */
- if (ztailpacking && end > inode->i_size)
- end = inode->i_size;
- break;
- }
- /* m.lcn should be >= 1 if endoff < m.clusterofs */
- if (!m.lcn) {
- erofs_err(sb, "invalid logical cluster 0 at nid %llu",
- vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
+ if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD && endoff >= m.clusterofs) {
+ m.headtype = m.type;
+ map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ /*
+ * For ztailpacking files, in order to inline data more
+ * effectively, special EOF lclusters are now supported
+ * which can have three parts at most.
+ */
+ if (ztailpacking && end > inode->i_size)
+ end = inode->i_size;
+ } else {
+ if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ end = (m.lcn << lclusterbits) | m.clusterofs;
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+ m.delta[0] = 1;
}
- end = (m.lcn << lclusterbits) | m.clusterofs;
- map->m_flags |= EROFS_MAP_FULL_MAPPED;
- m.delta[0] = 1;
- fallthrough;
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
/* get the corresponding first chunk */
err = z_erofs_extent_lookback(&m, m.delta[0]);
if (err)
goto unmap_out;
- break;
- default:
- erofs_err(sb, "unknown type %u @ offset %llu of nid %llu",
- m.type, ofs, vi->nid);
- err = -EOPNOTSUPP;
- goto unmap_out;
}
if (m.partialref)
map->m_flags |= EROFS_MAP_PARTIAL_REF;
@@ -483,13 +458,13 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
map->m_pa = vi->z_fragmentoff;
map->m_plen = vi->z_idata_size;
if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
- erofs_err(sb, "invalid tail-packing pclustersize %llu",
- map->m_plen);
+ erofs_err(sb, "ztailpacking inline data across blocks @ nid %llu",
+ vi->nid);
err = -EFSCORRUPTED;
goto unmap_out;
}
} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
- map->m_flags |= EROFS_MAP_FRAGMENT;
+ map->m_flags = EROFS_MAP_FRAGMENT;
} else {
map->m_pa = erofs_pos(sb, m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
@@ -503,20 +478,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
err = -EFSCORRUPTED;
goto unmap_out;
}
- afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
- Z_EROFS_COMPRESSION_INTERLACED :
- Z_EROFS_COMPRESSION_SHIFTED;
+ if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+ map->m_algorithmformat = vi->z_algorithmtype[1];
} else {
- afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
- vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
- if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
- erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
- afmt, vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
- }
+ map->m_algorithmformat = vi->z_algorithmtype[0];
}
- map->m_algorithmformat = afmt;
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
@@ -543,6 +513,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
unsigned int recsz = z_erofs_extent_recsize(vi->z_advise);
erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
vi->inode_isize + vi->xattr_isize), recsz);
+ bool in_mbox = erofs_inode_in_metabox(inode);
erofs_off_t lend = inode->i_size;
erofs_off_t l, r, mid, pa, la, lstart;
struct z_erofs_extent *ext;
@@ -552,19 +523,20 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
map->m_flags = 0;
if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) {
if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) {
- ext = erofs_read_metabuf(&map->buf, sb, pos, true);
+ ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox);
if (IS_ERR(ext))
return PTR_ERR(ext);
pa = le64_to_cpu(*(__le64 *)ext);
pos += sizeof(__le64);
lstart = 0;
} else {
- lstart = map->m_la >> vi->z_lclusterbits;
+ lstart = round_down(map->m_la, 1 << vi->z_lclusterbits);
+ pos += (lstart >> vi->z_lclusterbits) * recsz;
pa = EROFS_NULL_ADDR;
}
for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) {
- ext = erofs_read_metabuf(&map->buf, sb, pos, true);
+ ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox);
if (IS_ERR(ext))
return PTR_ERR(ext);
map->m_plen = le32_to_cpu(ext->plen);
@@ -584,7 +556,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
for (l = 0, r = vi->z_extents; l < r; ) {
mid = l + (r - l) / 2;
ext = erofs_read_metabuf(&map->buf, sb,
- pos + mid * recsz, true);
+ pos + mid * recsz, in_mbox);
if (IS_ERR(ext))
return PTR_ERR(ext);
@@ -596,6 +568,10 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
if (la > map->m_la) {
r = mid;
+ if (la > lend) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
lend = la;
} else {
l = mid + 1;
@@ -612,11 +588,11 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
if (lstart < lend) {
map->m_la = lstart;
if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
- map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT;
+ map->m_flags = EROFS_MAP_FRAGMENT;
vi->z_fragmentoff = map->m_plen;
- if (recsz >= offsetof(struct z_erofs_extent, pstart_lo))
+ if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
vi->z_fragmentoff |= map->m_pa << 32;
- } else if (map->m_plen) {
+ } else if (map->m_plen & Z_EROFS_EXTENT_PLEN_MASK) {
map->m_flags |= EROFS_MAP_MAPPED |
EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED;
fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT;
@@ -634,23 +610,16 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
}
}
map->m_llen = lend - map->m_la;
- if (!last && map->m_llen < sb->s_blocksize) {
- erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu",
- map->m_llen, map->m_la, vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
return 0;
}
-static int z_erofs_fill_inode_lazy(struct inode *inode)
+static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err, headnr;
- erofs_off_t pos;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct z_erofs_map_header *h;
+ erofs_off_t pos;
+ int err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
/*
@@ -664,12 +633,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
- err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
goto out_unlock;
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- h = erofs_read_metabuf(&buf, sb, pos, true);
+ h = erofs_read_metabuf(&map->buf, sb, pos, erofs_inode_in_metabox(inode));
if (IS_ERR(h)) {
err = PTR_ERR(h);
goto out_unlock;
@@ -701,22 +669,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- headnr = 0;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
- vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
- headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
- err = -EOPNOTSUPP;
- goto out_put_metabuf;
- }
-
if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto out_put_metabuf;
+ goto out_unlock;
}
if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT &&
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
@@ -724,32 +683,59 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto out_put_metabuf;
+ goto out_unlock;
}
if (vi->z_idata_size ||
(vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
- struct erofs_map_blocks map = {
+ struct erofs_map_blocks tm = {
.buf = __EROFS_BUF_INITIALIZER
};
- err = z_erofs_map_blocks_fo(inode, &map,
+ err = z_erofs_map_blocks_fo(inode, &tm,
EROFS_GET_BLOCKS_FINDTAIL);
- erofs_put_metabuf(&map.buf);
+ erofs_put_metabuf(&tm.buf);
if (err < 0)
- goto out_put_metabuf;
+ goto out_unlock;
}
done:
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
-out_put_metabuf:
- erofs_put_metabuf(&buf);
out_unlock:
clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
return err;
}
+static int z_erofs_map_sanity_check(struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ u64 pend;
+
+ if (!(map->m_flags & EROFS_MAP_ENCODED))
+ return 0;
+ if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) {
+ erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel",
+ map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid);
+ return -EOPNOTSUPP;
+ }
+ if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX &&
+ !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ map->m_algorithmformat, EROFS_I(inode)->nid);
+ return -EFSCORRUPTED;
+ }
+ if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+ map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+ return -EOPNOTSUPP;
+ /* Filesystems beyond 48-bit physical block addresses are invalid */
+ if (unlikely(check_add_overflow(map->m_pa, map->m_plen, &pend) ||
+ (pend >> sbi->blkszbits) >= BIT_ULL(48)))
+ return -EFSCORRUPTED;
+ return 0;
+}
+
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags)
{
@@ -762,7 +748,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
map->m_la = inode->i_size;
map->m_flags = 0;
} else {
- err = z_erofs_fill_inode_lazy(inode);
+ err = z_erofs_fill_inode(inode, map);
if (!err) {
if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
(vi->z_advise & Z_EROFS_ADVISE_EXTENTS))
@@ -770,10 +756,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
else
err = z_erofs_map_blocks_fo(inode, map, flags);
}
- if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
- unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
- map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
- err = -EOPNOTSUPP;
+ if (!err)
+ err = z_erofs_map_sanity_check(inode, map);
if (err)
map->m_llen = 0;
}
@@ -798,7 +782,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
iomap->length = map.m_llen;
if (map.m_flags & EROFS_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+ iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ?
IOMAP_NULL_ADDR : map.m_pa;
} else {
iomap->type = IOMAP_HOLE;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index af42b2c7d235..3219e0d596fe 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -378,9 +378,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
static int do_eventfd(unsigned int count, int flags)
{
- struct eventfd_ctx *ctx;
- struct file *file;
- int fd;
+ struct eventfd_ctx *ctx __free(kfree) = NULL;
/* Check the EFD_* constants for consistency. */
BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
@@ -398,26 +396,19 @@ static int do_eventfd(unsigned int count, int flags)
init_waitqueue_head(&ctx->wqh);
ctx->count = count;
ctx->flags = flags;
- ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL);
flags &= EFD_SHARED_FCNTL_FLAGS;
flags |= O_RDWR;
- fd = get_unused_fd_flags(flags);
- if (fd < 0)
- goto err;
-
- file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops,
- ctx, flags, FMODE_NOWAIT);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- fd = PTR_ERR(file);
- goto err;
- }
- fd_install(fd, file);
- return fd;
-err:
- eventfd_free_ctx(ctx);
- return fd;
+
+ FD_PREPARE(fdf, flags,
+ anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, ctx,
+ flags, FMODE_NOWAIT));
+ if (fdf.err)
+ return fdf.err;
+
+ ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL);
+ retain_and_null_ptr(ctx);
+ return fd_publish(fdf);
}
SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 100376863a44..6c36d9dc6926 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -46,10 +46,10 @@
*
* 1) epnested_mutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->lock (rwlock)
+ * 3) ep->lock (spinlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a rwlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -195,7 +195,7 @@ struct eventpoll {
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
- rwlock_t lock;
+ spinlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
@@ -218,6 +218,7 @@ struct eventpoll {
/* used to optimize loop detection check */
u64 gen;
struct hlist_head refs;
+ u8 loop_check_depth;
/*
* usage count, used together with epitem->dying to
@@ -740,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
* in a lockless way.
*/
lockdep_assert_irqs_enabled();
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_done_scan(struct eventpoll *ep,
@@ -751,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep,
{
struct epitem *epi, *nepi;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -792,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep,
wake_up(&ep->wq);
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_get(struct eventpoll *ep)
@@ -867,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
rb_erase_cached(&epi->rbn, &ep->rbr);
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -883,7 +884,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
kfree_rcu(epi, rcu);
percpu_counter_dec(&ep->user->epoll_watches);
- return ep_refcount_dec_and_test(ep);
+ return true;
}
/*
@@ -891,14 +892,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
*/
static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
{
- WARN_ON_ONCE(__ep_remove(ep, epi, false));
+ if (__ep_remove(ep, epi, false))
+ WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
}
static void ep_clear_and_put(struct eventpoll *ep)
{
struct rb_node *rbp, *next;
struct epitem *epi;
- bool dispose;
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
@@ -931,10 +932,8 @@ static void ep_clear_and_put(struct eventpoll *ep)
cond_resched();
}
- dispose = ep_refcount_dec_and_test(ep);
mutex_unlock(&ep->mtx);
-
- if (dispose)
+ if (ep_refcount_dec_and_test(ep))
ep_free(ep);
}
@@ -1137,7 +1136,7 @@ again:
dispose = __ep_remove(ep, epi, true);
mutex_unlock(&ep->mtx);
- if (dispose)
+ if (dispose && ep_refcount_dec_and_test(ep))
ep_free(ep);
goto again;
}
@@ -1153,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep)
return -ENOMEM;
mutex_init(&ep->mtx);
- rwlock_init(&ep->lock);
+ spin_lock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1241,99 +1240,9 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
#endif /* CONFIG_KCMP */
/*
- * Adds a new entry to the tail of the list in a lockless way, i.e.
- * multiple CPUs are allowed to call this function concurrently.
- *
- * Beware: it is necessary to prevent any other modifications of the
- * existing list until all changes are completed, in other words
- * concurrent list_add_tail_lockless() calls should be protected
- * with a read lock, where write lock acts as a barrier which
- * makes sure all list_add_tail_lockless() calls are fully
- * completed.
- *
- * Also an element can be locklessly added to the list only in one
- * direction i.e. either to the tail or to the head, otherwise
- * concurrent access will corrupt the list.
- *
- * Return: %false if element has been already added to the list, %true
- * otherwise.
- */
-static inline bool list_add_tail_lockless(struct list_head *new,
- struct list_head *head)
-{
- struct list_head *prev;
-
- /*
- * This is simple 'new->next = head' operation, but cmpxchg()
- * is used in order to detect that same element has been just
- * added to the list from another CPU: the winner observes
- * new->next == new.
- */
- if (!try_cmpxchg(&new->next, &new, head))
- return false;
-
- /*
- * Initially ->next of a new element must be updated with the head
- * (we are inserting to the tail) and only then pointers are atomically
- * exchanged. XCHG guarantees memory ordering, thus ->next should be
- * updated before pointers are actually swapped and pointers are
- * swapped before prev->next is updated.
- */
-
- prev = xchg(&head->prev, new);
-
- /*
- * It is safe to modify prev->next and new->prev, because a new element
- * is added only to the tail and new->next is updated before XCHG.
- */
-
- prev->next = new;
- new->prev = prev;
-
- return true;
-}
-
-/*
- * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
- * i.e. multiple CPUs are allowed to call this function concurrently.
- *
- * Return: %false if epi element has been already chained, %true otherwise.
- */
-static inline bool chain_epi_lockless(struct epitem *epi)
-{
- struct eventpoll *ep = epi->ep;
-
- /* Fast preliminary check */
- if (epi->next != EP_UNACTIVE_PTR)
- return false;
-
- /* Check that the same epi has not been just chained from another CPU */
- if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
- return false;
-
- /* Atomically exchange tail */
- epi->next = xchg(&ep->ovflist, epi);
-
- return true;
-}
-
-/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
- *
- * This callback takes a read lock in order not to contend with concurrent
- * events from another file descriptor, thus all modifications to ->rdllist
- * or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_start/done_scan(), which stops all list modifications and guarantees
- * that lists state is seen correctly.
- *
- * Another thing worth to mention is that ep_poll_callback() can be called
- * concurrently for the same @epi from different CPUs if poll table was inited
- * with several wait queues entries. Plural wakeup from different CPUs of a
- * single wait queue is serialized by wq.lock, but the case when multiple wait
- * queues are used should be detected accordingly. This is detected using
- * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
@@ -1344,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
unsigned long flags;
int ewake = 0;
- read_lock_irqsave(&ep->lock, flags);
+ spin_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1373,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (chain_epi_lockless(epi))
+ if (epi->next == EP_UNACTIVE_PTR) {
+ epi->next = READ_ONCE(ep->ovflist);
+ WRITE_ONCE(ep->ovflist, epi);
ep_pm_stay_awake_rcu(epi);
+ }
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
- if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
- ep_pm_stay_awake_rcu(epi);
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake_rcu(epi);
}
/*
@@ -1411,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
pwake++;
out_unlock:
- read_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1746,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
}
/* We have to drop the new item inside our item list to keep track of it */
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1763,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
/* We have to call this outside the lock */
if (pwake)
@@ -1827,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1838,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -1996,6 +1908,14 @@ static int ep_try_send_events(struct eventpoll *ep,
return res;
}
+static int ep_schedule_timeout(ktime_t *to)
+{
+ if (to)
+ return ktime_after(*to, ktime_get());
+ else
+ return 1;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@@ -2082,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
@@ -2101,11 +2021,12 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
if (!eavail)
- timed_out = !schedule_hrtimeout_range(to, slack,
- HRTIMER_MODE_ABS);
+ timed_out = !ep_schedule_timeout(to) ||
+ !schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
@@ -2116,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
@@ -2127,29 +2048,30 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
}
}
/**
- * ep_loop_check_proc - verify that adding an epoll file inside another
- * epoll structure does not violate the constraints, in
- * terms of closed loops, or too deep chains (which can
- * result in excessive stack usage).
+ * ep_loop_check_proc - verify that adding an epoll file @ep inside another
+ * epoll file does not create closed loops, and
+ * determine the depth of the subtree starting at @ep
*
* @ep: the &struct eventpoll to be currently checked.
* @depth: Current depth of the path being checked.
*
- * Return: %zero if adding the epoll @file inside current epoll
- * structure @ep does not violate the constraints, or %-1 otherwise.
+ * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
*/
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
- int error = 0;
+ int result = 0;
struct rb_node *rbp;
struct epitem *epi;
+ if (ep->gen == loop_check_gen)
+ return ep->loop_check_depth;
+
mutex_lock_nested(&ep->mtx, depth + 1);
ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
@@ -2157,13 +2079,11 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
if (unlikely(is_file_epoll(epi->ffd.file))) {
struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
- if (ep_tovisit->gen == loop_check_gen)
- continue;
if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
- error = -1;
+ result = INT_MAX;
else
- error = ep_loop_check_proc(ep_tovisit, depth + 1);
- if (error != 0)
+ result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
+ if (result > EP_MAX_NESTS)
break;
} else {
/*
@@ -2177,9 +2097,25 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
list_file(epi->ffd.file);
}
}
+ ep->loop_check_depth = result;
mutex_unlock(&ep->mtx);
- return error;
+ return result;
+}
+
+/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */
+static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)
+{
+ int result = 0;
+ struct epitem *epi;
+
+ if (ep->gen == loop_check_gen)
+ return ep->loop_check_depth;
+ hlist_for_each_entry_rcu(epi, &ep->refs, fllink)
+ result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1);
+ ep->gen = loop_check_gen;
+ ep->loop_check_depth = result;
+ return result;
}
/**
@@ -2195,8 +2131,22 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
*/
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
+ int depth, upwards_depth;
+
inserting_into = ep;
- return ep_loop_check_proc(to, 0);
+ /*
+ * Check how deep down we can get from @to, and whether it is possible
+ * to loop up to @ep.
+ */
+ depth = ep_loop_check_proc(to, 0);
+ if (depth > EP_MAX_NESTS)
+ return -1;
+ /* Check how far up we can go from @ep. */
+ rcu_read_lock();
+ upwards_depth = ep_get_upwards_depth_proc(ep, 0);
+ rcu_read_unlock();
+
+ return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
}
static void clear_tfile_check_list(void)
@@ -2215,9 +2165,8 @@ static void clear_tfile_check_list(void)
*/
static int do_epoll_create(int flags)
{
- int error, fd;
- struct eventpoll *ep = NULL;
- struct file *file;
+ int error;
+ struct eventpoll *ep;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -2234,26 +2183,15 @@ static int do_epoll_create(int flags)
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
- fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
- if (fd < 0) {
- error = fd;
- goto out_free_ep;
- }
- file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
- O_RDWR | (flags & O_CLOEXEC));
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto out_free_fd;
+ FD_PREPARE(fdf, O_RDWR | (flags & O_CLOEXEC),
+ anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
+ O_RDWR | (flags & O_CLOEXEC)));
+ if (fdf.err) {
+ ep_clear_and_put(ep);
+ return fdf.err;
}
- ep->file = file;
- fd_install(fd, file);
- return fd;
-
-out_free_fd:
- put_unused_fd(fd);
-out_free_ep:
- ep_clear_and_put(ep);
- return error;
+ ep->file = fd_prepare_file(fdf);
+ return fd_publish(fdf);
}
SYSCALL_DEFINE1(epoll_create1, int, flags)
diff --git a/fs/exec.c b/fs/exec.c
index f45859ad13ac..9d5ebc9d15b0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -78,6 +78,9 @@
#include <trace/events/sched.h>
+/* For vma exec functions. */
+#include "../mm/internal.h"
+
static int bprm_creds_from_file(struct linux_binprm *bprm);
int suid_dumpable = 0;
@@ -111,70 +114,13 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
bool path_noexec(const struct path *path)
{
+ /* If it's an anonymous inode make sure that we catch any shenanigans. */
+ VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) &&
+ !(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC));
return (path->mnt->mnt_flags & MNT_NOEXEC) ||
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}
-#ifdef CONFIG_USELIB
-/*
- * Note that a shared library must be both readable and executable due to
- * security reasons.
- *
- * Also note that we take the address to load from the file itself.
- */
-SYSCALL_DEFINE1(uselib, const char __user *, library)
-{
- struct linux_binfmt *fmt;
- struct file *file;
- struct filename *tmp = getname(library);
- int error = PTR_ERR(tmp);
- static const struct open_flags uselib_flags = {
- .open_flag = O_LARGEFILE | O_RDONLY,
- .acc_mode = MAY_READ | MAY_EXEC,
- .intent = LOOKUP_OPEN,
- .lookup_flags = LOOKUP_FOLLOW,
- };
-
- if (IS_ERR(tmp))
- goto out;
-
- file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
- putname(tmp);
- error = PTR_ERR(file);
- if (IS_ERR(file))
- goto out;
-
- /*
- * Check do_open_execat() for an explanation.
- */
- error = -EACCES;
- if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
- path_noexec(&file->f_path))
- goto exit;
-
- error = -ENOEXEC;
-
- read_lock(&binfmt_lock);
- list_for_each_entry(fmt, &formats, lh) {
- if (!fmt->load_shlib)
- continue;
- if (!try_module_get(fmt->module))
- continue;
- read_unlock(&binfmt_lock);
- error = fmt->load_shlib(file);
- read_lock(&binfmt_lock);
- put_binfmt(fmt);
- if (error != -ENOEXEC)
- break;
- }
- read_unlock(&binfmt_lock);
-exit:
- fput(file);
-out:
- return error;
-}
-#endif /* #ifdef CONFIG_USELIB */
-
#ifdef CONFIG_MMU
/*
* The nascent bprm->mm is not visible until exec_mmap() but it can
@@ -242,60 +188,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}
-static int __bprm_mm_init(struct linux_binprm *bprm)
-{
- int err;
- struct vm_area_struct *vma = NULL;
- struct mm_struct *mm = bprm->mm;
-
- bprm->vma = vma = vm_area_alloc(mm);
- if (!vma)
- return -ENOMEM;
- vma_set_anonymous(vma);
-
- if (mmap_write_lock_killable(mm)) {
- err = -EINTR;
- goto err_free;
- }
-
- /*
- * Need to be called with mmap write lock
- * held, to avoid race with ksmd.
- */
- err = ksm_execve(mm);
- if (err)
- goto err_ksm;
-
- /*
- * Place the stack at the largest stack address the architecture
- * supports. Later, we'll move this to an appropriate place. We don't
- * use STACK_TOP because that can depend on attributes which aren't
- * configured yet.
- */
- BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
- vma->vm_end = STACK_TOP_MAX;
- vma->vm_start = vma->vm_end - PAGE_SIZE;
- vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
- vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-
- err = insert_vm_struct(mm, vma);
- if (err)
- goto err;
-
- mm->stack_vm = mm->total_vm = 1;
- mmap_write_unlock(mm);
- bprm->p = vma->vm_end - sizeof(void *);
- return 0;
-err:
- ksm_exit(mm);
-err_ksm:
- mmap_write_unlock(mm);
-err_free:
- bprm->vma = NULL;
- vm_area_free(vma);
- return err;
-}
-
static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
return len <= MAX_ARG_STRLEN;
@@ -348,12 +240,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
{
}
-static int __bprm_mm_init(struct linux_binprm *bprm)
-{
- bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
- return 0;
-}
-
static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
return len <= bprm->p;
@@ -382,9 +268,13 @@ static int bprm_mm_init(struct linux_binprm *bprm)
bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
task_unlock(current->group_leader);
- err = __bprm_mm_init(bprm);
+#ifndef CONFIG_MMU
+ bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
+#else
+ err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p);
if (err)
goto err;
+#endif
return 0;
@@ -709,12 +599,12 @@ int setup_arg_pages(struct linux_binprm *bprm,
unsigned long stack_top,
int executable_stack)
{
- unsigned long ret;
+ int ret;
unsigned long stack_shift;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = bprm->vma;
struct vm_area_struct *prev = NULL;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
unsigned long stack_base;
unsigned long stack_size;
unsigned long stack_expand;
@@ -894,13 +784,15 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (IS_ERR(file))
return file;
+ if (path_noexec(&file->f_path))
+ return ERR_PTR(-EACCES);
+
/*
* In the past the regular type check was here. It moved to may_open() in
* 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
* an invariant that all non-regular files error out before we get here.
*/
- if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
- path_noexec(&file->f_path))
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)))
return ERR_PTR(-EACCES);
err = exe_file_deny_write_access(file);
@@ -1227,13 +1119,12 @@ int begin_new_exec(struct linux_binprm * bprm)
*/
bprm->point_of_no_return = true;
- /*
- * Make this the only thread in the thread group.
- */
+ /* Make this the only thread in the thread group */
retval = de_thread(me);
if (retval)
goto out;
-
+ /* see the comment in check_unsafe_exec() */
+ current->fs->in_exec = 0;
/*
* Cancel any io_uring activity across execve
*/
@@ -1389,10 +1280,9 @@ int begin_new_exec(struct linux_binprm * bprm)
/* Pass the opened binary to the interpreter. */
if (bprm->have_execfd) {
- retval = get_unused_fd_flags(0);
+ retval = FD_ADD(0, bprm->executable);
if (retval < 0)
goto out_unlock;
- fd_install(retval, bprm->executable);
bprm->executable = NULL;
bprm->execfd = retval;
}
@@ -1495,6 +1385,8 @@ static void free_bprm(struct linux_binprm *bprm)
}
free_arg_pages(bprm);
if (bprm->cred) {
+ /* in case exec fails before de_thread() succeeds */
+ current->fs->in_exec = 0;
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
@@ -1616,9 +1508,13 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
* suid exec because the differently privileged task
* will be able to manipulate the current directory, etc.
* It would be nice to force an unshare instead...
+ *
+ * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS)
+ * from another sub-thread until de_thread() succeeds, this
+ * state is protected by cred_guard_mutex we hold.
*/
n_fs = 1;
- spin_lock(&p->fs->lock);
+ read_seqlock_excl(&p->fs->seq);
rcu_read_lock();
for_other_threads(p, t) {
if (t->fs == p->fs)
@@ -1631,7 +1527,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
bprm->unsafe |= LSM_UNSAFE_SHARE;
else
p->fs->in_exec = 1;
- spin_unlock(&p->fs->lock);
+ read_sequnlock_excl(&p->fs->seq);
}
static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
@@ -1859,10 +1755,9 @@ static int bprm_execve(struct linux_binprm *bprm)
goto out;
sched_mm_cid_after_execve(current);
+ rseq_execve(current);
/* execve succeeded */
- current->fs->in_exec = 0;
current->in_execve = 0;
- rseq_execve(current);
user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
@@ -1879,7 +1774,7 @@ out:
force_fatal_sig(SIGSEGV);
sched_mm_cid_after_execve(current);
- current->fs->in_exec = 0;
+ rseq_force_update();
current->in_execve = 0;
return retval;
@@ -2103,7 +1998,7 @@ void set_dumpable(struct mm_struct *mm, int value)
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
return;
- set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
+ __mm_flags_set_mask_dumpable(mm, value);
}
SYSCALL_DEFINE3(execve,
@@ -2152,7 +2047,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ
{
int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
+ if (!error && write)
validate_coredump_safety();
return error;
}
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 9ff825f1502d..5429041c7eaf 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/bitmap.h>
#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -26,13 +27,58 @@
/*
* Allocation Bitmap Management Functions
*/
+static bool exfat_test_bitmap_range(struct super_block *sb, unsigned int clu,
+ unsigned int count)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned int start = clu;
+ unsigned int end = clu + count;
+ unsigned int ent_idx, i, b;
+ unsigned int bit_offset, bits_to_check;
+ __le_long *bitmap_le;
+ unsigned long mask, word;
+
+ if (!is_valid_cluster(sbi, start) || !is_valid_cluster(sbi, end - 1))
+ return false;
+
+ while (start < end) {
+ ent_idx = CLUSTER_TO_BITMAP_ENT(start);
+ i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
+ b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
+
+ bitmap_le = (__le_long *)sbi->vol_amap[i]->b_data;
+
+ /* Calculate how many bits we can check in the current word */
+ bit_offset = b % BITS_PER_LONG;
+ bits_to_check = min(end - start,
+ (unsigned int)(BITS_PER_LONG - bit_offset));
+
+ /* Create a bitmask for the range of bits to check */
+ if (bits_to_check >= BITS_PER_LONG)
+ mask = ~0UL;
+ else
+ mask = ((1UL << bits_to_check) - 1) << bit_offset;
+ word = lel_to_cpu(bitmap_le[b / BITS_PER_LONG]);
+
+ /* Check if all bits in the mask are set */
+ if ((word & mask) != mask)
+ return false;
+
+ start += bits_to_check;
+ }
+
+ return true;
+}
+
static int exfat_allocate_bitmap(struct super_block *sb,
struct exfat_dentry *ep)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct blk_plug plug;
long long map_size;
- unsigned int i, need_map_size;
+ unsigned int i, j, need_map_size;
sector_t sector;
+ unsigned int max_ra_count;
sbi->map_clu = le32_to_cpu(ep->dentry.bitmap.start_clu);
map_size = le64_to_cpu(ep->dentry.bitmap.size);
@@ -56,22 +102,37 @@ static int exfat_allocate_bitmap(struct super_block *sb,
return -ENOMEM;
sector = exfat_cluster_to_sector(sbi, sbi->map_clu);
+ max_ra_count = min(sb->s_bdi->ra_pages, sb->s_bdi->io_pages) <<
+ (PAGE_SHIFT - sb->s_blocksize_bits);
for (i = 0; i < sbi->map_sectors; i++) {
- sbi->vol_amap[i] = sb_bread(sb, sector + i);
- if (!sbi->vol_amap[i]) {
- /* release all buffers and free vol_amap */
- int j = 0;
-
- while (j < i)
- brelse(sbi->vol_amap[j++]);
-
- kvfree(sbi->vol_amap);
- sbi->vol_amap = NULL;
- return -EIO;
+ /* Trigger the next readahead in advance. */
+ if (max_ra_count && 0 == (i % max_ra_count)) {
+ blk_start_plug(&plug);
+ for (j = i; j < min(max_ra_count, sbi->map_sectors - i) + i; j++)
+ sb_breadahead(sb, sector + j);
+ blk_finish_plug(&plug);
}
+
+ sbi->vol_amap[i] = sb_bread(sb, sector + i);
+ if (!sbi->vol_amap[i])
+ goto err_out;
}
+ if (exfat_test_bitmap_range(sb, sbi->map_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(map_size, sbi)) == false)
+ goto err_out;
+
return 0;
+
+err_out:
+ j = 0;
+ /* release all buffers and free vol_amap */
+ while (j < i)
+ brelse(sbi->vol_amap[j++]);
+
+ kvfree(sbi->vol_amap);
+ sbi->vol_amap = NULL;
+ return -EIO;
}
int exfat_load_bitmap(struct super_block *sb)
@@ -122,11 +183,10 @@ void exfat_free_bitmap(struct exfat_sb_info *sbi)
kvfree(sbi->vol_amap);
}
-int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync)
+int exfat_set_bitmap(struct super_block *sb, unsigned int clu, bool sync)
{
int i, b;
unsigned int ent_idx;
- struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
if (!is_valid_cluster(sbi, clu))
@@ -141,13 +201,11 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync)
return 0;
}
-int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
+int exfat_clear_bitmap(struct super_block *sb, unsigned int clu, bool sync)
{
int i, b;
unsigned int ent_idx;
- struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct exfat_mount_options *opts = &sbi->options;
if (!is_valid_cluster(sbi, clu))
return -EIO;
@@ -163,20 +221,29 @@ int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
exfat_update_bh(sbi->vol_amap[i], sync);
- if (opts->discard) {
- int ret_discard;
+ return 0;
+}
- ret_discard = sb_issue_discard(sb,
- exfat_cluster_to_sector(sbi, clu),
- (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0);
+bool exfat_test_bitmap(struct super_block *sb, unsigned int clu)
+{
+ int i, b;
+ unsigned int ent_idx;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
- if (ret_discard == -EOPNOTSUPP) {
- exfat_err(sb, "discard not supported by device, disabling");
- opts->discard = 0;
- }
- }
+ if (!sbi->vol_amap)
+ return true;
- return 0;
+ if (!is_valid_cluster(sbi, clu))
+ return false;
+
+ ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
+ i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
+ b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
+
+ if (!test_bit_le(b, sbi->vol_amap[i]->b_data))
+ return false;
+
+ return true;
}
/*
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 3103b932b674..3045a58e124a 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -604,6 +604,11 @@ static int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir
if (ret)
return ret;
+ if (!exfat_test_bitmap(sb, clu)) {
+ exfat_err(sb, "failed to test cluster bit(%u)", clu);
+ return -EIO;
+ }
+
/* byte offset in cluster */
off = EXFAT_CLU_OFFSET(off, sbi);
@@ -996,6 +1001,7 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_hint_femp candi_empty;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
int num_entries = exfat_calc_num_entries(p_uniname);
+ unsigned int clu_count = 0;
if (num_entries < 0)
return num_entries;
@@ -1133,6 +1139,10 @@ rewind:
} else {
if (exfat_get_next_cluster(sb, &clu.dir))
return -EIO;
+
+ /* break if the cluster chain includes a loop */
+ if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi)))
+ goto not_found;
}
}
@@ -1195,6 +1205,7 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
int i, count = 0;
int dentries_per_clu;
unsigned int entry_type;
+ unsigned int clu_count = 0;
struct exfat_chain clu;
struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -1227,8 +1238,174 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
} else {
if (exfat_get_next_cluster(sb, &(clu.dir)))
return -EIO;
+
+ if (unlikely(++clu_count > sbi->used_clusters)) {
+ exfat_fs_error(sb, "FAT or bitmap is corrupted");
+ return -EIO;
+ }
+
}
}
return count;
}
+
+static int exfat_get_volume_label_dentry(struct super_block *sb,
+ struct exfat_entry_set_cache *es)
+{
+ int i;
+ int dentry = 0;
+ unsigned int type;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_hint_femp hint_femp;
+ struct exfat_inode_info *ei = EXFAT_I(sb->s_root->d_inode);
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+
+ hint_femp.eidx = EXFAT_HINT_NONE;
+ exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ for (i = 0; i < sbi->dentries_per_clu; i++, dentry++) {
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
+ if (!ep)
+ return -EIO;
+
+ type = exfat_get_entry_type(ep);
+ if (hint_femp.eidx == EXFAT_HINT_NONE) {
+ if (type == TYPE_DELETED || type == TYPE_UNUSED) {
+ hint_femp.cur = clu;
+ hint_femp.eidx = dentry;
+ hint_femp.count = 1;
+ }
+ }
+
+ if (type == TYPE_UNUSED) {
+ brelse(bh);
+ goto not_found;
+ }
+
+ if (type != TYPE_VOLUME) {
+ brelse(bh);
+ continue;
+ }
+
+ memset(es, 0, sizeof(*es));
+ es->sb = sb;
+ es->bh = es->__bh;
+ es->bh[0] = bh;
+ es->num_bh = 1;
+ es->start_off = EXFAT_DEN_TO_B(i) % sb->s_blocksize;
+
+ return 0;
+ }
+
+ if (exfat_get_next_cluster(sb, &(clu.dir)))
+ return -EIO;
+ }
+
+not_found:
+ if (hint_femp.eidx == EXFAT_HINT_NONE) {
+ hint_femp.cur.dir = EXFAT_EOF_CLUSTER;
+ hint_femp.eidx = dentry;
+ hint_femp.count = 0;
+ }
+
+ ei->hint_femp = hint_femp;
+
+ return -ENOENT;
+}
+
+int exfat_read_volume_label(struct super_block *sb, struct exfat_uni_name *label_out)
+{
+ int ret, i;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_entry_set_cache es;
+ struct exfat_dentry *ep;
+
+ mutex_lock(&sbi->s_lock);
+
+ memset(label_out, 0, sizeof(*label_out));
+ ret = exfat_get_volume_label_dentry(sb, &es);
+ if (ret < 0) {
+ /*
+ * ENOENT signifies that a volume label dentry doesn't exist
+ * We will treat this as an empty volume label and not fail.
+ */
+ if (ret == -ENOENT)
+ ret = 0;
+
+ goto unlock;
+ }
+
+ ep = exfat_get_dentry_cached(&es, 0);
+ label_out->name_len = ep->dentry.volume_label.char_count;
+ if (label_out->name_len > EXFAT_VOLUME_LABEL_LEN) {
+ ret = -EIO;
+ exfat_put_dentry_set(&es, false);
+ goto unlock;
+ }
+
+ for (i = 0; i < label_out->name_len; i++)
+ label_out->name[i] = le16_to_cpu(ep->dentry.volume_label.volume_label[i]);
+
+ exfat_put_dentry_set(&es, false);
+unlock:
+ mutex_unlock(&sbi->s_lock);
+ return ret;
+}
+
+int exfat_write_volume_label(struct super_block *sb,
+ struct exfat_uni_name *label)
+{
+ int ret, i;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct inode *root_inode = sb->s_root->d_inode;
+ struct exfat_entry_set_cache es;
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+
+ if (label->name_len > EXFAT_VOLUME_LABEL_LEN)
+ return -EINVAL;
+
+ mutex_lock(&sbi->s_lock);
+
+ ret = exfat_get_volume_label_dentry(sb, &es);
+ if (ret == -ENOENT) {
+ if (label->name_len == 0) {
+ /* No volume label dentry, no need to clear */
+ ret = 0;
+ goto unlock;
+ }
+
+ ret = exfat_find_empty_entry(root_inode, &clu, 1, &es);
+ }
+
+ if (ret < 0)
+ goto unlock;
+
+ ep = exfat_get_dentry_cached(&es, 0);
+
+ if (label->name_len == 0 && ep->dentry.volume_label.char_count == 0) {
+ /* volume label had been cleared */
+ exfat_put_dentry_set(&es, 0);
+ goto unlock;
+ }
+
+ memset(ep, 0, sizeof(*ep));
+ ep->type = EXFAT_VOLUME;
+
+ for (i = 0; i < label->name_len; i++)
+ ep->dentry.volume_label.volume_label[i] =
+ cpu_to_le16(label->name[i]);
+
+ ep->dentry.volume_label.char_count = label->name_len;
+ es.modified = true;
+
+ ret = exfat_put_dentry_set(&es, IS_DIRSYNC(root_inode));
+
+unlock:
+ mutex_unlock(&sbi->s_lock);
+ return ret;
+}
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index d30ce18a88b7..176fef62574c 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -14,8 +14,6 @@
#define EXFAT_ROOT_INO 1
-#define EXFAT_CLUSTERS_UNTRACKED (~0u)
-
/*
* exfat error flags
*/
@@ -31,7 +29,6 @@ enum exfat_error_mode {
enum {
NLS_NAME_NO_LOSSY = 0, /* no lossy */
NLS_NAME_LOSSY = 1 << 0, /* just detected incorrect filename(s) */
- NLS_NAME_OVERLEN = 1 << 1, /* the length is over than its limit */
};
#define EXFAT_HASH_BITS 8
@@ -455,8 +452,9 @@ int exfat_count_num_clusters(struct super_block *sb,
/* balloc.c */
int exfat_load_bitmap(struct super_block *sb);
void exfat_free_bitmap(struct exfat_sb_info *sbi);
-int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync);
-int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync);
+int exfat_set_bitmap(struct super_block *sb, unsigned int clu, bool sync);
+int exfat_clear_bitmap(struct super_block *sb, unsigned int clu, bool sync);
+bool exfat_test_bitmap(struct super_block *sb, unsigned int clu);
unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu);
int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count);
int exfat_trim_fs(struct inode *inode, struct fstrim_range *range);
@@ -479,6 +477,9 @@ int exfat_force_shutdown(struct super_block *sb, u32 flags);
/* namei.c */
extern const struct dentry_operations exfat_dentry_ops;
extern const struct dentry_operations exfat_utf8_dentry_ops;
+int exfat_find_empty_entry(struct inode *inode,
+ struct exfat_chain *p_dir, int num_entries,
+ struct exfat_entry_set_cache *es);
/* cache.c */
int exfat_cache_init(void);
@@ -519,6 +520,10 @@ int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es,
unsigned int num_entries);
int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
+int exfat_read_volume_label(struct super_block *sb,
+ struct exfat_uni_name *label_out);
+int exfat_write_volume_label(struct super_block *sb,
+ struct exfat_uni_name *label);
/* inode.c */
extern const struct inode_operations exfat_file_inode_operations;
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 971a1ccd0e89..4082fa7b8c14 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -80,6 +80,7 @@
#define BOOTSEC_OLDBPB_LEN 53
#define EXFAT_FILE_NAME_LEN 15
+#define EXFAT_VOLUME_LABEL_LEN 11
#define EXFAT_MIN_SECT_SIZE_BITS 9
#define EXFAT_MAX_SECT_SIZE_BITS 12
@@ -160,6 +161,11 @@ struct exfat_dentry {
__le64 size;
} __packed upcase; /* up-case table directory entry */
struct {
+ __u8 char_count;
+ __le16 volume_label[EXFAT_VOLUME_LABEL_LEN];
+ __u8 reserved[8];
+ } __packed volume_label; /* volume label directory entry */
+ struct {
__u8 flags;
__u8 vendor_guid[16];
__u8 vendor_defined[14];
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 6f3651c6ca91..c9c5f2e3a05e 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -89,35 +89,36 @@ int exfat_ent_get(struct super_block *sb, unsigned int loc,
int err;
if (!is_valid_cluster(sbi, loc)) {
- exfat_fs_error(sb, "invalid access to FAT (entry 0x%08x)",
+ exfat_fs_error_ratelimit(sb,
+ "invalid access to FAT (entry 0x%08x)",
loc);
return -EIO;
}
err = __exfat_ent_get(sb, loc, content);
if (err) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"failed to access to FAT (entry 0x%08x, err:%d)",
loc, err);
return err;
}
if (*content == EXFAT_FREE_CLUSTER) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"invalid access to FAT free cluster (entry 0x%08x)",
loc);
return -EIO;
}
if (*content == EXFAT_BAD_CLUSTER) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"invalid access to FAT bad cluster (entry 0x%08x)",
loc);
return -EIO;
}
if (*content != EXFAT_EOF_CLUSTER && !is_valid_cluster(sbi, *content)) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"invalid access to FAT (entry 0x%08x) bogus content (0x%08x)",
loc, *content);
return -EIO;
@@ -144,6 +145,20 @@ int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain,
return 0;
}
+static inline void exfat_discard_cluster(struct super_block *sb,
+ unsigned int clu, unsigned int num_clusters)
+{
+ int ret;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ ret = sb_issue_discard(sb, exfat_cluster_to_sector(sbi, clu),
+ sbi->sect_per_clus * num_clusters, GFP_NOFS, 0);
+ if (ret == -EOPNOTSUPP) {
+ exfat_err(sb, "discard not supported by device, disabling");
+ sbi->options.discard = 0;
+ }
+}
+
/* This function must be called with bitmap_lock held */
static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain)
{
@@ -190,13 +205,18 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
cur_cmap_i = next_cmap_i;
}
- err = exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)));
+ err = exfat_clear_bitmap(sb, clu, (sync && IS_DIRSYNC(inode)));
if (err)
break;
clu++;
num_clusters++;
} while (num_clusters < p_chain->size);
+
+ if (sbi->options.discard)
+ exfat_discard_cluster(sb, p_chain->dir, p_chain->size);
} else {
+ unsigned int nr_clu = 1;
+
do {
bool sync = false;
unsigned int n_clu = clu;
@@ -213,8 +233,18 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
cur_cmap_i = next_cmap_i;
}
- if (exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))))
+ if (exfat_clear_bitmap(sb, clu, (sync && IS_DIRSYNC(inode))))
break;
+
+ if (sbi->options.discard) {
+ if (n_clu == clu + 1)
+ nr_clu++;
+ else {
+ exfat_discard_cluster(sb, clu - nr_clu + 1, nr_clu);
+ nr_clu = 1;
+ }
+ }
+
clu = n_clu;
num_clusters++;
@@ -265,7 +295,7 @@ int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain,
clu = next;
if (exfat_ent_get(sb, clu, &next))
return -EIO;
- } while (next != EXFAT_EOF_CLUSTER);
+ } while (next != EXFAT_EOF_CLUSTER && count <= p_chain->size);
if (p_chain->size != count) {
exfat_fs_error(sb,
@@ -379,7 +409,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
}
/* update allocation bitmap */
- if (exfat_set_bitmap(inode, new_clu, sync_bmap)) {
+ if (exfat_set_bitmap(sb, new_clu, sync_bmap)) {
ret = -EIO;
goto free_cluster;
}
@@ -461,5 +491,15 @@ int exfat_count_num_clusters(struct super_block *sb,
}
*ret_count = count;
+
+ /*
+ * since exfat_count_used_clusters() is not called, sbi->used_clusters
+ * cannot be used here.
+ */
+ if (unlikely(i == sbi->num_clusters && clu != EXFAT_EOF_CLUSTER)) {
+ exfat_fs_error(sb, "The cluster chain has a loop");
+ return -EIO;
+ }
+
return 0;
}
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 807349d8ea05..536c8078f0c1 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -25,6 +25,8 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_chain clu;
+ truncate_pagecache(inode, i_size_read(inode));
+
ret = inode_newsize_ok(inode, size);
if (ret)
return ret;
@@ -486,6 +488,55 @@ static int exfat_ioctl_shutdown(struct super_block *sb, unsigned long arg)
return exfat_force_shutdown(sb, flags);
}
+static int exfat_ioctl_get_volume_label(struct super_block *sb, unsigned long arg)
+{
+ int ret;
+ char label[FSLABEL_MAX] = {0};
+ struct exfat_uni_name uniname;
+
+ ret = exfat_read_volume_label(sb, &uniname);
+ if (ret < 0)
+ return ret;
+
+ ret = exfat_utf16_to_nls(sb, &uniname, label, uniname.name_len);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((char __user *)arg, label, ret + 1))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int exfat_ioctl_set_volume_label(struct super_block *sb,
+ unsigned long arg)
+{
+ int ret = 0, lossy, label_len;
+ char label[FSLABEL_MAX] = {0};
+ struct exfat_uni_name uniname;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(label, (char __user *)arg, FSLABEL_MAX))
+ return -EFAULT;
+
+ memset(&uniname, 0, sizeof(uniname));
+ label_len = strnlen(label, FSLABEL_MAX - 1);
+ if (label[0]) {
+ ret = exfat_nls_to_utf16(sb, label, label_len,
+ &uniname, &lossy);
+ if (ret < 0)
+ return ret;
+ else if (lossy & NLS_NAME_LOSSY)
+ return -EINVAL;
+ }
+
+ uniname.name_len = ret;
+
+ return exfat_write_volume_label(sb, &uniname);
+}
+
long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -500,6 +551,10 @@ long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return exfat_ioctl_shutdown(inode->i_sb, arg);
case FITRIM:
return exfat_ioctl_fitrim(inode, arg);
+ case FS_IOC_GETFSLABEL:
+ return exfat_ioctl_get_volume_label(inode->i_sb, arg);
+ case FS_IOC_SETFSLABEL:
+ return exfat_ioctl_set_volume_label(inode->i_sb, arg);
default:
return -ENOTTY;
}
@@ -532,11 +587,10 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
return blkdev_issue_flush(inode->i_sb->s_bdev);
}
-static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+static int exfat_extend_valid_size(struct inode *inode, loff_t new_valid_size)
{
int err;
loff_t pos;
- struct inode *inode = file_inode(file);
struct exfat_inode_info *ei = EXFAT_I(inode);
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *ops = mapping->a_ops;
@@ -551,14 +605,14 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
if (pos + len > new_valid_size)
len = new_valid_size - pos;
- err = ops->write_begin(file, mapping, pos, len, &folio, NULL);
+ err = ops->write_begin(NULL, mapping, pos, len, &folio, NULL);
if (err)
goto out;
off = offset_in_folio(folio, pos);
folio_zero_new_buffers(folio, off, off + len);
- err = ops->write_end(file, mapping, pos, len, len, folio, NULL);
+ err = ops->write_end(NULL, mapping, pos, len, len, folio, NULL);
if (err < 0)
goto out;
pos += len;
@@ -582,8 +636,14 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t pos = iocb->ki_pos;
loff_t valid_size;
+ if (unlikely(exfat_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
inode_lock(inode);
+ if (pos > i_size_read(inode))
+ truncate_pagecache(inode, i_size_read(inode));
+
valid_size = ei->valid_size;
ret = generic_write_checks(iocb, iter);
@@ -601,7 +661,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
}
if (pos > valid_size) {
- ret = exfat_extend_valid_size(file, pos);
+ ret = exfat_extend_valid_size(inode, pos);
if (ret < 0 && ret != -ENOSPC) {
exfat_err(inode->i_sb,
"write: fail to zero from %llu to %llu(%zd)",
@@ -620,9 +680,8 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (pos > valid_size)
pos = valid_size;
- if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) {
- ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1,
- iocb->ki_flags & IOCB_SYNC);
+ if (iocb->ki_pos > pos) {
+ ssize_t err = generic_write_sync(iocb, iocb->ki_pos - pos);
if (err < 0)
return err;
}
@@ -635,6 +694,16 @@ unlock:
return ret;
}
+static ssize_t exfat_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(exfat_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
+ return generic_file_read_iter(iocb, iter);
+}
+
static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf)
{
int err;
@@ -652,7 +721,7 @@ static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf)
start + vma->vm_end - vma->vm_start);
if (ei->valid_size < end) {
- err = exfat_extend_valid_size(file, end);
+ err = exfat_extend_valid_size(inode, end);
if (err < 0) {
inode_unlock(inode);
return vmf_fs_error(err);
@@ -670,24 +739,38 @@ static const struct vm_operations_struct exfat_file_vm_ops = {
.page_mkwrite = exfat_page_mkwrite,
};
-static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int exfat_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
+
+ if (unlikely(exfat_forced_shutdown(file_inode(desc->file)->i_sb)))
+ return -EIO;
+
file_accessed(file);
- vma->vm_ops = &exfat_file_vm_ops;
+ desc->vm_ops = &exfat_file_vm_ops;
return 0;
}
+static ssize_t exfat_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len, unsigned int flags)
+{
+ if (unlikely(exfat_forced_shutdown(file_inode(in)->i_sb)))
+ return -EIO;
+
+ return filemap_splice_read(in, ppos, pipe, len, flags);
+}
+
const struct file_operations exfat_file_operations = {
.llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = exfat_file_read_iter,
.write_iter = exfat_file_write_iter,
.unlocked_ioctl = exfat_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = exfat_compat_ioctl,
#endif
- .mmap = exfat_file_mmap,
+ .mmap_prepare = exfat_file_mmap_prepare,
.fsync = exfat_file_fsync,
- .splice_read = filemap_splice_read,
+ .splice_read = exfat_splice_read,
.splice_write = iter_file_splice_write,
};
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 96952d4acb50..f9501c3a3666 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -25,7 +25,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- bool is_dir = (ei->type == TYPE_DIR) ? true : false;
+ bool is_dir = (ei->type == TYPE_DIR);
struct timespec64 ts;
if (inode->i_ino == EXFAT_ROOT_INO)
@@ -274,9 +274,11 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
sector_t last_block;
sector_t phys = 0;
sector_t valid_blks;
+ loff_t i_size;
mutex_lock(&sbi->s_lock);
- last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb);
+ i_size = i_size_read(inode);
+ last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size, sb);
if (iblock >= last_block && !create)
goto done;
@@ -305,77 +307,99 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
if (buffer_delay(bh_result))
clear_buffer_delay(bh_result);
- if (create) {
+ /*
+ * In most cases, we just need to set bh_result to mapped, unmapped
+ * or new status as follows:
+ * 1. i_size == valid_size
+ * 2. write case (create == 1)
+ * 3. direct_read (!bh_result->b_folio)
+ * -> the unwritten part will be zeroed in exfat_direct_IO()
+ *
+ * Otherwise, in the case of buffered read, it is necessary to take
+ * care the last nested block if valid_size is not equal to i_size.
+ */
+ if (i_size == ei->valid_size || create || !bh_result->b_folio)
valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb);
+ else
+ valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb);
- if (iblock + max_blocks < valid_blks) {
- /* The range has been written, map it */
- goto done;
- } else if (iblock < valid_blks) {
- /*
- * The range has been partially written,
- * map the written part.
- */
- max_blocks = valid_blks - iblock;
- goto done;
- }
+ /* The range has been fully written, map it */
+ if (iblock + max_blocks < valid_blks)
+ goto done;
- /* The area has not been written, map and mark as new. */
- set_buffer_new(bh_result);
+ /* The range has been partially written, map the written part */
+ if (iblock < valid_blks) {
+ max_blocks = valid_blks - iblock;
+ goto done;
+ }
+ /* The area has not been written, map and mark as new for create case */
+ if (create) {
+ set_buffer_new(bh_result);
ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb);
mark_inode_dirty(inode);
- } else {
- valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb);
+ goto done;
+ }
- if (iblock + max_blocks < valid_blks) {
- /* The range has been written, map it */
+ /*
+ * The area has just one block partially written.
+ * In that case, we should read and fill the unwritten part of
+ * a block with zero.
+ */
+ if (bh_result->b_folio && iblock == valid_blks &&
+ (ei->valid_size & (sb->s_blocksize - 1))) {
+ loff_t size, pos;
+ void *addr;
+
+ max_blocks = 1;
+
+ /*
+ * No buffer_head is allocated.
+ * (1) bmap: It's enough to set blocknr without I/O.
+ * (2) read: The unwritten part should be filled with zero.
+ * If a folio does not have any buffers,
+ * let's returns -EAGAIN to fallback to
+ * block_read_full_folio() for per-bh IO.
+ */
+ if (!folio_buffers(bh_result->b_folio)) {
+ err = -EAGAIN;
goto done;
- } else if (iblock < valid_blks) {
- /*
- * The area has been partially written,
- * map the written part.
- */
- max_blocks = valid_blks - iblock;
+ }
+
+ pos = EXFAT_BLK_TO_B(iblock, sb);
+ size = ei->valid_size - pos;
+ addr = folio_address(bh_result->b_folio) +
+ offset_in_folio(bh_result->b_folio, pos);
+
+ /* Check if bh->b_data points to proper addr in folio */
+ if (bh_result->b_data != addr) {
+ exfat_fs_error_ratelimit(sb,
+ "b_data(%p) != folio_addr(%p)",
+ bh_result->b_data, addr);
+ err = -EINVAL;
goto done;
- } else if (iblock == valid_blks &&
- (ei->valid_size & (sb->s_blocksize - 1))) {
- /*
- * The block has been partially written,
- * zero the unwritten part and map the block.
- */
- loff_t size, off, pos;
-
- max_blocks = 1;
-
- /*
- * For direct read, the unwritten part will be zeroed in
- * exfat_direct_IO()
- */
- if (!bh_result->b_folio)
- goto done;
-
- pos = EXFAT_BLK_TO_B(iblock, sb);
- size = ei->valid_size - pos;
- off = pos & (PAGE_SIZE - 1);
-
- folio_set_bh(bh_result, bh_result->b_folio, off);
- err = bh_read(bh_result, 0);
- if (err < 0)
- goto unlock_ret;
-
- folio_zero_segment(bh_result->b_folio, off + size,
- off + sb->s_blocksize);
- } else {
- /*
- * The range has not been written, clear the mapped flag
- * to only zero the cache and do not read from disk.
- */
- clear_buffer_mapped(bh_result);
}
+
+ /* Read a block */
+ err = bh_read(bh_result, 0);
+ if (err < 0)
+ goto done;
+
+ /* Zero unwritten part of a block */
+ memset(bh_result->b_data + size, 0, bh_result->b_size - size);
+ err = 0;
+ goto done;
}
+
+ /*
+ * The area has not been written, clear mapped for read/bmap cases.
+ * If so, it will be filled with zero without reading from disk.
+ */
+ clear_buffer_mapped(bh_result);
done:
bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb);
+ if (err < 0)
+ clear_buffer_mapped(bh_result);
unlock_ret:
mutex_unlock(&sbi->s_lock);
return err;
@@ -422,9 +446,10 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int exfat_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len,
- struct folio **foliop, void **fsdata)
+static int exfat_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned int len,
+ struct folio **foliop, void **fsdata)
{
int ret;
@@ -439,15 +464,16 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int exfat_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
- struct folio *folio, void *fsdata)
+static int exfat_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
struct exfat_inode_info *ei = EXFAT_I(inode);
int err;
- err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (err < len)
exfat_write_failed(mapping, pos+len);
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index fede0283d6e2..dfe957493d49 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -300,7 +300,7 @@ static int exfat_check_max_dentries(struct inode *inode)
* the directory entry index in p_dir is returned on succeeds
* -error code is returned on failure
*/
-static int exfat_find_empty_entry(struct inode *inode,
+int exfat_find_empty_entry(struct inode *inode,
struct exfat_chain *p_dir, int num_entries,
struct exfat_entry_set_cache *es)
{
@@ -442,7 +442,7 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path,
return namelen; /* return error value */
if ((lossy && !lookup) || !namelen)
- return (lossy & NLS_NAME_OVERLEN) ? -ENAMETOOLONG : -EINVAL;
+ return -EINVAL;
return 0;
}
@@ -587,7 +587,7 @@ unlock:
}
/* lookup a file */
-static int exfat_find(struct inode *dir, struct qstr *qname,
+static int exfat_find(struct inode *dir, const struct qstr *qname,
struct exfat_dir_entry *info)
{
int ret, dentry, count;
@@ -642,15 +642,9 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
info->type = exfat_get_entry_type(ep);
info->attr = le16_to_cpu(ep->dentry.file.attr);
- info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size);
info->size = le64_to_cpu(ep2->dentry.stream.size);
- if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) {
- exfat_fs_error(sb, "data size is invalid(%lld)", info->size);
- return -EIO;
- }
-
info->start_clu = le32_to_cpu(ep2->dentry.stream.start_clu);
if (!is_valid_cluster(sbi, info->start_clu) && info->size) {
exfat_warn(sb, "start_clu is invalid cluster(0x%x)",
@@ -688,6 +682,16 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
0);
exfat_put_dentry_set(&es, false);
+ if (info->valid_size < 0) {
+ exfat_fs_error(sb, "data valid size is invalid(%lld)", info->valid_size);
+ return -EIO;
+ }
+
+ if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) {
+ exfat_fs_error(sb, "data size is invalid(%lld)", info->size);
+ return -EIO;
+ }
+
if (ei->start_clu == EXFAT_FREE_CLUSTER) {
exfat_fs_error(sb,
"non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)",
@@ -890,6 +894,7 @@ static int exfat_check_dir_empty(struct super_block *sb,
{
int i, dentries_per_clu;
unsigned int type;
+ unsigned int clu_count = 0;
struct exfat_chain clu;
struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -926,6 +931,10 @@ static int exfat_check_dir_empty(struct super_block *sb,
} else {
if (exfat_get_next_cluster(sb, &(clu.dir)))
return -EIO;
+
+ /* break if the cluster chain includes a loop */
+ if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi)))
+ break;
}
}
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index d47896a89596..57db08a5271c 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -616,9 +616,6 @@ static int exfat_nls_to_ucs2(struct super_block *sb,
unilen++;
}
- if (p_cstring[i] != '\0')
- lossy |= NLS_NAME_OVERLEN;
-
*uniname = '\0';
p_uniname->name_len = unilen;
p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0,
@@ -789,7 +786,7 @@ int exfat_create_upcase_table(struct super_block *sb)
return ret;
}
- if (exfat_get_next_cluster(sb, &(clu.dir)))
+ if (exfat_get_next_cluster(sb, &clu.dir))
return -EIO;
}
@@ -801,4 +798,5 @@ load_default:
void exfat_free_upcase_table(struct exfat_sb_info *sbi)
{
kvfree(sbi->vol_utbl);
+ sbi->vol_utbl = NULL;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index bd57844414aa..10e872a99663 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -31,34 +31,25 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi)
kfree(sbi->options.iocharset);
}
-static void exfat_put_super(struct super_block *sb)
+static void exfat_set_iocharset(struct exfat_mount_options *opts,
+ char *iocharset)
{
- struct exfat_sb_info *sbi = EXFAT_SB(sb);
-
- mutex_lock(&sbi->s_lock);
- exfat_free_bitmap(sbi);
- brelse(sbi->boot_bh);
- mutex_unlock(&sbi->s_lock);
+ opts->iocharset = iocharset;
+ if (!strcmp(opts->iocharset, "utf8"))
+ opts->utf8 = 1;
+ else
+ opts->utf8 = 0;
}
-static int exfat_sync_fs(struct super_block *sb, int wait)
+static void exfat_put_super(struct super_block *sb)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- int err = 0;
-
- if (unlikely(exfat_forced_shutdown(sb)))
- return 0;
-
- if (!wait)
- return 0;
- /* If there are some dirty buffers in the bdev inode */
mutex_lock(&sbi->s_lock);
- sync_blockdev(sb->s_bdev);
- if (exfat_clear_volume_dirty(sb))
- err = -EIO;
+ exfat_clear_volume_dirty(sb);
+ exfat_free_bitmap(sbi);
+ brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
- return err;
}
static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -67,15 +58,6 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
unsigned long long id = huge_encode_dev(sb->s_bdev->bd_dev);
- if (sbi->used_clusters == EXFAT_CLUSTERS_UNTRACKED) {
- mutex_lock(&sbi->s_lock);
- if (exfat_count_used_clusters(sb, &sbi->used_clusters)) {
- mutex_unlock(&sbi->s_lock);
- return -EIO;
- }
- mutex_unlock(&sbi->s_lock);
- }
-
buf->f_type = sb->s_magic;
buf->f_bsize = sbi->cluster_size;
buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */
@@ -228,7 +210,6 @@ static const struct super_operations exfat_sops = {
.write_inode = exfat_write_inode,
.evict_inode = exfat_evict_inode,
.put_super = exfat_put_super,
- .sync_fs = exfat_sync_fs,
.statfs = exfat_statfs,
.show_options = exfat_show_options,
.shutdown = exfat_shutdown,
@@ -272,11 +253,11 @@ static const struct fs_parameter_spec exfat_parameters[] = {
fsparam_u32oct("allow_utime", Opt_allow_utime),
fsparam_string("iocharset", Opt_charset),
fsparam_enum("errors", Opt_errors, exfat_param_enums),
- fsparam_flag("discard", Opt_discard),
+ fsparam_flag_no("discard", Opt_discard),
fsparam_flag("keep_last_dots", Opt_keep_last_dots),
fsparam_flag("sys_tz", Opt_sys_tz),
fsparam_s32("time_offset", Opt_time_offset),
- fsparam_flag("zero_size_dir", Opt_zero_size_dir),
+ fsparam_flag_no("zero_size_dir", Opt_zero_size_dir),
__fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated,
NULL),
__fsparam(NULL, "debug", Opt_debug, fs_param_deprecated,
@@ -321,14 +302,14 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_charset:
exfat_free_iocharset(sbi);
- opts->iocharset = param->string;
+ exfat_set_iocharset(opts, param->string);
param->string = NULL;
break;
case Opt_errors:
opts->errors = result.uint_32;
break;
case Opt_discard:
- opts->discard = 1;
+ opts->discard = !result.negated;
break;
case Opt_keep_last_dots:
opts->keep_last_dots = 1;
@@ -346,7 +327,7 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
opts->time_offset = result.int_32;
break;
case Opt_zero_size_dir:
- opts->zero_size_dir = true;
+ opts->zero_size_dir = !result.negated;
break;
case Opt_utf8:
case Opt_debug:
@@ -370,13 +351,12 @@ static void exfat_hash_init(struct super_block *sb)
INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
}
-static int exfat_read_root(struct inode *inode)
+static int exfat_read_root(struct inode *inode, struct exfat_chain *root_clu)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- struct exfat_chain cdir;
- int num_subdirs, num_clu = 0;
+ int num_subdirs;
exfat_chain_set(&ei->dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
ei->entry = -1;
@@ -389,12 +369,9 @@ static int exfat_read_root(struct inode *inode)
ei->hint_stat.clu = sbi->root_dir;
ei->hint_femp.eidx = EXFAT_HINT_NONE;
- exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
- if (exfat_count_num_clusters(sb, &cdir, &num_clu))
- return -EIO;
- i_size_write(inode, num_clu << sbi->cluster_size_bits);
+ i_size_write(inode, EXFAT_CLU_TO_B(root_clu->size, sbi));
- num_subdirs = exfat_count_dir_entries(sb, &cdir);
+ num_subdirs = exfat_count_dir_entries(sb, root_clu);
if (num_subdirs < 0)
return -EIO;
set_nlink(inode, num_subdirs + EXFAT_MIN_SUBDIR);
@@ -456,7 +433,10 @@ static int exfat_read_boot_sector(struct super_block *sb)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
/* set block size to read super block */
- sb_min_blocksize(sb, 512);
+ if (!sb_min_blocksize(sb, 512)) {
+ exfat_err(sb, "unable to set blocksize");
+ return -EINVAL;
+ }
/* read boot sector */
sbi->boot_bh = sb_bread(sb, 0);
@@ -531,7 +511,6 @@ static int exfat_read_boot_sector(struct super_block *sb)
sbi->vol_flags = le16_to_cpu(p_boot->vol_flags);
sbi->vol_flags_persistent = sbi->vol_flags & (VOLUME_DIRTY | MEDIA_FAILURE);
sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
- sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED;
/* check consistencies */
if ((u64)sbi->num_FAT_sectors << p_boot->sect_size_bits <
@@ -608,7 +587,8 @@ static int exfat_verify_boot_region(struct super_block *sb)
}
/* mount the file system volume */
-static int __exfat_fill_super(struct super_block *sb)
+static int __exfat_fill_super(struct super_block *sb,
+ struct exfat_chain *root_clu)
{
int ret;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -625,6 +605,18 @@ static int __exfat_fill_super(struct super_block *sb)
goto free_bh;
}
+ /*
+ * Call exfat_count_num_cluster() before searching for up-case and
+ * bitmap directory entries to avoid infinite loop if they are missing
+ * and the cluster chain includes a loop.
+ */
+ exfat_chain_set(root_clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+ ret = exfat_count_num_clusters(sb, root_clu, &root_clu->size);
+ if (ret) {
+ exfat_err(sb, "failed to count the number of clusters in root");
+ goto free_bh;
+ }
+
ret = exfat_create_upcase_table(sb);
if (ret) {
exfat_err(sb, "failed to load upcase table");
@@ -637,6 +629,17 @@ static int __exfat_fill_super(struct super_block *sb)
goto free_bh;
}
+ if (!exfat_test_bitmap(sb, sbi->root_dir)) {
+ exfat_warn(sb, "failed to test first cluster bit of root dir(%u)",
+ sbi->root_dir);
+ /*
+ * The first cluster bit of the root directory should never
+ * be unset except when storage is corrupted. This bit is
+ * set to allow operations after mount.
+ */
+ exfat_set_bitmap(sb, sbi->root_dir, false);
+ }
+
ret = exfat_count_used_clusters(sb, &sbi->used_clusters);
if (ret) {
exfat_err(sb, "failed to scan clusters");
@@ -657,6 +660,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
struct exfat_sb_info *sbi = sb->s_fs_info;
struct exfat_mount_options *opts = &sbi->options;
struct inode *root_inode;
+ struct exfat_chain root_clu;
int err;
if (opts->allow_utime == (unsigned short)-1)
@@ -675,7 +679,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS;
sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS;
- err = __exfat_fill_super(sb);
+ err = __exfat_fill_super(sb, &root_clu);
if (err) {
exfat_err(sb, "failed to recognize exfat type");
goto check_nls_io;
@@ -684,8 +688,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
/* set up enough so that it can read an inode */
exfat_hash_init(sb);
- if (!strcmp(sbi->options.iocharset, "utf8"))
- opts->utf8 = 1;
+ if (sbi->options.utf8)
+ set_default_d_op(sb, &exfat_utf8_dentry_ops);
else {
sbi->nls_io = load_nls(sbi->options.iocharset);
if (!sbi->nls_io) {
@@ -694,13 +698,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
err = -EINVAL;
goto free_table;
}
+ set_default_d_op(sb, &exfat_dentry_ops);
}
- if (sbi->options.utf8)
- sb->s_d_op = &exfat_utf8_dentry_ops;
- else
- sb->s_d_op = &exfat_dentry_ops;
-
root_inode = new_inode(sb);
if (!root_inode) {
exfat_err(sb, "failed to allocate root inode");
@@ -710,7 +710,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
root_inode->i_ino = EXFAT_ROOT_INO;
inode_set_iversion(root_inode, 1);
- err = exfat_read_root(root_inode);
+ err = exfat_read_root(root_inode, &root_clu);
if (err) {
exfat_err(sb, "failed to initialize root inode");
goto put_inode;
@@ -761,10 +761,46 @@ static void exfat_free(struct fs_context *fc)
static int exfat_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+ struct exfat_sb_info *remount_sbi = fc->s_fs_info;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_mount_options *new_opts = &remount_sbi->options;
+ struct exfat_mount_options *cur_opts = &sbi->options;
+
fc->sb_flags |= SB_NODIRATIME;
- /* volume flag will be updated in exfat_sync_fs */
- sync_filesystem(fc->root->d_sb);
+ sync_filesystem(sb);
+ mutex_lock(&sbi->s_lock);
+ exfat_clear_volume_dirty(sb);
+ mutex_unlock(&sbi->s_lock);
+
+ if (new_opts->allow_utime == (unsigned short)-1)
+ new_opts->allow_utime = ~new_opts->fs_dmask & 0022;
+
+ /*
+ * Since the old settings of these mount options are cached in
+ * inodes or dentries, they cannot be modified dynamically.
+ */
+ if (strcmp(new_opts->iocharset, cur_opts->iocharset) ||
+ new_opts->keep_last_dots != cur_opts->keep_last_dots ||
+ new_opts->sys_tz != cur_opts->sys_tz ||
+ new_opts->time_offset != cur_opts->time_offset ||
+ !uid_eq(new_opts->fs_uid, cur_opts->fs_uid) ||
+ !gid_eq(new_opts->fs_gid, cur_opts->fs_gid) ||
+ new_opts->fs_fmask != cur_opts->fs_fmask ||
+ new_opts->fs_dmask != cur_opts->fs_dmask ||
+ new_opts->allow_utime != cur_opts->allow_utime)
+ return -EINVAL;
+
+ if (new_opts->discard != cur_opts->discard &&
+ new_opts->discard &&
+ !bdev_max_discard_sectors(sb->s_bdev)) {
+ exfat_warn(sb, "remounting with \"discard\" option, but the device does not support discard");
+ return -EINVAL;
+ }
+
+ swap(*cur_opts, *new_opts);
+
return 0;
}
@@ -788,13 +824,24 @@ static int exfat_init_fs_context(struct fs_context *fc)
ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- sbi->options.fs_uid = current_uid();
- sbi->options.fs_gid = current_gid();
- sbi->options.fs_fmask = current->fs->umask;
- sbi->options.fs_dmask = current->fs->umask;
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && fc->root) {
+ struct super_block *sb = fc->root->d_sb;
+ struct exfat_mount_options *cur_opts = &EXFAT_SB(sb)->options;
+
+ sbi->options.fs_uid = cur_opts->fs_uid;
+ sbi->options.fs_gid = cur_opts->fs_gid;
+ sbi->options.fs_fmask = cur_opts->fs_fmask;
+ sbi->options.fs_dmask = cur_opts->fs_dmask;
+ } else {
+ sbi->options.fs_uid = current_uid();
+ sbi->options.fs_gid = current_gid();
+ sbi->options.fs_fmask = current->fs->umask;
+ sbi->options.fs_dmask = current->fs->umask;
+ }
+
sbi->options.allow_utime = -1;
- sbi->options.iocharset = exfat_default_iocharset;
sbi->options.errors = EXFAT_ERRORS_RO;
+ exfat_set_iocharset(&sbi->options, exfat_default_iocharset);
fc->s_fs_info = sbi;
fc->ops = &exfat_context_ops;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index b5845c4846b8..d3e55de4a2a2 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -143,7 +143,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
- tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf));
+ tmp = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), parent);
if (IS_ERR(tmp)) {
dprintk("lookup failed: %ld\n", PTR_ERR(tmp));
err = PTR_ERR(tmp);
@@ -284,6 +284,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
};
struct getdents_callback buffer = {
.ctx.actor = filldir_one,
+ .ctx.count = INT_MAX,
.name = name,
};
@@ -548,16 +549,13 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
goto err_result;
}
- inode_lock(target_dir->d_inode);
- nresult = lookup_one(mnt_idmap(mnt), nbuf,
- target_dir, strlen(nbuf));
+ nresult = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), target_dir);
if (!IS_ERR(nresult)) {
if (unlikely(nresult->d_inode != result->d_inode)) {
dput(nresult);
nresult = ERR_PTR(-ESTALE);
}
}
- inode_unlock(target_dir->d_inode);
/*
* At this point we are done with the parent, but it's pinned
* by the child dentry anyway.
@@ -608,4 +606,5 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
}
EXPORT_SYMBOL_GPL(exportfs_decode_fh);
+MODULE_DESCRIPTION("Code mapping from inodes to file handles");
MODULE_LICENSE("GPL");
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 402fecf90a44..b07b3b369710 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
struct inode *dir = mapping->host;
inode_inc_iversion(dir);
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+ block_write_end(pos, len, len, folio);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index f38bdd46e4f7..cf97b76e9fd3 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -368,6 +368,7 @@ struct ext2_inode {
#define EXT2_MOUNT_ERRORS_CONT 0x000010 /* Continue on errors */
#define EXT2_MOUNT_ERRORS_RO 0x000020 /* Remount fs ro on errors */
#define EXT2_MOUNT_ERRORS_PANIC 0x000040 /* Panic on errors */
+#define EXT2_MOUNT_ERRORS_MASK 0x000070
#define EXT2_MOUNT_MINIX_DF 0x000080 /* Mimics the Minix statfs */
#define EXT2_MOUNT_NOBH 0x000100 /* No buffer_heads */
#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */
@@ -749,9 +750,9 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
/* ioctl.c */
-extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern int ext2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 10b061ac5bc0..76bddce462fc 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -122,17 +122,19 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
.pfn_mkwrite = ext2_dax_fault,
};
-static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ext2_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
+
if (!IS_DAX(file_inode(file)))
- return generic_file_mmap(file, vma);
+ return generic_file_mmap_prepare(desc);
file_accessed(file);
- vma->vm_ops = &ext2_dax_vm_ops;
+ desc->vm_ops = &ext2_dax_vm_ops;
return 0;
}
#else
-#define ext2_file_mmap generic_file_mmap
+#define ext2_file_mmap_prepare generic_file_mmap_prepare
#endif
/*
@@ -316,7 +318,7 @@ const struct file_operations ext2_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
- .mmap = ext2_file_mmap,
+ .mmap_prepare = ext2_file_mmap_prepare,
.open = ext2_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 30f8201c155f..dbfe9098a124 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -895,9 +895,19 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret;
+ loff_t i_size;
inode_lock(inode);
- len = min_t(u64, len, i_size_read(inode));
+ i_size = i_size_read(inode);
+ /*
+ * iomap_fiemap() returns EINVAL for 0 length. Make sure we don't trim
+ * length to 0 but still trim the range as much as possible since
+ * ext2_get_blocks() iterates unmapped space block by block which is
+ * slow.
+ */
+ if (i_size == 0)
+ i_size = 1;
+ len = min_t(u64, len, i_size);
ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops);
inode_unlock(inode);
@@ -915,7 +925,7 @@ static void ext2_readahead(struct readahead_control *rac)
}
static int
-ext2_write_begin(struct file *file, struct address_space *mapping,
+ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
int ret;
@@ -926,13 +936,14 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int ext2_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int ext2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
ext2_write_failed(mapping, pos + len);
return ret;
@@ -1387,7 +1398,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ei = EXT2_I(inode);
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 44e04484e570..c3fea55b8efa 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -18,7 +18,7 @@
#include <linux/uaccess.h>
#include <linux/fileattr.h>
-int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct ext2_inode_info *ei = EXT2_I(d_inode(dentry));
@@ -28,7 +28,7 @@ int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ext2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct ext2_inode_info *ei = EXT2_I(inode);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 37f7ce56adce..121e634c792a 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -23,7 +23,8 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/random.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
@@ -40,7 +41,6 @@
#include "acl.h"
static void ext2_write_super(struct super_block *sb);
-static int ext2_remount (struct super_block * sb, int * flags, char * data);
static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
static int ext2_sync_fs(struct super_block *sb, int wait);
static int ext2_freeze(struct super_block *sb);
@@ -81,6 +81,33 @@ void ext2_error(struct super_block *sb, const char *function,
}
}
+static void ext2_msg_fc(struct fs_context *fc, const char *prefix,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ const char *s_id;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ s_id = fc->root->d_sb->s_id;
+ } else {
+ /* get last path component of source */
+ s_id = strrchr(fc->source, '/');
+ if (s_id)
+ s_id++;
+ else
+ s_id = fc->source;
+ }
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%sEXT2-fs (%s): %pV\n", prefix, s_id, &vaf);
+
+ va_end(args);
+}
+
void ext2_msg(struct super_block *sb, const char *prefix,
const char *fmt, ...)
{
@@ -346,7 +373,6 @@ static const struct super_operations ext2_sops = {
.freeze_fs = ext2_freeze,
.unfreeze_fs = ext2_unfreeze,
.statfs = ext2_statfs,
- .remount_fs = ext2_remount,
.show_options = ext2_show_options,
#ifdef CONFIG_QUOTA
.quota_read = ext2_quota_read,
@@ -402,230 +428,218 @@ static const struct export_operations ext2_export_ops = {
.get_parent = ext2_get_parent,
};
-static unsigned long get_sb_block(void **data)
-{
- unsigned long sb_block;
- char *options = (char *) *data;
-
- if (!options || strncmp(options, "sb=", 3) != 0)
- return 1; /* Default location */
- options += 3;
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
- printk("EXT2-fs: Invalid sb specification: %s\n",
- (char *) *data);
- return 1;
- }
- if (*options == ',')
- options++;
- *data = (void *) options;
- return sb_block;
-}
-
enum {
- Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
- Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
- Opt_err_ro, Opt_nouid32, Opt_debug,
- Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
- Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
- Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
+ Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid,
+ Opt_sb, Opt_errors, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_nobh, Opt_user_xattr, Opt_acl, Opt_xip, Opt_dax, Opt_ignore,
+ Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation,
};
-static const match_table_t tokens = {
- {Opt_bsd_df, "bsddf"},
- {Opt_minix_df, "minixdf"},
- {Opt_grpid, "grpid"},
- {Opt_grpid, "bsdgroups"},
- {Opt_nogrpid, "nogrpid"},
- {Opt_nogrpid, "sysvgroups"},
- {Opt_resgid, "resgid=%u"},
- {Opt_resuid, "resuid=%u"},
- {Opt_sb, "sb=%u"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_nouid32, "nouid32"},
- {Opt_debug, "debug"},
- {Opt_oldalloc, "oldalloc"},
- {Opt_orlov, "orlov"},
- {Opt_nobh, "nobh"},
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_xip, "xip"},
- {Opt_dax, "dax"},
- {Opt_grpquota, "grpquota"},
- {Opt_ignore, "noquota"},
- {Opt_quota, "quota"},
- {Opt_usrquota, "usrquota"},
- {Opt_reservation, "reservation"},
- {Opt_noreservation, "noreservation"},
- {Opt_err, NULL}
+static const struct constant_table ext2_param_errors[] = {
+ {"continue", EXT2_MOUNT_ERRORS_CONT},
+ {"panic", EXT2_MOUNT_ERRORS_PANIC},
+ {"remount-ro", EXT2_MOUNT_ERRORS_RO},
+ {}
+};
+
+static const struct fs_parameter_spec ext2_param_spec[] = {
+ fsparam_flag ("bsddf", Opt_bsd_df),
+ fsparam_flag ("minixdf", Opt_minix_df),
+ fsparam_flag ("grpid", Opt_grpid),
+ fsparam_flag ("bsdgroups", Opt_grpid),
+ fsparam_flag ("nogrpid", Opt_nogrpid),
+ fsparam_flag ("sysvgroups", Opt_nogrpid),
+ fsparam_gid ("resgid", Opt_resgid),
+ fsparam_uid ("resuid", Opt_resuid),
+ fsparam_u32 ("sb", Opt_sb),
+ fsparam_enum ("errors", Opt_errors, ext2_param_errors),
+ fsparam_flag ("nouid32", Opt_nouid32),
+ fsparam_flag ("debug", Opt_debug),
+ fsparam_flag ("oldalloc", Opt_oldalloc),
+ fsparam_flag ("orlov", Opt_orlov),
+ fsparam_flag ("nobh", Opt_nobh),
+ fsparam_flag_no ("user_xattr", Opt_user_xattr),
+ fsparam_flag_no ("acl", Opt_acl),
+ fsparam_flag ("xip", Opt_xip),
+ fsparam_flag ("dax", Opt_dax),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_flag ("noquota", Opt_ignore),
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag_no ("reservation", Opt_reservation),
+ {}
+};
+
+#define EXT2_SPEC_s_resuid (1 << 0)
+#define EXT2_SPEC_s_resgid (1 << 1)
+
+struct ext2_fs_context {
+ unsigned long vals_s_flags; /* Bits to set in s_flags */
+ unsigned long mask_s_flags; /* Bits changed in s_flags */
+ unsigned int vals_s_mount_opt;
+ unsigned int mask_s_mount_opt;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ unsigned long s_sb_block;
+ unsigned int spec;
+
};
-static int parse_options(char *options, struct super_block *sb,
- struct ext2_mount_options *opts)
+static inline void ctx_set_mount_opt(struct ext2_fs_context *ctx,
+ unsigned long flag)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- kuid_t uid;
- kgid_t gid;
-
- if (!options)
- return 1;
-
- while ((p = strsep (&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_bsd_df:
- clear_opt (opts->s_mount_opt, MINIX_DF);
- break;
- case Opt_minix_df:
- set_opt (opts->s_mount_opt, MINIX_DF);
- break;
- case Opt_grpid:
- set_opt (opts->s_mount_opt, GRPID);
- break;
- case Opt_nogrpid:
- clear_opt (opts->s_mount_opt, GRPID);
- break;
- case Opt_resuid:
- if (match_int(&args[0], &option))
- return 0;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid)) {
- ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option);
- return 0;
-
- }
- opts->s_resuid = uid;
- break;
- case Opt_resgid:
- if (match_int(&args[0], &option))
- return 0;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid)) {
- ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option);
- return 0;
- }
- opts->s_resgid = gid;
- break;
- case Opt_sb:
- /* handled by get_sb_block() instead of here */
- /* *sb_block = match_int(&args[0]); */
- break;
- case Opt_err_panic:
- clear_opt (opts->s_mount_opt, ERRORS_CONT);
- clear_opt (opts->s_mount_opt, ERRORS_RO);
- set_opt (opts->s_mount_opt, ERRORS_PANIC);
- break;
- case Opt_err_ro:
- clear_opt (opts->s_mount_opt, ERRORS_CONT);
- clear_opt (opts->s_mount_opt, ERRORS_PANIC);
- set_opt (opts->s_mount_opt, ERRORS_RO);
- break;
- case Opt_err_cont:
- clear_opt (opts->s_mount_opt, ERRORS_RO);
- clear_opt (opts->s_mount_opt, ERRORS_PANIC);
- set_opt (opts->s_mount_opt, ERRORS_CONT);
- break;
- case Opt_nouid32:
- set_opt (opts->s_mount_opt, NO_UID32);
- break;
- case Opt_debug:
- set_opt (opts->s_mount_opt, DEBUG);
- break;
- case Opt_oldalloc:
- set_opt (opts->s_mount_opt, OLDALLOC);
- break;
- case Opt_orlov:
- clear_opt (opts->s_mount_opt, OLDALLOC);
- break;
- case Opt_nobh:
- ext2_msg(sb, KERN_INFO,
- "nobh option not supported");
- break;
+ ctx->mask_s_mount_opt |= flag;
+ ctx->vals_s_mount_opt |= flag;
+}
+
+static inline void ctx_clear_mount_opt(struct ext2_fs_context *ctx,
+ unsigned long flag)
+{
+ ctx->mask_s_mount_opt |= flag;
+ ctx->vals_s_mount_opt &= ~flag;
+}
+
+static inline unsigned long
+ctx_test_mount_opt(struct ext2_fs_context *ctx, unsigned long flag)
+{
+ return (ctx->vals_s_mount_opt & flag);
+}
+
+static inline bool
+ctx_parsed_mount_opt(struct ext2_fs_context *ctx, unsigned long flag)
+{
+ return (ctx->mask_s_mount_opt & flag);
+}
+
+static void ext2_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static int ext2_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct ext2_fs_context *ctx = fc->fs_private;
+ int opt;
+ struct fs_parse_result result;
+
+ opt = fs_parse(fc, ext2_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_bsd_df:
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_MINIX_DF);
+ break;
+ case Opt_minix_df:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_MINIX_DF);
+ break;
+ case Opt_grpid:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_GRPID);
+ break;
+ case Opt_nogrpid:
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_GRPID);
+ break;
+ case Opt_resuid:
+ ctx->s_resuid = result.uid;
+ ctx->spec |= EXT2_SPEC_s_resuid;
+ break;
+ case Opt_resgid:
+ ctx->s_resgid = result.gid;
+ ctx->spec |= EXT2_SPEC_s_resgid;
+ break;
+ case Opt_sb:
+ /* Note that this is silently ignored on remount */
+ ctx->s_sb_block = result.uint_32;
+ break;
+ case Opt_errors:
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_ERRORS_MASK);
+ ctx_set_mount_opt(ctx, result.uint_32);
+ break;
+ case Opt_nouid32:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_NO_UID32);
+ break;
+ case Opt_debug:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_DEBUG);
+ break;
+ case Opt_oldalloc:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_OLDALLOC);
+ break;
+ case Opt_orlov:
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_OLDALLOC);
+ break;
+ case Opt_nobh:
+ ext2_msg_fc(fc, KERN_INFO, "nobh option not supported\n");
+ break;
#ifdef CONFIG_EXT2_FS_XATTR
- case Opt_user_xattr:
- set_opt (opts->s_mount_opt, XATTR_USER);
- break;
- case Opt_nouser_xattr:
- clear_opt (opts->s_mount_opt, XATTR_USER);
- break;
+ case Opt_user_xattr:
+ if (!result.negated)
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_XATTR_USER);
+ else
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_XATTR_USER);
+ break;
#else
- case Opt_user_xattr:
- case Opt_nouser_xattr:
- ext2_msg(sb, KERN_INFO, "(no)user_xattr options"
- "not supported");
- break;
+ case Opt_user_xattr:
+ ext2_msg_fc(fc, KERN_INFO, "(no)user_xattr options not supported");
+ break;
#endif
#ifdef CONFIG_EXT2_FS_POSIX_ACL
- case Opt_acl:
- set_opt(opts->s_mount_opt, POSIX_ACL);
- break;
- case Opt_noacl:
- clear_opt(opts->s_mount_opt, POSIX_ACL);
- break;
+ case Opt_acl:
+ if (!result.negated)
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL);
+ else
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL);
+ break;
#else
- case Opt_acl:
- case Opt_noacl:
- ext2_msg(sb, KERN_INFO,
- "(no)acl options not supported");
- break;
+ case Opt_acl:
+ ext2_msg_fc(fc, KERN_INFO, "(no)acl options not supported");
+ break;
#endif
- case Opt_xip:
- ext2_msg(sb, KERN_INFO, "use dax instead of xip");
- set_opt(opts->s_mount_opt, XIP);
- fallthrough;
- case Opt_dax:
+ case Opt_xip:
+ ext2_msg_fc(fc, KERN_INFO, "use dax instead of xip");
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_XIP);
+ fallthrough;
+ case Opt_dax:
#ifdef CONFIG_FS_DAX
- ext2_msg(sb, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- set_opt(opts->s_mount_opt, DAX);
+ ext2_msg_fc(fc, KERN_WARNING,
+ "DAX enabled. Warning: DAX support in ext2 driver is deprecated"
+ " and will be removed at the end of 2025. Please use ext4 driver instead.");
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_DAX);
#else
- ext2_msg(sb, KERN_INFO, "dax option not supported");
+ ext2_msg_fc(fc, KERN_INFO, "dax option not supported");
#endif
- break;
+ break;
#if defined(CONFIG_QUOTA)
- case Opt_quota:
- case Opt_usrquota:
- set_opt(opts->s_mount_opt, USRQUOTA);
- break;
-
- case Opt_grpquota:
- set_opt(opts->s_mount_opt, GRPQUOTA);
- break;
+ case Opt_quota:
+ case Opt_usrquota:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_USRQUOTA);
+ break;
+
+ case Opt_grpquota:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_GRPQUOTA);
+ break;
#else
- case Opt_quota:
- case Opt_usrquota:
- case Opt_grpquota:
- ext2_msg(sb, KERN_INFO,
- "quota operations not supported");
- break;
+ case Opt_quota:
+ case Opt_usrquota:
+ case Opt_grpquota:
+ ext2_msg_fc(fc, KERN_INFO, "quota operations not supported");
+ break;
#endif
-
- case Opt_reservation:
- set_opt(opts->s_mount_opt, RESERVATION);
- ext2_msg(sb, KERN_INFO, "reservations ON");
- break;
- case Opt_noreservation:
- clear_opt(opts->s_mount_opt, RESERVATION);
- ext2_msg(sb, KERN_INFO, "reservations OFF");
- break;
- case Opt_ignore:
- break;
- default:
- return 0;
+ case Opt_reservation:
+ if (!result.negated) {
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_RESERVATION);
+ ext2_msg_fc(fc, KERN_INFO, "reservations ON");
+ } else {
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_RESERVATION);
+ ext2_msg_fc(fc, KERN_INFO, "reservations OFF");
}
+ break;
+ case Opt_ignore:
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
static int ext2_setup_super (struct super_block * sb,
@@ -801,24 +815,83 @@ static unsigned long descriptor_loc(struct super_block *sb,
return ext2_group_first_block_no(sb, bg) + ext2_bg_has_super(sb, bg);
}
-static int ext2_fill_super(struct super_block *sb, void *data, int silent)
+/*
+ * Set all mount options either from defaults on disk, or from parsed
+ * options. Parsed/specified options override on-disk defaults.
+ */
+static void ext2_set_options(struct fs_context *fc, struct ext2_sb_info *sbi)
+{
+ struct ext2_fs_context *ctx = fc->fs_private;
+ struct ext2_super_block *es = sbi->s_es;
+ unsigned long def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+
+ /* Copy parsed mount options to sbi */
+ sbi->s_mount_opt = ctx->vals_s_mount_opt;
+
+ /* Use in-superblock defaults only if not specified during parsing */
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_DEBUG) &&
+ def_mount_opts & EXT2_DEFM_DEBUG)
+ set_opt(sbi->s_mount_opt, DEBUG);
+
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_GRPID) &&
+ def_mount_opts & EXT2_DEFM_BSDGROUPS)
+ set_opt(sbi->s_mount_opt, GRPID);
+
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_NO_UID32) &&
+ def_mount_opts & EXT2_DEFM_UID16)
+ set_opt(sbi->s_mount_opt, NO_UID32);
+
+#ifdef CONFIG_EXT2_FS_XATTR
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_XATTR_USER) &&
+ def_mount_opts & EXT2_DEFM_XATTR_USER)
+ set_opt(sbi->s_mount_opt, XATTR_USER);
+#endif
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL) &&
+ def_mount_opts & EXT2_DEFM_ACL)
+ set_opt(sbi->s_mount_opt, POSIX_ACL);
+#endif
+
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_ERRORS_MASK)) {
+ if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC)
+ set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE)
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
+ else
+ set_opt(sbi->s_mount_opt, ERRORS_RO);
+ }
+
+ if (ctx->spec & EXT2_SPEC_s_resuid)
+ sbi->s_resuid = ctx->s_resuid;
+ else
+ sbi->s_resuid = make_kuid(&init_user_ns,
+ le16_to_cpu(es->s_def_resuid));
+
+ if (ctx->spec & EXT2_SPEC_s_resgid)
+ sbi->s_resgid = ctx->s_resgid;
+ else
+ sbi->s_resgid = make_kgid(&init_user_ns,
+ le16_to_cpu(es->s_def_resgid));
+}
+
+static int ext2_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct ext2_fs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
struct buffer_head * bh;
struct ext2_sb_info * sbi;
struct ext2_super_block * es;
struct inode *root;
unsigned long block;
- unsigned long sb_block = get_sb_block(&data);
+ unsigned long sb_block = ctx->s_sb_block;
unsigned long logic_sb_block;
unsigned long offset = 0;
- unsigned long def_mount_opts;
long ret = -ENOMEM;
int blocksize = BLOCK_SIZE;
int db_count;
int i, j;
__le32 features;
int err;
- struct ext2_mount_options opts;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -877,42 +950,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
if (sb->s_magic != EXT2_SUPER_MAGIC)
goto cantfind_ext2;
- opts.s_mount_opt = 0;
- /* Set defaults before we parse the mount options */
- def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
- if (def_mount_opts & EXT2_DEFM_DEBUG)
- set_opt(opts.s_mount_opt, DEBUG);
- if (def_mount_opts & EXT2_DEFM_BSDGROUPS)
- set_opt(opts.s_mount_opt, GRPID);
- if (def_mount_opts & EXT2_DEFM_UID16)
- set_opt(opts.s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT2_FS_XATTR
- if (def_mount_opts & EXT2_DEFM_XATTR_USER)
- set_opt(opts.s_mount_opt, XATTR_USER);
-#endif
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
- if (def_mount_opts & EXT2_DEFM_ACL)
- set_opt(opts.s_mount_opt, POSIX_ACL);
-#endif
-
- if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC)
- set_opt(opts.s_mount_opt, ERRORS_PANIC);
- else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE)
- set_opt(opts.s_mount_opt, ERRORS_CONT);
- else
- set_opt(opts.s_mount_opt, ERRORS_RO);
-
- opts.s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- opts.s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
-
- set_opt(opts.s_mount_opt, RESERVATION);
-
- if (!parse_options((char *) data, sb, &opts))
- goto failed_mount;
-
- sbi->s_mount_opt = opts.s_mount_opt;
- sbi->s_resuid = opts.s_resuid;
- sbi->s_resgid = opts.s_resgid;
+ ext2_set_options(fc, sbi);
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -1324,23 +1362,21 @@ static void ext2_write_super(struct super_block *sb)
ext2_sync_fs(sb, 1);
}
-static int ext2_remount (struct super_block * sb, int * flags, char * data)
+static int ext2_reconfigure(struct fs_context *fc)
{
+ struct ext2_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
struct ext2_sb_info * sbi = EXT2_SB(sb);
struct ext2_super_block * es;
struct ext2_mount_options new_opts;
+ int flags = fc->sb_flags;
int err;
sync_filesystem(sb);
- spin_lock(&sbi->s_lock);
- new_opts.s_mount_opt = sbi->s_mount_opt;
- new_opts.s_resuid = sbi->s_resuid;
- new_opts.s_resgid = sbi->s_resgid;
- spin_unlock(&sbi->s_lock);
-
- if (!parse_options(data, sb, &new_opts))
- return -EINVAL;
+ new_opts.s_mount_opt = ctx->vals_s_mount_opt;
+ new_opts.s_resuid = ctx->s_resuid;
+ new_opts.s_resgid = ctx->s_resgid;
spin_lock(&sbi->s_lock);
es = sbi->s_es;
@@ -1349,9 +1385,9 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
"dax flag with busy inodes while remounting");
new_opts.s_mount_opt ^= EXT2_MOUNT_DAX;
}
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(flags & SB_RDONLY) == sb_rdonly(sb))
goto out_set;
- if (*flags & SB_RDONLY) {
+ if (flags & SB_RDONLY) {
if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
!(sbi->s_mount_state & EXT2_VALID_FS))
goto out_set;
@@ -1470,10 +1506,9 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
return 0;
}
-static struct dentry *ext2_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int ext2_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+ return get_tree_bdev(fc, ext2_fill_super);
}
#ifdef CONFIG_QUOTA
@@ -1556,7 +1591,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
unlock_buffer(bh);
@@ -1624,12 +1659,49 @@ out:
#endif
+static const struct fs_context_operations ext2_context_ops = {
+ .parse_param = ext2_parse_param,
+ .get_tree = ext2_get_tree,
+ .reconfigure = ext2_reconfigure,
+ .free = ext2_free_fc,
+};
+
+static int ext2_init_fs_context(struct fs_context *fc)
+{
+ struct ext2_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+ spin_lock(&sbi->s_lock);
+ ctx->vals_s_mount_opt = sbi->s_mount_opt;
+ ctx->vals_s_flags = sb->s_flags;
+ ctx->s_resuid = sbi->s_resuid;
+ ctx->s_resgid = sbi->s_resgid;
+ spin_unlock(&sbi->s_lock);
+ } else {
+ ctx->s_sb_block = 1;
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_RESERVATION);
+ }
+
+ fc->fs_private = ctx;
+ fc->ops = &ext2_context_ops;
+
+ return 0;
+}
+
static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
.name = "ext2",
- .mount = ext2_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = ext2_init_fs_context,
+ .parameters = ext2_param_spec,
};
MODULE_ALIAS_FS("ext2");
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index c9ca41d91a6c..01873c2a34ad 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -1,31 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-# Ext3 configs are here for backward compatibility with old configs which may
-# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable
-# kernels after the removal of ext3 driver.
-config EXT3_FS
- tristate "The Extended 3 (ext3) filesystem"
- select EXT4_FS
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
-config EXT3_FS_POSIX_ACL
- bool "Ext3 POSIX Access Control Lists"
- depends on EXT3_FS
- select EXT4_FS_POSIX_ACL
- select FS_POSIX_ACL
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
-config EXT3_FS_SECURITY
- bool "Ext3 Security Labels"
- depends on EXT3_FS
- select EXT4_FS_SECURITY
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
select BUFFER_HEAD
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c48fd36b2d74..8040c731b3e4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -703,7 +703,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* possible we just missed a transaction commit that did so
*/
smp_mb();
- if (sbi->s_mb_free_pending == 0) {
+ if (atomic_read(&sbi->s_mb_free_pending) == 0) {
if (test_opt(sb, DISCARD)) {
atomic_inc(&sbi->s_retry_alloc_pending);
flush_work(&sbi->s_discard_work);
@@ -752,7 +752,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
*count = ar.len;
/*
* Account for the allocated meta blocks. We will never
- * fail EDQUOT for metdata, but we do account for it.
+ * fail EDQUOT for metadata, but we do account for it.
*/
if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
dquot_alloc_block_nofail(inode,
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index a4dbaccee6e7..87760fabdd2e 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -30,7 +30,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
provided |= (hi << 16);
@@ -52,7 +52,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
return;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
@@ -71,7 +71,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb,
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
provided |= (hi << 16);
@@ -92,7 +92,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb,
if (!ext4_has_feature_metadata_csum(sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 87ee3a17bd29..e8c5525afc67 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line,
{
__le32 *bref = p;
unsigned int blk;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ if (journal && inode == journal->j_inode)
return 0;
while (bref < p+max) {
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 0a056d97e640..cf0a0970c095 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -227,6 +227,8 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
}
const struct fscrypt_operations ext4_cryptops = {
+ .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) -
+ (int)offsetof(struct ext4_inode_info, vfs_inode),
.needs_bounce_pages = 1,
.has_32bit_inodes = 1,
.supports_subblock_data_units = 1,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d4164c507a90..256fe2c1d4c1 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -192,13 +192,13 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
continue;
}
if (err > 0) {
- pgoff_t index = map.m_pblk >>
- (PAGE_SHIFT - inode->i_blkbits);
+ pgoff_t index = map.m_pblk << inode->i_blkbits >>
+ PAGE_SHIFT;
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_mapping,
- &file->f_ra, file,
- index, 1);
+ &file->f_ra, file, index,
+ 1 << EXT4_SB(sb)->s_min_folio_order);
file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0);
if (IS_ERR(bh)) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5a20e9cd7184..56112f201cac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -157,7 +157,7 @@ enum criteria {
/*
* Reads each block group sequentially, performing disk IO if
- * necessary, to find find_suitable block group. Tries to
+ * necessary, to find suitable block group. Tries to
* allocate goal length but might trim the request if nothing
* is found after enough tries.
*/
@@ -185,14 +185,8 @@ enum criteria {
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001
-/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED 0x0002
-/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST 0x0008
-/* search for the best chunk */
-#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
@@ -213,15 +207,6 @@ enum criteria {
#define EXT4_MB_USE_RESERVED 0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
-/* Large fragment size list lookup succeeded at least once for
- * CR_POWER2_ALIGNED */
-#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
-/* Avg fragment size rb tree lookup succeeded at least once for
- * CR_GOAL_LEN_FAST */
-#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
-/* Avg fragment size rb tree lookup succeeded at least once for
- * CR_BEST_AVAIL_LEN */
-#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -256,15 +241,26 @@ struct ext4_allocation_request {
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
#define EXT4_MAP_DELAYED BIT(BH_Delay)
+/*
+ * This is for use in ext4_map_query_blocks() for a special case where we can
+ * have a physically and logically contiguous blocks split across two leaf
+ * nodes instead of a single extent. This is required in case of atomic writes
+ * to know whether the returned extent is last in leaf. If yes, then lookup for
+ * next in leaf block in ext4_map_query_blocks_next_in_leaf().
+ * - This is never going to be added to any buffer head state.
+ * - We use the next available bit after BH_BITMAP_UPTODATE.
+ */
+#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_DELAYED)
+ EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
ext4_lblk_t m_lblk;
unsigned int m_len;
unsigned int m_flags;
+ u64 m_seq;
};
/*
@@ -372,7 +368,14 @@ struct ext4_io_submit {
blkbits))
#define EXT4_B_TO_LBLK(inode, offset) \
(round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
-
+#define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits)
+
+/* Translate a block number to a page index */
+#define EXT4_LBLK_TO_PG(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \
+ PAGE_SHIFT)
+/* Translate a page index to a block number */
+#define EXT4_PG_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \
+ (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
/* Translate a cluster number to a block number */
@@ -699,16 +702,22 @@ enum {
/* Caller is from the delayed allocation writeout path
* finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
- /* caller is from the direct IO path, request to creation of an
- unwritten extents if not allocated, split the unwritten
- extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_PRE_IO 0x0008
-#define EXT4_GET_BLOCKS_CONVERT 0x0010
-#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
- EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
- /* Convert extent to initialized after IO complete */
-#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
+ /*
+ * This means that we cannot merge newly allocated extents, and if we
+ * found an unwritten extent, we need to split it.
+ */
+#define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008
+ /*
+ * Caller is from the dio or dioread_nolock buffered IO, reqest to
+ * create an unwritten extent if it does not exist or split the
+ * found unwritten extent. Also do not merge the newly created
+ * unwritten extent, io end will convert unwritten to written,
+ * and try to merge the written extent.
+ */
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+ /* Convert unwritten extent to initialized. */
+#define EXT4_GET_BLOCKS_CONVERT 0x0010
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
@@ -720,11 +729,23 @@ enum {
#define EXT4_GET_BLOCKS_ZERO 0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
EXT4_GET_BLOCKS_ZERO)
- /* Caller will submit data before dropping transaction handle. This
- * allows jbd2 to avoid submitting data before commit. */
+ /* Caller is in the context of data submission, such as writeback,
+ * fsync, etc. Especially, in the generic writeback path, caller will
+ * submit data before dropping transaction handle. This allows jbd2
+ * to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\
+ EXT4_GET_BLOCKS_IO_SUBMIT)
/* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
+/*
+ * Atomic write caller needs this to query in the slow path of mixed mapping
+ * case, when a contiguous extent can be split across two adjacent leaf nodes.
+ * Look EXT4_MAP_QUERY_LAST_IN_LEAF.
+ */
+#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000
/*
* The bit position of these flags must not overlap with any of the
@@ -738,6 +759,13 @@ enum {
#define EXT4_EX_NOCACHE 0x40000000
#define EXT4_EX_FORCE_CACHE 0x20000000
#define EXT4_EX_NOFAIL 0x10000000
+/*
+ * ext4_map_query_blocks() uses this filter mask to filter the flags needed to
+ * pass while lookup/querying of on disk extent tree.
+ */
+#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\
+ EXT4_EX_NOFAIL |\
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
/*
* Flags used by ext4_free_blocks
@@ -1061,16 +1089,16 @@ struct ext4_inode_info {
/* End of lblk range that needs to be committed in this fast commit */
ext4_lblk_t i_fc_lblk_len;
- /* Number of ongoing updates on this inode */
- atomic_t i_fc_updates;
-
spinlock_t i_raw_lock; /* protects updates to the raw inode */
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
- /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
- struct mutex i_fc_lock;
+ /*
+ * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
+ * and inode's EXT4_FC_STATE_COMMITTING state bit.
+ */
+ spinlock_t i_fc_lock;
/*
* i_disksize keeps track of what the inode size is ON DISK, not
@@ -1127,6 +1155,8 @@ struct ext4_inode_info {
ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
extents to shrink. Protected by
i_es_lock */
+ u64 i_es_seq; /* Change counter for extents.
+ Protected by i_es_lock */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -1171,6 +1201,14 @@ struct ext4_inode_info {
__u32 i_csum_seed;
kprojid_t i_projid;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
+#endif
+
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info;
+#endif
};
/*
@@ -1431,7 +1469,9 @@ struct ext4_super_block {
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
__le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */
- __le32 s_reserved[94]; /* Padding to the end of the block */
+ __le16 s_def_resuid_hi;
+ __le16 s_def_resgid_hi;
+ __le32 s_reserved[93]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1582,16 +1622,14 @@ struct ext4_sb_info {
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
- unsigned int s_mb_free_pending;
+ atomic_t s_mb_free_pending;
struct list_head s_freed_data_list[2]; /* List of blocks to be freed
after commit completed */
struct list_head s_discard_list;
struct work_struct s_discard_work;
atomic_t s_retry_alloc_pending;
- struct list_head *s_mb_avg_fragment_size;
- rwlock_t *s_mb_avg_fragment_size_locks;
- struct list_head *s_mb_largest_free_orders;
- rwlock_t *s_mb_largest_free_orders_locks;
+ struct xarray *s_mb_avg_fragment_size;
+ struct xarray *s_mb_largest_free_orders;
/* tunables */
unsigned long s_stripe;
@@ -1603,15 +1641,16 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_dir_size_kb;
- /* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
- unsigned long s_mb_last_start;
unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit;
unsigned int s_mb_best_avail_max_trim_order;
unsigned int s_sb_update_sec;
unsigned int s_sb_update_kb;
+ /* where last allocation was done - for stream allocation */
+ ext4_group_t *s_mb_last_groups;
+ unsigned int s_mb_nr_global_goals;
+
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
atomic_t s_bal_success; /* we found long enough chunks */
@@ -1620,12 +1659,10 @@ struct ext4_sb_info {
atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */
atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_stream_goals; /* stream allocation global goal hits */
atomic_t s_bal_len_goals; /* len goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
- atomic_t s_bal_p2_aligned_bad_suggestions;
- atomic_t s_bal_goal_fast_bad_suggestions;
- atomic_t s_bal_best_avail_bad_suggestions;
atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */
@@ -1667,6 +1704,11 @@ struct ext4_sb_info {
/* record the last minlen when FITRIM is called. */
unsigned long s_last_trim_minblks;
+ /* minimum folio order of a page cache allocation */
+ u16 s_min_folio_order;
+ /* supported maximum folio order, 0 means not supported */
+ u16 s_max_folio_order;
+
/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_csum_seed;
@@ -1754,7 +1796,7 @@ struct ext4_sb_info {
* following fields:
* ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
*/
- spinlock_t s_fc_lock;
+ struct mutex s_fc_lock;
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
tid_t s_fc_ineligible_tid;
@@ -1804,6 +1846,18 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
+static inline int ext4_get_resuid(struct ext4_super_block *es)
+{
+ return le16_to_cpu(es->s_def_resuid) |
+ le16_to_cpu(es->s_def_resuid_hi) << 16;
+}
+
+static inline int ext4_get_resgid(struct ext4_super_block *es)
+{
+ return le16_to_cpu(es->s_def_resgid) |
+ le16_to_cpu(es->s_def_resgid_hi) << 16;
+}
+
/*
* Returns: sbi->field[index]
* Used to access an array element from the following sbi fields which require
@@ -1913,6 +1967,7 @@ enum {
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
+ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
};
@@ -1973,6 +2028,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
+ * Check whether the inode is tracked as orphan (either in orphan file or
+ * orphan list).
+ */
+static inline bool ext4_inode_orphan_tracked(struct inode *inode)
+{
+ return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
+ !list_empty(&EXT4_I(inode)->i_orphan);
+}
+
+/*
* Codes for operating systems
*/
#define EXT4_OS_LINUX 0
@@ -2295,10 +2360,12 @@ static inline int ext4_emergency_state(struct super_block *sb)
#define EXT4_DEFM_NODELALLOC 0x0800
/*
- * Default journal batch times
+ * Default journal batch times and ioprio.
*/
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
/*
* Default values for superblock update
@@ -2429,28 +2496,19 @@ static inline unsigned int ext4_dir_rec_len(__u8 name_len,
return (rec_len & ~EXT4_DIR_ROUND);
}
-/*
- * If we ever get support for fs block sizes > page_size, we'll need
- * to remove the #if statements in the next two functions...
- */
static inline unsigned int
ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
unsigned len = le16_to_cpu(dlen);
-#if (PAGE_SIZE >= 65536)
if (len == EXT4_MAX_REC_LEN || len == 0)
return blocksize;
return (len & 65532) | ((len & 3) << 16);
-#else
- return len;
-#endif
}
static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3));
-#if (PAGE_SIZE >= 65536)
if (len < 65536)
return cpu_to_le16(len);
if (len == blocksize) {
@@ -2460,9 +2518,6 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
return cpu_to_le16(0);
}
return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
-#else
- return cpu_to_le16(len);
-#endif
}
/*
@@ -2487,8 +2542,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_SIPHASH 6
#define DX_HASH_LAST DX_HASH_SIPHASH
-static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
- const void *address, unsigned int length)
+static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length)
{
return crc32c(crc, address, length);
}
@@ -2922,8 +2976,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
-void ext4_fc_start_update(struct inode *inode);
-void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
@@ -2973,6 +3025,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr)
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
+void ext4_check_map_extents_env(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
@@ -2993,6 +3046,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+void ext4_set_inode_mapping_order(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3036,9 +3090,11 @@ extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
-extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks);
+extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
@@ -3050,6 +3106,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+static inline bool is_special_ino(struct super_block *sb, unsigned long ino)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
+ ino == le32_to_cpu(es->s_usr_quota_inum) ||
+ ino == le32_to_cpu(es->s_grp_quota_inum) ||
+ ino == le32_to_cpu(es->s_prj_quota_inum) ||
+ ino == le32_to_cpu(es->s_orphan_file_inum);
+}
+
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -3062,8 +3129,8 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
int ext4_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
-int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
int ext4_update_overhead(struct super_block *sb, bool force);
int ext4_force_shutdown(struct super_block *sb, u32 flags);
@@ -3111,6 +3178,8 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
sector_t block, blk_opf_t op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block);
+extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
+ sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
@@ -3119,8 +3188,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
-extern __le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es);
+extern __le32 ext4_superblock_csum(struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
@@ -3378,6 +3446,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
return 1 << sbi->s_log_groups_per_flex;
}
+static inline loff_t ext4_get_maxbytes(struct inode *inode)
+{
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return inode->i_sb->s_maxbytes;
+ return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
@@ -3442,8 +3517,6 @@ struct ext4_group_info {
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
- struct list_head bb_avg_fragment_size_node;
- struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order.
* bb_counters[3] = 5 means
@@ -3494,23 +3567,28 @@ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}
+static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group)
+{
+ if (!spin_trylock(ext4_group_lock_ptr(sb, group)))
+ return false;
+ /*
+ * We're able to grab the lock right away, so drop the lock
+ * contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ return true;
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spinlock_t *lock = ext4_group_lock_ptr(sb, group);
- if (spin_trylock(lock))
- /*
- * We're able to grab the lock right away, so drop the
- * lock contention counter.
- */
- atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
- else {
+ if (!ext4_try_lock_group(sb, group)) {
/*
* The lock is busy, so bump the contention counter,
* and then wait on the spin lock.
*/
atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
EXT4_MAX_CONTENTION);
- spin_lock(lock);
+ spin_lock(ext4_group_lock_ptr(sb, group));
}
}
@@ -3565,6 +3643,7 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+extern void ext4_update_final_de(void *de_buf, int old_size, int new_size);
int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
@@ -3624,10 +3703,10 @@ static inline int ext4_has_inline_data(struct inode *inode)
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
-extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int blocksize, int csum_size,
- unsigned int parent_ino, int dotdot_real_len);
+extern int ext4_init_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *dir_block,
+ unsigned int parent_ino, void *inline_buf,
+ int inline_size);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
@@ -3710,6 +3789,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_extents_atomic(handle_t *handle,
+ struct inode *inode, loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
@@ -3847,7 +3928,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
static inline bool ext4_inode_can_atomic_write(struct inode *inode)
{
- return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
+ return S_ISREG(inode->i_mode) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ EXT4_SB(inode->i_sb)->s_awu_min > 0;
}
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 26435f3a3094..c484125d963f 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -31,13 +31,6 @@
#define CHECK_BINSEARCH__
/*
- * If EXT_STATS is defined then stats numbers are collected.
- * These number will be displayed at umount time.
- */
-#define EXT_STATS_
-
-
-/*
* ext4_inode has i_block array (60 bytes total).
* The first 12 bytes store ext4_extent_header;
* the remainder stores an array of ext4_extent.
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 135e278c832e..05e5946ed9b3 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -279,9 +279,16 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
bh, is_metadata, inode->i_mode,
test_opt(inode->i_sb, DATA_FLAGS));
- /* In the no journal case, we can just do a bforget and return */
+ /*
+ * In the no journal case, we should wait for the ongoing buffer
+ * to complete and do a forget.
+ */
if (!ext4_handle_valid(handle)) {
- bforget(bh);
+ if (bh) {
+ clear_buffer_dirty(bh);
+ wait_on_buffer(bh);
+ __bforget(bh);
+ }
return 0;
}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 3221714d9901..63d17c5201b5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -319,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
revoke_creds, 0);
}
-static inline int ext4_journal_blocks_per_page(struct inode *inode)
+static inline int ext4_journal_blocks_per_folio(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
- return jbd2_journal_blocks_per_page(inode);
+ return jbd2_journal_blocks_per_folio(inode);
return 0;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c616a16a9f36..2cf5759ba689 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh,
EXT4_EXTENT_TAIL_OFFSET(eh));
return cpu_to_le32(csum);
}
@@ -334,7 +333,7 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
int nofail)
{
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
- int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
+ int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
if (nofail)
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
@@ -611,6 +610,8 @@ int ext4_ext_precache(struct inode *inode)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return 0; /* not an extent-mapped inode */
+ ext4_check_map_extents_env(inode);
+
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
@@ -1530,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode,
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent *ret_ex)
+ struct ext4_extent *ret_ex, int flags)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
@@ -1604,7 +1605,8 @@ got_index:
ix++;
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth,
+ flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1612,7 +1614,7 @@ got_index:
put_bh(bh);
}
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -2000,7 +2002,7 @@ ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
}
/* try to insert block into found extent and return */
- if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
+ if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) {
/*
* Try to see whether we should rather test the extent on
@@ -2179,7 +2181,7 @@ has_space:
merge:
/* try to merge extents */
- if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
+ if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
ext4_ext_try_to_merge(handle, inode, path, nearex);
/* time to correct all indexes above */
@@ -2211,7 +2213,7 @@ static int ext4_fill_es_cache_info(struct inode *inode,
while (block <= end) {
next = 0;
flags = 0;
- if (!ext4_es_lookup_extent(inode, block, &next, &es))
+ if (!ext4_es_lookup_extent(inode, block, &next, &es, NULL))
break;
if (ext4_es_is_unwritten(&es))
flags |= FIEMAP_EXTENT_UNWRITTEN;
@@ -2396,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
- int depth;
/* If we are converting the inline data, only one is needed here. */
if (ext4_has_inline_data(inode))
return 1;
- depth = ext_depth(inode);
-
+ /*
+ * Extent tree can change between the time we estimate credits and
+ * the time we actually modify the tree. Assume the worst case.
+ */
if (extents <= 1)
- index = depth * 2;
+ index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents;
else
- index = depth * 3;
+ index = (EXT4_MAX_EXTENT_DEPTH * 3) +
+ DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0));
return index;
}
@@ -2821,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct partial_cluster partial;
handle_t *handle;
int i = 0, err = 0;
+ int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL;
partial.pclu = 0;
partial.lblk = 0;
@@ -2851,8 +2856,7 @@ again:
ext4_fsblk_t pblk;
/* find extent for or closest extent to this block */
- path = ext4_find_extent(inode, end, NULL,
- EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
+ path = ext4_find_extent(inode, end, NULL, flags);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2918,7 +2922,7 @@ again:
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- NULL);
+ NULL, flags);
if (err < 0)
goto out;
if (pblk) {
@@ -2994,8 +2998,7 @@ again:
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
bh = read_extent_tree_block(inode, path[i].p_idx,
- depth - i - 1,
- EXT4_EX_NOCACHE);
+ depth - i - 1, flags);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -3221,7 +3224,7 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
else
ext4_ext_mark_initialized(ex);
- if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+ if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
ext4_ext_try_to_merge(handle, inode, path, ex);
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
@@ -3365,7 +3368,7 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
if (map->m_lblk + map->m_len < ee_block + ee_len) {
split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
- flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+ flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
if (unwritten)
split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
@@ -3718,10 +3721,6 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
>> inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len;
- /*
- * It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully inside i_size or new_size.
- */
depth = ext_depth(inode);
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
@@ -3732,11 +3731,15 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
split_flag |= EXT4_EXT_DATA_VALID1;
/* Convert to initialized */
} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully inside i_size or new_size.
+ */
split_flag |= ee_block + ee_len <= eof_block ?
EXT4_EXT_MAY_ZEROOUT : 0;
split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
}
- flags |= EXT4_GET_BLOCKS_PRE_IO;
+ flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE;
return ext4_split_extent(handle, inode, path, map, split_flag, flags,
allocated);
}
@@ -3908,7 +3911,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
*allocated, newblock);
/* get_block() before submitting IO, split the extent */
- if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+ if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) {
path = ext4_split_convert_extents(handle, inode, map, path,
flags | EXT4_GET_BLOCKS_CONVERT, allocated);
if (IS_ERR(path))
@@ -4202,7 +4205,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
+ path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -4314,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out;
ar.lright = map->m_lblk;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
+ &ex2, flags);
if (err < 0)
goto out;
@@ -4433,6 +4437,20 @@ got_allocated_blocks:
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
out:
+ /*
+ * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag.
+ * So we know that the depth used here is correct, since there was no
+ * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set.
+ * If tomorrow we start using this QUERY flag with CREATE, then we will
+ * need to re-calculate the depth as it might have changed due to block
+ * allocation.
+ */
+ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) {
+ WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE);
+ if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr)))
+ map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF;
+ }
+
ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
@@ -4483,6 +4501,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
struct ext4_map_blocks map;
unsigned int credits;
loff_t epos, old_size = i_size_read(inode);
+ unsigned int blkbits = inode->i_blkbits;
+ bool alloc_zero = false;
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4496,6 +4516,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
/*
+ * Do the actual write zero during a running journal transaction
+ * costs a lot. First allocate an unwritten extent and then
+ * convert it to written after zeroing it out.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO) {
+ flags &= ~EXT4_GET_BLOCKS_ZERO;
+ flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
+ alloc_zero = true;
+ }
+
+ /*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, len);
@@ -4531,9 +4562,7 @@ retry:
* allow a full retry cycle for any remaining allocations
*/
retries = 0;
- map.m_lblk += ret;
- map.m_len = len = len - ret;
- epos = (loff_t)map.m_lblk << inode->i_blkbits;
+ epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret);
inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
@@ -4553,6 +4582,21 @@ retry:
ret2 = ret3 ? ret3 : ret2;
if (unlikely(ret2))
break;
+
+ if (alloc_zero &&
+ (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
+ ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+ map.m_len);
+ if (likely(!ret2))
+ ret2 = ext4_convert_unwritten_extents(NULL,
+ inode, (loff_t)map.m_lblk << blkbits,
+ (loff_t)map.m_len << blkbits);
+ if (ret2)
+ break;
+ }
+
+ map.m_lblk += ret;
+ map.m_len = len = len - ret;
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -4618,7 +4662,11 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (end_lblk > start_lblk) {
ext4_lblk_t zero_blks = end_lblk - start_lblk;
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
+ if (mode & FALLOC_FL_WRITE_ZEROES)
+ flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
+ else
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE);
ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
new_size, flags);
if (ret)
@@ -4727,11 +4775,18 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the underlying device does not
+ * enable the unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev))
+ return -EOPNOTSUPP;
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_INSERT_RANGE))
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES))
return -EOPNOTSUPP;
inode_lock(inode);
@@ -4762,16 +4817,23 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
goto out_invalidate_lock;
- if (mode & FALLOC_FL_PUNCH_HOLE)
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
ret = ext4_punch_hole(file, offset, len);
- else if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
ret = ext4_collapse_range(file, offset, len);
- else if (mode & FALLOC_FL_INSERT_RANGE)
+ break;
+ case FALLOC_FL_INSERT_RANGE:
ret = ext4_insert_range(file, offset, len);
- else if (mode & FALLOC_FL_ZERO_RANGE)
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
ret = ext4_zero_range(file, offset, len, mode);
- else
+ break;
+ default:
ret = -EOPNOTSUPP;
+ }
out_invalidate_lock:
filemap_invalidate_unlock(mapping);
@@ -4781,6 +4843,93 @@ out_inode_lock:
}
/*
+ * This function converts a range of blocks to written extents. The caller of
+ * this function will pass the start offset and the size. all unwritten extents
+ * within this range will be converted to written extents.
+ *
+ * This function is called from the direct IO end io call back function for
+ * atomic writes, to convert the unwritten extents after IO is completed.
+ *
+ * Note that the requirement for atomic writes is that all conversion should
+ * happen atomically in a single fs journal transaction. We mainly only allocate
+ * unwritten extents either on a hole on a pre-exiting unwritten extent range in
+ * ext4_map_blocks_atomic_write(). The only case where we can have multiple
+ * unwritten extents in a range [offset, offset+len) is when there is a split
+ * unwritten extent between two leaf nodes which was cached in extent status
+ * cache during ext4_iomap_alloc() time. That will allow
+ * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going
+ * into the slow path. That means we might need a loop for conversion of this
+ * unwritten extent split across leaf block within a single journal transaction.
+ * Split extents across leaf nodes is a rare case, but let's still handle that
+ * to meet the requirements of multi-fsblock atomic writes.
+ *
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
+{
+ unsigned int max_blocks;
+ int ret = 0, ret2 = 0, ret3 = 0;
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
+ int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE;
+
+ map.m_lblk = offset >> blkbits;
+ max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
+
+ if (!handle) {
+ /*
+ * TODO: An optimization can be added later by having an extent
+ * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that
+ * it can tell if the extent in the cache is a split extent.
+ * But for now let's assume pextents as 2 always.
+ */
+ credits = ext4_meta_trans_blocks(inode, max_blocks, 2);
+ }
+
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+ }
+
+ while (ret >= 0 && ret < max_blocks) {
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret != max_blocks)
+ ext4_msg(inode->i_sb, KERN_INFO,
+ "inode #%lu: block %u: len %u: "
+ "split block mapping found for atomic write, "
+ "ret = %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ if (ret <= 0)
+ break;
+ }
+
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+
+ if (credits) {
+ ret3 = ext4_journal_stop(handle);
+ if (unlikely(ret3))
+ ret2 = ret3;
+ }
+
+ if (ret <= 0 || ret2)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "returned %d or %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret, ret2);
+
+ return ret > 0 ? ret2 : ret;
+}
+
+/*
* This function convert a range of blocks to written extents
* The caller of this function will pass the start offset and the size.
* all unwritten extents within this range will be converted to
@@ -4819,8 +4968,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
break;
}
}
+ /*
+ * Do not cache any unrelated extents, as it does not hold the
+ * i_rwsem or invalidate_lock, which could corrupt the extent
+ * status tree.
+ */
ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT |
+ EXT4_EX_NOCACHE);
if (ret <= 0)
ext4_warning(inode->i_sb,
"inode #%lu: block %u: len %u: "
@@ -4931,12 +5086,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = {
static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
- u64 maxbytes;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- maxbytes = inode->i_sb->s_maxbytes;
- else
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ u64 maxbytes = ext4_get_maxbytes(inode);
if (*len == 0)
return -EINVAL;
@@ -4956,10 +5106,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
int error = 0;
+ inode_lock_shared(inode);
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
if (error)
- return error;
+ goto unlock;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
@@ -4970,15 +5121,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
error = ext4_fiemap_check_ranges(inode, start, &len);
if (error)
- return error;
+ goto unlock;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
- return iomap_fiemap(inode, fieinfo, start, len,
- &ext4_iomap_xattr_ops);
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_xattr_ops);
+ } else {
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_report_ops);
}
-
- return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
+unlock:
+ inode_unlock_shared(inode);
+ return error;
}
int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -4999,7 +5154,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ inode_lock_shared(inode);
error = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
if (error)
return error;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
@@ -5058,7 +5215,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
credits = depth + 2;
}
- restart_credits = ext4_writepage_trans_blocks(inode);
+ restart_credits = ext4_chunk_trans_extent(inode, 0);
err = ext4_datasem_ensure_credits(handle, inode, credits,
restart_credits, 0);
if (err) {
@@ -5318,7 +5475,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -5328,6 +5485,8 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
start_lblk = offset >> inode->i_blkbits;
end_lblk = (offset + len) >> inode->i_blkbits;
+ ext4_check_map_extents_env(inode);
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
@@ -5412,7 +5571,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -5429,6 +5588,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
start_lblk = offset >> inode->i_blkbits;
len_lblk = len >> inode->i_blkbits;
+ ext4_check_map_extents_env(inode);
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
@@ -5457,7 +5618,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
path = ext4_split_extent_at(handle, inode, path,
start_lblk, split_flag,
EXT4_EX_NOCACHE |
- EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_SPLIT_NOMERGE |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index d1401d4a5513..e04fbf10fe4f 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -120,9 +120,40 @@
* memory. Hence, we will reclaim written/unwritten/hole extents from
* the tree under a heavy memory pressure.
*
+ * ==========================================================================
+ * 3. Assurance of Ext4 extent status tree consistency
+ *
+ * When mapping blocks, Ext4 queries the extent status tree first and should
+ * always trusts that the extent status tree is consistent and up to date.
+ * Therefore, it is important to adheres to the following rules when createing,
+ * modifying and removing extents.
+ *
+ * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
+ * the extent information should always be processed through the extent
+ * status tree instead of being organized manually through the on-disk
+ * extent tree.
+ *
+ * 2. When updating the extent tree, Ext4 should acquire the i_data_sem
+ * exclusively and update the extent status tree atomically. If the extents
+ * to be modified are large enough to exceed the range that a single
+ * i_data_sem can process (as ext4_datasem_ensure_credits() may drop
+ * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
+ * does):
+ *
+ * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against page faults, as well as reads and writes that may
+ * concurrently modify the extent status tree.
+ * b) Evict all page cache in the affected range and recommend rebuilding
+ * or dropping the extent status tree after modifying the on-disk
+ * extent tree. This ensures exclusion against concurrent writebacks
+ * that do not hold those locks but only holds a folio lock.
+ *
+ * 3. Based on the rules above, when querying block mappings, Ext4 should at
+ * least hold the i_rwsem or invalidate_lock or folio lock(s) for the
+ * specified querying range.
*
* ==========================================================================
- * 3. Performance analysis
+ * 4. Performance analysis
*
* -- overhead
* 1. There is a cache extent for write access, so if writes are
@@ -134,7 +165,7 @@
*
*
* ==========================================================================
- * 4. TODO list
+ * 5. TODO list
*
* -- Refactor delayed space reservation
*
@@ -204,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
return es->es_lblk + es->es_len - 1;
}
+static inline void ext4_es_inc_seq(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
+}
+
/*
* search through the tree for an delayed extent with a given offset. If
* it can't be found, try to find next extent.
@@ -875,7 +913,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
- trace_ext4_es_insert_extent(inode, &newes);
ext4_es_insert_extent_check(inode, &newes);
@@ -924,6 +961,11 @@ retry:
}
pending = err3;
}
+ /*
+ * TODO: For cache on-disk extents, there is no need to increment
+ * the sequence counter, this requires future optimization.
+ */
+ ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
/*
@@ -950,6 +992,7 @@ error:
if (err1 || err2 || err3 < 0)
goto retry;
+ trace_ext4_es_insert_extent(inode, &newes);
ext4_es_print_tree(inode);
return;
}
@@ -996,8 +1039,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
* Return: 1 on found, 0 on not
*/
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t *next_lblk,
- struct extent_status *es)
+ ext4_lblk_t *next_lblk, struct extent_status *es,
+ u64 *pseq)
{
struct ext4_es_tree *tree;
struct ext4_es_stats *stats;
@@ -1056,6 +1099,8 @@ out:
} else
*next_lblk = 0;
}
+ if (pseq)
+ *pseq = EXT4_I(inode)->i_es_seq;
} else {
percpu_counter_inc(&stats->es_stats_cache_misses);
}
@@ -1519,7 +1564,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
@@ -1539,16 +1583,21 @@ retry:
*/
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, end, &reserved, es);
+ if (err)
+ goto error;
/* Free preallocated extent if it didn't get used. */
if (es) {
if (!es->es_len)
__es_free_extent(es);
es = NULL;
}
+ ext4_es_inc_seq(inode);
+error:
write_unlock(&EXT4_I(inode)->i_es_lock);
if (err)
goto retry;
+ trace_ext4_es_remove_extent(inode, lblk, len);
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
}
@@ -2109,8 +2158,6 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
- trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
- end_allocated);
ext4_es_insert_extent_check(inode, &newes);
@@ -2165,11 +2212,14 @@ retry:
pr2 = NULL;
}
}
+ ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
if (err1 || err2 || err3 < 0)
goto retry;
+ trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
+ end_allocated);
ext4_es_print_tree(inode);
ext4_print_pending_tree(inode);
return;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8f9c008d11e8..f3396cf32b44 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -148,7 +148,7 @@ extern void ext4_es_find_extent_range(struct inode *inode,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t *next_lblk,
- struct extent_status *es);
+ struct extent_status *es, u64 *pseq);
extern bool ext4_es_scan_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index da4263a14a20..fa66b08de999 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -12,6 +12,7 @@
#include "ext4_extents.h"
#include "mballoc.h"
+#include <linux/lockdep.h>
/*
* Ext4 Fast Commits
* -----------------
@@ -49,19 +50,27 @@
* that need to be committed during a fast commit in another in memory queue of
* inodes. During the commit operation, we commit in the following order:
*
- * [1] Lock inodes for any further data updates by setting COMMITTING state
- * [2] Submit data buffers of all the inodes
- * [3] Wait for [2] to complete
- * [4] Commit all the directory entry updates in the fast commit space
- * [5] Commit all the changed inode structures
- * [6] Write tail tag (this tag ensures the atomicity, please read the following
+ * [1] Prepare all the inodes to write out their data by setting
+ * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
+ * deleted while it is being flushed.
+ * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
+ * state.
+ * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
+ * all the exsiting handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit
+ * by setting "EXT4_STATE_FC_COMMITTING" state.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
+ * starting of new handles. If new handles try to start an update on
+ * any of the inodes that are being committed, ext4_fc_track_inode()
+ * will block until those inodes have finished the fast commit.
+ * [6] Commit all the directory entry updates in the fast commit space.
+ * [7] Commit all the changed inodes in the fast commit space and clear
+ * "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [8] Write tail tag (this tag ensures the atomicity, please read the following
* section for more details).
- * [7] Wait for [4], [5] and [6] to complete.
*
- * All the inode updates must call ext4_fc_start_update() before starting an
- * update. If such an ongoing update is present, fast commit waits for it to
- * complete. The completion of such an update is marked by
- * ext4_fc_stop_update().
+ * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * and jbd2_journal_stop() similar to JBD2 journaling.
*
* Fast Commit Ineligibility
* -------------------------
@@ -142,6 +151,13 @@
* similarly. Thus, by converting a non-idempotent procedure into a series of
* idempotent outcomes, fast commits ensured idempotence during the replay.
*
+ * Locking
+ * -------
+ * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
+ * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
+ * inode. Most of the code avoids acquiring both the locks, but if one must do
+ * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
+ *
* TODOs
* -----
*
@@ -156,13 +172,12 @@
* fast commit recovery even if that area is invalidated by later full
* commits.
*
- * 1) Fast commit's commit path locks the entire file system during fast
- * commit. This has significant performance penalty. Instead of that, we
- * should use ext4_fc_start/stop_update functions to start inode level
- * updates from ext4_journal_start/stop. Once we do that we can drop file
- * system locking during commit path.
+ * 1) Handle more ineligible cases.
*
- * 2) Handle more ineligible cases.
+ * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
+ * status tree. This would get rid of the need to call ext4_fc_track_inode()
+ * before acquiring i_data_sem. To do that we would need to ensure that
+ * modified extents from the extent status tree are not evicted from memory.
*/
#include <trace/events/ext4.h>
@@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode)
INIT_LIST_HEAD(&ei->i_fc_list);
INIT_LIST_HEAD(&ei->i_fc_dilist);
init_waitqueue_head(&ei->i_fc_wait);
- atomic_set(&ei->i_fc_updates, 0);
-}
-
-/* This function must be called with sbi->s_fc_lock held. */
-static void ext4_fc_wait_committing_inode(struct inode *inode)
-__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
-{
- wait_queue_head_t *wq;
- struct ext4_inode_info *ei = EXT4_I(inode);
-
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);