aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/9p/vfs_inode.c3
-rw-r--r--fs/9p/vfs_inode_dotl.c3
-rw-r--r--fs/Kconfig5
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.c2
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/super.c8
-rw-r--r--fs/affs/symlink.c4
-rw-r--r--fs/aio.c27
-rw-r--r--fs/autofs4/autofs_i.h5
-rw-r--r--fs/befs/befs.h2
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/block_dev.c33
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/backref.c94
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/check-integrity.c10
-rw-r--r--fs/btrfs/compression.c29
-rw-r--r--fs/btrfs/ctree.c20
-rw-r--r--fs/btrfs/ctree.h47
-rw-r--r--fs/btrfs/delayed-ref.c372
-rw-r--r--fs/btrfs/delayed-ref.h29
-rw-r--r--fs/btrfs/dev-replace.c7
-rw-r--r--fs/btrfs/disk-io.c151
-rw-r--r--fs/btrfs/extent-tree.c630
-rw-r--r--fs/btrfs/extent-tree.h0
-rw-r--r--fs/btrfs/extent_io.c73
-rw-r--r--fs/btrfs/file.c11
-rw-r--r--fs/btrfs/free-space-cache.c71
-rw-r--r--fs/btrfs/inode-map.c17
-rw-r--r--fs/btrfs/inode.c198
-rw-r--r--fs/btrfs/ioctl.c349
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c42
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c1106
-rw-r--r--fs/btrfs/qgroup.h61
-rw-r--r--fs/btrfs/raid56.c149
-rw-r--r--fs/btrfs/raid56.h10
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/relocation.c54
-rw-r--r--fs/btrfs/scrub.c443
-rw-r--r--fs/btrfs/send.c147
-rw-r--r--fs/btrfs/super.c414
-rw-r--r--fs/btrfs/sysfs.c148
-rw-r--r--fs/btrfs/sysfs.h8
-rw-r--r--fs/btrfs/tests/qgroup-tests.c109
-rw-r--r--fs/btrfs/transaction.c101
-rw-r--r--fs/btrfs/transaction.h26
-rw-r--r--fs/btrfs/tree-defrag.c3
-rw-r--r--fs/btrfs/tree-log.c576
-rw-r--r--fs/btrfs/ulist.c47
-rw-r--r--fs/btrfs/ulist.h1
-rw-r--r--fs/btrfs/volumes.c408
-rw-r--r--fs/btrfs/volumes.h12
-rw-r--r--fs/buffer.c13
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/namei.c33
-rw-r--r--fs/ceph/acl.c4
-rw-r--r--fs/ceph/addr.c310
-rw-r--r--fs/ceph/caps.c824
-rw-r--r--fs/ceph/dir.c383
-rw-r--r--fs/ceph/file.c63
-rw-r--r--fs/ceph/inode.c155
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c425
-rw-r--r--fs/ceph/mds_client.h23
-rw-r--r--fs/ceph/snap.c173
-rw-r--r--fs/ceph/super.c27
-rw-r--r--fs/ceph/super.h124
-rw-r--r--fs/ceph/xattr.c65
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/cifs/Kconfig9
-rw-r--r--fs/cifs/cifs_ioctl.h42
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h13
-rw-r--r--fs/cifs/cifspdu.h26
-rw-r--r--fs/cifs/cifssmb.c12
-rw-r--r--fs/cifs/connect.c13
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/ioctl.c73
-rw-r--r--fs/cifs/smb2ops.c180
-rw-r--r--fs/cifs/smb2pdu.c74
-rw-r--r--fs/cifs/smb2pdu.h81
-rw-r--r--fs/cifs/smbfsctl.h3
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/coda/coda_linux.h2
-rw-r--r--fs/coda/upcall.c6
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/configfs/inode.c2
-rw-r--r--fs/configfs/item.c4
-rw-r--r--fs/configfs/mount.c10
-rw-r--r--fs/coredump.c48
-rw-r--r--fs/dax.c311
-rw-r--r--fs/dcache.c50
-rw-r--r--fs/debugfs/file.c14
-rw-r--r--fs/debugfs/inode.c22
-rw-r--r--fs/devpts/inode.c31
-rw-r--r--fs/direct-io.c18
-rw-r--r--fs/dlm/lowcomms.c743
-rw-r--r--fs/dlm/plock.c3
-rw-r--r--fs/dlm/user.c16
-rw-r--r--fs/drop_caches.c10
-rw-r--r--fs/ecryptfs/crypto.c3
-rw-r--r--fs/ecryptfs/dentry.c16
-rw-r--r--fs/ecryptfs/file.c1
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/exec.c10
-rw-r--r--fs/exofs/dir.c6
-rw-r--r--fs/ext2/dir.c5
-rw-r--r--fs/ext2/file.c14
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext2/namei.c46
-rw-r--r--fs/ext3/Kconfig89
-rw-r--r--fs/ext3/Makefile12
-rw-r--r--fs/ext3/acl.c281
-rw-r--r--fs/ext3/acl.h72
-rw-r--r--fs/ext3/balloc.c2158
-rw-r--r--fs/ext3/bitmap.c20
-rw-r--r--fs/ext3/dir.c537
-rw-r--r--fs/ext3/ext3.h1332
-rw-r--r--fs/ext3/ext3_jbd.c59
-rw-r--r--fs/ext3/file.c79
-rw-r--r--fs/ext3/fsync.c109
-rw-r--r--fs/ext3/hash.c206
-rw-r--r--fs/ext3/ialloc.c706
-rw-r--r--fs/ext3/inode.c3574
-rw-r--r--fs/ext3/ioctl.c327
-rw-r--r--fs/ext3/namei.c2586
-rw-r--r--fs/ext3/namei.h27
-rw-r--r--fs/ext3/resize.c1117
-rw-r--r--fs/ext3/super.c3165
-rw-r--r--fs/ext3/symlink.c46
-rw-r--r--fs/ext3/xattr.c1330
-rw-r--r--fs/ext3/xattr.h136
-rw-r--r--fs/ext3/xattr_security.c78
-rw-r--r--fs/ext3/xattr_trusted.c54
-rw-r--r--fs/ext3/xattr_user.c58
-rw-r--r--fs/ext4/Kconfig54
-rw-r--r--fs/ext4/crypto_fname.c5
-rw-r--r--fs/ext4/crypto_key.c4
-rw-r--r--fs/ext4/crypto_policy.c17
-rw-r--r--fs/ext4/ext4.h4
-rw-r--r--fs/ext4/extents.c6
-rw-r--r--fs/ext4/file.c78
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c70
-rw-r--r--fs/ext4/ioctl.c1
-rw-r--r--fs/ext4/mballoc.c16
-rw-r--r--fs/ext4/migrate.c17
-rw-r--r--fs/ext4/mmp.c48
-rw-r--r--fs/ext4/namei.c63
-rw-r--r--fs/ext4/page-io.c26
-rw-r--r--fs/ext4/readpage.c8
-rw-r--r--fs/ext4/super.c70
-rw-r--r--fs/f2fs/Kconfig2
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/checkpoint.c93
-rw-r--r--fs/f2fs/crypto_key.c3
-rw-r--r--fs/f2fs/data.c967
-rw-r--r--fs/f2fs/debug.c30
-rw-r--r--fs/f2fs/dir.c4
-rw-r--r--fs/f2fs/extent_cache.c791
-rw-r--r--fs/f2fs/f2fs.h134
-rw-r--r--fs/f2fs/file.c190
-rw-r--r--fs/f2fs/gc.c111
-rw-r--r--fs/f2fs/gc.h6
-rw-r--r--fs/f2fs/inline.c25
-rw-r--r--fs/f2fs/inode.c97
-rw-r--r--fs/f2fs/namei.c21
-rw-r--r--fs/f2fs/node.c86
-rw-r--r--fs/f2fs/recovery.c43
-rw-r--r--fs/f2fs/segment.c79
-rw-r--r--fs/f2fs/segment.h55
-rw-r--r--fs/f2fs/shrinker.c139
-rw-r--r--fs/f2fs/super.c65
-rw-r--r--fs/f2fs/xattr.c5
-rw-r--r--fs/file.c77
-rw-r--r--fs/file_table.c25
-rw-r--r--fs/freevxfs/vxfs_lookup.c9
-rw-r--r--fs/fs-writeback.c234
-rw-r--r--fs/fscache/cookie.c8
-rw-r--r--fs/fscache/internal.h12
-rw-r--r--fs/fscache/object.c69
-rw-r--r--fs/fscache/operation.c254
-rw-r--r--fs/fscache/page.c86
-rw-r--r--fs/fscache/stats.c14
-rw-r--r--fs/fuse/cuse.c15
-rw-r--r--fs/fuse/dev.c833
-rw-r--r--fs/fuse/file.c22
-rw-r--r--fs/fuse/fuse_i.h167
-rw-r--r--fs/fuse/inode.c95
-rw-r--r--fs/gfs2/lops.c19
-rw-r--r--fs/gfs2/ops_fstype.c6
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/hfs/bnode.c9
-rw-r--r--fs/hfs/brec.c20
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/bnode.c3
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/alloc.c95
-rw-r--r--fs/hpfs/buffer.c39
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/file.c10
-rw-r--r--fs/hpfs/hpfs_fn.h13
-rw-r--r--fs/hpfs/map.c26
-rw-r--r--fs/hpfs/namei.c25
-rw-r--r--fs/hpfs/super.c62
-rw-r--r--fs/hugetlbfs/inode.c304
-rw-r--r--fs/inode.c116
-rw-r--r--fs/internal.h4
-rw-r--r--fs/jbd/Kconfig30
-rw-r--r--fs/jbd/Makefile7
-rw-r--r--fs/jbd/checkpoint.c782
-rw-r--r--fs/jbd/commit.c1021
-rw-r--r--fs/jbd/journal.c2145
-rw-r--r--fs/jbd/recovery.c594
-rw-r--r--fs/jbd/revoke.c733
-rw-r--r--fs/jbd/transaction.c2237
-rw-r--r--fs/jbd2/checkpoint.c39
-rw-r--r--fs/jbd2/commit.c2
-rw-r--r--fs/jbd2/journal.c13
-rw-r--r--fs/jbd2/transaction.c74
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/file.c9
-rw-r--r--fs/jfs/inode.c4
-rw-r--r--fs/jfs/ioctl.c3
-rw-r--r--fs/jfs/jfs_incore.h2
-rw-r--r--fs/jfs/jfs_inode.c4
-rw-r--r--fs/jfs/jfs_logmgr.c22
-rw-r--r--fs/jfs/jfs_metapage.c8
-rw-r--r--fs/jfs/namei.c81
-rw-r--r--fs/kernfs/dir.c61
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/libfs.c100
-rw-r--r--fs/lockd/svc.c8
-rw-r--r--fs/locks.c39
-rw-r--r--fs/logfs/dev_bdev.c16
-rw-r--r--fs/minix/dir.c5
-rw-r--r--fs/minix/minix.h2
-rw-r--r--fs/mount.h3
-rw-r--r--fs/mpage.c8
-rw-r--r--fs/namei.c44
-rw-r--r--fs/namespace.c104
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c14
-rw-r--r--fs/nfs/blocklayout/blocklayout.h19
-rw-r--r--fs/nfs/blocklayout/dev.c9
-rw-r--r--fs/nfs/blocklayout/extent_tree.c19
-rw-r--r--fs/nfs/callback.c16
-rw-r--r--fs/nfs/callback_proc.c47
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c155
-rw-r--r--fs/nfs/delegation.c29
-rw-r--r--fs/nfs/delegation.h3
-rw-r--r--fs/nfs/dir.c25
-rw-r--r--fs/nfs/file.c36
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c766
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h36
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c89
-rw-r--r--fs/nfs/inode.c88
-rw-r--r--fs/nfs/internal.h41
-rw-r--r--fs/nfs/nfs3xdr.c3
-rw-r--r--fs/nfs/nfs42.h11
-rw-r--r--fs/nfs/nfs42proc.c106
-rw-r--r--fs/nfs/nfs42xdr.c105
-rw-r--r--fs/nfs/nfs4_fs.h5
-rw-r--r--fs/nfs/nfs4client.c6
-rw-r--r--fs/nfs/nfs4file.c36
-rw-r--r--fs/nfs/nfs4getroot.c7
-rw-r--r--fs/nfs/nfs4idmap.c19
-rw-r--r--fs/nfs/nfs4proc.c369
-rw-r--r--fs/nfs/nfs4state.c45
-rw-r--r--fs/nfs/nfs4trace.h61
-rw-r--r--fs/nfs/nfs4xdr.c90
-rw-r--r--fs/nfs/pagelist.c21
-rw-r--r--fs/nfs/pnfs.c342
-rw-r--r--fs/nfs/pnfs.h57
-rw-r--r--fs/nfs/pnfs_nfs.c88
-rw-r--r--fs/nfs/super.c9
-rw-r--r--fs/nfs/write.c60
-rw-r--r--fs/nfs_common/grace.c23
-rw-r--r--fs/nfsd/blocklayoutxdr.c2
-rw-r--r--fs/nfsd/blocklayoutxdr.h15
-rw-r--r--fs/nfsd/export.c73
-rw-r--r--fs/nfsd/export.h1
-rw-r--r--fs/nfsd/idmap.h4
-rw-r--r--fs/nfsd/netns.h1
-rw-r--r--fs/nfsd/nfs2acl.c10
-rw-r--r--fs/nfsd/nfs3acl.c4
-rw-r--r--fs/nfsd/nfs4acl.c8
-rw-r--r--fs/nfsd/nfs4callback.c122
-rw-r--r--fs/nfsd/nfs4idmap.c3
-rw-r--r--fs/nfsd/nfs4layouts.c1
-rw-r--r--fs/nfsd/nfs4proc.c30
-rw-r--r--fs/nfsd/nfs4recover.c18
-rw-r--r--fs/nfsd/nfs4state.c192
-rw-r--r--fs/nfsd/nfs4xdr.c169
-rw-r--r--fs/nfsd/nfssvc.c17
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nfsd/vfs.c6
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/nilfs2/dir.c5
-rw-r--r--fs/nilfs2/inode.c22
-rw-r--r--fs/nilfs2/ioctl.c1
-rw-r--r--fs/nilfs2/segbuf.c7
-rw-r--r--fs/notify/dnotify/dnotify.c14
-rw-r--r--fs/notify/fanotify/fanotify_user.c8
-rw-r--r--fs/notify/fdinfo.c3
-rw-r--r--fs/notify/fsnotify.c11
-rw-r--r--fs/notify/fsnotify.h21
-rw-r--r--fs/notify/inode_mark.c40
-rw-r--r--fs/notify/inotify/inotify_user.c4
-rw-r--r--fs/notify/mark.c141
-rw-r--r--fs/notify/vfsmount_mark.c19
-rw-r--r--fs/nsfs.c10
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ntfs/inode.h2
-rw-r--r--fs/ntfs/super.c23
-rw-r--r--fs/ocfs2/acl.c26
-rw-r--r--fs/ocfs2/alloc.c148
-rw-r--r--fs/ocfs2/aops.c58
-rw-r--r--fs/ocfs2/buffer_head_io.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c78
-rw-r--r--fs/ocfs2/dir.c70
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c78
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c6
-rw-r--r--fs/ocfs2/dlm/dlmthread.c10
-rw-r--r--fs/ocfs2/dlmglue.c12
-rw-r--r--fs/ocfs2/extent_map.c22
-rw-r--r--fs/ocfs2/file.c75
-rw-r--r--fs/ocfs2/inode.c49
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/ioctl.c1
-rw-r--r--fs/ocfs2/journal.c32
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/ocfs2/move_extents.c8
-rw-r--r--fs/ocfs2/namei.c155
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/quota_local.c7
-rw-r--r--fs/ocfs2/refcounttree.c86
-rw-r--r--fs/ocfs2/stack_user.c9
-rw-r--r--fs/ocfs2/suballoc.c96
-rw-r--r--fs/ocfs2/super.c73
-rw-r--r--fs/ocfs2/super.h8
-rw-r--r--fs/ocfs2/xattr.c51
-rw-r--r--fs/open.c63
-rw-r--r--fs/overlayfs/inode.c25
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/readdir.c77
-rw-r--r--fs/overlayfs/super.c120
-rw-r--r--fs/pnode.h2
-rw-r--r--fs/posix_acl.c46
-rw-r--r--fs/proc/Kconfig6
-rw-r--r--fs/proc/array.c5
-rw-r--r--fs/proc/base.c129
-rw-r--r--fs/proc/generic.c67
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/internal.h6
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c65
-rw-r--r--fs/proc/proc_sysctl.c37
-rw-r--r--fs/proc/root.c11
-rw-r--r--fs/proc/task_mmu.c296
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/proc_namespace.c34
-rw-r--r--fs/pstore/inode.c12
-rw-r--r--fs/qnx6/dir.c5
-rw-r--r--fs/quota/dquot.c104
-rw-r--r--fs/quota/quota.c4
-rw-r--r--fs/reiserfs/inode.c7
-rw-r--r--fs/reiserfs/namei.c63
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/seq_file.c76
-rw-r--r--fs/signalfd.c5
-rw-r--r--fs/squashfs/squashfs_fs_i.h2
-rw-r--r--fs/super.c177
-rw-r--r--fs/sysfs/dir.c34
-rw-r--r--fs/sysfs/mount.c9
-rw-r--r--fs/sysv/dir.c5
-rw-r--r--fs/sysv/sysv.h2
-rw-r--r--fs/tracefs/inode.c17
-rw-r--r--fs/udf/inode.c19
-rw-r--r--fs/udf/super.c7
-rw-r--r--fs/udf/udf_i.h2
-rw-r--r--fs/ufs/Makefile2
-rw-r--r--fs/ufs/balloc.c42
-rw-r--r--fs/ufs/dir.c19
-rw-r--r--fs/ufs/ialloc.c16
-rw-r--r--fs/ufs/inode.c945
-rw-r--r--fs/ufs/namei.c79
-rw-r--r--fs/ufs/super.c47
-rw-r--r--fs/ufs/truncate.c523
-rw-r--r--fs/ufs/ufs.h16
-rw-r--r--fs/userfaultfd.c1330
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c285
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h10
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr.c27
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c53
-rw-r--r--fs/xfs/libxfs/xfs_bit.c (renamed from fs/xfs/xfs_bit.c)0
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c30
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c10
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c36
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c7
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c17
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_format.h85
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c549
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h15
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c95
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c12
-rw-r--r--fs/xfs/libxfs/xfs_sb.c55
-rw-r--r--fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c4
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h4
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_aops.c173
-rw-r--r--fs/xfs/xfs_aops.h7
-rw-r--r--fs/xfs/xfs_attr_inactive.c16
-rw-r--r--fs/xfs/xfs_bmap_util.c164
-rw-r--r--fs/xfs/xfs_buf.c22
-rw-r--r--fs/xfs/xfs_buf.h3
-rw-r--r--fs/xfs/xfs_buf_item.c26
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c11
-rw-r--r--fs/xfs/xfs_dquot.c18
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_error.h4
-rw-r--r--fs/xfs/xfs_extfree_item.c107
-rw-r--r--fs/xfs/xfs_extfree_item.h26
-rw-r--r--fs/xfs/xfs_file.c269
-rw-r--r--fs/xfs/xfs_filestream.c3
-rw-r--r--fs/xfs/xfs_fsops.c16
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_inode.c345
-rw-r--r--fs/xfs/xfs_inode.h85
-rw-r--r--fs/xfs/xfs_inode_item.c11
-rw-r--r--fs/xfs/xfs_ioctl.c14
-rw-r--r--fs/xfs/xfs_iomap.c18
-rw-r--r--fs/xfs/xfs_iops.c56
-rw-r--r--fs/xfs/xfs_itable.c16
-rw-r--r--fs/xfs/xfs_linux.h14
-rw-r--r--fs/xfs/xfs_log.c138
-rw-r--r--fs/xfs/xfs_log.h14
-rw-r--r--fs/xfs/xfs_log_cil.c20
-rw-r--r--fs/xfs/xfs_log_priv.h4
-rw-r--r--fs/xfs/xfs_log_recover.c320
-rw-r--r--fs/xfs/xfs_mount.c44
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_pnfs.c4
-rw-r--r--fs/xfs/xfs_qm.c7
-rw-r--r--fs/xfs/xfs_qm_syscalls.c20
-rw-r--r--fs/xfs/xfs_quota.h1
-rw-r--r--fs/xfs/xfs_rtalloc.c71
-rw-r--r--fs/xfs/xfs_super.c45
-rw-r--r--fs/xfs/xfs_symlink.c28
-rw-r--r--fs/xfs/xfs_trace.h82
-rw-r--r--fs/xfs/xfs_trans.c106
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_trans_ail.c6
-rw-r--r--fs/xfs/xfs_trans_dquot.c32
-rw-r--r--fs/xfs/xfs_trans_extfree.c32
-rw-r--r--fs/xfs/xfs_trans_priv.h17
487 files changed, 19919 insertions, 36724 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 8aa56bb6e861..6caca025019d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -52,7 +52,7 @@ enum {
/* Options that take integer arguments */
Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
/* String options */
- Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
+ Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag,
/* Options that take no arguments */
Opt_nodevmap,
/* Cache options */
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 1ef16bd8280b..3abc447783aa 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -381,7 +381,7 @@ static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct p9_fid *fid = iocb->ki_filp->private_data;
- int ret, err;
+ int ret, err = 0;
p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n",
iov_iter_count(to), iocb->ki_pos);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 510040b04c96..b1dc51888048 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -540,8 +540,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
unlock_new_inode(inode);
return inode;
error:
- unlock_new_inode(inode);
- iput(inode);
+ iget_failed(inode);
return ERR_PTR(retval);
}
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 09e4433717b8..e8aa57dc8d6d 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -149,8 +149,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
unlock_new_inode(inode);
return inode;
error:
- unlock_new_inode(inode);
- iput(inode);
+ iget_failed(inode);
return ERR_PTR(retval);
}
diff --git a/fs/Kconfig b/fs/Kconfig
index 011f43365d7b..da3f32f1a4e4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -11,18 +11,15 @@ config DCACHE_WORD_ACCESS
if BLOCK
source "fs/ext2/Kconfig"
-source "fs/ext3/Kconfig"
source "fs/ext4/Kconfig"
-source "fs/jbd/Kconfig"
source "fs/jbd2/Kconfig"
config FS_MBCACHE
# Meta block cache for Extended Attributes (ext2/ext3/ext4)
tristate
default y if EXT2_FS=y && EXT2_FS_XATTR
- default y if EXT3_FS=y && EXT3_FS_XATTR
default y if EXT4_FS=y
- default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
+ default m if EXT2_FS_XATTR || EXT4_FS
source "fs/reiserfs/Kconfig"
source "fs/jfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index cb20e4bf2303..f79cf4043e60 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
@@ -62,12 +63,10 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
-obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
obj-$(CONFIG_EXT2_FS) += ext2/
# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
# unless explicitly requested by rootfstype
obj-$(CONFIG_EXT4_FS) += ext4/
-obj-$(CONFIG_JBD) += jbd/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_CRAMFS) += cramfs/
obj-$(CONFIG_SQUASHFS) += squashfs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index a19c31d3f369..4d4a0df8344f 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep;
static struct inode *adfs_alloc_inode(struct super_block *sb)
{
struct adfs_inode_info *ei;
- ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index cffe8370fb44..c69a87eaf57d 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -64,7 +64,7 @@ struct affs_inode_info {
/* short cut to get to the affs specific inode data */
static inline struct affs_inode_info *AFFS_I(struct inode *inode)
{
- return list_entry(inode, struct affs_inode_info, vfs_inode);
+ return container_of(inode, struct affs_inode_info, vfs_inode);
}
/*
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index a8f463c028ce..5fa92bc790ef 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -140,7 +140,7 @@ affs_remove_link(struct dentry *dentry)
{
struct inode *dir, *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
- struct buffer_head *bh = NULL, *link_bh = NULL;
+ struct buffer_head *bh, *link_bh = NULL;
u32 link_ino, ino;
int retval;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index a022f4accd76..17349500592d 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -346,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
{
struct super_block *sb = dir->i_sb;
struct buffer_head *inode_bh = NULL;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
u32 block = 0;
int retval;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3f89c9e05b40..5b50c4ca43a7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/writeback.h>
+#include <linux/blkdev.h>
#include "affs.h"
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = sb->s_bdev->bd_inode->i_size >> 9;
+ size = i_size_read(sb->s_bdev->bd_inode) >> 9;
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
/* Try to find root block. Its location depends on the block size. */
- i = 512;
- j = 4096;
+ i = bdev_logical_block_size(sb->s_bdev);
+ j = PAGE_SIZE;
if (blocksize > 0) {
i = j = blocksize;
size = size / (blocksize / 512);
}
+
for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
sbi->s_root_block = root_block;
if (root_block < 0)
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index f39b71c3981e..ea5b69a18ba9 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -16,14 +16,12 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
char *link = kmap(page);
struct slink_front *lf;
- int err;
int i, j;
char c;
char lc;
pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
- err = -EIO;
bh = affs_bread(inode->i_sb, inode->i_ino);
if (!bh)
goto fail;
@@ -66,7 +64,7 @@ fail:
SetPageError(page);
kunmap(page);
unlock_page(page);
- return err;
+ return -EIO;
}
const struct address_space_operations affs_symlink_aops = {
diff --git a/fs/aio.c b/fs/aio.c
index 480440f4701f..155f84253f33 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
}
}
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_flags |= VM_DONTEXPAND;
- vma->vm_ops = &generic_file_vm_ops;
- return 0;
-}
-
-static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma)
{
+ struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
return res;
}
+static const struct vm_operations_struct aio_ring_vm_ops = {
+ .mremap = aio_ring_mremap,
+#if IS_ENABLED(CONFIG_MMU)
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+#endif
+};
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_ops = &aio_ring_vm_ops;
+ return 0;
+}
+
static const struct file_operations aio_ring_fops = {
.mmap = aio_ring_mmap,
- .mremap = aio_ring_remap,
};
#if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5b700ef1e59d..c37149b929be 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -238,11 +238,6 @@ static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
return d_inode(sbi->sb->s_root)->i_ino;
}
-static inline int simple_positive(struct dentry *dentry)
-{
- return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
static inline void __autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index 1fead8d56a98..35d19e8731e3 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -112,7 +112,7 @@ BEFS_SB(const struct super_block *super)
static inline struct befs_inode_info *
BEFS_I(const struct inode *inode)
{
- return list_entry(inode, struct befs_inode_info, vfs_inode);
+ return container_of(inode, struct befs_inode_info, vfs_inode);
}
static inline befs_blocknr_t
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index cd46e4158830..6b659967898e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1530,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note)
file = vma->vm_file;
if (!file)
continue;
- filename = d_path(&file->f_path, name_curpos, remaining);
+ filename = file_path(file, name_curpos, remaining);
if (IS_ERR(filename)) {
if (PTR_ERR(filename) == -ENAMETOOLONG) {
vfree(data);
@@ -1540,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note)
continue;
}
- /* d_path() fills at the end, move name down */
+ /* file_path() fills at the end, move name down */
/* n = strlen(filename) + 1: */
n = (name_curpos + remaining) - filename;
remaining = filename - name_curpos;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f04c873a7365..22ea424ee741 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -28,6 +28,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -43,7 +44,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode)
return container_of(inode, struct bdev_inode, vfs_inode);
}
-inline struct block_device *I_BDEV(struct inode *inode)
+struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
}
@@ -152,6 +153,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ if (IS_DAX(inode))
+ return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
+ NULL, DIO_SKIP_DIO_COUNT);
return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
blkdev_get_block, NULL, NULL,
DIO_SKIP_DIO_COUNT);
@@ -377,7 +381,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
struct page *page)
{
const struct block_device_operations *ops = bdev->bd_disk->fops;
- if (!ops->rw_page)
+ if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
}
@@ -408,7 +412,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
int result;
int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
const struct block_device_operations *ops = bdev->bd_disk->fops;
- if (!ops->rw_page)
+ if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
set_page_writeback(page);
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
@@ -438,11 +442,17 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
* accessible at this address.
*/
long bdev_direct_access(struct block_device *bdev, sector_t sector,
- void **addr, unsigned long *pfn, long size)
+ void __pmem **addr, unsigned long *pfn, long size)
{
long avail;
const struct block_device_operations *ops = bdev->bd_disk->fops;
+ /*
+ * The device driver is allowed to sleep, in order to make the
+ * memory directly accessible.
+ */
+ might_sleep();
+
if (size < 0)
return size;
if (!ops->direct_access)
@@ -453,7 +463,7 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, addr, pfn, size);
+ avail = ops->direct_access(bdev, sector, addr, pfn);
if (!avail)
return -ERANGE;
return min(avail, size);
@@ -1170,6 +1180,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
+ bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
if (!partno) {
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
@@ -1759,7 +1770,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
{
struct inode *inode, *old_inode = NULL;
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
struct address_space *mapping = inode->i_mapping;
@@ -1771,13 +1782,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
/*
* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the
- * inode_sb_list_lock. We cannot iput the inode now as we can
+ * s_inode_list_lock We cannot iput the inode now as we can
* be holding the last reference and we cannot iput it under
- * inode_sb_list_lock. So we keep the reference and iput it
+ * s_inode_list_lock. So we keep the reference and iput it
* later.
*/
iput(old_inode);
@@ -1785,8 +1796,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
func(I_BDEV(inode), arg);
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
iput(old_inode);
}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index df9932b00d08..1ce06c849a86 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -85,6 +85,7 @@ BTRFS_WORK_HELPER(extent_refs_helper);
BTRFS_WORK_HELPER(scrub_helper);
BTRFS_WORK_HELPER(scrubwrc_helper);
BTRFS_WORK_HELPER(scrubnc_helper);
+BTRFS_WORK_HELPER(scrubparity_helper);
static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index ec2ee477f8ba..b0b093b6afec 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
BTRFS_WORK_HELPER_PROTO(scrub_helper);
BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
+
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
unsigned int flags,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 614aaa1969bd..ecbc63d3143e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -206,10 +206,33 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
return -ENOMEM;
ref->root_id = root_id;
- if (key)
+ if (key) {
ref->key_for_search = *key;
- else
+ /*
+ * We can often find data backrefs with an offset that is too
+ * large (>= LLONG_MAX, maximum allowed file offset) due to
+ * underflows when subtracting a file's offset with the data
+ * offset of its corresponding extent data item. This can
+ * happen for example in the clone ioctl.
+ * So if we detect such case we set the search key's offset to
+ * zero to make sure we will find the matching file extent item
+ * at add_all_parents(), otherwise we will miss it because the
+ * offset taken form the backref is much larger then the offset
+ * of the file extent item. This can make us scan a very large
+ * number of file extent items, but at least it will not make
+ * us miss any.
+ * This is an ugly workaround for a behaviour that should have
+ * never existed, but it does and a fix for the clone ioctl
+ * would touch a lot of places, cause backwards incompatibility
+ * and would not fix the problem for extents cloned with older
+ * kernels.
+ */
+ if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
+ ref->key_for_search.offset >= LLONG_MAX)
+ ref->key_for_search.offset = 0;
+ } else {
memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+ }
ref->inode_list = NULL;
ref->level = level;
@@ -250,8 +273,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* the first item to check. But sometimes, we may enter it with
* slot==nritems. In that case, go to the next leaf before we continue.
*/
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
- ret = btrfs_next_old_leaf(root, path, time_seq);
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ if (time_seq == (u64)-1)
+ ret = btrfs_next_leaf(root, path);
+ else
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ }
while (!ret && count < total_refs) {
eb = path->nodes[0];
@@ -291,7 +318,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- ret = btrfs_next_old_item(root, path, time_seq);
+ if (time_seq == (u64)-1)
+ ret = btrfs_next_item(root, path);
+ else
+ ret = btrfs_next_old_item(root, path, time_seq);
}
if (ret > 0)
@@ -334,6 +364,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
+ else if (time_seq == (u64)-1)
+ root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -343,7 +375,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
path->lowest_level = level;
- ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+ if (time_seq == (u64)-1)
+ ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
+ 0, 0);
+ else
+ ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
+ time_seq);
/* root node has been locked, we can release @subvol_srcu safely here */
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -491,7 +528,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(!ref->wanted_disk_byte);
eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -507,7 +546,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
}
/*
- * merge two lists of backrefs and adjust counts accordingly
+ * merge backrefs and adjust counts accordingly
*
* mode = 1: merge identical keys, if key is set
* FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
@@ -535,9 +574,9 @@ static void __merge_refs(struct list_head *head, int mode)
ref2 = list_entry(pos2, struct __prelim_ref, list);
+ if (!ref_for_same_block(ref1, ref2))
+ continue;
if (mode == 1) {
- if (!ref_for_same_block(ref1, ref2))
- continue;
if (!ref1->parent && ref2->parent) {
xchg = ref1;
ref1 = ref2;
@@ -572,8 +611,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct list_head *prefs, u64 *total_refs,
u64 inum)
{
+ struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
- struct rb_node *n = &head->node.rb_node;
struct btrfs_key key;
struct btrfs_key op_key = {0};
int sgn;
@@ -583,12 +622,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
spin_lock(&head->lock);
- n = rb_first(&head->ref_root);
- while (n) {
- struct btrfs_delayed_ref_node *node;
- node = rb_entry(n, struct btrfs_delayed_ref_node,
- rb_node);
- n = rb_next(n);
+ list_for_each_entry(node, &head->ref_list, list) {
if (node->seq > seq)
continue;
@@ -621,7 +655,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_tree_ref *ref;
ref = btrfs_delayed_node_to_tree_ref(node);
- ret = __add_prelim_ref(prefs, ref->root, NULL,
+ ret = __add_prelim_ref(prefs, 0, NULL,
ref->level + 1, ref->parent,
node->bytenr,
node->ref_mod * sgn, GFP_ATOMIC);
@@ -653,11 +687,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_data_ref *ref;
ref = btrfs_delayed_node_to_data_ref(node);
-
- key.objectid = ref->objectid;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = ref->offset;
- ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+ ret = __add_prelim_ref(prefs, 0, NULL, 0,
ref->parent, node->bytenr,
node->ref_mod * sgn, GFP_ATOMIC);
break;
@@ -882,6 +912,11 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*
* NOTE: This can return values > 0
*
+ * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * much like trans == NULL case, the difference only lies in it will not
+ * commit root.
+ * The special case is for qgroup to search roots in commit_transaction().
+ *
* FIXME some caching might speed things up
*/
static int find_parent_nodes(struct btrfs_trans_handle *trans,
@@ -920,6 +955,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
+ if (time_seq == (u64)-1)
+ path->skip_locking = 1;
+
/*
* grab both a lock on the path and a lock on the delayed ref head.
* We need both to get a consistent picture of how the refs look
@@ -934,9 +972,10 @@ again:
BUG_ON(ret == 0);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (trans && likely(trans->type != __TRANS_DUMMY)) {
+ if (trans && likely(trans->type != __TRANS_DUMMY) &&
+ time_seq != (u64)-1) {
#else
- if (trans) {
+ if (trans && time_seq != (u64)-1) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1034,7 +1073,10 @@ again:
eb = read_tree_block(fs_info->extent_root,
ref->parent, 0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0ef5cc13fae2..81220b2203c6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,8 @@
#define BTRFS_INODE_IN_DELALLOC_LIST 9
#define BTRFS_INODE_READDIO_NEED_LOCK 10
#define BTRFS_INODE_HAS_PROPS 11
+/* DIO is ready to submit */
+#define BTRFS_INODE_DIO_READY 12
/*
* The following 3 bits are meant only for the btree inode.
* When any of them is set, it means an error happened while writing an
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce7dec88f4b8..541fbfaed276 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -343,7 +343,7 @@ static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bio_end_io(struct bio *bp);
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
const struct btrfsic_block *block,
@@ -2207,7 +2207,7 @@ continue_loop:
goto again;
}
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+static void btrfsic_bio_end_io(struct bio *bp)
{
struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
int iodone_w_error;
@@ -2215,7 +2215,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
/* mutex is not held! This is not save if IO is not yet completed
* on umount */
iodone_w_error = 0;
- if (bio_error_status)
+ if (bp->bi_error)
iodone_w_error = 1;
BUG_ON(NULL == block);
@@ -2230,7 +2230,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
printk(KERN_INFO
"bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
- bio_error_status,
+ bp->bi_error,
btrfsic_get_block_type(dev_state->state, block),
block->logical_bytenr, dev_state->name,
block->dev_bytenr, block->mirror_num);
@@ -2252,7 +2252,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
block = next_block;
} while (NULL != block);
- bp->bi_end_io(bp, bio_error_status);
+ bp->bi_end_io(bp);
}
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ce62324c78e7..57ee8ca29b06 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -97,10 +97,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
static struct bio *compressed_bio_alloc(struct block_device *bdev,
u64 first_byte, gfp_t gfp_flags)
{
- int nr_vecs;
-
- nr_vecs = bio_get_nr_vecs(bdev);
- return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
+ return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags);
}
static int check_compressed_csum(struct inode *inode,
@@ -152,7 +149,7 @@ fail:
* The compressed pages are freed here, and it must be run
* in process context
*/
-static void end_compressed_bio_read(struct bio *bio, int err)
+static void end_compressed_bio_read(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
@@ -160,7 +157,7 @@ static void end_compressed_bio_read(struct bio *bio, int err)
unsigned long index;
int ret;
- if (err)
+ if (bio->bi_error)
cb->errors = 1;
/* if there are more bios still pending for this compressed
@@ -210,7 +207,7 @@ csum_failed:
bio_for_each_segment_all(bvec, cb->orig_bio, i)
SetPageChecked(bvec->bv_page);
- bio_endio(cb->orig_bio, 0);
+ bio_endio(cb->orig_bio);
}
/* finally free the cb struct */
@@ -266,7 +263,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
* This also calls the writeback end hooks for the file pages so that
* metadata and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct bio *bio, int err)
+static void end_compressed_bio_write(struct bio *bio)
{
struct extent_io_tree *tree;
struct compressed_bio *cb = bio->bi_private;
@@ -274,7 +271,7 @@ static void end_compressed_bio_write(struct bio *bio, int err)
struct page *page;
unsigned long index;
- if (err)
+ if (bio->bi_error)
cb->errors = 1;
/* if there are more bios still pending for this compressed
@@ -293,7 +290,7 @@ static void end_compressed_bio_write(struct bio *bio, int err)
cb->start,
cb->start + cb->len - 1,
NULL,
- err ? 0 : 1);
+ bio->bi_error ? 0 : 1);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
@@ -697,8 +694,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
- if (ret)
- bio_endio(comp_bio, ret);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(comp_bio);
+ }
bio_put(comp_bio);
@@ -724,8 +723,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
- if (ret)
- bio_endio(comp_bio, ret);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(comp_bio);
+ }
bio_put(comp_bio);
return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0f11ebc92f02..5f745eadf77d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1159,8 +1159,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
return ret;
+ }
}
if (buf == root->node) {
@@ -1439,8 +1441,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
old = read_tree_block(root, logical, 0);
- if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
- free_extent_buffer(old);
+ if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+ if (!IS_ERR(old))
+ free_extent_buffer(old);
btrfs_warn(root->fs_info,
"failed to read tree block %llu from get_old_root", logical);
} else {
@@ -1685,7 +1688,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (!cur || !uptodate) {
if (!cur) {
cur = read_tree_block(root, blocknr, gen);
- if (!cur || !extent_buffer_uptodate(cur)) {
+ if (IS_ERR(cur)) {
+ return PTR_ERR(cur);
+ } else if (!extent_buffer_uptodate(cur)) {
free_extent_buffer(cur);
return -EIO;
}
@@ -1864,8 +1869,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
btrfs_node_ptr_generation(parent, slot));
- if (eb && !extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
+ if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
+ if (!IS_ERR(eb))
+ free_extent_buffer(eb);
eb = NULL;
}
@@ -2494,7 +2500,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
tmp = read_tree_block(root, blocknr, 0);
- if (tmp) {
+ if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6f364e1d8d3d..938efe33be80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -174,7 +174,7 @@ struct btrfs_ordered_sum;
/* csum types */
#define BTRFS_CSUM_TYPE_CRC32 0
-static int btrfs_csum_sizes[] = { 4, 0 };
+static int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
@@ -1300,7 +1300,7 @@ struct btrfs_block_group_cache {
/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
- unsigned int ro:1;
+ unsigned int ro;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
@@ -1518,12 +1518,6 @@ struct btrfs_fs_info {
*/
struct mutex ordered_operations_mutex;
- /*
- * Same as ordered_operations_mutex except this is for ordered extents
- * and not the operations.
- */
- struct mutex ordered_extent_flush_mutex;
-
struct rw_semaphore commit_root_sem;
struct rw_semaphore cleanup_work_sem;
@@ -1619,10 +1613,7 @@ struct btrfs_fs_info {
struct task_struct *cleaner_kthread;
int thread_pool_size;
- struct kobject super_kobj;
struct kobject *space_info_kobj;
- struct kobject *device_dir_kobj;
- struct completion kobj_unregister;
int do_barriers;
int closing;
int log_root_recovering;
@@ -1698,6 +1689,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_nocow_workers;
+ struct btrfs_workqueue *scrub_parity_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
@@ -1735,7 +1727,7 @@ struct btrfs_fs_info {
/* list of dirty qgroups to be written at next commit */
struct list_head dirty_qgroups;
- /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+ /* used by qgroup for an efficient tree traversal */
u64 qgroup_seq;
/* qgroup rescan items */
@@ -1780,6 +1772,7 @@ struct btrfs_fs_info {
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
+ struct mutex delete_unused_bgs_mutex;
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
@@ -3438,6 +3431,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start,
struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3458,6 +3453,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode);
void btrfs_orphan_release_metadata(struct inode *inode);
@@ -3495,9 +3491,9 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
@@ -3515,6 +3511,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
int __get_raid_index(u64 flags);
int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const u64 type);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -4050,6 +4049,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#ifdef CONFIG_BTRFS_ASSERT
+__cold
static inline void assfail(char *expr, char *file, int line)
{
pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
@@ -4065,10 +4065,13 @@ static inline void assfail(char *expr, char *file, int line)
#define btrfs_assert()
__printf(5, 6)
+__cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+const char *btrfs_decode_error(int errno);
+__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno);
@@ -4111,11 +4114,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
*/
-
#define btrfs_abort_transaction(trans, root, errno) \
do { \
- __btrfs_abort_transaction(trans, root, __func__, \
- __LINE__, errno); \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((root)->fs_info->fs_state))) { \
+ WARN(1, KERN_DEBUG \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno)); \
+ } \
+ __btrfs_abort_transaction((trans), (root), __func__, \
+ __LINE__, (errno)); \
} while (0)
#define btrfs_std_error(fs_info, errno) \
@@ -4132,6 +4141,7 @@ do { \
} while (0)
__printf(5, 6)
+__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
@@ -4172,8 +4182,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 8f8ed7d20bac..ac3e81da6d4e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -22,6 +22,7 @@
#include "ctree.h"
#include "delayed-ref.h"
#include "transaction.h"
+#include "qgroup.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
return 0;
}
-/*
- * entries in the rb tree are ordered by the byte number of the extent,
- * type of the delayed backrefs and content of delayed backrefs.
- */
-static int comp_entry(struct btrfs_delayed_ref_node *ref2,
- struct btrfs_delayed_ref_node *ref1,
- bool compare_seq)
-{
- if (ref1->bytenr < ref2->bytenr)
- return -1;
- if (ref1->bytenr > ref2->bytenr)
- return 1;
- if (ref1->is_head && ref2->is_head)
- return 0;
- if (ref2->is_head)
- return -1;
- if (ref1->is_head)
- return 1;
- if (ref1->type < ref2->type)
- return -1;
- if (ref1->type > ref2->type)
- return 1;
- if (ref1->no_quota > ref2->no_quota)
- return 1;
- if (ref1->no_quota < ref2->no_quota)
- return -1;
- /* merging of sequenced refs is not allowed */
- if (compare_seq) {
- if (ref1->seq < ref2->seq)
- return -1;
- if (ref1->seq > ref2->seq)
- return 1;
- }
- if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
- btrfs_delayed_node_to_tree_ref(ref1),
- ref1->type);
- } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
- ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
- return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
- btrfs_delayed_node_to_data_ref(ref1));
- }
- BUG();
- return 0;
-}
-
-/*
- * insert a new ref into the rbtree. This returns any existing refs
- * for the same (bytenr,parent) tuple, or NULL if the new node was properly
- * inserted.
- */
-static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_delayed_ref_node *entry;
- struct btrfs_delayed_ref_node *ins;
- int cmp;
-
- ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
- rb_node);
-
- cmp = comp_entry(entry, ins, 1);
- if (cmp < 0)
- p = &(*p)->rb_left;
- else if (cmp > 0)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
/* insert a new ref to head ref rbtree */
static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
struct rb_node *node)
@@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
rb_erase(&head->href_node, &delayed_refs->href_root);
} else {
assert_spin_locked(&head->lock);
- rb_erase(&ref->rb_node, &head->ref_root);
+ list_del(&ref->list);
}
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
@@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates--;
}
-static int merge_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head,
- struct btrfs_delayed_ref_node *ref, u64 seq)
-{
- struct rb_node *node;
- int mod = 0;
- int done = 0;
-
- node = rb_next(&ref->rb_node);
- while (!done && node) {
- struct btrfs_delayed_ref_node *next;
-
- next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
- if (seq && next->seq >= seq)
- break;
- if (comp_entry(ref, next, 0))
- continue;
-
- if (ref->action == next->action) {
- mod = next->ref_mod;
- } else {
- if (ref->ref_mod < next->ref_mod) {
- struct btrfs_delayed_ref_node *tmp;
-
- tmp = ref;
- ref = next;
- next = tmp;
- done = 1;
- }
- mod = -next->ref_mod;
- }
-
- drop_delayed_ref(trans, delayed_refs, head, next);
- ref->ref_mod += mod;
- if (ref->ref_mod == 0) {
- drop_delayed_ref(trans, delayed_refs, head, ref);
- done = 1;
- } else {
- /*
- * You can't have multiples of the same ref on a tree
- * block.
- */
- WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
- }
- }
- return done;
-}
-
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
-{
- struct rb_node *node;
- u64 seq = 0;
-
- assert_spin_locked(&head->lock);
- /*
- * We don't have too much refs to merge in the case of delayed data
- * refs.
- */
- if (head->is_data)
- return;
-
- spin_lock(&fs_info->tree_mod_seq_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- struct seq_list *elem;
-
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- seq = elem->seq;
- }
- spin_unlock(&fs_info->tree_mod_seq_lock);
-
- node = rb_first(&head->ref_root);
- while (node) {
- struct btrfs_delayed_ref_node *ref;
-
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
- /* We can't merge refs that are outside of our seq count */
- if (seq && ref->seq >= seq)
- break;
- if (merge_ref(trans, delayed_refs, head, ref, seq))
- node = rb_first(&head->ref_root);
- else
- node = rb_next(&ref->rb_node);
- }
-}
-
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 seq)
@@ -443,45 +270,71 @@ again:
}
/*
- * helper function to update an extent delayed ref in the
- * rbtree. existing and update must both have the same
- * bytenr and parent
+ * Helper to insert the ref_node to the tail or merge with tail.
*
- * This may free existing if the update cancels out whatever
- * operation it was doing.
+ * Return 0 for insert.
+ * Return >0 for merge.
*/
-static noinline void
-update_existing_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head,
- struct btrfs_delayed_ref_node *existing,
- struct btrfs_delayed_ref_node *update)
+static int
+add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *root,
+ struct btrfs_delayed_ref_head *href,
+ struct btrfs_delayed_ref_node *ref)
{
- if (update->action != existing->action) {
- /*
- * this is effectively undoing either an add or a
- * drop. We decrement the ref_mod, and if it goes
- * down to zero we just delete the entry without
- * every changing the extent allocation tree.
- */
- existing->ref_mod--;
- if (existing->ref_mod == 0)
- drop_delayed_ref(trans, delayed_refs, head, existing);
- else
- WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
- existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ struct btrfs_delayed_ref_node *exist;
+ int mod;
+ int ret = 0;
+
+ spin_lock(&href->lock);
+ /* Check whether we can merge the tail node with ref */
+ if (list_empty(&href->ref_list))
+ goto add_tail;
+ exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
+ list);
+ /* No need to compare bytenr nor is_head */
+ if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
+ exist->seq != ref->seq)
+ goto add_tail;
+
+ if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+ comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
+ btrfs_delayed_node_to_tree_ref(ref),
+ ref->type))
+ goto add_tail;
+ if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
+ comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
+ btrfs_delayed_node_to_data_ref(ref)))
+ goto add_tail;
+
+ /* Now we are sure we can merge */
+ ret = 1;
+ if (exist->action == ref->action) {
+ mod = ref->ref_mod;
} else {
- WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
- existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
- /*
- * the action on the existing ref matches
- * the action on the ref we're trying to add.
- * Bump the ref_mod by one so the backref that
- * is eventually added/removed has the correct
- * reference count
- */
- existing->ref_mod += update->ref_mod;
+ /* Need to change action */
+ if (exist->ref_mod < ref->ref_mod) {
+ exist->action = ref->action;
+ mod = -exist->ref_mod;
+ exist->ref_mod = ref->ref_mod;
+ } else
+ mod = -ref->ref_mod;
}
+ exist->ref_mod += mod;
+
+ /* remove existing tail if its ref_mod is zero */
+ if (exist->ref_mod == 0)
+ drop_delayed_ref(trans, root, href, exist);
+ spin_unlock(&href->lock);
+ return ret;
+
+add_tail:
+ list_add_tail(&ref->list, &href->ref_list);
+ atomic_inc(&root->num_entries);
+ trans->delayed_ref_updates++;
+ spin_unlock(&href->lock);
+ return ret;
}
/*
@@ -568,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref, u64 bytenr,
- u64 num_bytes, int action, int is_data)
+ struct btrfs_delayed_ref_node *ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr, u64 num_bytes, int action, int is_data)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *qexisting;
int count_mod = 1;
int must_insert_reserved = 0;
@@ -618,10 +473,22 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
- head_ref->ref_root = RB_ROOT;
+ INIT_LIST_HEAD(&head_ref->ref_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
+ /* Record qgroup extent info if provided */
+ if (qrecord) {
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+
+ qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+ qrecord);
+ if (qexisting)
+ kfree(qrecord);
+ }
+
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
@@ -659,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 parent, u64 ref_root, int level,
int action, int no_quota)
{
- struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
+ int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -693,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_tree_ref(ref, full_ref, action);
- spin_lock(&head_ref->lock);
- existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
- if (existing) {
- update_existing_ref(trans, delayed_refs, head_ref, existing,
- ref);
- /*
- * we've updated the existing ref, free the newly
- * allocated ref
- */
+ ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+ /*
+ * XXX: memory should be freed at the same level allocated.
+ * But bad practice is anywhere... Follow it now. Need cleanup.
+ */
+ if (ret > 0)
kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
- } else {
- atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
- }
- spin_unlock(&head_ref->lock);
}
/*
@@ -721,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
u64 offset, int action, int no_quota)
{
- struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
+ int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -758,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_data_ref(ref, full_ref, action);
- spin_lock(&head_ref->lock);
- existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
- if (existing) {
- update_existing_ref(trans, delayed_refs, head_ref, existing,
- ref);
- /*
- * we've updated the existing ref, free the newly
- * allocated ref
- */
+ ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+ if (ret > 0)
kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
- } else {
- atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
- }
- spin_unlock(&head_ref->lock);
}
/*
@@ -790,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
no_quota = 0;
@@ -800,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
- if (!head_ref) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
- return -ENOMEM;
+ if (!head_ref)
+ goto free_ref;
+
+ if (fs_info->quota_enabled && is_fstree(ref_root)) {
+ record = kmalloc(sizeof(*record), GFP_NOFS);
+ if (!record)
+ goto free_head_ref;
}
head_ref->extent_op = extent_op;
@@ -814,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, action, 0);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -823,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
spin_unlock(&delayed_refs->lock);
return 0;
+
+free_head_ref:
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+free_ref:
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
+ return -ENOMEM;
}
/*
@@ -839,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
no_quota = 0;
@@ -854,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
return -ENOMEM;
}
+ if (fs_info->quota_enabled && is_fstree(ref_root)) {
+ record = kmalloc(sizeof(*record), GFP_NOFS);
+ if (!record) {
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_head_cachep,
+ head_ref);
+ return -ENOMEM;
+ }
+ }
+
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -863,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, action, 1);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -891,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
- num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
- extent_op->is_data);
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
+ num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ extent_op->is_data);
spin_unlock(&delayed_refs->lock);
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5eb0892396d0..13fb5e6090fe 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -24,9 +24,25 @@
#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+/*
+ * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the
+ * same ref_node structure.
+ * Ref_head is in a higher logic level than tree/data ref, and duplicated
+ * bytenr/num_bytes in ref_node is really a waste or memory, they should be
+ * referred from ref_head.
+ * This gets more disgusting after we use list to store tree/data ref in
+ * ref_head. Must clean this mess up later.
+ */
struct btrfs_delayed_ref_node {
+ /*
+ * ref_head use rb tree, stored in ref_root->href.
+ * indexed by bytenr
+ */
struct rb_node rb_node;
+ /*data/tree ref use list, stored in ref_head->ref_list. */
+ struct list_head list;
+
/* the starting bytenr of the extent */
u64 bytenr;
@@ -83,7 +99,7 @@ struct btrfs_delayed_ref_head {
struct mutex mutex;
spinlock_t lock;
- struct rb_root ref_root;
+ struct list_head ref_list;
struct rb_node href_node;
@@ -132,6 +148,9 @@ struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root href_root;
+ /* dirty extent records */
+ struct rb_root dirty_extent_root;
+
/* this spin lock protects the rbtree and the entries inside */
spinlock_t lock;
@@ -156,6 +175,14 @@ struct btrfs_delayed_ref_root {
int flushing;
u64 run_delayed_start;
+
+ /*
+ * To make qgroup to skip given root.
+ * This is for snapshot, as btrfs_qgroup_inherit() will manully
+ * modify counters for snapshot and its source, so we should skip
+ * the snapshot in new_root/old_roots or it will get calculated twice
+ */
+ u64 qgroup_to_skip;
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 0573848c7333..564a7de17d99 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
+ ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
+ if (ret)
+ btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
printk_in_rcu(KERN_INFO
"BTRFS: dev_replace from %s (devid %llu) to %s started\n",
src_device->missing ? "<missing disk>" :
@@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info, src_device);
- btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
/* write back the superblocks */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0bccf18dc1dc..9ebd34f1c677 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -703,7 +703,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
return -EIO; /* we fixed nothing */
}
-static void end_workqueue_bio(struct bio *bio, int err)
+static void end_workqueue_bio(struct bio *bio)
{
struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
@@ -711,7 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
btrfs_work_func_t func;
fs_info = end_io_wq->info;
- end_io_wq->error = err;
+ end_io_wq->error = bio->bi_error;
if (bio->bi_rw & REQ_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
@@ -808,7 +808,8 @@ static void run_one_async_done(struct btrfs_work *work)
/* If an error occured we just want to clean up the bio and move on */
if (async->error) {
- bio_endio(async->bio, async->error);
+ async->bio->bi_error = async->error;
+ bio_endio(async->bio);
return;
}
@@ -908,8 +909,10 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
* submission context. Just jump into btrfs_map_bio
*/
ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
- if (ret)
- bio_endio(bio, ret);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
return ret;
}
@@ -960,10 +963,13 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
__btree_submit_bio_done);
}
- if (ret) {
+ if (ret)
+ goto out_w_error;
+ return 0;
+
out_w_error:
- bio_endio(bio, ret);
- }
+ bio->bi_error = ret;
+ bio_endio(bio);
return ret;
}
@@ -1149,12 +1155,12 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret) {
free_extent_buffer(buf);
- return NULL;
+ return ERR_PTR(ret);
}
return buf;
@@ -1509,20 +1515,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
generation);
- if (!root->node) {
- ret = -ENOMEM;
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
goto find_fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- goto read_fail;
+ free_extent_buffer(root->node);
+ goto find_fail;
}
root->commit_root = btrfs_root_node(root);
out:
btrfs_free_path(path);
return root;
-read_fail:
- free_extent_buffer(root->node);
find_fail:
kfree(root);
alloc_fail:
@@ -1725,6 +1730,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
+ bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
return 0;
}
@@ -1736,22 +1742,22 @@ static void end_workqueue_fn(struct btrfs_work *work)
{
struct bio *bio;
struct btrfs_end_io_wq *end_io_wq;
- int error;
end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
bio = end_io_wq->bio;
- error = end_io_wq->error;
+ bio->bi_error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
- bio_endio(bio, error);
+ bio_endio(bio);
}
static int cleaner_kthread(void *arg)
{
struct btrfs_root *root = arg;
int again;
+ struct btrfs_trans_handle *trans;
do {
again = 0;
@@ -1773,7 +1779,6 @@ static int cleaner_kthread(void *arg)
}
btrfs_run_delayed_iputs(root);
- btrfs_delete_unused_bgs(root->fs_info);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -1782,6 +1787,16 @@ static int cleaner_kthread(void *arg)
* needn't do anything special here.
*/
btrfs_run_defrag_inodes(root->fs_info);
+
+ /*
+ * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
+ * with relocation (btrfs_relocate_chunk) and relocation
+ * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
+ * after acquiring fs_info->delete_unused_bgs_mutex. So we
+ * can't hold, nor need to, fs_info->cleaner_mutex when deleting
+ * unused block groups.
+ */
+ btrfs_delete_unused_bgs(root->fs_info);
sleep:
if (!try_to_freeze() && !again) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -1790,6 +1805,34 @@ sleep:
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
+
+ /*
+ * Transaction kthread is stopped before us and wakes us up.
+ * However we might have started a new transaction and COWed some
+ * tree blocks when deleting unused block groups for example. So
+ * make sure we commit the transaction we started to have a clean
+ * shutdown when evicting the btree inode - if it has dirty pages
+ * when we do the final iput() on it, eviction will trigger a
+ * writeback for it which will fail with null pointer dereferences
+ * since work queues and other resources were already released and
+ * destroyed by the time the iput/eviction/writeback is made.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT)
+ btrfs_err(root->fs_info,
+ "cleaner transaction attach returned %ld",
+ PTR_ERR(trans));
+ } else {
+ int ret;
+
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ btrfs_err(root->fs_info,
+ "cleaner open transaction commit returned %d",
+ ret);
+ }
+
return 0;
}
@@ -2320,8 +2363,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = read_tree_block(tree_root, bytenr,
fs_info->generation + 1);
- if (!log_tree_root->node ||
- !extent_buffer_uptodate(log_tree_root->node)) {
+ if (IS_ERR(log_tree_root->node)) {
+ printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ ret = PTR_ERR(log_tree_root->node);
+ kfree(log_tree_root);
+ return ret;
+ } else if (!extent_buffer_uptodate(log_tree_root->node)) {
printk(KERN_ERR "BTRFS: failed to read log tree\n");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
@@ -2489,12 +2536,12 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
+ mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
seqlock_init(&fs_info->profiles_lock);
init_rwsem(&fs_info->delayed_iput_sem);
- init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -2567,7 +2614,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->ordered_operations_mutex);
- mutex_init(&fs_info->ordered_extent_flush_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2797,10 +2843,11 @@ int open_ctree(struct super_block *sb,
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
generation);
- if (!chunk_root->node ||
- !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+ if (IS_ERR(chunk_root->node) ||
+ !extent_buffer_uptodate(chunk_root->node)) {
printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
+ chunk_root->node = NULL;
goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
@@ -2834,11 +2881,11 @@ retry_root_backup:
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
generation);
- if (!tree_root->node ||
- !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ if (IS_ERR(tree_root->node) ||
+ !extent_buffer_uptodate(tree_root->node)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
sb->s_id);
-
+ tree_root->node = NULL;
goto recovery_tree_root;
}
@@ -2874,10 +2921,22 @@ retry_root_backup:
btrfs_close_extra_devices(fs_devices, 1);
+ ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
+ if (ret) {
+ pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_sysfs_add_device(fs_devices);
+ if (ret) {
+ pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
+ goto fail_fsdev_sysfs;
+ }
+
ret = btrfs_sysfs_add_one(fs_info);
if (ret) {
pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
- goto fail_block_groups;
+ goto fail_fsdev_sysfs;
}
ret = btrfs_init_space_info(fs_info);
@@ -2896,8 +2955,9 @@ retry_root_backup:
if (fs_info->fs_devices->missing_devices >
fs_info->num_tolerated_disk_barrier_failures &&
!(sb->s_flags & MS_RDONLY)) {
- printk(KERN_WARNING "BTRFS: "
- "too many missing devices, writeable mount is not allowed\n");
+ pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
+ fs_info->fs_devices->missing_devices,
+ fs_info->num_tolerated_disk_barrier_failures);
goto fail_sysfs;
}
@@ -3055,6 +3115,9 @@ fail_cleaner:
fail_sysfs:
btrfs_sysfs_remove_one(fs_info);
+fail_fsdev_sysfs:
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
+
fail_block_groups:
btrfs_put_block_group_cache(fs_info);
btrfs_free_block_groups(fs_info);
@@ -3267,10 +3330,8 @@ static int write_dev_supers(struct btrfs_device *device,
* endio for the write_dev_flush, this will wake anyone waiting
* for the barrier when it is done
*/
-static void btrfs_end_empty_barrier(struct bio *bio, int err)
+static void btrfs_end_empty_barrier(struct bio *bio)
{
- if (err)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
if (bio->bi_private)
complete(bio->bi_private);
bio_put(bio);
@@ -3298,8 +3359,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
- if (!bio_flagged(bio, BIO_UPTODATE)) {
- ret = -EIO;
+ if (bio->bi_error) {
+ ret = bio->bi_error;
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_FLUSH_ERRS);
}
@@ -3703,6 +3764,15 @@ void close_ctree(struct btrfs_root *root)
cancel_work_sync(&fs_info->async_reclaim_work);
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+ /*
+ * If the cleaner thread is stopped and there are
+ * block groups queued for removal, the deletion will be
+ * skipped when we quit the cleaner thread.
+ */
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_delete_unused_bgs(root->fs_info);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
ret = btrfs_commit_super(root);
if (ret)
btrfs_err(fs_info, "commit super ret %d", ret);
@@ -3725,6 +3795,7 @@ void close_ctree(struct btrfs_root *root)
}
btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
btrfs_free_fs_roots(fs_info);
@@ -4053,6 +4124,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_node *tmp;
bool pin_bytes = false;
head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -4068,11 +4140,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
spin_lock(&head->lock);
- while ((node = rb_first(&head->ref_root)) != NULL) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
+ list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
+ list) {
ref->in_tree = 0;
- rb_erase(&ref->rb_node, &head->ref_root);
+ list_del(&ref->list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ec3acd14cbf..5411f0ab5683 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
+ struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extra_op,
- int no_quota);
+ struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei);
@@ -1317,8 +1316,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline u32 extent_data_ref_count(struct btrfs_root *root,
- struct btrfs_path *path,
+static noinline u32 extent_data_ref_count(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref)
{
struct btrfs_key key;
@@ -1884,10 +1882,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-static int btrfs_issue_discard(struct block_device *bdev,
- u64 start, u64 len)
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+ u64 *discarded_bytes)
{
- return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+ int j, ret = 0;
+ u64 bytes_left, end;
+ u64 aligned_start = ALIGN(start, 1 << 9);
+
+ if (WARN_ON(start != aligned_start)) {
+ len -= aligned_start - start;
+ len = round_down(len, 1 << 9);
+ start = aligned_start;
+ }
+
+ *discarded_bytes = 0;
+
+ if (!len)
+ return 0;
+
+ end = start + len;
+ bytes_left = len;
+
+ /* Skip any superblocks on this device. */
+ for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+ u64 sb_start = btrfs_sb_offset(j);
+ u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+ u64 size = sb_start - start;
+
+ if (!in_range(sb_start, start, bytes_left) &&
+ !in_range(sb_end, start, bytes_left) &&
+ !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+ continue;
+
+ /*
+ * Superblock spans beginning of range. Adjust start and
+ * try again.
+ */
+ if (sb_start <= start) {
+ start += sb_end - start;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ continue;
+ }
+
+ if (size) {
+ ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += size;
+ else if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ start = sb_end;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ }
+
+ if (bytes_left) {
+ ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += bytes_left;
+ }
+ return ret;
}
int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1908,14 +1973,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ u64 bytes;
if (!stripe->dev->can_discard)
continue;
ret = btrfs_issue_discard(stripe->dev->bdev,
stripe->physical,
- stripe->length);
+ stripe->length,
+ &bytes);
if (!ret)
- discarded_bytes += stripe->length;
+ discarded_bytes += bytes;
else if (ret != -EOPNOTSUPP)
break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
@@ -1967,10 +2034,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_ref_node *node,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
- int no_quota,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1978,9 +2044,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
struct btrfs_key key;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
u64 refs;
int ret;
- enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
+ int no_quota = node->no_quota;
path = btrfs_alloc_path();
if (!path)
@@ -1996,26 +2064,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
bytenr, num_bytes, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
- if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+ if ((ret < 0 && ret != -EAGAIN) || !ret)
goto out;
- /*
- * Ok we were able to insert an inline extent and it appears to be a new
- * reference, deal with the qgroup accounting.
- */
- if (!ret && !no_quota) {
- ASSERT(root->fs_info->quota_enabled);
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- item = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
- type = BTRFS_QGROUP_OPER_ADD_SHARED;
- btrfs_release_path(path);
-
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- bytenr, num_bytes, type, 0);
- goto out;
- }
/*
* Ok we had -EAGAIN which means we didn't have space to insert and
@@ -2026,8 +2076,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, item);
- if (refs)
- type = BTRFS_QGROUP_OPER_ADD_SHARED;
btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
@@ -2035,13 +2083,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- if (!no_quota) {
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- bytenr, num_bytes, type, 0);
- if (ret)
- goto out;
- }
-
path->reada = 1;
path->leave_spinning = 1;
/* now insert the actual backref */
@@ -2087,17 +2128,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
ref->objectid, ref->offset,
&ins, node->ref_mod);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
- node->num_bytes, parent,
+ ret = __btrfs_inc_extent_ref(trans, root, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- node->no_quota, extent_op);
+ extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node->bytenr,
- node->num_bytes, parent,
+ ret = __btrfs_free_extent(trans, root, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op, node->no_quota);
+ extent_op);
} else {
BUG();
}
@@ -2255,15 +2294,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref->level, &ins,
node->no_quota);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
- node->num_bytes, parent, ref_root,
- ref->level, 0, 1, node->no_quota,
+ ret = __btrfs_inc_extent_ref(trans, root, node,
+ parent, ref_root,
+ ref->level, 0, 1,
extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node->bytenr,
- node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op,
- node->no_quota);
+ ret = __btrfs_free_extent(trans, root, node,
+ parent, ref_root,
+ ref->level, 0, 1, extent_op);
} else {
BUG();
}
@@ -2323,28 +2361,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline struct btrfs_delayed_ref_node *
+static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
- struct rb_node *node;
- struct btrfs_delayed_ref_node *ref, *last = NULL;;
+ struct btrfs_delayed_ref_node *ref;
+
+ if (list_empty(&head->ref_list))
+ return NULL;
/*
- * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
- * this prevents ref count from going down to zero when
- * there still are pending delayed ref.
+ * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * This is to prevent a ref count from going down to zero, which deletes
+ * the extent item from the extent tree, when there still are references
+ * to add, which would fail because they would not find the extent item.
*/
- node = rb_first(&head->ref_root);
- while (node) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
+ list_for_each_entry(ref, &head->ref_list, list) {
if (ref->action == BTRFS_ADD_DELAYED_REF)
return ref;
- else if (last == NULL)
- last = ref;
- node = rb_next(node);
}
- return last;
+
+ return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
+ list);
}
/*
@@ -2396,16 +2433,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
}
- /*
- * We need to try and merge add/drops of the same ref since we
- * can run into issues with relocate dropping the implicit ref
- * and then it being added back again before the drop can
- * finish. If we merged anything we need to re-loop so we can
- * get a good ref.
- */
spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
- locked_ref);
/*
* locked_ref is the head node, so we have to go one
@@ -2482,7 +2510,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&locked_ref->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&locked_ref->lock);
- if (rb_first(&locked_ref->ref_root) ||
+ if (!list_empty(&locked_ref->ref_list) ||
locked_ref->extent_op) {
spin_unlock(&locked_ref->lock);
spin_unlock(&delayed_refs->lock);
@@ -2496,7 +2524,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
} else {
actual_count++;
ref->in_tree = 0;
- rb_erase(&ref->rb_node, &locked_ref->ref_root);
+ list_del(&ref->list);
}
atomic_dec(&delayed_refs->num_entries);
@@ -2864,9 +2892,6 @@ again:
goto again;
}
out:
- ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
- if (ret)
- return ret;
assert_qgroups_uptodate(trans);
return 0;
}
@@ -2905,7 +2930,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- struct rb_node *node;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
@@ -2934,11 +2958,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
spin_lock(&head->lock);
- node = rb_first(&head->ref_root);
- while (node) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
-
+ list_for_each_entry(ref, &head->ref_list, list) {
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
ret = 1;
@@ -3693,7 +3713,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
- found->full = 0;
+ if (total_bytes > 0)
+ found->full = 0;
spin_unlock(&found->lock);
*space_info = found;
return 0;
@@ -3721,7 +3742,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_reserved = 0;
found->bytes_readonly = 0;
found->bytes_may_use = 0;
- found->full = 0;
+ if (total_bytes > 0)
+ found->full = 0;
+ else
+ found->full = 1;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
@@ -3975,6 +3999,9 @@ commit_trans:
!atomic_read(&root->fs_info->open_ioctl_trans)) {
need_commit--;
+ if (need_commit > 0)
+ btrfs_wait_ordered_roots(fs_info, -1);
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -4088,7 +4115,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
return 1;
}
-static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
{
u64 num_dev;
@@ -4102,24 +4129,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
else
num_dev = 1; /* DUP or single */
- /* metadata for updaing devices and chunk tree */
- return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+ return num_dev;
}
-static void check_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 type)
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 type)
{
struct btrfs_space_info *info;
u64 left;
u64 thresh;
+ int ret = 0;
+ u64 num_devs;
+
+ /*
+ * Needed because we can end up allocating a system chunk and for an
+ * atomic and race free space reservation in the chunk block reserve.
+ */
+ ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
left = info->total_bytes - info->bytes_used - info->bytes_pinned -
- info->bytes_reserved - info->bytes_readonly;
+ info->bytes_reserved - info->bytes_readonly -
+ info->bytes_may_use;
spin_unlock(&info->lock);
- thresh = get_system_chunk_thresh(root, type);
+ num_devs = get_profile_num_devs(root, type);
+
+ /* num_devs device items to update and 1 chunk item to add or remove */
+ thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
+ btrfs_calc_trans_metadata_size(root, 1);
+
if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
left, thresh, type);
@@ -4130,7 +4176,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
u64 flags;
flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
- btrfs_alloc_chunk(trans, root, flags);
+ /*
+ * Ignore failure to create system chunk. We might end up not
+ * needing it, as we might not need to COW all nodes/leafs from
+ * the paths we visit in the chunk tree (they were already COWed
+ * or created in the current transaction for example).
+ */
+ ret = btrfs_alloc_chunk(trans, root, flags);
+ }
+
+ if (!ret) {
+ ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
+ &root->fs_info->chunk_block_rsv,
+ thresh, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->chunk_bytes_reserved += thresh;
}
}
@@ -4235,6 +4295,24 @@ out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
+ /*
+ * When we allocate a new chunk we reserve space in the chunk block
+ * reserve to make sure we can COW nodes/leafs in the chunk tree or
+ * add new nodes/leafs to it if we end up needing to do it when
+ * inserting the chunk item and updating device items as part of the
+ * second phase of chunk allocation, performed by
+ * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+ * large number of new block groups to create in our transaction
+ * handle's new_bgs list to avoid exhausting the chunk block reserve
+ * in extreme cases - like having a single transaction create many new
+ * block groups when starting to write out the free space caches of all
+ * the block groups that were made dirty during the lifetime of the
+ * transaction.
+ */
+ if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+ btrfs_create_pending_block_groups(trans, trans->root);
+ btrfs_trans_release_chunk_metadata(trans);
+ }
return ret;
}
@@ -5188,6 +5266,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
trans->bytes_reserved = 0;
}
+/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->root->fs_info;
+
+ if (!trans->chunk_bytes_reserved)
+ return;
+
+ WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+ block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
+ trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved = 0;
+}
+
/* Can only return 0 or -ENOSPC */
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
@@ -6034,20 +6130,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *block_group, *tmp;
+ struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
- if (trans->aborted)
- return 0;
-
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
unpin = &fs_info->freed_extents[0];
- while (1) {
+ while (!trans->aborted) {
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
@@ -6066,6 +6161,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
cond_resched();
}
+ /*
+ * Transaction is finished. We don't need the lock anymore. We
+ * do need to clean up the block groups in case of a transaction
+ * abort.
+ */
+ deleted_bgs = &trans->transaction->deleted_bgs;
+ list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+ u64 trimmed = 0;
+
+ ret = -EROFS;
+ if (!trans->aborted)
+ ret = btrfs_discard_extent(root,
+ block_group->key.objectid,
+ block_group->key.offset,
+ &trimmed);
+
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group_trimming(block_group);
+ btrfs_put_block_group(block_group);
+
+ if (ret) {
+ const char *errstr = btrfs_decode_error(ret);
+ btrfs_warn(fs_info,
+ "Discard failed while removing blockgroup: errno=%d %s\n",
+ ret, errstr);
+ }
+ }
+
return 0;
}
@@ -6092,11 +6215,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
+ struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -6110,10 +6232,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
+ int no_quota = node->no_quota;
u32 item_size;
u64 refs;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
int last_ref = 0;
- enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
@@ -6294,7 +6418,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs -= refs_to_drop;
if (refs > 0) {
- type = BTRFS_QGROUP_OPER_SUB_SHARED;
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
/*
@@ -6321,7 +6444,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(root, path, iref));
+ extent_data_ref_count(path, iref));
if (iref) {
BUG_ON(path->slots[0] != extent_slot);
} else {
@@ -6356,18 +6479,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- /* Deal with the quota accounting */
- if (!ret && last_ref && !no_quota) {
- int mod_seq = 0;
-
- if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
- type == BTRFS_QGROUP_OPER_SUB_SHARED)
- mod_seq = 1;
-
- ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
- bytenr, num_bytes, type,
- mod_seq);
- }
out:
btrfs_free_path(path);
return ret;
@@ -6393,7 +6504,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (rb_first(&head->ref_root))
+ if (!list_empty(&head->ref_list))
goto out;
if (head->extent_op) {
@@ -7303,13 +7414,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- /* Always set parent to 0 here since its exclusive anyway. */
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- ins->objectid, ins->offset,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
- if (ret)
- return ret;
-
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7391,14 +7495,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- if (!no_quota) {
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- ins->objectid, num_bytes,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
- if (ret)
- return ret;
- }
-
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
@@ -7566,9 +7662,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
/*
* finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
* returns the tree buffer or an ERR_PTR on error.
*/
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -7755,12 +7848,18 @@ reada:
wc->reada_slot = slot;
}
+/*
+ * TODO: Modify related function to add related node/leaf to dirty_extent_root,
+ * for later qgroup accounting.
+ *
+ * Current, this function does nothing.
+ */
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
- int i, extent_type, ret;
+ int i, extent_type;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
@@ -7783,13 +7882,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-
- ret = btrfs_qgroup_record_ref(trans, root->fs_info,
- root->objectid,
- bytenr, num_bytes,
- BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
- if (ret)
- return ret;
}
return 0;
}
@@ -7858,6 +7950,8 @@ static int adjust_slots_upwards(struct btrfs_root *root,
/*
* root_eb is the subtree root and is locked before this function is called.
+ * TODO: Modify this function to mark all (including complete shared node)
+ * to dirty_extent_root to allow it get accounted in qgroup.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -7920,7 +8014,11 @@ walk_down:
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
eb = read_tree_block(root, child_bytenr, child_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
ret = -EIO;
goto out;
}
@@ -7931,16 +8029,6 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-
- ret = btrfs_qgroup_record_ref(trans, root->fs_info,
- root->objectid,
- child_bytenr,
- root->nodesize,
- BTRFS_QGROUP_OPER_SUB_SUBTREE,
- 0);
- if (ret)
- goto out;
-
}
if (level == 0) {
@@ -8151,7 +8239,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
next = read_tree_block(root, bytenr, generation);
- if (!next || !extent_buffer_uptodate(next)) {
+ if (IS_ERR(next)) {
+ return PTR_ERR(next);
+ } else if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
}
@@ -8533,24 +8623,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
- /*
- * Qgroup update accounting is run from
- * delayed ref handling. This usually works
- * out because delayed refs are normally the
- * only way qgroup updates are added. However,
- * we may have added updates during our tree
- * walk so run qgroups here to make sure we
- * don't lose any updates.
- */
- ret = btrfs_delayed_qgroup_accounting(trans,
- root->fs_info);
- if (ret)
- printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
- "running qgroup updates "
- "during snapshot delete. "
- "Quota is out of sync, "
- "rescan required.\n", ret);
-
btrfs_end_transaction_throttle(trans, tree_root);
if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8604,14 +8676,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
root_dropped = true;
out_end_trans:
- ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
- if (ret)
- printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
- "running qgroup updates "
- "during snapshot delete. "
- "Quota is out of sync, "
- "rescan required.\n", ret);
-
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
@@ -8751,14 +8815,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
return flags;
}
-static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 min_allocable_bytes;
int ret = -ENOSPC;
-
/*
* We need some metadata space and system metadata space for
* allocating chunks in some corner cases until we force to set
@@ -8775,6 +8838,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
spin_lock(&cache->lock);
if (cache->ro) {
+ cache->ro++;
ret = 0;
goto out;
}
@@ -8786,7 +8850,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
- cache->ro = 1;
+ cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
ret = 0;
}
@@ -8796,7 +8860,7 @@ out:
return ret;
}
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
@@ -8804,8 +8868,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
u64 alloc_flags;
int ret;
- BUG_ON(cache->ro);
-
again:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
@@ -8848,7 +8910,7 @@ again:
goto out;
}
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -8856,7 +8918,7 @@ again:
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = update_block_group_flags(root, cache->flags);
@@ -8919,7 +8981,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
@@ -8929,11 +8991,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
- cache->bytes_super - btrfs_block_group_used(&cache->item);
- sinfo->bytes_readonly -= num_bytes;
- cache->ro = 0;
- list_del_init(&cache->ro_list);
+ if (!--cache->ro) {
+ num_bytes = cache->key.offset - cache->reserved -
+ cache->pinned - cache->bytes_super -
+ btrfs_block_group_used(&cache->item);
+ sinfo->bytes_readonly -= num_bytes;
+ list_del_init(&cache->ro_list);
+ }
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
@@ -9449,7 +9513,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid)) {
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
} else if (btrfs_block_group_used(&cache->item) == 0) {
spin_lock(&info->unused_bgs_lock);
/* Should always be true but just in case. */
@@ -9477,11 +9541,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_RAID0],
list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_SINGLE],
list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
}
init_global_block_rsv(info);
@@ -9562,6 +9626,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+ /*
+ * Call to ensure the corresponding space_info object is created and
+ * assigned to our block group, but don't update its counters just yet.
+ * We want our bg to be added to the rbtree with its ->space_info set.
+ */
+ ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+ &cache->space_info);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
ret = btrfs_add_block_group_cache(root->fs_info, cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
@@ -9569,6 +9646,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return ret;
}
+ /*
+ * Now that our block group has its ->space_info set and is inserted in
+ * the rbtree, update the space info's counters.
+ */
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
if (ret) {
@@ -9845,6 +9926,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* currently running transaction might finish and a new one start,
* allowing for new block groups to be created that can reuse the same
* physical device locations unless we take this special care.
+ *
+ * There may also be an implicit trim operation if the file system
+ * is mounted with -odiscard. The same protections must remain
+ * in place until the extents have been discarded completely when
+ * the transaction commit has completed.
*/
remove_em = (atomic_read(&block_group->trimming) == 0);
/*
@@ -9919,6 +10005,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
u64 start, end;
+ int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group_cache,
@@ -9931,6 +10018,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
@@ -9950,7 +10039,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
/* We don't want to force the issue, only flip if it's ok. */
- ret = set_block_group_ro(block_group, 0);
+ ret = inc_block_group_ro(block_group, 0);
up_write(&space_info->groups_sem);
if (ret < 0) {
ret = 0;
@@ -9964,7 +10053,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
/* 1 for btrfs_orphan_reserve_metadata() */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
ret = PTR_ERR(trans);
goto next;
}
@@ -9991,14 +10080,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
@@ -10016,15 +10105,43 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
+ /* DISCARD can flip during remount */
+ trimming = btrfs_test_opt(root, DISCARD);
+
+ /* Implicit trim during transaction commit. */
+ if (trimming)
+ btrfs_get_block_group_trimming(block_group);
+
/*
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
*/
ret = btrfs_remove_chunk(trans, root,
block_group->key.objectid);
+
+ if (ret) {
+ if (trimming)
+ btrfs_put_block_group_trimming(block_group);
+ goto end_trans;
+ }
+
+ /*
+ * If we're not mounted with -odiscard, we can just forget
+ * about this block group. Otherwise we'll need to wait
+ * until transaction commit to do the actual discard.
+ */
+ if (trimming) {
+ WARN_ON(!list_empty(&block_group->bg_list));
+ spin_lock(&trans->transaction->deleted_bgs_lock);
+ list_move(&block_group->bg_list,
+ &trans->transaction->deleted_bgs);
+ spin_unlock(&trans->transaction->deleted_bgs_lock);
+ btrfs_get_block_group(block_group);
+ }
end_trans:
btrfs_end_transaction(trans, root);
next:
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
@@ -10074,10 +10191,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
return unpin_extent_range(root, start, end, false);
}
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space. Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time. We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses. For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+ u64 minlen, u64 *trimmed)
+{
+ u64 start = 0, len = 0;
+ int ret;
+
+ *trimmed = 0;
+
+ /* Not writeable = nothing to do. */
+ if (!device->writeable)
+ return 0;
+
+ /* No free space = nothing to do. */
+ if (device->total_bytes <= device->bytes_used)
+ return 0;
+
+ ret = 0;
+
+ while (1) {
+ struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+ struct btrfs_transaction *trans;
+ u64 bytes;
+
+ ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+ if (ret)
+ return ret;
+
+ down_read(&fs_info->commit_root_sem);
+
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ atomic_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ret = find_free_dev_extent_start(trans, device, minlen, start,
+ &start, &len);
+ if (trans)
+ btrfs_put_transaction(trans);
+
+ if (ret) {
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret == -ENOSPC)
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (ret)
+ break;
+
+ start += len;
+ *trimmed += bytes;
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_device *device;
+ struct list_head *devices;
u64 group_trimmed;
u64 start;
u64 end;
@@ -10132,6 +10338,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
cache = next_block_group(fs_info->tree_root, cache);
}
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ devices = &root->fs_info->fs_devices->alloc_list;
+ list_for_each_entry(device, devices, dev_alloc_list) {
+ ret = btrfs_trim_free_extents(device, range->minlen,
+ &group_trimmed);
+ if (ret)
+ break;
+
+ trimmed += group_trimmed;
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
range->len = trimmed;
return ret;
}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/fs/btrfs/extent-tree.h
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c374e1e71e5f..f1018cfbfefa 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
}
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
@@ -2481,7 +2486,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct bio *bio, int err)
+static void end_bio_extent_writepage(struct bio *bio)
{
struct bio_vec *bvec;
u64 start;
@@ -2511,7 +2516,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
- if (end_extent_writepage(page, err, start, end))
+ if (end_extent_writepage(page, bio->bi_error, start, end))
continue;
end_page_writeback(page);
@@ -2543,10 +2548,10 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct bio *bio, int err)
+static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ int uptodate = !bio->bi_error;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree;
u64 offset = 0;
@@ -2559,16 +2564,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
int ret;
int i;
- if (err)
- uptodate = 0;
-
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
- "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
- io_bio->mirror_num);
+ "mirror=%u\n", (u64)bio->bi_iter.bi_sector,
+ bio->bi_error, io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
/* We always issue full-page reads, but if some block
@@ -2609,8 +2611,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (tree->ops && tree->ops->readpage_io_failed_hook) {
ret = tree->ops->readpage_io_failed_hook(page, mirror);
- if (!ret && !err &&
- test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (!ret && !bio->bi_error)
uptodate = 1;
} else {
/*
@@ -2626,10 +2627,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
ret = bio_readpage_error(bio, offset, page, start, end,
mirror);
if (ret == 0) {
- uptodate =
- test_bit(BIO_UPTODATE, &bio->bi_flags);
- if (err)
- uptodate = 0;
+ uptodate = !bio->bi_error;
offset += len;
continue;
}
@@ -2679,7 +2677,7 @@ readpage_ok:
endio_readpage_release_extent(tree, extent_start, extent_len,
uptodate);
if (io_bio->end_io)
- io_bio->end_io(io_bio, err);
+ io_bio->end_io(io_bio, bio->bi_error);
bio_put(bio);
}
@@ -2725,6 +2723,12 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
btrfs_bio->csum = NULL;
btrfs_bio->csum_allocated = NULL;
btrfs_bio->end_io = NULL;
+
+#ifdef CONFIG_BLK_CGROUP
+ /* FIXME, put this into bio_clone_bioset */
+ if (bio->bi_css)
+ bio_associate_blkcg(new, bio->bi_css);
+#endif
}
return new;
}
@@ -2785,6 +2789,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
}
static int submit_extent_page(int rw, struct extent_io_tree *tree,
+ struct writeback_control *wbc,
struct page *page, sector_t sector,
size_t size, unsigned long offset,
struct block_device *bdev,
@@ -2797,9 +2802,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
{
int ret = 0;
struct bio *bio;
- int nr;
int contig = 0;
- int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
@@ -2821,21 +2824,24 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
}
bio = NULL;
} else {
+ if (wbc)
+ wbc_account_io(wbc, page, page_size);
return 0;
}
}
- if (this_compressed)
- nr = BIO_MAX_PAGES;
- else
- nr = bio_get_nr_vecs(bdev);
- bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+ bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES,
+ GFP_NOFS | __GFP_HIGH);
if (!bio)
return -ENOMEM;
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, page, page_size);
+ }
if (bio_ret)
*bio_ret = bio;
@@ -3046,7 +3052,7 @@ static int __do_readpage(struct extent_io_tree *tree,
}
pnr -= page->index;
- ret = submit_extent_page(rw, tree, page,
+ ret = submit_extent_page(rw, tree, NULL, page,
sector, disk_io_size, pg_offset,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
@@ -3441,7 +3447,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page->index, cur, end);
}
- ret = submit_extent_page(write_flags, tree, page,
+ ret = submit_extent_page(write_flags, tree, wbc, page,
sector, iosize, pg_offset,
bdev, &epd->bio, max_nr,
end_bio_extent_writepage,
@@ -3691,7 +3697,7 @@ static void set_btree_ioerr(struct page *page)
}
}
-static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+static void end_bio_extent_buffer_writepage(struct bio *bio)
{
struct bio_vec *bvec;
struct extent_buffer *eb;
@@ -3704,7 +3710,8 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
- if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+ if (bio->bi_error ||
+ test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page);
}
@@ -3744,7 +3751,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(rw, tree, p, offset >> 9,
+ ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
-1, end_bio_extent_buffer_writepage,
0, epd->bio_flags, bio_flags);
@@ -4490,6 +4497,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
free_extent_map(em);
em = NULL;
@@ -4607,9 +4616,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
{
struct extent_buffer *eb = NULL;
- eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
- if (eb == NULL)
- return NULL;
+ eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
eb->start = start;
eb->len = len;
eb->fs_info = fs_info;
@@ -4867,7 +4874,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return NULL;
for (i = 0; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS);
+ p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p)
goto free_eb;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b072e17479aa..b823fac91c92 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1748,7 +1748,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
}
current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
@@ -1868,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_log_ctx ctx;
int ret = 0;
bool full_sync = 0;
+ const u64 len = end - start + 1;
trace_btrfs_sync_file(file, datasync);
@@ -1896,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* all extents are persisted and the respective file extent
* items are in the fs/subvol btree.
*/
- ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+ ret = btrfs_wait_ordered_range(inode, start, len);
} else {
/*
* Start any new ordered operations before starting to log the
@@ -1968,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- (full_sync && BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed)) {
+ (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed &&
+ (full_sync ||
+ !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9dbe5b548fa6..abe3a66bd3ba 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -231,6 +231,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
{
int ret = 0;
struct btrfs_path *path = btrfs_alloc_path();
+ bool locked = false;
if (!path) {
ret = -ENOMEM;
@@ -238,6 +239,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
}
if (block_group) {
+ locked = true;
mutex_lock(&trans->transaction->cache_write_mutex);
if (!list_empty(&block_group->io_list)) {
list_del_init(&block_group->io_list);
@@ -269,18 +271,14 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
- if (ret) {
- mutex_unlock(&trans->transaction->cache_write_mutex);
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
+ if (ret)
+ goto fail;
ret = btrfs_update_inode(trans, root, inode);
- if (block_group)
- mutex_unlock(&trans->transaction->cache_write_mutex);
-
fail:
+ if (locked)
+ mutex_unlock(&trans->transaction->cache_write_mutex);
if (ret)
btrfs_abort_transaction(trans, root, ret);
@@ -3274,35 +3272,23 @@ next:
return ret;
}
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
- u64 *trimmed, u64 start, u64 end, u64 minlen)
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
{
- int ret;
+ atomic_inc(&cache->trimming);
+}
- *trimmed = 0;
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ bool cleanup;
spin_lock(&block_group->lock);
- if (block_group->removed) {
- spin_unlock(&block_group->lock);
- return 0;
- }
- atomic_inc(&block_group->trimming);
+ cleanup = (atomic_dec_and_test(&block_group->trimming) &&
+ block_group->removed);
spin_unlock(&block_group->lock);
- ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
- if (ret)
- goto out;
-
- ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
-out:
- spin_lock(&block_group->lock);
- if (atomic_dec_and_test(&block_group->trimming) &&
- block_group->removed) {
- struct extent_map_tree *em_tree;
- struct extent_map *em;
-
- spin_unlock(&block_group->lock);
-
+ if (cleanup) {
lock_chunks(block_group->fs_info->chunk_root);
em_tree = &block_group->fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
@@ -3326,10 +3312,31 @@ out:
* this block group have left 1 entry each one. Free them.
*/
__btrfs_remove_free_space_cache(block_group->free_space_ctl);
- } else {
+ }
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
spin_unlock(&block_group->lock);
+ return 0;
}
+ btrfs_get_block_group_trimming(block_group);
+ spin_unlock(&block_group->lock);
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+ if (ret)
+ goto out;
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+ btrfs_put_block_group_trimming(block_group);
return ret;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f6a596d5a637..d4a582ac3f73 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -246,6 +246,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+ spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
struct btrfs_free_space *info;
struct rb_node *n;
u64 count;
@@ -254,24 +255,30 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
return;
while (1) {
+ bool add_to_ctl = true;
+
+ spin_lock(rbroot_lock);
n = rb_first(rbroot);
- if (!n)
+ if (!n) {
+ spin_unlock(rbroot_lock);
break;
+ }
info = rb_entry(n, struct btrfs_free_space, offset_index);
BUG_ON(info->bitmap); /* Logic error */
if (info->offset > root->ino_cache_progress)
- goto free;
+ add_to_ctl = false;
else if (info->offset + info->bytes > root->ino_cache_progress)
count = root->ino_cache_progress - info->offset + 1;
else
count = info->bytes;
- __btrfs_add_free_space(ctl, info->offset, count);
-free:
rb_erase(&info->offset_index, rbroot);
- kfree(info);
+ spin_unlock(rbroot_lock);
+ if (add_to_ctl)
+ __btrfs_add_free_space(ctl, info->offset, count);
+ kmem_cache_free(btrfs_free_space_cachep, info);
}
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bb013672aee..237da012f7d0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1845,8 +1845,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
int ret;
ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
- if (ret)
- bio_endio(bio, ret);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
return ret;
}
@@ -1906,8 +1908,10 @@ mapit:
ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
out:
- if (ret < 0)
- bio_endio(bio, ret);
+ if (ret < 0) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
return ret;
}
@@ -3654,6 +3658,35 @@ cache_index:
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * We don't persist the id of the transaction where an unlink operation
+ * against the inode was last made. So here we assume the inode might
+ * have been evicted, and therefore the exact value of last_unlink_trans
+ * lost, and set it to last_trans to avoid metadata inconsistencies
+ * between the inode and its parent if the inode is fsync'ed and the log
+ * replayed. For example, in the scenario:
+ *
+ * touch mydir/foo
+ * ln mydir/foo mydir/bar
+ * sync
+ * unlink mydir/bar
+ * echo 2 > /proc/sys/vm/drop_caches # evicts inode
+ * xfs_io -c fsync mydir/foo
+ * <power failure>
+ * mount fs, triggers fsync log replay
+ *
+ * We must make sure that when we fsync our inode foo we also log its
+ * parent inode, otherwise after log replay the parent still has the
+ * dentry with the "bar" name but our inode foo has a link count of 1
+ * and doesn't have an inode ref with the name "bar" anymore.
+ *
+ * Setting last_unlink_trans to last_trans is a pessimistic approach,
+ * but it guarantees correctness at the expense of ocassional full
+ * transaction commits on fsync if our inode is a directory, or if our
+ * inode is not a directory, logging its parent unnecessarily.
+ */
+ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+
path->slots[0]++;
if (inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
@@ -4209,7 +4242,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 extent_num_bytes = 0;
u64 extent_offset = 0;
u64 item_end = 0;
- u64 last_size = (u64)-1;
+ u64 last_size = new_size;
u32 found_type = (u8)-1;
int found_extent;
int del_item;
@@ -4493,8 +4526,7 @@ out:
btrfs_abort_transaction(trans, root, ret);
}
error:
- if (last_size != (u64)-1 &&
- root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
btrfs_ordered_update_i_size(inode, last_size, NULL);
btrfs_free_path(path);
@@ -4986,24 +5018,41 @@ static void evict_inode_truncate_pages(struct inode *inode)
}
write_unlock(&map_tree->lock);
+ /*
+ * Keep looping until we have no more ranges in the io tree.
+ * We can have ongoing bios started by readpages (called from readahead)
+ * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+ * still in progress (unlocked the pages in the bio but did not yet
+ * unlocked the ranges in the io tree). Therefore this means some
+ * ranges can still be locked and eviction started because before
+ * submitting those bios, which are executed by a separate task (work
+ * queue kthread), inode references (inode->i_count) were not taken
+ * (which would be dropped in the end io callback of each bio).
+ * Therefore here we effectively end up waiting for those bios and
+ * anyone else holding locked ranges without having bumped the inode's
+ * reference count - if we don't do it, when they access the inode's
+ * io_tree to unlock a range it may be too late, leading to an
+ * use-after-free issue.
+ */
spin_lock(&io_tree->lock);
while (!RB_EMPTY_ROOT(&io_tree->state)) {
struct extent_state *state;
struct extent_state *cached_state = NULL;
+ u64 start;
+ u64 end;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
- atomic_inc(&state->refs);
+ start = state->start;
+ end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, state->start, state->end,
- 0, &cached_state);
- clear_extent_bit(io_tree, state->start, state->end,
+ lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 1,
&cached_state, GFP_NOFS);
- free_extent_state(state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -7530,6 +7579,7 @@ unlock:
current->journal_info = outstanding_extents;
btrfs_free_reserved_data_space(inode, len);
+ set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags);
}
/*
@@ -7671,13 +7721,13 @@ struct btrfs_retry_complete {
int uptodate;
};
-static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+static void btrfs_retry_endio_nocsum(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
struct bio_vec *bvec;
int i;
- if (err)
+ if (bio->bi_error)
goto end;
done->uptodate = 1;
@@ -7726,7 +7776,7 @@ try_again:
return 0;
}
-static void btrfs_retry_endio(struct bio *bio, int err)
+static void btrfs_retry_endio(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
@@ -7735,7 +7785,7 @@ static void btrfs_retry_endio(struct bio *bio, int err)
int ret;
int i;
- if (err)
+ if (bio->bi_error)
goto end;
uptodate = 1;
@@ -7818,12 +7868,13 @@ static int btrfs_subio_endio_read(struct inode *inode,
}
}
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static void btrfs_endio_direct_read(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct inode *inode = dip->inode;
struct bio *dio_bio;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ int err = bio->bi_error;
if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
err = btrfs_subio_endio_read(inode, io_bio, err);
@@ -7834,17 +7885,14 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
kfree(dip);
- /* If we had a csum failure make sure to clear the uptodate flag */
- if (err)
- clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
- dio_end_io(dio_bio, err);
+ dio_end_io(dio_bio, bio->bi_error);
if (io_bio->end_io)
io_bio->end_io(io_bio, err);
bio_put(bio);
}
-static void btrfs_endio_direct_write(struct bio *bio, int err)
+static void btrfs_endio_direct_write(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct inode *inode = dip->inode;
@@ -7855,12 +7903,11 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
struct bio *dio_bio;
int ret;
- if (err)
- goto out_done;
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
- ordered_bytes, !err);
+ ordered_bytes,
+ !bio->bi_error);
if (!ret)
goto out_test;
@@ -7879,15 +7926,11 @@ out_test:
ordered = NULL;
goto again;
}
-out_done:
dio_bio = dip->dio_bio;
kfree(dip);
- /* If we had an error make sure to clear the uptodate flag */
- if (err)
- clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
- dio_end_io(dio_bio, err);
+ dio_end_io(dio_bio, bio->bi_error);
bio_put(bio);
}
@@ -7902,9 +7945,10 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
return 0;
}
-static void btrfs_end_dio_bio(struct bio *bio, int err)
+static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
+ int err = bio->bi_error;
if (err)
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
@@ -7933,8 +7977,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
if (dip->errors) {
bio_io_error(dip->orig_bio);
} else {
- set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
- bio_endio(dip->orig_bio, 0);
+ dip->dio_bio->bi_error = 0;
+ bio_endio(dip->orig_bio);
}
out:
bio_put(bio);
@@ -7943,8 +7987,11 @@ out:
static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
u64 first_sector, gfp_t gfp_flags)
{
- int nr_vecs = bio_get_nr_vecs(bdev);
- return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+ struct bio *bio;
+ bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
+ if (bio)
+ bio_associate_current(bio);
+ return bio;
}
static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
@@ -8147,9 +8194,8 @@ out_err:
static void btrfs_submit_direct(int rw, struct bio *dio_bio,
struct inode *inode, loff_t file_offset)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_dio_private *dip;
- struct bio *io_bio;
+ struct btrfs_dio_private *dip = NULL;
+ struct bio *io_bio = NULL;
struct btrfs_io_bio *btrfs_bio;
int skip_sum;
int write = rw & REQ_WRITE;
@@ -8166,7 +8212,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip = kzalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
ret = -ENOMEM;
- goto free_io_bio;
+ goto free_ordered;
}
dip->private = dio_bio->bi_private;
@@ -8194,25 +8240,56 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
if (btrfs_bio->end_io)
btrfs_bio->end_io(btrfs_bio, ret);
-free_io_bio:
- bio_put(io_bio);
free_ordered:
/*
- * If this is a write, we need to clean up the reserved space and kill
- * the ordered extent.
+ * If we arrived here it means either we failed to submit the dip
+ * or we either failed to clone the dio_bio or failed to allocate the
+ * dip. If we cloned the dio_bio and allocated the dip, we can just
+ * call bio_endio against our io_bio so that we get proper resource
+ * cleanup if we fail to submit the dip, otherwise, we must do the
+ * same as btrfs_endio_direct_[write|read] because we can't call these
+ * callbacks - they require an allocated dip and a clone of dio_bio.
*/
- if (write) {
- struct btrfs_ordered_extent *ordered;
- ordered = btrfs_lookup_ordered_extent(inode, file_offset);
- if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
- btrfs_free_reserved_extent(root, ordered->start,
- ordered->disk_len, 1);
- btrfs_put_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
+ if (io_bio && dip) {
+ io_bio->bi_error = -EIO;
+ bio_endio(io_bio);
+ /*
+ * The end io callbacks free our dip, do the final put on io_bio
+ * and all the cleanup and final put for dio_bio (through
+ * dio_end_io()).
+ */
+ dip = NULL;
+ io_bio = NULL;
+ } else {
+ if (write) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_ordered_extent(inode,
+ file_offset);
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+ /*
+ * Decrements our ref on the ordered extent and removes
+ * the ordered extent from the inode's ordered tree,
+ * doing all the proper resource cleanup such as for the
+ * reserved space and waking up any waiters for this
+ * ordered extent (through btrfs_remove_ordered_extent).
+ */
+ btrfs_finish_ordered_io(ordered);
+ } else {
+ unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+ file_offset + dio_bio->bi_iter.bi_size - 1);
+ }
+ dio_bio->bi_error = -EIO;
+ /*
+ * Releases and cleans up our dio_bio, no need to bio_put()
+ * nor bio_endio()/bio_io_error() against dio_bio.
+ */
+ dio_end_io(dio_bio, ret);
}
- bio_endio(dio_bio, ret);
+ if (io_bio)
+ bio_put(io_bio);
+ kfree(dip);
}
static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
@@ -8314,9 +8391,18 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
btrfs_submit_direct, flags);
if (iov_iter_rw(iter) == WRITE) {
current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED)
- btrfs_delalloc_release_space(inode, count);
- else if (ret >= 0 && (size_t)ret < count)
+ if (ret < 0 && ret != -EIOCBQUEUED) {
+ /*
+ * If the error comes from submitting stage,
+ * btrfs_get_blocsk_direct() has free'd data space,
+ * and metadata space will be handled by
+ * finish_ordered_fn, don't do that again to make
+ * sure bytes_may_use is correct.
+ */
+ if (!test_and_clear_bit(BTRFS_INODE_DIO_READY,
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_delalloc_release_space(inode, count);
+ } else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode,
count - (size_t)ret);
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1c22c6518504..0adf5422fce9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
static int btrfs_clone(struct inode *src, struct inode *inode,
- u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+ u64 off, u64 olen, u64 olen_aligned, u64 destoff,
+ int no_time_update);
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -553,8 +554,8 @@ static noinline int create_subvol(struct inode *dir,
key.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
if (IS_ERR(new_root)) {
- btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
ret = PTR_ERR(new_root);
+ btrfs_abort_transaction(trans, root, ret);
goto fail;
}
@@ -1029,6 +1030,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
struct extent_map *em;
int ret = 1;
bool next_mergeable = true;
+ bool prev_mergeable = true;
/*
* make sure that once we start defragging an extent, we keep on
@@ -1049,13 +1051,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
goto out;
}
+ if (!*defrag_end)
+ prev_mergeable = false;
+
next_mergeable = defrag_check_next_extent(inode, em);
/*
* we hit a real extent, if it is big or the next extent is not a
* real extent, don't bother defragging it
*/
if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- (em->len >= thresh || !next_mergeable))
+ (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
ret = 0;
out:
/*
@@ -1318,7 +1323,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i = range->start >> PAGE_CACHE_SHIFT;
}
if (!max_to_defrag)
- max_to_defrag = last_index + 1;
+ max_to_defrag = last_index - i + 1;
/*
* make writeback starts from i, so the defrag range can be
@@ -1368,7 +1373,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index = max(i, ra_index);
btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
cluster);
- ra_index += max_cluster;
+ ra_index += cluster;
}
mutex_lock(&inode->i_mutex);
@@ -1932,6 +1937,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
u64 found_transid;
struct extent_buffer *leaf;
struct btrfs_ioctl_search_header sh;
+ struct btrfs_key test;
unsigned long item_off;
unsigned long item_len;
int nritems;
@@ -2015,12 +2021,17 @@ static noinline int copy_to_sk(struct btrfs_root *root,
}
advance_key:
ret = 0;
- if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+ test.objectid = sk->max_objectid;
+ test.type = sk->max_type;
+ test.offset = sk->max_offset;
+ if (btrfs_comp_cpu_keys(key, &test) >= 0)
+ ret = 1;
+ else if (key->offset < (u64)-1)
key->offset++;
- else if (key->type < (u8)-1 && key->type < sk->max_type) {
+ else if (key->type < (u8)-1) {
key->offset = 0;
key->type++;
- } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+ } else if (key->objectid < (u64)-1) {
key->offset = 0;
key->type = 0;
key->objectid++;
@@ -2271,10 +2282,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
{
struct btrfs_ioctl_ino_lookup_args *args;
struct inode *inode;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ int ret = 0;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
@@ -2282,13 +2290,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
inode = file_inode(file);
+ /*
+ * Unprivileged query to obtain the containing subvolume root id. The
+ * path is reset so it's consistent with btrfs_search_path_in_tree.
+ */
if (args->treeid == 0)
args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+ if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+ args->name[0] = 0;
+ goto out;
+ }
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
+ }
+
ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
args->treeid, args->objectid,
args->name);
+out:
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
@@ -2413,8 +2436,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_unlock_inode;
}
- d_invalidate(dentry);
-
down_write(&root->fs_info->subvol_sem);
err = may_destroy_subvol(dest);
@@ -2508,7 +2529,7 @@ out_up_write:
out_unlock_inode:
mutex_unlock(&inode->i_mutex);
if (!err) {
- shrink_dcache_sb(root->fs_info->sb);
+ d_invalidate(dentry);
btrfs_invalidate_inodes(dest);
d_delete(dentry);
ASSERT(dest->send_in_progress == 0);
@@ -2755,14 +2776,11 @@ out:
return ret;
}
-static struct page *extent_same_get_page(struct inode *inode, u64 off)
+static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
{
struct page *page;
- pgoff_t index;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- index = off >> PAGE_CACHE_SHIFT;
-
page = grab_cache_page(inode->i_mapping, index);
if (!page)
return NULL;
@@ -2783,6 +2801,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off)
return page;
}
+static int gather_extent_pages(struct inode *inode, struct page **pages,
+ int num_pages, u64 off)
+{
+ int i;
+ pgoff_t index = off >> PAGE_CACHE_SHIFT;
+
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = extent_same_get_page(inode, index + i);
+ if (!pages[i])
+ return -ENOMEM;
+ }
+ return 0;
+}
+
static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
{
/* do any pending delalloc/csum calc on src, one way or
@@ -2808,52 +2840,118 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
}
}
-static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-
mutex_unlock(&inode1->i_mutex);
mutex_unlock(&inode2->i_mutex);
}
-static void btrfs_double_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 < inode2)
+ swap(inode1, inode2);
+
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
{
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
-
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
lock_extent_range(inode1, loff1, len);
- if (inode1 != inode2) {
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
- lock_extent_range(inode2, loff2, len);
+ lock_extent_range(inode2, loff2, len);
+}
+
+struct cmp_pages {
+ int num_pages;
+ struct page **src_pages;
+ struct page **dst_pages;
+};
+
+static void btrfs_cmp_data_free(struct cmp_pages *cmp)
+{
+ int i;
+ struct page *pg;
+
+ for (i = 0; i < cmp->num_pages; i++) {
+ pg = cmp->src_pages[i];
+ if (pg)
+ page_cache_release(pg);
+ pg = cmp->dst_pages[i];
+ if (pg)
+ page_cache_release(pg);
+ }
+ kfree(cmp->src_pages);
+ kfree(cmp->dst_pages);
+}
+
+static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
+ struct inode *dst, u64 dst_loff,
+ u64 len, struct cmp_pages *cmp)
+{
+ int ret;
+ int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+ struct page **src_pgarr, **dst_pgarr;
+
+ /*
+ * We must gather up all the pages before we initiate our
+ * extent locking. We use an array for the page pointers. Size
+ * of the array is bounded by len, which is in turn bounded by
+ * BTRFS_MAX_DEDUPE_LEN.
+ */
+ src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+ dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+ if (!src_pgarr || !dst_pgarr) {
+ kfree(src_pgarr);
+ kfree(dst_pgarr);
+ return -ENOMEM;
}
+ cmp->num_pages = num_pages;
+ cmp->src_pages = src_pgarr;
+ cmp->dst_pages = dst_pgarr;
+
+ ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff);
+ if (ret)
+ goto out;
+
+ ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff);
+
+out:
+ if (ret)
+ btrfs_cmp_data_free(cmp);
+ return 0;
}
static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
- u64 dst_loff, u64 len)
+ u64 dst_loff, u64 len, struct cmp_pages *cmp)
{
int ret = 0;
+ int i;
struct page *src_page, *dst_page;
unsigned int cmp_len = PAGE_CACHE_SIZE;
void *addr, *dst_addr;
+ i = 0;
while (len) {
if (len < PAGE_CACHE_SIZE)
cmp_len = len;
- src_page = extent_same_get_page(src, loff);
- if (!src_page)
- return -EINVAL;
- dst_page = extent_same_get_page(dst, dst_loff);
- if (!dst_page) {
- page_cache_release(src_page);
- return -EINVAL;
- }
+ BUG_ON(i >= cmp->num_pages);
+
+ src_page = cmp->src_pages[i];
+ dst_page = cmp->dst_pages[i];
+
addr = kmap_atomic(src_page);
dst_addr = kmap_atomic(dst_page);
@@ -2865,26 +2963,30 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
kunmap_atomic(addr);
kunmap_atomic(dst_addr);
- page_cache_release(src_page);
- page_cache_release(dst_page);
if (ret)
break;
- loff += cmp_len;
- dst_loff += cmp_len;
len -= cmp_len;
+ i++;
}
return ret;
}
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
+ u64 olen)
{
+ u64 len = *plen;
u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
- if (off + len > inode->i_size || off + len < off)
+ if (off + olen > inode->i_size || off + olen < off)
return -EINVAL;
+
+ /* if we extend to eof, continue to block boundary */
+ if (off + len == inode->i_size)
+ *plen = len = ALIGN(inode->i_size, bs) - off;
+
/* Check that we are block aligned - btrfs_clone() requires this */
if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
return -EINVAL;
@@ -2892,31 +2994,67 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
return 0;
}
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
+ u64 len = olen;
+ struct cmp_pages cmp;
+ int same_inode = 0;
+ u64 same_lock_start = 0;
+ u64 same_lock_len = 0;
- /*
- * btrfs_clone() can't handle extents in the same file
- * yet. Once that works, we can drop this check and replace it
- * with a check for the same inode, but overlapping extents.
- */
if (src == dst)
- return -EINVAL;
+ same_inode = 1;
if (len == 0)
return 0;
- btrfs_double_lock(src, loff, dst, dst_loff, len);
+ if (same_inode) {
+ mutex_lock(&src->i_mutex);
+
+ ret = extent_same_check_offsets(src, loff, &len, olen);
+ if (ret)
+ goto out_unlock;
- ret = extent_same_check_offsets(src, loff, len);
- if (ret)
- goto out_unlock;
+ /*
+ * Single inode case wants the same checks, except we
+ * don't want our length pushed out past i_size as
+ * comparing that data range makes no sense.
+ *
+ * extent_same_check_offsets() will do this for an
+ * unaligned length at i_size, so catch it here and
+ * reject the request.
+ *
+ * This effectively means we require aligned extents
+ * for the single-inode case, whereas the other cases
+ * allow an unaligned length so long as it ends at
+ * i_size.
+ */
+ if (len != olen) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- ret = extent_same_check_offsets(dst, dst_loff, len);
- if (ret)
- goto out_unlock;
+ /* Check for overlapping ranges */
+ if (dst_loff + len > loff && dst_loff < loff + len) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ same_lock_start = min_t(u64, loff, dst_loff);
+ same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
+ } else {
+ btrfs_double_inode_lock(src, dst);
+
+ ret = extent_same_check_offsets(src, loff, &len, olen);
+ if (ret)
+ goto out_unlock;
+
+ ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
+ if (ret)
+ goto out_unlock;
+ }
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
@@ -2925,12 +3063,32 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
goto out_unlock;
}
- ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+ ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
+ if (ret)
+ goto out_unlock;
+
+ if (same_inode)
+ lock_extent_range(src, same_lock_start, same_lock_len);
+ else
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+
+ /* pass original length for comparison so we stay within i_size */
+ ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
if (ret == 0)
- ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+ ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+
+ if (same_inode)
+ unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
+ same_lock_start + same_lock_len - 1);
+ else
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ btrfs_cmp_data_free(&cmp);
out_unlock:
- btrfs_double_unlock(src, loff, dst, dst_loff, len);
+ if (same_inode)
+ mutex_unlock(&src->i_mutex);
+ else
+ btrfs_double_inode_unlock(src, dst);
return ret;
}
@@ -2940,7 +3098,7 @@ out_unlock:
static long btrfs_ioctl_file_extent_same(struct file *file,
struct btrfs_ioctl_same_args __user *argp)
{
- struct btrfs_ioctl_same_args *same;
+ struct btrfs_ioctl_same_args *same = NULL;
struct btrfs_ioctl_same_extent_info *info;
struct inode *src = file_inode(file);
u64 off;
@@ -2970,6 +3128,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
if (IS_ERR(same)) {
ret = PTR_ERR(same);
+ same = NULL;
goto out;
}
@@ -3040,6 +3199,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
out:
mnt_drop_write_file(file);
+ kfree(same);
return ret;
}
@@ -3082,13 +3242,15 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
struct inode *inode,
u64 endoff,
const u64 destoff,
- const u64 olen)
+ const u64 olen,
+ int no_time_update)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (!no_time_update)
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
@@ -3173,13 +3335,13 @@ static void clone_update_extent_map(struct inode *inode,
* @inode: Inode to clone to
* @off: Offset within source to start clone from
* @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen, extent_same uses
- * identical values here
+ * @olen_aligned: Block-aligned value of olen
* @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff)
+ const u64 destoff, int no_time_update)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path = NULL;
@@ -3434,6 +3596,20 @@ process_slot:
u64 trim = 0;
u64 aligned_end = 0;
+ /*
+ * Don't copy an inline extent into an offset
+ * greater than zero. Having an inline extent
+ * at such an offset results in chaos as btrfs
+ * isn't prepared for such cases. Just skip
+ * this case for the same reasons as commented
+ * at btrfs_ioctl_clone().
+ */
+ if (last_dest_end > 0) {
+ ret = -EOPNOTSUPP;
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+
if (off > key.offset) {
skip = off - key.offset;
new_key.offset += skip;
@@ -3503,7 +3679,8 @@ process_slot:
root->sectorsize);
ret = clone_finish_inode_update(trans, inode,
last_dest_end,
- destoff, olen);
+ destoff, olen,
+ no_time_update);
if (ret)
goto out;
if (new_key.offset + datal >= destoff + len)
@@ -3541,7 +3718,7 @@ process_slot:
clone_update_extent_map(inode, trans, NULL, last_dest_end,
destoff + len - last_dest_end);
ret = clone_finish_inode_update(trans, inode, destoff + len,
- destoff, olen);
+ destoff, olen, no_time_update);
}
out:
@@ -3618,13 +3795,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
goto out_fput;
if (!same_inode) {
- if (inode < src) {
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
- } else {
- mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- }
+ btrfs_double_inode_lock(src, inode);
} else {
mutex_lock(&src->i_mutex);
}
@@ -3674,11 +3845,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
lock_extent_range(src, lock_start, lock_len);
} else {
- lock_extent_range(src, off, len);
- lock_extent_range(inode, destoff, len);
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
}
- ret = btrfs_clone(src, inode, off, olen, len, destoff);
+ ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
if (same_inode) {
u64 lock_start = min_t(u64, off, destoff);
@@ -3686,9 +3856,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
} else {
- unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
- destoff + len - 1);
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
}
/*
* Truncate page cache pages so that future reads will see the cloned
@@ -3697,17 +3865,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
truncate_inode_pages_range(&inode->i_data, destoff,
PAGE_CACHE_ALIGN(destoff + len) - 1);
out_unlock:
- if (!same_inode) {
- if (inode < src) {
- mutex_unlock(&src->i_mutex);
- mutex_unlock(&inode->i_mutex);
- } else {
- mutex_unlock(&inode->i_mutex);
- mutex_unlock(&src->i_mutex);
- }
- } else {
+ if (!same_inode)
+ btrfs_double_inode_unlock(src, inode);
+ else
mutex_unlock(&src->i_mutex);
- }
out_fput:
fdput(src_file);
out_drop_write:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index f8229ef1b46d..d7e6baf1b205 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -241,6 +241,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
*/
void btrfs_tree_lock(struct extent_buffer *eb)
{
+ WARN_ON(eb->lock_owner == current->pid);
again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 760c4a5e096b..52170cf1757e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
- !(type == BTRFS_ORDERED_NOCOW))
- entry->csum_bytes_left = disk_len;
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->inode = igrab(inode);
@@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode,
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
- WARN_ON(entry->csum_bytes_left < sum->len);
- entry->csum_bytes_left -= sum->len;
- if (entry->csum_bytes_left == 0)
- wake_up(&entry->wait);
spin_unlock_irq(&tree->lock);
}
@@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
- list_add_tail(&ordered->trans_list, &trans->ordered);
+ /*
+ * If our ordered extent completed it means it updated the
+ * fs/subvol and csum trees already, so no need to make the
+ * current transaction's commit wait for it, as we end up
+ * holding memory unnecessarily and delaying the inode's iput
+ * until the transaction commit (we schedule an iput for the
+ * inode when the ordered extent's refcount drops to 0), which
+ * prevents it from being evictable until the transaction
+ * commits.
+ */
+ if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
+ btrfs_put_ordered_extent(ordered);
+ else
+ list_add_tail(&ordered->trans_list, &trans->ordered);
+
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -545,6 +552,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (atomic_dec_and_test(&entry->refs)) {
+ ASSERT(list_empty(&entry->log_list));
+ ASSERT(list_empty(&entry->trans_list));
+ ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
@@ -572,6 +583,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
+ RB_CLEAR_NODE(node);
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
@@ -844,6 +856,20 @@ out:
return entry;
}
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len)
+{
+ struct btrfs_ordered_extent *oe;
+
+ oe = btrfs_lookup_ordered_range(inode, file_offset, len);
+ if (oe) {
+ btrfs_put_ordered_extent(oe);
+ return true;
+ }
+ return false;
+}
+
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e96cd4ccd805..7176cc0fe43f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -89,9 +89,6 @@ struct btrfs_ordered_extent {
/* number of bytes that still need writing */
u64 bytes_left;
- /* number of bytes that still need csumming */
- u64 csum_bytes_left;
-
/*
* the end of the ordered extent which is behind it but
* didn't update disk_i_size. Please see the comment of
@@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
u64 file_offset,
u64 len);
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3d6546581bb9..d904ee1c5349 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -34,6 +34,7 @@
#include "extent_io.h"
#include "qgroup.h"
+
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
* - reorganize keys
@@ -84,11 +85,42 @@ struct btrfs_qgroup {
/*
* temp variables for accounting operations
+ * Refer to qgroup_shared_accouting() for details.
*/
u64 old_refcnt;
u64 new_refcnt;
};
+static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq;
+ qg->old_refcnt += mod;
+}
+
+static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq;
+ qg->new_refcnt += mod;
+}
+
+static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->old_refcnt < seq)
+ return 0;
+ return qg->old_refcnt - seq;
+}
+
+static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->new_refcnt < seq)
+ return 0;
+ return qg->new_refcnt - seq;
+}
+
/*
* glue structure to represent the relations between qgroups.
*/
@@ -344,7 +376,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = find_qgroup_rb(fs_info, found_key.offset);
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
- btrfs_err(fs_info, "inconsitent qgroup config");
+ btrfs_err(fs_info, "inconsistent qgroup config");
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
if (!qgroup) {
@@ -1115,14 +1147,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct ulist *tmp;
int ret = 0;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
-
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
if (!quota_root) {
@@ -1317,6 +1349,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
+ /* Sometimes we would want to clear the limit on this qgroup.
+ * To meet this requirement, we treat the -1 as a special value
+ * which tell kernel to clear the limit on this qgroup.
+ */
+ const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
@@ -1332,14 +1369,42 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
}
spin_lock(&fs_info->qgroup_lock);
- if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
- qgroup->max_rfer = limit->max_rfer;
- if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
- qgroup->max_excl = limit->max_excl;
- if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
- qgroup->rsv_rfer = limit->rsv_rfer;
- if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
- qgroup->rsv_excl = limit->rsv_excl;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
+ if (limit->max_rfer == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+ qgroup->max_rfer = 0;
+ } else {
+ qgroup->max_rfer = limit->max_rfer;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+ if (limit->max_excl == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+ qgroup->max_excl = 0;
+ } else {
+ qgroup->max_excl = limit->max_excl;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
+ if (limit->rsv_rfer == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+ qgroup->rsv_rfer = 0;
+ } else {
+ qgroup->rsv_rfer = limit->rsv_rfer;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
+ if (limit->rsv_excl == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+ qgroup->rsv_excl = 0;
+ } else {
+ qgroup->rsv_excl = limit->rsv_excl;
+ }
+ }
qgroup->lim_flags |= limit->flags;
spin_unlock(&fs_info->qgroup_lock);
@@ -1356,239 +1421,86 @@ out:
return ret;
}
-static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
- struct btrfs_qgroup_operation *oper2)
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
- /*
- * Ignore seq and type here, we're looking for any operation
- * at all related to this extent on that root.
- */
- if (oper1->bytenr < oper2->bytenr)
- return -1;
- if (oper1->bytenr > oper2->bytenr)
- return 1;
- if (oper1->ref_root < oper2->ref_root)
- return -1;
- if (oper1->ref_root > oper2->ref_root)
- return 1;
- return 0;
-}
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ u64 qgroup_to_skip;
+ int ret = 0;
-static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct rb_node *n;
- struct btrfs_qgroup_operation *cur;
- int cmp;
+ delayed_refs = &trans->transaction->delayed_refs;
+ qgroup_to_skip = delayed_refs->qgroup_to_skip;
- spin_lock(&fs_info->qgroup_op_lock);
- n = fs_info->qgroup_op_tree.rb_node;
- while (n) {
- cur = rb_entry(n, struct btrfs_qgroup_operation, n);
- cmp = comp_oper_exist(cur, oper);
- if (cmp < 0) {
- n = n->rb_right;
- } else if (cmp) {
- n = n->rb_left;
- } else {
- spin_unlock(&fs_info->qgroup_op_lock);
- return -EEXIST;
- }
+ /*
+ * No need to do lock, since this function will only be called in
+ * btrfs_commmit_transaction().
+ */
+ node = rb_first(&delayed_refs->dirty_extent_root);
+ while (node) {
+ record = rb_entry(node, struct btrfs_qgroup_extent_record,
+ node);
+ ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0,
+ &record->old_roots);
+ if (ret < 0)
+ break;
+ if (qgroup_to_skip)
+ ulist_del(record->old_roots, qgroup_to_skip, 0);
+ node = rb_next(node);
}
- spin_unlock(&fs_info->qgroup_op_lock);
- return 0;
-}
-
-static int comp_oper(struct btrfs_qgroup_operation *oper1,
- struct btrfs_qgroup_operation *oper2)
-{
- if (oper1->bytenr < oper2->bytenr)
- return -1;
- if (oper1->bytenr > oper2->bytenr)
- return 1;
- if (oper1->ref_root < oper2->ref_root)
- return -1;
- if (oper1->ref_root > oper2->ref_root)
- return 1;
- if (oper1->seq < oper2->seq)
- return -1;
- if (oper1->seq > oper2->seq)
- return 1;
- if (oper1->type < oper2->type)
- return -1;
- if (oper1->type > oper2->type)
- return 1;
- return 0;
+ return ret;
}
-static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record)
{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct btrfs_qgroup_operation *cur;
- int cmp;
+ struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_qgroup_extent_record *entry;
+ u64 bytenr = record->bytenr;
- spin_lock(&fs_info->qgroup_op_lock);
- p = &fs_info->qgroup_op_tree.rb_node;
while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
- cmp = comp_oper(cur, oper);
- if (cmp < 0) {
- p = &(*p)->rb_right;
- } else if (cmp) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
+ node);
+ if (bytenr < entry->bytenr)
p = &(*p)->rb_left;
- } else {
- spin_unlock(&fs_info->qgroup_op_lock);
- return -EEXIST;
- }
- }
- rb_link_node(&oper->n, parent, p);
- rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
- spin_unlock(&fs_info->qgroup_op_lock);
- return 0;
-}
-
-/*
- * Record a quota operation for processing later on.
- * @trans: the transaction we are adding the delayed op to.
- * @fs_info: the fs_info for this fs.
- * @ref_root: the root of the reference we are acting on,
- * @bytenr: the bytenr we are acting on.
- * @num_bytes: the number of bytes in the reference.
- * @type: the type of operation this is.
- * @mod_seq: do we need to get a sequence number for looking up roots.
- *
- * We just add it to our trans qgroup_ref_list and carry on and process these
- * operations in order at some later point. If the reference root isn't a fs
- * root then we don't bother with doing anything.
- *
- * MUST BE HOLDING THE REF LOCK.
- */
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 ref_root,
- u64 bytenr, u64 num_bytes,
- enum btrfs_qgroup_operation_type type, int mod_seq)
-{
- struct btrfs_qgroup_operation *oper;
- int ret;
-
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- return 0;
-
- oper = kmalloc(sizeof(*oper), GFP_NOFS);
- if (!oper)
- return -ENOMEM;
-
- oper->ref_root = ref_root;
- oper->bytenr = bytenr;
- oper->num_bytes = num_bytes;
- oper->type = type;
- oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
- INIT_LIST_HEAD(&oper->elem.list);
- oper->elem.seq = 0;
-
- trace_btrfs_qgroup_record_ref(oper);
-
- if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
- /*
- * If any operation for this bytenr/ref_root combo
- * exists, then we know it's not exclusively owned and
- * shouldn't be queued up.
- *
- * This also catches the case where we have a cloned
- * extent that gets queued up multiple times during
- * drop snapshot.
- */
- if (qgroup_oper_exists(fs_info, oper)) {
- kfree(oper);
- return 0;
- }
- }
-
- ret = insert_qgroup_oper(fs_info, oper);
- if (ret) {
- /* Shouldn't happen so have an assert for developers */
- ASSERT(0);
- kfree(oper);
- return ret;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return entry;
}
- list_add_tail(&oper->list, &trans->qgroup_ref_list);
-
- if (mod_seq)
- btrfs_get_tree_mod_seq(fs_info, &oper->elem);
-
- return 0;
-}
-
-static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *tmp;
- int sign = 0;
- int ret = 0;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
-
- spin_lock(&fs_info->qgroup_lock);
- if (!fs_info->quota_root)
- goto out;
-
- switch (oper->type) {
- case BTRFS_QGROUP_OPER_ADD_EXCL:
- sign = 1;
- break;
- case BTRFS_QGROUP_OPER_SUB_EXCL:
- sign = -1;
- break;
- default:
- ASSERT(0);
- }
- ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
- oper->num_bytes, sign);
-out:
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(tmp);
- return ret;
+ rb_link_node(&record->node, parent_node, p);
+ rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
+ return NULL;
}
+#define UPDATE_NEW 0
+#define UPDATE_OLD 1
/*
- * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
- * properly.
+ * Walk all of the roots that points to the bytenr and adjust their refcnts.
*/
-static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
- u64 root_to_skip, struct ulist *tmp,
- struct ulist *roots, struct ulist *qgroups,
- u64 seq, int *old_roots, int rescan)
+static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct ulist *tmp,
+ struct ulist *qgroups, u64 seq, int update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct ulist_node *tmp_unode;
struct ulist_iterator tmp_uiter;
struct btrfs_qgroup *qg;
- int ret;
+ int ret = 0;
+ if (!roots)
+ return 0;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
- /* We don't count our current root here */
- if (unode->val == root_to_skip)
- continue;
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
- /*
- * We could have a pending removal of this same ref so we may
- * not have actually found our ref root when doing
- * btrfs_find_all_roots, so we need to keep track of how many
- * old roots we find in case we removed ours and added a
- * different one at the same time. I don't think this could
- * happen in practice but that sort of thinking leads to pain
- * and suffering and to the dark side.
- */
- (*old_roots)++;
ulist_reinit(tmp);
ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
@@ -1603,29 +1515,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_list *glist;
qg = u64_to_ptr(tmp_unode->aux);
- /*
- * We use this sequence number to keep from having to
- * run the whole list and 0 out the refcnt every time.
- * We basically use sequnce as the known 0 count and
- * then add 1 everytime we see a qgroup. This is how we
- * get how many of the roots actually point up to the
- * upper level qgroups in order to determine exclusive
- * counts.
- *
- * For rescan we want to set old_refcnt to seq so our
- * exclusive calculations end up correct.
- */
- if (rescan)
- qg->old_refcnt = seq;
- else if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
+ if (update_old)
+ btrfs_qgroup_update_old_refcnt(qg, seq, 1);
else
- qg->old_refcnt++;
-
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
+ btrfs_qgroup_update_new_refcnt(qg, seq, 1);
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(qgroups, glist->group->qgroupid,
ptr_to_u64(glist->group),
@@ -1644,161 +1537,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
}
/*
- * We need to walk forward in our operation tree and account for any roots that
- * were deleted after we made this operation.
- */
-static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper,
- struct ulist *tmp,
- struct ulist *qgroups, u64 seq,
- int *old_roots)
-{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_operation *tmp_oper;
- struct rb_node *n;
- int ret;
-
- ulist_reinit(tmp);
-
- /*
- * We only walk forward in the tree since we're only interested in
- * removals that happened _after_ our operation.
- */
- spin_lock(&fs_info->qgroup_op_lock);
- n = rb_next(&oper->n);
- spin_unlock(&fs_info->qgroup_op_lock);
- if (!n)
- return 0;
- tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
- while (tmp_oper->bytenr == oper->bytenr) {
- /*
- * If it's not a removal we don't care, additions work out
- * properly with our refcnt tracking.
- */
- if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
- tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
- goto next;
- qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
- if (!qg)
- goto next;
- ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
- GFP_ATOMIC);
- if (ret) {
- if (ret < 0)
- return ret;
- /*
- * We only want to increase old_roots if this qgroup is
- * not already in the list of qgroups. If it is already
- * there then that means it must have been re-added or
- * the delete will be discarded because we had an
- * existing ref that we haven't looked up yet. In this
- * case we don't want to increase old_roots. So if ret
- * == 1 then we know that this is the first time we've
- * seen this qgroup and we can bump the old_roots.
- */
- (*old_roots)++;
- ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
-next:
- spin_lock(&fs_info->qgroup_op_lock);
- n = rb_next(&tmp_oper->n);
- spin_unlock(&fs_info->qgroup_op_lock);
- if (!n)
- break;
- tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
- }
-
- /* Ok now process the qgroups we found */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = u64_to_ptr(unode->aux);
- if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
- else
- qg->old_refcnt++;
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(qgroups, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/* Add refcnt for the newly added reference. */
-static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper,
- struct btrfs_qgroup *qgroup,
- struct ulist *tmp, struct ulist *qgroups,
- u64 seq)
-{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup *qg;
- int ret;
-
- ulist_reinit(tmp);
- ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = u64_to_ptr(unode->aux);
- if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
- } else {
- if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
- else
- qg->old_refcnt++;
- }
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(qgroups, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/*
- * This adjusts the counters for all referenced qgroups if need be.
+ * Update qgroup rfer/excl counters.
+ * Rfer update is easy, codes can explain themselves.
+ *
+ * Excl update is tricky, the update is split into 2 part.
+ * Part 1: Possible exclusive <-> sharing detect:
+ * | A | !A |
+ * -------------------------------------
+ * B | * | - |
+ * -------------------------------------
+ * !B | + | ** |
+ * -------------------------------------
+ *
+ * Conditions:
+ * A: cur_old_roots < nr_old_roots (not exclusive before)
+ * !A: cur_old_roots == nr_old_roots (possible exclusive before)
+ * B: cur_new_roots < nr_new_roots (not exclusive now)
+ * !B: cur_new_roots == nr_new_roots (possible exclsuive now)
+ *
+ * Results:
+ * +: Possible sharing -> exclusive -: Possible exclusive -> sharing
+ * *: Definitely not changed. **: Possible unchanged.
+ *
+ * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
+ *
+ * To make the logic clear, we first use condition A and B to split
+ * combination into 4 results.
+ *
+ * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
+ * only on variant maybe 0.
+ *
+ * Lastly, check result **, since there are 2 variants maybe 0, split them
+ * again(2x2).
+ * But this time we don't need to consider other things, the codes and logic
+ * is easy to understand now.
*/
-static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
- u64 root_to_skip, u64 num_bytes,
- struct ulist *qgroups, u64 seq,
- int old_roots, int new_roots, int rescan)
+static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
+ struct ulist *qgroups,
+ u64 nr_old_roots,
+ u64 nr_new_roots,
+ u64 num_bytes, u64 seq)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1810,423 +1588,196 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
bool dirty = false;
qg = u64_to_ptr(unode->aux);
- /*
- * Wasn't referenced before but is now, add to the reference
- * counters.
- */
- if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+ cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
+ cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+
+ /* Rfer update part */
+ if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
qg->rfer_cmpr += num_bytes;
dirty = true;
}
-
- /*
- * Was referenced before but isn't now, subtract from the
- * reference counters.
- */
- if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+ if (cur_old_count > 0 && cur_new_count == 0) {
qg->rfer -= num_bytes;
qg->rfer_cmpr -= num_bytes;
dirty = true;
}
- if (qg->old_refcnt < seq)
- cur_old_count = 0;
- else
- cur_old_count = qg->old_refcnt - seq;
- if (qg->new_refcnt < seq)
- cur_new_count = 0;
- else
- cur_new_count = qg->new_refcnt - seq;
+ /* Excl update part */
+ /* Exclusive/none -> shared case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count < nr_new_roots) {
+ /* Exclusive -> shared */
+ if (cur_old_count != 0) {
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+ }
- /*
- * If our refcount was the same as the roots previously but our
- * new count isn't the same as the number of roots now then we
- * went from having a exclusive reference on this range to not.
- */
- if (old_roots && cur_old_count == old_roots &&
- (cur_new_count != new_roots || new_roots == 0)) {
- WARN_ON(cur_new_count != new_roots && new_roots == 0);
- qg->excl -= num_bytes;
- qg->excl_cmpr -= num_bytes;
- dirty = true;
+ /* Shared -> exclusive/none case */
+ if (cur_old_count < nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ /* Shared->exclusive */
+ if (cur_new_count != 0) {
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
}
- /*
- * If we didn't reference all the roots before but now we do we
- * have an exclusive reference to this range.
- */
- if ((!old_roots || (old_roots && cur_old_count != old_roots))
- && cur_new_count == new_roots) {
- qg->excl += num_bytes;
- qg->excl_cmpr += num_bytes;
- dirty = true;
+ /* Exclusive/none -> exclusive/none case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ if (cur_old_count == 0) {
+ /* None -> exclusive/none */
+
+ if (cur_new_count != 0) {
+ /* None -> exclusive */
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
+ /* None -> none, nothing changed */
+ } else {
+ /* Exclusive -> exclusive/none */
+
+ if (cur_new_count == 0) {
+ /* Exclusive -> none */
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+ /* Exclusive -> exclusive, nothing changed */
+ }
}
+ /* For exclusive extent, free its reserved bytes too */
+ if (nr_old_roots == 0 && nr_new_roots == 1 &&
+ cur_new_count == nr_new_roots)
+ qg->reserved -= num_bytes;
if (dirty)
qgroup_dirty(fs_info, qg);
}
return 0;
}
-/*
- * If we removed a data extent and there were other references for that bytenr
- * then we need to lookup all referenced roots to make sure we still don't
- * reference this bytenr. If we do then we can just discard this operation.
- */
-static int check_existing_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int ret = 0;
-
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
- oper->elem.seq, &roots);
- if (ret < 0)
- return ret;
- ret = 0;
-
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- if (unode->val == oper->ref_root) {
- ret = 1;
- break;
- }
- }
- ulist_free(roots);
- btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-
- return ret;
-}
-
-/*
- * If we share a reference across multiple roots then we may need to adjust
- * various qgroups referenced and exclusive counters. The basic premise is this
- *
- * 1) We have seq to represent a 0 count. Instead of looping through all of the
- * qgroups and resetting their refcount to 0 we just constantly bump this
- * sequence number to act as the base reference count. This means that if
- * anybody is equal to or below this sequence they were never referenced. We
- * jack this sequence up by the number of roots we found each time in order to
- * make sure we don't have any overlap.
- *
- * 2) We first search all the roots that reference the area _except_ the root
- * we're acting on currently. This makes up the old_refcnt of all the qgroups
- * before.
- *
- * 3) We walk all of the qgroups referenced by the root we are currently acting
- * on, and will either adjust old_refcnt in the case of a removal or the
- * new_refcnt in the case of an addition.
- *
- * 4) Finally we walk all the qgroups that are referenced by this range
- * including the root we are acting on currently. We will adjust the counters
- * based on the number of roots we had and will have after this operation.
- *
- * Take this example as an illustration
- *
- * [qgroup 1/0]
- * / | \
- * [qg 0/0] [qg 0/1] [qg 0/2]
- * \ | /
- * [ extent ]
- *
- * Say we are adding a reference that is covered by qg 0/0. The first step
- * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
- * old_roots being 2. Because it is adding new_roots will be 1. We then go
- * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
- * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
- * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
- * reference and thus must add the size to the referenced bytes. Everything
- * else is the same so nothing else changes.
- */
-static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 num_bytes,
+ struct ulist *old_roots, struct ulist *new_roots)
{
- struct ulist *roots = NULL;
- struct ulist *qgroups, *tmp;
- struct btrfs_qgroup *qgroup;
- struct seq_list elem = SEQ_LIST_INIT(elem);
+ struct ulist *qgroups = NULL;
+ struct ulist *tmp = NULL;
u64 seq;
- int old_roots = 0;
- int new_roots = 0;
+ u64 nr_new_roots = 0;
+ u64 nr_old_roots = 0;
int ret = 0;
- if (oper->elem.seq) {
- ret = check_existing_refs(trans, fs_info, oper);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
- }
+ if (new_roots)
+ nr_new_roots = new_roots->nnodes;
+ if (old_roots)
+ nr_old_roots = old_roots->nnodes;
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups)
- return -ENOMEM;
+ if (!fs_info->quota_enabled)
+ goto out_free;
+ BUG_ON(!fs_info->quota_root);
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
tmp = ulist_alloc(GFP_NOFS);
if (!tmp) {
- ulist_free(qgroups);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_free;
}
- btrfs_get_tree_mod_seq(fs_info, &elem);
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
- &roots);
- btrfs_put_tree_mod_seq(fs_info, &elem);
- if (ret < 0) {
- ulist_free(qgroups);
- ulist_free(tmp);
- return ret;
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ ret = 0;
+ goto out_free;
+ }
}
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
spin_lock(&fs_info->qgroup_lock);
- qgroup = find_qgroup_rb(fs_info, oper->ref_root);
- if (!qgroup)
- goto out;
seq = fs_info->qgroup_seq;
- /*
- * So roots is the list of all the roots currently pointing at the
- * bytenr, including the ref we are adding if we are adding, or not if
- * we are removing a ref. So we pass in the ref_root to skip that root
- * in our calculations. We set old_refnct and new_refcnt cause who the
- * hell knows what everything looked like before, and it doesn't matter
- * except...
- */
- ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
- seq, &old_roots, 0);
- if (ret < 0)
- goto out;
-
- /*
- * Now adjust the refcounts of the qgroups that care about this
- * reference, either the old_count in the case of removal or new_count
- * in the case of an addition.
- */
- ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
- seq);
+ /* Update old refcnts using old_roots */
+ ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
+ UPDATE_OLD);
if (ret < 0)
goto out;
- /*
- * ...in the case of removals. If we had a removal before we got around
- * to processing this operation then we need to find that guy and count
- * his references as if they really existed so we don't end up screwing
- * up the exclusive counts. Then whenever we go to process the delete
- * everything will be grand and we can account for whatever exclusive
- * changes need to be made there. We also have to pass in old_roots so
- * we have an accurate count of the roots as it pertains to this
- * operations view of the world.
- */
- ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
- &old_roots);
+ /* Update new refcnts using new_roots */
+ ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
+ UPDATE_NEW);
if (ret < 0)
goto out;
- /*
- * We are adding our root, need to adjust up the number of roots,
- * otherwise old_roots is the number of roots we want.
- */
- if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
- new_roots = old_roots + 1;
- } else {
- new_roots = old_roots;
- old_roots++;
- }
- fs_info->qgroup_seq += old_roots + 1;
-
+ qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+ num_bytes, seq);
/*
- * And now the magic happens, bless Arne for having a pretty elegant
- * solution for this.
+ * Bump qgroup_seq to avoid seq overlap
*/
- qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
- qgroups, seq, old_roots, new_roots, 0);
+ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(qgroups);
- ulist_free(roots);
+out_free:
ulist_free(tmp);
+ ulist_free(qgroups);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
return ret;
}
-/*
- * Process a reference to a shared subtree. This type of operation is
- * queued during snapshot removal when we encounter extents which are
- * shared between more than one root.
- */
-static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup_list *glist;
- struct ulist *parents;
- int ret = 0;
- int err;
- struct btrfs_qgroup *qg;
- u64 root_obj = 0;
- struct seq_list elem = SEQ_LIST_INIT(elem);
-
- parents = ulist_alloc(GFP_NOFS);
- if (!parents)
- return -ENOMEM;
-
- btrfs_get_tree_mod_seq(fs_info, &elem);
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
- elem.seq, &roots);
- btrfs_put_tree_mod_seq(fs_info, &elem);
- if (ret < 0)
- goto out;
-
- if (roots->nnodes != 1)
- goto out;
-
- ULIST_ITER_INIT(&uiter);
- unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
- /*
- * If we find our ref root then that means all refs
- * this extent has to the root have not yet been
- * deleted. In that case, we do nothing and let the
- * last ref for this bytenr drive our update.
- *
- * This can happen for example if an extent is
- * referenced multiple times in a snapshot (clone,
- * etc). If we are in the middle of snapshot removal,
- * queued updates for such an extent will find the
- * root if we have not yet finished removing the
- * snapshot.
- */
- if (unode->val == oper->ref_root)
- goto out;
-
- root_obj = unode->val;
- BUG_ON(!root_obj);
-
- spin_lock(&fs_info->qgroup_lock);
- qg = find_qgroup_rb(fs_info, root_obj);
- if (!qg)
- goto out_unlock;
-
- qg->excl += oper->num_bytes;
- qg->excl_cmpr += oper->num_bytes;
- qgroup_dirty(fs_info, qg);
-
- /*
- * Adjust counts for parent groups. First we find all
- * parents, then in the 2nd loop we do the adjustment
- * while adding parents of the parents to our ulist.
- */
- list_for_each_entry(glist, &qg->groups, next_group) {
- err = ulist_add(parents, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (err < 0) {
- ret = err;
- goto out_unlock;
- }
- }
-
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(parents, &uiter))) {
- qg = u64_to_ptr(unode->aux);
- qg->excl += oper->num_bytes;
- qg->excl_cmpr += oper->num_bytes;
- qgroup_dirty(fs_info, qg);
-
- /* Add any parents of the parents */
- list_for_each_entry(glist, &qg->groups, next_group) {
- err = ulist_add(parents, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (err < 0) {
- ret = err;
- goto out_unlock;
- }
- }
- }
-
-out_unlock:
- spin_unlock(&fs_info->qgroup_lock);
-
-out:
- ulist_free(roots);
- ulist_free(parents);
- return ret;
-}
-
-/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
- */
-static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct ulist *new_roots = NULL;
+ struct rb_node *node;
+ u64 qgroup_to_skip;
int ret = 0;
- if (!fs_info->quota_enabled)
- return 0;
-
- BUG_ON(!fs_info->quota_root);
+ delayed_refs = &trans->transaction->delayed_refs;
+ qgroup_to_skip = delayed_refs->qgroup_to_skip;
+ while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
+ record = rb_entry(node, struct btrfs_qgroup_extent_record,
+ node);
- mutex_lock(&fs_info->qgroup_rescan_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- return 0;
+ if (!ret) {
+ /*
+ * Use (u64)-1 as time_seq to do special search, which
+ * doesn't lock tree or delayed_refs and search current
+ * root. It's safe inside commit_transaction().
+ */
+ ret = btrfs_find_all_roots(trans, fs_info,
+ record->bytenr, (u64)-1, &new_roots);
+ if (ret < 0)
+ goto cleanup;
+ if (qgroup_to_skip)
+ ulist_del(new_roots, qgroup_to_skip, 0);
+ ret = btrfs_qgroup_account_extent(trans, fs_info,
+ record->bytenr, record->num_bytes,
+ record->old_roots, new_roots);
+ record->old_roots = NULL;
+ new_roots = NULL;
}
- }
- mutex_unlock(&fs_info->qgroup_rescan_lock);
-
- ASSERT(is_fstree(oper->ref_root));
-
- trace_btrfs_qgroup_account(oper);
-
- switch (oper->type) {
- case BTRFS_QGROUP_OPER_ADD_EXCL:
- case BTRFS_QGROUP_OPER_SUB_EXCL:
- ret = qgroup_excl_accounting(fs_info, oper);
- break;
- case BTRFS_QGROUP_OPER_ADD_SHARED:
- case BTRFS_QGROUP_OPER_SUB_SHARED:
- ret = qgroup_shared_accounting(trans, fs_info, oper);
- break;
- case BTRFS_QGROUP_OPER_SUB_SUBTREE:
- ret = qgroup_subtree_accounting(trans, fs_info, oper);
- break;
- default:
- ASSERT(0);
- }
- return ret;
-}
-
-/*
- * Needs to be called everytime we run delayed refs, even if there is an error
- * in order to cleanup outstanding operations.
- */
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_qgroup_operation *oper;
- int ret = 0;
+cleanup:
+ ulist_free(record->old_roots);
+ ulist_free(new_roots);
+ new_roots = NULL;
+ rb_erase(node, &delayed_refs->dirty_extent_root);
+ kfree(record);
- while (!list_empty(&trans->qgroup_ref_list)) {
- oper = list_first_entry(&trans->qgroup_ref_list,
- struct btrfs_qgroup_operation, list);
- list_del_init(&oper->list);
- if (!ret || !trans->aborted)
- ret = btrfs_qgroup_account(trans, fs_info, oper);
- spin_lock(&fs_info->qgroup_op_lock);
- rb_erase(&oper->n, &fs_info->qgroup_op_tree);
- spin_unlock(&fs_info->qgroup_op_lock);
- btrfs_put_tree_mod_seq(fs_info, &oper->elem);
- kfree(oper);
}
return ret;
}
@@ -2637,15 +2188,13 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans, struct ulist *qgroups,
- struct ulist *tmp, struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans,
+ struct extent_buffer *scratch_leaf)
{
struct btrfs_key found;
struct ulist *roots = NULL;
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
- u64 seq;
- int new_roots;
int slot;
int ret;
@@ -2695,33 +2244,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
else
num_bytes = found.offset;
- ulist_reinit(qgroups);
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
&roots);
if (ret < 0)
goto out;
- spin_lock(&fs_info->qgroup_lock);
- seq = fs_info->qgroup_seq;
- fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
-
- new_roots = 0;
- ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
- seq, &new_roots, 1);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
-
- ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
- seq, 0, new_roots, 1);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
+ /* For rescan, just pass old_roots as NULL */
+ ret = btrfs_qgroup_account_extent(trans, fs_info,
+ found.objectid, num_bytes, NULL, roots);
+ if (ret < 0)
goto out;
- }
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
}
out:
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
@@ -2735,7 +2266,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *tmp = NULL, *qgroups = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
int ret = 0;
@@ -2743,12 +2273,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path = btrfs_alloc_path();
if (!path)
goto out;
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups)
- goto out;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- goto out;
scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
if (!scratch_leaf)
goto out;
@@ -2764,7 +2288,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = -EINTR;
} else {
err = qgroup_rescan_leaf(fs_info, path, trans,
- qgroups, tmp, scratch_leaf);
+ scratch_leaf);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2774,8 +2298,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
out:
kfree(scratch_leaf);
- ulist_free(qgroups);
- ulist_free(tmp);
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c5242aa9a4b2..6387dcfa354c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -19,43 +19,18 @@
#ifndef __BTRFS_QGROUP__
#define __BTRFS_QGROUP__
+#include "ulist.h"
+#include "delayed-ref.h"
+
/*
- * A description of the operations, all of these operations only happen when we
- * are adding the 1st reference for that subvolume in the case of adding space
- * or on the last reference delete in the case of subtraction. The only
- * exception is the last one, which is added for confusion.
- *
- * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
- * one pointing at the bytes we are adding. This is called on the first
- * allocation.
- *
- * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
- * shared between subvols. This is called on the creation of a ref that already
- * has refs from a different subvolume, so basically reflink.
- *
- * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
- * one referencing the range.
- *
- * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
- * refs with other subvolumes.
+ * Record a dirty extent, and info qgroup to update quota on it
+ * TODO: Use kmem cache to alloc it.
*/
-enum btrfs_qgroup_operation_type {
- BTRFS_QGROUP_OPER_ADD_EXCL,
- BTRFS_QGROUP_OPER_ADD_SHARED,
- BTRFS_QGROUP_OPER_SUB_EXCL,
- BTRFS_QGROUP_OPER_SUB_SHARED,
- BTRFS_QGROUP_OPER_SUB_SUBTREE,
-};
-
-struct btrfs_qgroup_operation {
- u64 ref_root;
+struct btrfs_qgroup_extent_record {
+ struct rb_node node;
u64 bytenr;
u64 num_bytes;
- u64 seq;
- enum btrfs_qgroup_operation_type type;
- struct seq_list elem;
- struct rb_node n;
- struct list_head list;
+ struct ulist *old_roots;
};
int btrfs_quota_enable(struct btrfs_trans_handle *trans,
@@ -79,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 ref_root,
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record);
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes,
- enum btrfs_qgroup_operation_type type,
- int mod_seq);
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper);
+ struct ulist *old_roots, struct ulist *new_roots);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fa72068bd256..fcf7265ca46f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -61,9 +61,10 @@
#define RBIO_CACHE_SIZE 1024
enum btrfs_rbio_ops {
- BTRFS_RBIO_WRITE = 0,
- BTRFS_RBIO_READ_REBUILD = 1,
- BTRFS_RBIO_PARITY_SCRUB = 2,
+ BTRFS_RBIO_WRITE,
+ BTRFS_RBIO_READ_REBUILD,
+ BTRFS_RBIO_PARITY_SCRUB,
+ BTRFS_RBIO_REBUILD_MISSING,
};
struct btrfs_raid_bio {
@@ -602,6 +603,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
cur->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
+ if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
+ cur->operation == BTRFS_RBIO_REBUILD_MISSING)
+ return 0;
+
return 1;
}
@@ -793,7 +798,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
if (next->operation == BTRFS_RBIO_READ_REBUILD)
async_read_rebuild(next);
- else if (next->operation == BTRFS_RBIO_WRITE) {
+ else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ steal_rbio(rbio, next);
+ async_read_rebuild(next);
+ } else if (next->operation == BTRFS_RBIO_WRITE) {
steal_rbio(rbio, next);
async_rmw_stripe(next);
} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
@@ -851,7 +859,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
* this frees the rbio and runs through all the bios in the
* bio_list and calls end_io on them
*/
-static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *next;
@@ -864,9 +872,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
while (cur) {
next = cur->bi_next;
cur->bi_next = NULL;
- if (uptodate)
- set_bit(BIO_UPTODATE, &cur->bi_flags);
- bio_endio(cur, err);
+ cur->bi_error = err;
+ bio_endio(cur);
cur = next;
}
}
@@ -875,9 +882,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
* end io function used by finish_rmw. When we finally
* get here, we've written a full stripe
*/
-static void raid_write_end_io(struct bio *bio, int err)
+static void raid_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
+ int err = bio->bi_error;
if (err)
fail_bio_stripe(rbio, bio);
@@ -893,7 +901,7 @@ static void raid_write_end_io(struct bio *bio, int err)
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
err = -EIO;
- rbio_orig_end_io(rbio, err, 0);
+ rbio_orig_end_io(rbio, err);
return;
}
@@ -1071,7 +1079,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
* devices or if they are not contiguous
*/
if (last_end == disk_start && stripe->dev->bdev &&
- test_bit(BIO_UPTODATE, &last->bi_flags) &&
+ !last->bi_error &&
last->bi_bdev == stripe->dev->bdev) {
ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
if (ret == PAGE_CACHE_SIZE)
@@ -1087,7 +1095,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
bio->bi_iter.bi_size = 0;
bio->bi_bdev = stripe->dev->bdev;
bio->bi_iter.bi_sector = disk_start >> 9;
- set_bit(BIO_UPTODATE, &bio->bi_flags);
bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
bio_list_add(bio_list, bio);
@@ -1312,13 +1319,12 @@ write_data:
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(WRITE, bio);
}
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
}
/*
@@ -1441,11 +1447,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid_rmw_end_io(struct bio *bio, int err)
+static void raid_rmw_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- if (err)
+ if (bio->bi_error)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -1455,7 +1461,6 @@ static void raid_rmw_end_io(struct bio *bio, int err)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- err = 0;
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
goto cleanup;
@@ -1469,7 +1474,7 @@ static void raid_rmw_end_io(struct bio *bio, int err)
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
}
static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
@@ -1572,14 +1577,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
btrfs_bio_wq_end_io(rbio->fs_info, bio,
BTRFS_WQ_ENDIO_RAID56);
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(READ, bio);
}
/* the actual write will happen once the reads are done */
return 0;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
return -EIO;
finish:
@@ -1809,7 +1813,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
faila = rbio->faila;
failb = rbio->failb;
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
spin_lock_irq(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock_irq(&rbio->bio_list_lock);
@@ -1834,7 +1839,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
@@ -1943,7 +1949,8 @@ pstripe:
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
@@ -1964,7 +1971,9 @@ cleanup_io:
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- rbio_orig_end_io(rbio, err, err == 0);
+ rbio_orig_end_io(rbio, err);
+ } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ rbio_orig_end_io(rbio, err);
} else if (err == 0) {
rbio->faila = -1;
rbio->failb = -1;
@@ -1976,7 +1985,7 @@ cleanup_io:
else
BUG();
} else {
- rbio_orig_end_io(rbio, err, 0);
+ rbio_orig_end_io(rbio, err);
}
}
@@ -1984,7 +1993,7 @@ cleanup_io:
* This is called only for stripes we've read from disk to
* reconstruct the parity.
*/
-static void raid_recover_end_io(struct bio *bio, int err)
+static void raid_recover_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -1992,7 +2001,7 @@ static void raid_recover_end_io(struct bio *bio, int err)
* we only read stripe pages off the disk, set them
* up to date if there were no errors
*/
- if (err)
+ if (bio->bi_error)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -2002,7 +2011,7 @@ static void raid_recover_end_io(struct bio *bio, int err)
return;
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
else
__raid_recover_end_io(rbio);
}
@@ -2094,15 +2103,15 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
btrfs_bio_wq_end_io(rbio->fs_info, bio,
BTRFS_WQ_ENDIO_RAID56);
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(READ, bio);
}
out:
return 0;
cleanup:
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
- rbio_orig_end_io(rbio, -EIO, 0);
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
+ rbio_orig_end_io(rbio, -EIO);
return -EIO;
}
@@ -2232,8 +2241,9 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
return rbio;
}
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
- struct page *page, u64 logical)
+/* Used for both parity scrub and missing. */
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+ u64 logical)
{
int stripe_offset;
int index;
@@ -2277,11 +2287,12 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
* end io function used by finish_rmw. When we finally
* get here, we've written a full stripe
*/
-static void raid_write_parity_end_io(struct bio *bio, int err)
+static void raid_write_parity_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
+ int err = bio->bi_error;
- if (err)
+ if (bio->bi_error)
fail_bio_stripe(rbio, bio);
bio_put(bio);
@@ -2294,7 +2305,7 @@ static void raid_write_parity_end_io(struct bio *bio, int err)
if (atomic_read(&rbio->error))
err = -EIO;
- rbio_orig_end_io(rbio, err, 0);
+ rbio_orig_end_io(rbio, err);
}
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
@@ -2437,7 +2448,7 @@ submit_write:
nr_data = bio_list_size(&bio_list);
if (!nr_data) {
/* Every parity is right */
- rbio_orig_end_io(rbio, 0, 0);
+ rbio_orig_end_io(rbio, 0);
return;
}
@@ -2450,13 +2461,12 @@ submit_write:
bio->bi_private = rbio;
bio->bi_end_io = raid_write_parity_end_io;
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(WRITE, bio);
}
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
}
static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2524,7 +2534,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
}
/*
@@ -2535,11 +2545,11 @@ cleanup:
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid56_parity_scrub_end_io(struct bio *bio, int err)
+static void raid56_parity_scrub_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- if (err)
+ if (bio->bi_error)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -2632,14 +2642,13 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
btrfs_bio_wq_end_io(rbio->fs_info, bio,
BTRFS_WQ_ENDIO_RAID56);
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(READ, bio);
}
/* the actual write will happen once the reads are done */
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
return;
finish:
@@ -2668,3 +2677,55 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
if (!lock_stripe_add(rbio))
async_scrub_parity(rbio);
}
+
+/* The following code is used for dev replace of a missing RAID 5/6 device. */
+
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 length)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = alloc_rbio(root, bbio, length);
+ if (IS_ERR(rbio))
+ return NULL;
+
+ rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
+ bio_list_add(&rbio->bio_list, bio);
+ /*
+ * This is a special bio which is used to hold the completion handler
+ * and make the scrub rbio is similar to the other types
+ */
+ ASSERT(!bio->bi_iter.bi_size);
+
+ rbio->faila = find_logical_bio_stripe(rbio, bio);
+ if (rbio->faila == -1) {
+ BUG();
+ kfree(rbio);
+ return NULL;
+ }
+
+ return rbio;
+}
+
+static void missing_raid56_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ __raid56_parity_recover(rbio);
+}
+
+static void async_missing_raid56(struct btrfs_raid_bio *rbio)
+{
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ missing_raid56_work, NULL, NULL);
+
+ btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+}
+
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!lock_stripe_add(rbio))
+ async_missing_raid56(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2b5d7977d83b..8b694699d502 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -48,15 +48,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len);
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+ u64 logical);
+
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
- struct page *page, u64 logical);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 length);
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
+
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 0e7beea92b4c..4645cd16d5ba 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -328,6 +328,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
struct btrfs_device *prev_dev;
u32 blocksize;
u64 length;
+ int real_stripes;
int nzones = 0;
int i;
unsigned long index = logical >> PAGE_CACHE_SHIFT;
@@ -369,7 +370,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
goto error;
}
- for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
+ real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ for (nzones = 0; nzones < real_stripes; ++nzones) {
struct reada_zone *zone;
dev = bbio->stripes[nzones].dev;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74b24b01d574..303babeef505 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1847,8 +1847,10 @@ again:
}
eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
- ret = (!eb) ? -ENOMEM : -EIO;
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
+ ret = -EIO;
free_extent_buffer(eb);
break;
}
@@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
eb = read_tree_block(root, bytenr, ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2519,8 +2523,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
* counted. return -ENOENT if the block is root of reloc tree.
*/
static noinline_for_stack
-struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
- struct backref_node *node)
+struct btrfs_root *select_one_root(struct backref_node *node)
{
struct backref_node *next;
struct btrfs_root *root;
@@ -2710,7 +2713,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
blocksize = root->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
eb = read_tree_block(root, bytenr, generation);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ err = PTR_ERR(eb);
+ goto next;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
err = -EIO;
goto next;
@@ -2873,7 +2879,9 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
block->key.offset);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2903,7 +2911,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
return 0;
BUG_ON(node->processed);
- root = select_one_root(trans, node);
+ root = select_one_root(node);
if (root == ERR_PTR(-ENOENT)) {
update_processed_blocks(rc, node);
goto out;
@@ -3746,8 +3754,7 @@ out:
* helper to find next unprocessed extent
*/
static noinline_for_stack
-int find_next_extent(struct btrfs_trans_handle *trans,
- struct reloc_control *rc, struct btrfs_path *path,
+int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
struct btrfs_key *extent_key)
{
struct btrfs_key key;
@@ -3942,7 +3949,7 @@ restart:
continue;
}
- ret = find_next_extent(trans, rc, path, &key);
+ ret = find_next_extent(rc, path, &key);
if (ret < 0)
err = ret;
if (ret != 0)
@@ -3967,6 +3974,10 @@ restart:
sizeof(struct btrfs_extent_item_v0));
ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
&path_change);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
else
@@ -4040,7 +4051,7 @@ restart:
if (trans && progress && err == -ENOSPC) {
ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
rc->block_group->flags);
- if (ret == 0) {
+ if (ret == 1) {
err = 0;
progress = 0;
goto restart;
@@ -4131,7 +4142,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_key key;
- u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+ u64 objectid;
int err = 0;
root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
@@ -4206,14 +4217,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!rc->block_group);
- if (!rc->block_group->ro) {
- ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
- if (ret) {
- err = ret;
- goto out;
- }
- rw = 1;
+ ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
+ if (ret) {
+ err = ret;
+ goto out;
}
+ rw = 1;
path = btrfs_alloc_path();
if (!path) {
@@ -4285,7 +4294,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
out:
if (err && rw)
- btrfs_set_block_group_rw(extent_root, rc->block_group);
+ btrfs_dec_block_group_ro(extent_root, rc->block_group);
iput(rc->data_inode);
btrfs_put_block_group(rc->block_group);
kfree(rc);
@@ -4585,8 +4594,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
* called before creating snapshot. it calculates metadata reservation
* requried for relocating tree blocks in the snapshot
*/
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve)
{
struct btrfs_root *root;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ab5811545a98..9a11db0c47ee 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -125,6 +125,7 @@ struct scrub_block {
/* It is for the data with checksum */
unsigned int data_corrected:1;
};
+ struct btrfs_work work;
};
/* Used for the chunks with parity stripe such RAID5/6 */
@@ -278,7 +279,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum, int force,
u64 physical_for_dev_replace);
-static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io(struct bio *bio);
static void scrub_bio_end_io_worker(struct btrfs_work *work);
static void scrub_block_complete(struct scrub_block *sblock);
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
@@ -295,7 +296,7 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static void scrub_wr_submit(struct scrub_ctx *sctx);
-static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io(struct bio *bio);
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
static int write_page_nocow(struct scrub_ctx *sctx,
u64 physical_for_dev_replace, struct page *page);
@@ -332,11 +333,14 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
}
}
-static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+static void scrub_pause_on(struct btrfs_fs_info *fs_info)
{
atomic_inc(&fs_info->scrubs_paused);
wake_up(&fs_info->scrub_pause_wait);
+}
+static void scrub_pause_off(struct btrfs_fs_info *fs_info)
+{
mutex_lock(&fs_info->scrub_lock);
__scrub_blocked_if_needed(fs_info);
atomic_dec(&fs_info->scrubs_paused);
@@ -345,6 +349,12 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->scrub_pause_wait);
}
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+ scrub_pause_on(fs_info);
+ scrub_pause_off(fs_info);
+}
+
/*
* used for workers that require transaction commits (i.e., for the
* NOCOW case)
@@ -454,27 +464,14 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
struct scrub_ctx *sctx;
int i;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
- int pages_per_rd_bio;
int ret;
- /*
- * the setting of pages_per_rd_bio is correct for scrub but might
- * be wrong for the dev_replace code where we might read from
- * different devices in the initial huge bios. However, that
- * code is able to correctly handle the case when adding a page
- * to a bio fails.
- */
- if (dev->bdev)
- pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
- bio_get_nr_vecs(dev->bdev));
- else
- pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
if (!sctx)
goto nomem;
atomic_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
- sctx->pages_per_rd_bio = pages_per_rd_bio;
+ sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
sctx->dev_root = dev->dev_root;
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
@@ -1429,11 +1426,11 @@ struct scrub_bio_ret {
int error;
};
-static void scrub_bio_wait_endio(struct bio *bio, int error)
+static void scrub_bio_wait_endio(struct bio *bio)
{
struct scrub_bio_ret *ret = bio->bi_private;
- ret->error = error;
+ ret->error = bio->bi_error;
complete(&ret->event);
}
@@ -1790,12 +1787,12 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
btrfsic_submit_bio(WRITE, sbio->bio);
}
-static void scrub_wr_bio_end_io(struct bio *bio, int err)
+static void scrub_wr_bio_end_io(struct bio *bio)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
- sbio->err = err;
+ sbio->err = bio->bi_error;
sbio->bio = bio;
btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
@@ -2087,21 +2084,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
-
- if (!sbio->bio->bi_bdev) {
- /*
- * this case should not happen. If btrfs_map_block() is
- * wrong, it could happen for dev-replace operations on
- * missing devices when no mirrors are available, but in
- * this case it should already fail the mount.
- * This case is handled correctly (but _very_ slowly).
- */
- printk_ratelimited(KERN_WARNING
- "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
- bio_endio(sbio->bio, -EIO);
- } else {
- btrfsic_submit_bio(READ, sbio->bio);
- }
+ btrfsic_submit_bio(READ, sbio->bio);
}
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
@@ -2178,6 +2161,134 @@ again:
return 0;
}
+static void scrub_missing_raid56_end_io(struct bio *bio)
+{
+ struct scrub_block *sblock = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info;
+
+ if (bio->bi_error)
+ sblock->no_io_error_seen = 0;
+
+ btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
+}
+
+static void scrub_missing_raid56_worker(struct btrfs_work *work)
+{
+ struct scrub_block *sblock = container_of(work, struct scrub_block, work);
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ unsigned int is_metadata;
+ unsigned int have_csum;
+ u8 *csum;
+ u64 generation;
+ u64 logical;
+ struct btrfs_device *dev;
+
+ is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA);
+ have_csum = sblock->pagev[0]->have_csum;
+ csum = sblock->pagev[0]->csum;
+ generation = sblock->pagev[0]->generation;
+ logical = sblock->pagev[0]->logical;
+ dev = sblock->pagev[0]->dev;
+
+ if (sblock->no_io_error_seen) {
+ scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+ have_csum, csum, generation,
+ sctx->csum_size);
+ }
+
+ if (!sblock->no_io_error_seen) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ spin_unlock(&sctx->stat_lock);
+ printk_ratelimited_in_rcu(KERN_ERR
+ "BTRFS: I/O error rebulding logical %llu for dev %s\n",
+ logical, rcu_str_deref(dev->name));
+ } else if (sblock->header_error || sblock->checksum_error) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ printk_ratelimited_in_rcu(KERN_ERR
+ "BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+ logical, rcu_str_deref(dev->name));
+ } else {
+ scrub_write_block_to_dev_replace(sblock);
+ }
+
+ scrub_block_put(sblock);
+
+ if (sctx->is_dev_replace &&
+ atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+ }
+
+ scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_missing_raid56_pages(struct scrub_block *sblock)
+{
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ u64 length = sblock->page_count * PAGE_SIZE;
+ u64 logical = sblock->pagev[0]->logical;
+ struct btrfs_bio *bbio;
+ struct bio *bio;
+ struct btrfs_raid_bio *rbio;
+ int ret;
+ int i;
+
+ ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+ &bbio, 0, 1);
+ if (ret || !bbio || !bbio->raid_map)
+ goto bbio_out;
+
+ if (WARN_ON(!sctx->is_dev_replace ||
+ !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ /*
+ * We shouldn't be scrubbing a missing device. Even for dev
+ * replace, we should only get here for RAID 5/6. We either
+ * managed to mount something with no mirrors remaining or
+ * there's a bug in scrub_remap_extent()/btrfs_map_block().
+ */
+ goto bbio_out;
+ }
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ goto bbio_out;
+
+ bio->bi_iter.bi_sector = logical >> 9;
+ bio->bi_private = sblock;
+ bio->bi_end_io = scrub_missing_raid56_end_io;
+
+ rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length);
+ if (!rbio)
+ goto rbio_out;
+
+ for (i = 0; i < sblock->page_count; i++) {
+ struct scrub_page *spage = sblock->pagev[i];
+
+ raid56_add_scrub_pages(rbio, spage->page, spage->logical);
+ }
+
+ btrfs_init_work(&sblock->work, btrfs_scrub_helper,
+ scrub_missing_raid56_worker, NULL, NULL);
+ scrub_block_get(sblock);
+ scrub_pending_bio_inc(sctx);
+ raid56_submit_missing_rbio(rbio);
+ return;
+
+rbio_out:
+ bio_put(bio);
+bbio_out:
+ btrfs_put_bbio(bbio);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+}
+
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum, int force,
@@ -2241,31 +2352,39 @@ leave_nomem:
}
WARN_ON(sblock->page_count == 0);
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
- int ret;
+ if (dev->missing) {
+ /*
+ * This case should only be hit for RAID 5/6 device replace. See
+ * the comment in scrub_missing_raid56_pages() for details.
+ */
+ scrub_missing_raid56_pages(sblock);
+ } else {
+ for (index = 0; index < sblock->page_count; index++) {
+ struct scrub_page *spage = sblock->pagev[index];
+ int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
- if (ret) {
- scrub_block_put(sblock);
- return ret;
+ ret = scrub_add_page_to_rd_bio(sctx, spage);
+ if (ret) {
+ scrub_block_put(sblock);
+ return ret;
+ }
}
- }
- if (force)
- scrub_submit(sctx);
+ if (force)
+ scrub_submit(sctx);
+ }
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
return 0;
}
-static void scrub_bio_end_io(struct bio *bio, int err)
+static void scrub_bio_end_io(struct bio *bio)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
- sbio->err = err;
+ sbio->err = bio->bi_error;
sbio->bio = bio;
btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
@@ -2564,6 +2683,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
+ if (dev->missing) {
+ scrub_parity_mark_sectors_error(sparity, logical, len);
+ return 0;
+ }
+
if (flags & BTRFS_EXTENT_FLAG_DATA) {
blocksize = sctx->sectorsize;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
@@ -2662,18 +2786,30 @@ static void scrub_free_parity(struct scrub_parity *sparity)
kfree(sparity);
}
-static void scrub_parity_bio_endio(struct bio *bio, int error)
+static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
{
- struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+ struct scrub_parity *sparity = container_of(work, struct scrub_parity,
+ work);
struct scrub_ctx *sctx = sparity->sctx;
- if (error)
+ scrub_free_parity(sparity);
+ scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_parity_bio_endio(struct bio *bio)
+{
+ struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+
+ if (bio->bi_error)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
- scrub_free_parity(sparity);
- scrub_pending_bio_dec(sctx);
bio_put(bio);
+
+ btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
+ scrub_parity_bio_endio_worker, NULL, NULL);
+ btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers,
+ &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -2690,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
sparity->nsectors))
goto out;
- length = sparity->logic_end - sparity->logic_start + 1;
+ length = sparity->logic_end - sparity->logic_start;
ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
sparity->logic_start,
&length, &bbio, 0, 1);
@@ -2713,8 +2849,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
goto rbio_out;
list_for_each_entry(spage, &sparity->spages, list)
- raid56_parity_add_scrub_pages(rbio, spage->page,
- spage->logical);
+ raid56_add_scrub_pages(rbio, spage->page, spage->logical);
scrub_pending_bio_inc(sctx);
raid56_parity_submit_scrub_rbio(rbio);
@@ -2762,6 +2897,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
+ struct btrfs_bio *bbio = NULL;
u64 flags;
int ret;
int slot;
@@ -2771,6 +2907,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
u64 extent_logical;
u64 extent_physical;
u64 extent_len;
+ u64 mapped_length;
struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
int nsectors;
@@ -2844,6 +2981,10 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
}
btrfs_item_key_to_cpu(l, &key, slot);
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
if (key.type == BTRFS_METADATA_ITEM_KEY)
bytes = root->nodesize;
else
@@ -2852,11 +2993,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
if (key.objectid + bytes <= logic_start)
goto next;
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.objectid > logic_end) {
+ if (key.objectid >= logic_end) {
stop_loop = 1;
break;
}
@@ -2869,11 +3006,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
flags = btrfs_extent_flags(l, extent);
generation = btrfs_extent_generation(l, extent);
- if (key.objectid < logic_start &&
- (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logic_start);
+ if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ (key.objectid < logic_start ||
+ key.objectid + bytes >
+ logic_start + map->stripe_len)) {
+ btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ key.objectid, logic_start);
goto next;
}
again:
@@ -2893,10 +3031,21 @@ again:
scrub_parity_mark_sectors_data(sparity, extent_logical,
extent_len);
- scrub_remap_extent(fs_info, extent_logical,
- extent_len, &extent_physical,
- &extent_dev,
- &extent_mirror_num);
+ mapped_length = extent_len;
+ ret = btrfs_map_block(fs_info, READ, extent_logical,
+ &mapped_length, &bbio, 0);
+ if (!ret) {
+ if (!bbio || mapped_length < extent_len)
+ ret = -EIO;
+ }
+ if (ret) {
+ btrfs_put_bbio(bbio);
+ goto out;
+ }
+ extent_physical = bbio->stripes[0].physical;
+ extent_mirror_num = bbio->mirror_num;
+ extent_dev = bbio->stripes[0].dev;
+ btrfs_put_bbio(bbio);
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
@@ -2911,10 +3060,12 @@ again:
extent_dev, flags,
generation,
extent_mirror_num);
+
+ scrub_free_csums(sctx);
+
if (ret)
goto out;
- scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
logic_start += map->stripe_len;
@@ -2943,7 +3094,7 @@ next:
out:
if (ret < 0)
scrub_parity_mark_sectors_error(sparity, logic_start,
- logic_end - logic_start + 1);
+ logic_end - logic_start);
scrub_parity_put(sparity);
scrub_submit(sctx);
mutex_lock(&sctx->wr_ctx.wr_lock);
@@ -3092,22 +3243,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
ret = 0;
while (physical < physical_end) {
- /* for raid56, we skip parity stripe */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = get_raid56_logic_offset(physical, num,
- map, &logical, &stripe_logical);
- logical += base;
- if (ret) {
- stripe_logical += base;
- stripe_end = stripe_logical + increment - 1;
- ret = scrub_raid56_parity(sctx, map, scrub_dev,
- ppath, stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto skip;
- }
- }
/*
* canceled?
*/
@@ -3132,6 +3267,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
scrub_blocked_if_needed(fs_info);
}
+ /* for raid56, we skip parity stripe */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = get_raid56_logic_offset(physical, num, map,
+ &logical,
+ &stripe_logical);
+ logical += base;
+ if (ret) {
+ stripe_logical += base;
+ stripe_end = stripe_logical + increment;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ ppath, stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
+ goto skip;
+ }
+ }
+
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
@@ -3176,6 +3329,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
btrfs_item_key_to_cpu(l, &key, slot);
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
if (key.type == BTRFS_METADATA_ITEM_KEY)
bytes = root->nodesize;
else
@@ -3184,10 +3341,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (key.objectid + bytes <= logical)
goto next;
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
if (key.objectid >= logical + map->stripe_len) {
/* out of this device extent */
if (key.objectid >= logic_end)
@@ -3200,8 +3353,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
flags = btrfs_extent_flags(l, extent);
generation = btrfs_extent_generation(l, extent);
- if (key.objectid < logical &&
- (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+ if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ (key.objectid < logical ||
+ key.objectid + bytes >
+ logical + map->stripe_len)) {
btrfs_err(fs_info,
"scrub: tree block %llu spanning "
"stripes, ignored. logical=%llu",
@@ -3235,9 +3390,11 @@ again:
&extent_dev,
&extent_mirror_num);
- ret = btrfs_lookup_csums_range(csum_root, logical,
- logical + map->stripe_len - 1,
- &sctx->csum_list, 1);
+ ret = btrfs_lookup_csums_range(csum_root,
+ extent_logical,
+ extent_logical +
+ extent_len - 1,
+ &sctx->csum_list, 1);
if (ret)
goto out;
@@ -3245,10 +3402,12 @@ again:
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
extent_logical - logical + physical);
+
+ scrub_free_csums(sctx);
+
if (ret)
goto out;
- scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -3266,7 +3425,7 @@ loop:
if (ret && physical < physical_end) {
stripe_logical += base;
stripe_end = stripe_logical +
- increment - 1;
+ increment;
ret = scrub_raid56_parity(sctx,
map, scrub_dev, ppath,
stripe_logical,
@@ -3375,7 +3534,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
u64 chunk_tree;
u64 chunk_objectid;
u64 chunk_offset;
- int ret;
+ int ret = 0;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
@@ -3403,8 +3562,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
- if (ret)
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
break;
+ }
+ } else {
+ ret = 0;
}
}
@@ -3446,6 +3611,22 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ /*
+ * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+ * to avoid deadlock caused by:
+ * btrfs_inc_block_group_ro()
+ * -> btrfs_wait_for_commit()
+ * -> btrfs_commit_transaction()
+ * -> btrfs_scrub_pause()
+ */
+ scrub_pause_on(fs_info);
+ ret = btrfs_inc_block_group_ro(root, cache);
+ scrub_pause_off(fs_info);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
@@ -3471,8 +3652,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
- atomic_inc(&fs_info->scrubs_paused);
- wake_up(&fs_info->scrub_pause_wait);
+
+ scrub_pause_on(fs_info);
/*
* must be called before we decrease @scrub_paused.
@@ -3483,11 +3664,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
atomic_read(&sctx->workers_pending) == 0);
atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
- mutex_lock(&fs_info->scrub_lock);
- __scrub_blocked_if_needed(fs_info);
- atomic_dec(&fs_info->scrubs_paused);
- mutex_unlock(&fs_info->scrub_lock);
- wake_up(&fs_info->scrub_pause_wait);
+ scrub_pause_off(fs_info);
+
+ btrfs_dec_block_group_ro(root, cache);
btrfs_put_block_group(cache);
if (ret)
@@ -3511,11 +3690,7 @@ skip:
btrfs_free_path(path);
- /*
- * ret can still be 1 from search_slot or next_leaf,
- * that's not an error
- */
- return ret < 0 ? ret : 0;
+ return ret;
}
static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
@@ -3559,7 +3734,6 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
- int ret = 0;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
@@ -3572,27 +3746,36 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
fs_info->scrub_workers =
btrfs_alloc_workqueue("btrfs-scrub", flags,
max_active, 4);
- if (!fs_info->scrub_workers) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!fs_info->scrub_workers)
+ goto fail_scrub_workers;
+
fs_info->scrub_wr_completion_workers =
btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
max_active, 2);
- if (!fs_info->scrub_wr_completion_workers) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!fs_info->scrub_wr_completion_workers)
+ goto fail_scrub_wr_completion_workers;
+
fs_info->scrub_nocow_workers =
btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
- if (!fs_info->scrub_nocow_workers) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!fs_info->scrub_nocow_workers)
+ goto fail_scrub_nocow_workers;
+ fs_info->scrub_parity_workers =
+ btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+ max_active, 2);
+ if (!fs_info->scrub_parity_workers)
+ goto fail_scrub_parity_workers;
}
++fs_info->scrub_workers_refcnt;
-out:
- return ret;
+ return 0;
+
+fail_scrub_parity_workers:
+ btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+fail_scrub_nocow_workers:
+ btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+fail_scrub_wr_completion_workers:
+ btrfs_destroy_workqueue(fs_info->scrub_workers);
+fail_scrub_workers:
+ return -ENOMEM;
}
static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
@@ -3601,6 +3784,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->scrub_workers);
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
}
@@ -3875,8 +4059,7 @@ static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
return 0;
WARN_ON(!dev->bdev);
- wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
- bio_get_nr_vecs(dev->bdev));
+ wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
wr_ctx->tgtdev = dev;
atomic_set(&wr_ctx->flush_all_writes, 0);
return 0;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a1216f9b4917..aa72bfd28f7d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -243,6 +243,7 @@ struct waiting_dir_move {
* after this directory is moved, we can try to rmdir the ino rmdir_ino.
*/
u64 rmdir_ino;
+ bool orphanized;
};
struct orphan_dir_info {
@@ -1158,6 +1159,9 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
+ /* data offset in the file extent item */
+ u64 data_offset;
+
/* Just to check for bugs in backref resolving */
int found_itself;
};
@@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
if (ret < 0)
return ret;
- if (offset + bctx->extent_len > i_size)
+ if (offset + bctx->data_offset + bctx->extent_len > i_size)
return 0;
/*
@@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx,
backref_ctx->cur_offset = data_offset;
backref_ctx->found_itself = 0;
backref_ctx->extent_len = num_bytes;
+ /*
+ * For non-compressed extents iterate_extent_inodes() gives us extent
+ * offsets that already take into account the data offset, but not for
+ * compressed extents, since the offset is logical and not relative to
+ * the physical extent locations. We must take this into account to
+ * avoid sending clone offsets that go beyond the source file's size,
+ * which would result in the clone ioctl failing with -EINVAL on the
+ * receiving end.
+ */
+ if (compressed == BTRFS_COMPRESS_NONE)
+ backref_ctx->data_offset = 0;
+ else
+ backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
@@ -1900,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
}
- /* we know that it is or will be overwritten. check this now */
- if (ow_inode < sctx->send_progress)
+ /*
+ * We know that it is or will be overwritten. Check this now.
+ * The current inode being processed might have been the one that caused
+ * inode 'ino' to be orphanized, therefore ow_inode can actually be the
+ * same as sctx->send_progress.
+ */
+ if (ow_inode <= sctx->send_progress)
ret = 1;
else
ret = 0;
@@ -2223,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
fs_path_reset(dest);
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+ struct waiting_dir_move *wdm;
+
fs_path_reset(name);
if (is_waiting_for_rm(sctx, ino)) {
@@ -2233,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
break;
}
- if (is_waiting_for_move(sctx, ino)) {
+ wdm = get_waiting_dir_move(sctx, ino);
+ if (wdm && wdm->orphanized) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ stop = 1;
+ } else if (wdm) {
ret = get_first_ref(sctx->parent_root, ino,
&parent_inode, &parent_gen, name);
} else {
@@ -2328,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
le64_to_cpu(sctx->send_root->root_item.ctransid));
if (parent_root) {
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- sctx->parent_root->root_item.uuid);
+ if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(sctx->parent_root->root_item.ctransid));
}
@@ -2923,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
return entry != NULL;
}
-static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
{
struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
struct rb_node *parent = NULL;
@@ -2934,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
return -ENOMEM;
dm->ino = ino;
dm->rmdir_ino = 0;
+ dm->orphanized = orphanized;
while (*p) {
parent = *p;
@@ -3030,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
goto out;
}
- ret = add_waiting_dir_move(sctx, pm->ino);
+ ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
if (ret)
goto out;
@@ -3353,8 +3386,40 @@ out:
return ret;
}
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int is_ancestor(struct btrfs_root *root,
+ const u64 ino1,
+ const u64 ino1_gen,
+ const u64 ino2,
+ struct fs_path *fs_path)
+{
+ u64 ino = ino2;
+
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ int ret;
+ u64 parent;
+ u64 parent_gen;
+
+ fs_path_reset(fs_path);
+ ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+ if (ret < 0) {
+ if (ret == -ENOENT && ino == ino2)
+ ret = 0;
+ return ret;
+ }
+ if (parent == ino1)
+ return parent_gen == ino1_gen ? 1 : 0;
+ ino = parent;
+ }
+ return 0;
+}
+
static int wait_for_parent_move(struct send_ctx *sctx,
- struct recorded_ref *parent_ref)
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
{
int ret = 0;
u64 ino = parent_ref->dir;
@@ -3374,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx,
* Our current directory inode may not yet be renamed/moved because some
* ancestor (immediate or not) has to be renamed/moved first. So find if
* such ancestor exists and make sure our own rename/move happens after
- * that ancestor is processed.
+ * that ancestor is processed to avoid path build infinite loops (done
+ * at get_cur_path()).
*/
while (ino > BTRFS_FIRST_FREE_OBJECTID) {
if (is_waiting_for_move(sctx, ino)) {
- ret = 1;
+ /*
+ * If the current inode is an ancestor of ino in the
+ * parent root, we need to delay the rename of the
+ * current inode, otherwise don't delayed the rename
+ * because we can end up with a circular dependency
+ * of renames, resulting in some directories never
+ * getting the respective rename operations issued in
+ * the send stream or getting into infinite path build
+ * loops.
+ */
+ ret = is_ancestor(sctx->parent_root,
+ sctx->cur_ino, sctx->cur_inode_gen,
+ ino, path_before);
break;
}
@@ -3420,7 +3498,7 @@ out:
ino,
&sctx->new_refs,
&sctx->deleted_refs,
- false);
+ is_orphan);
if (!ret)
ret = 1;
}
@@ -3589,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
}
}
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
+ can_rename) {
+ ret = wait_for_parent_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
/*
* link/move the ref to the new place. If we have an orphan
* inode, move it and update valid_path. If not, link or move
@@ -3609,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = wait_for_parent_move(sctx, cur);
- if (ret < 0)
- goto out;
- if (ret) {
- *pending_move = 1;
- } else {
- ret = send_rename(sctx, valid_path,
- cur->full_path);
- if (!ret)
- ret = fs_path_copy(valid_path,
- cur->full_path);
- }
+ ret = send_rename(sctx, valid_path,
+ cur->full_path);
+ if (!ret)
+ ret = fs_path_copy(valid_path,
+ cur->full_path);
if (ret < 0)
goto out;
} else {
@@ -4508,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
if (ret < 0)
goto out;
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- clone_root->root->root_item.uuid);
+ /*
+ * If the parent we're using has a received_uuid set then use that as
+ * our clone source as that is what we will look for when doing a
+ * receive.
+ *
+ * This covers the case that we create a snapshot off of a received
+ * subvolume and then use that as the parent and try to receive on a
+ * different host.
+ */
+ if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(clone_root->root->root_item.ctransid));
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9e66f5e724db..2b07b3581781 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -69,7 +69,7 @@ static struct file_system_type btrfs_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
-static const char *btrfs_decode_error(int errno)
+const char *btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
* __btrfs_std_error decodes expected errors from the caller and
* invokes the approciate error response.
*/
+__cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
@@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
* We'll complete the cleanup in btrfs_end_transaction and
* btrfs_commit_transaction.
*/
+__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno)
{
- /*
- * Report first abort since mount
- */
- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
- &root->fs_info->fs_state)) {
- WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
- errno);
- }
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
@@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
* __btrfs_panic decodes unexpected, fatal errors from the caller,
* issues an alert, and either panics or BUGs, depending on mount options.
*/
+__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
@@ -841,33 +836,153 @@ out:
return error;
}
-static struct dentry *get_default_root(struct super_block *sb,
- u64 subvol_objectid)
+static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_root *new_root;
- struct btrfs_dir_item *di;
- struct btrfs_path *path;
- struct btrfs_key location;
- struct inode *inode;
- u64 dir_id;
- int new = 0;
+ struct btrfs_root *fs_root;
+ struct btrfs_root_ref *root_ref;
+ struct btrfs_inode_ref *inode_ref;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ char *name = NULL, *ptr;
+ u64 dirid;
+ int len;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ path->leave_spinning = 1;
+
+ name = kmalloc(PATH_MAX, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ ptr = name + PATH_MAX - 1;
+ ptr[0] = '\0';
/*
- * We have a specific subvol we want to mount, just setup location and
- * go look up the root.
+ * Walk up the subvolume trees in the tree of tree roots by root
+ * backrefs until we hit the top-level subvolume.
*/
- if (subvol_objectid) {
- location.objectid = subvol_objectid;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = (u64)-1;
- goto find_root;
+ while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+ key.objectid = subvol_objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(root, path, subvol_objectid,
+ BTRFS_ROOT_BACKREF_KEY);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ subvol_objectid = key.offset;
+
+ root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_root_ref);
+ len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(root_ref + 1), len);
+ ptr[0] = '/';
+ dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
+ btrfs_release_path(path);
+
+ key.objectid = subvol_objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(fs_root)) {
+ ret = PTR_ERR(fs_root);
+ goto err;
+ }
+
+ /*
+ * Walk up the filesystem tree by inode refs until we hit the
+ * root directory.
+ */
+ while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(fs_root, path, dirid,
+ BTRFS_INODE_REF_KEY);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ dirid = key.offset;
+
+ inode_ref = btrfs_item_ptr(path->nodes[0],
+ path->slots[0],
+ struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(path->nodes[0],
+ inode_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(inode_ref + 1), len);
+ ptr[0] = '/';
+ btrfs_release_path(path);
+ }
+ }
+
+ btrfs_free_path(path);
+ if (ptr == name + PATH_MAX - 1) {
+ name[0] = '/';
+ name[1] = '\0';
+ } else {
+ memmove(name, ptr, name + PATH_MAX - ptr);
}
+ return name;
+
+err:
+ btrfs_free_path(path);
+ kfree(name);
+ return ERR_PTR(ret);
+}
+
+static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_key location;
+ u64 dir_id;
path = btrfs_alloc_path();
if (!path)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
path->leave_spinning = 1;
/*
@@ -879,58 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb,
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
- return ERR_CAST(di);
+ return PTR_ERR(di);
}
if (!di) {
/*
* Ok the default dir item isn't there. This is weird since
* it's always been there, but don't freak out, just try and
- * mount to root most subvolume.
+ * mount the top-level subvolume.
*/
btrfs_free_path(path);
- dir_id = BTRFS_FIRST_FREE_OBJECTID;
- new_root = fs_info->fs_root;
- goto setup_root;
+ *objectid = BTRFS_FS_TREE_OBJECTID;
+ return 0;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
btrfs_free_path(path);
-
-find_root:
- new_root = btrfs_read_fs_root_no_name(fs_info, &location);
- if (IS_ERR(new_root))
- return ERR_CAST(new_root);
-
- if (!(sb->s_flags & MS_RDONLY)) {
- int ret;
- down_read(&fs_info->cleanup_work_sem);
- ret = btrfs_orphan_cleanup(new_root);
- up_read(&fs_info->cleanup_work_sem);
- if (ret)
- return ERR_PTR(ret);
- }
-
- dir_id = btrfs_root_dirid(&new_root->root_item);
-setup_root:
- location.objectid = dir_id;
- location.type = BTRFS_INODE_ITEM_KEY;
- location.offset = 0;
-
- inode = btrfs_iget(sb, &location, new_root, &new);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- /*
- * If we're just mounting the root most subvol put the inode and return
- * a reference to the dentry. We will have already gotten a reference
- * to the inode in btrfs_fill_super so we're good to go.
- */
- if (!new && d_inode(sb->s_root) == inode) {
- iput(inode);
- return dget(sb->s_root);
- }
-
- return d_obtain_root(inode);
+ *objectid = location.objectid;
+ return 0;
}
static int btrfs_fill_super(struct super_block *sb,
@@ -953,6 +1033,7 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_flags |= MS_POSIXACL;
#endif
sb->s_flags |= MS_I_VERSION;
+ sb->s_iflags |= SB_I_CGROUPWB;
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
printk(KERN_ERR "BTRFS: open_ctree failed\n");
@@ -1108,6 +1189,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+ seq_printf(seq, ",subvolid=%llu",
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ seq_puts(seq, ",subvol=");
+ seq_dentry(seq, dentry, " \t\n\\");
return 0;
}
@@ -1138,107 +1223,139 @@ static inline int is_subvolume_inode(struct inode *inode)
}
/*
- * This will strip out the subvol=%s argument for an argument string and add
- * subvolid=0 to make sure we get the actual tree root for path walking to the
- * subvol we want.
+ * This will add subvolid=0 to the argument string while removing any subvol=
+ * and subvolid= arguments to make sure we get the top-level root for path
+ * walking to the subvol we want.
*/
static char *setup_root_args(char *args)
{
- unsigned len = strlen(args) + 2 + 1;
- char *src, *dst, *buf;
-
- /*
- * We need the same args as before, but with this substitution:
- * s!subvol=[^,]+!subvolid=0!
- *
- * Since the replacement string is up to 2 bytes longer than the
- * original, allocate strlen(args) + 2 + 1 bytes.
- */
+ char *buf, *dst, *sep;
- src = strstr(args, "subvol=");
- /* This shouldn't happen, but just in case.. */
- if (!src)
- return NULL;
+ if (!args)
+ return kstrdup("subvolid=0", GFP_NOFS);
- buf = dst = kmalloc(len, GFP_NOFS);
+ /* The worst case is that we add ",subvolid=0" to the end. */
+ buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS);
if (!buf)
return NULL;
- /*
- * If the subvol= arg is not at the start of the string,
- * copy whatever precedes it into buf.
- */
- if (src != args) {
- *src++ = '\0';
- strcpy(buf, args);
- dst += strlen(args);
+ while (1) {
+ sep = strchrnul(args, ',');
+ if (!strstarts(args, "subvol=") &&
+ !strstarts(args, "subvolid=")) {
+ memcpy(dst, args, sep - args);
+ dst += sep - args;
+ *dst++ = ',';
+ }
+ if (*sep)
+ args = sep + 1;
+ else
+ break;
}
-
strcpy(dst, "subvolid=0");
- dst += strlen("subvolid=0");
-
- /*
- * If there is a "," after the original subvol=... string,
- * copy that suffix into our buffer. Otherwise, we're done.
- */
- src = strchr(src, ',');
- if (src)
- strcpy(dst, src);
return buf;
}
-static struct dentry *mount_subvol(const char *subvol_name, int flags,
- const char *device_name, char *data)
+static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
+ int flags, const char *device_name,
+ char *data)
{
struct dentry *root;
- struct vfsmount *mnt;
+ struct vfsmount *mnt = NULL;
char *newargs;
+ int ret;
newargs = setup_root_args(data);
- if (!newargs)
- return ERR_PTR(-ENOMEM);
- mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
- newargs);
+ if (!newargs) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
+ }
- if (PTR_RET(mnt) == -EBUSY) {
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
+ if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
if (flags & MS_RDONLY) {
- mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
- newargs);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+ device_name, newargs);
} else {
- int r;
- mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
- newargs);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+ device_name, newargs);
if (IS_ERR(mnt)) {
- kfree(newargs);
- return ERR_CAST(mnt);
+ root = ERR_CAST(mnt);
+ mnt = NULL;
+ goto out;
}
- r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
- if (r < 0) {
- /* FIXME: release vfsmount mnt ??*/
- kfree(newargs);
- return ERR_PTR(r);
+ down_write(&mnt->mnt_sb->s_umount);
+ ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+ up_write(&mnt->mnt_sb->s_umount);
+ if (ret < 0) {
+ root = ERR_PTR(ret);
+ goto out;
}
}
}
+ if (IS_ERR(mnt)) {
+ root = ERR_CAST(mnt);
+ mnt = NULL;
+ goto out;
+ }
- kfree(newargs);
+ if (!subvol_name) {
+ if (!subvol_objectid) {
+ ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
+ &subvol_objectid);
+ if (ret) {
+ root = ERR_PTR(ret);
+ goto out;
+ }
+ }
+ subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
+ subvol_objectid);
+ if (IS_ERR(subvol_name)) {
+ root = ERR_CAST(subvol_name);
+ subvol_name = NULL;
+ goto out;
+ }
- if (IS_ERR(mnt))
- return ERR_CAST(mnt);
+ }
root = mount_subtree(mnt, subvol_name);
+ /* mount_subtree() drops our reference on the vfsmount. */
+ mnt = NULL;
- if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
+ if (!IS_ERR(root)) {
struct super_block *s = root->d_sb;
- dput(root);
- root = ERR_PTR(-EINVAL);
- deactivate_locked_super(s);
- printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
- subvol_name);
+ struct inode *root_inode = d_inode(root);
+ u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+
+ ret = 0;
+ if (!is_subvolume_inode(root_inode)) {
+ pr_err("BTRFS: '%s' is not a valid subvolume\n",
+ subvol_name);
+ ret = -EINVAL;
+ }
+ if (subvol_objectid && root_objectid != subvol_objectid) {
+ /*
+ * This will also catch a race condition where a
+ * subvolume which was passed by ID is renamed and
+ * another subvolume is renamed over the old location.
+ */
+ pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n",
+ subvol_name, subvol_objectid);
+ ret = -EINVAL;
+ }
+ if (ret) {
+ dput(root);
+ root = ERR_PTR(ret);
+ deactivate_locked_super(s);
+ }
}
+out:
+ mntput(mnt);
+ kfree(newargs);
+ kfree(subvol_name);
return root;
}
@@ -1303,7 +1420,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
{
struct block_device *bdev = NULL;
struct super_block *s;
- struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
struct security_mnt_opts new_sec_opts;
@@ -1323,10 +1439,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
return ERR_PTR(error);
}
- if (subvol_name) {
- root = mount_subvol(subvol_name, flags, device_name, data);
- kfree(subvol_name);
- return root;
+ if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+ /* mount_subvol() will free subvol_name. */
+ return mount_subvol(subvol_name, subvol_objectid, flags,
+ device_name, data);
}
security_init_mnt_opts(&new_sec_opts);
@@ -1392,23 +1508,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
}
-
- root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
- if (IS_ERR(root)) {
+ if (error) {
deactivate_locked_super(s);
- error = PTR_ERR(root);
goto error_sec_opts;
}
fs_info = btrfs_sb(s);
error = setup_security_options(fs_info, s, &new_sec_opts);
if (error) {
- dput(root);
deactivate_locked_super(s);
goto error_sec_opts;
}
- return root;
+ return dget(s->s_root);
error_close_devices:
btrfs_close_devices(fs_devices);
@@ -1539,6 +1651,17 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags |= MS_RDONLY;
+ /*
+ * Setting MS_RDONLY will put the cleaner thread to
+ * sleep at the next loop if it's already active.
+ * If it's already asleep, we'll leave unused block
+ * groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_delete_unused_bgs(fs_info);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
btrfs_pause_balance(fs_info);
@@ -2052,8 +2175,7 @@ static int btrfs_interface_init(void)
static void btrfs_interface_exit(void)
{
- if (misc_deregister(&btrfs_misc) < 0)
- printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
+ misc_deregister(&btrfs_misc);
}
static void btrfs_print_info(void)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e8a4c86d274d..603b0cc2b9bb 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -33,6 +33,7 @@
#include "volumes.h"
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
static u64 get_features(struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set)
@@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
-static struct attribute *btrfs_attrs[] = {
+static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
BTRFS_ATTR_PTR(nodesize),
BTRFS_ATTR_PTR(sectorsize),
@@ -438,21 +439,29 @@ static struct attribute *btrfs_attrs[] = {
static void btrfs_release_super_kobj(struct kobject *kobj)
{
- struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- complete(&fs_info->kobj_unregister);
+ struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
+
+ memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+ complete(&fs_devs->kobj_unregister);
}
static struct kobj_type btrfs_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = btrfs_release_super_kobj,
- .default_attrs = btrfs_attrs,
};
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
+{
+ if (kobj->ktype != &btrfs_ktype)
+ return NULL;
+ return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+}
+
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
{
if (kobj->ktype != &btrfs_ktype)
return NULL;
- return container_of(kobj, struct btrfs_fs_info, super_kobj);
+ return to_fs_devs(kobj)->fs_info;
}
#define NUM_FEATURE_BITS 64
@@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
attrs[0] = &fa->kobj_attr.attr;
if (add) {
int ret;
- ret = sysfs_merge_group(&fs_info->super_kobj,
+ ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
&agroup);
if (ret)
return ret;
} else
- sysfs_unmerge_group(&fs_info->super_kobj,
+ sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
&agroup);
}
@@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
return 0;
}
-static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+{
+ if (fs_devs->device_dir_kobj) {
+ kobject_del(fs_devs->device_dir_kobj);
+ kobject_put(fs_devs->device_dir_kobj);
+ fs_devs->device_dir_kobj = NULL;
+ }
+
+ if (fs_devs->super_kobj.state_initialized) {
+ kobject_del(&fs_devs->super_kobj);
+ kobject_put(&fs_devs->super_kobj);
+ wait_for_completion(&fs_devs->kobj_unregister);
+ }
+}
+
+/* when fs_devs is NULL it will remove all fsid kobject */
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
{
- kobject_del(&fs_info->super_kobj);
- kobject_put(&fs_info->super_kobj);
- wait_for_completion(&fs_info->kobj_unregister);
+ struct list_head *fs_uuids = btrfs_get_fs_uuids();
+
+ if (fs_devs) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ return;
+ }
+
+ list_for_each_entry(fs_devs, fs_uuids, list) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ }
}
void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
{
+ btrfs_reset_fs_info_ptr(fs_info);
+
if (fs_info->space_info_kobj) {
sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
- kobject_del(fs_info->device_dir_kobj);
- kobject_put(fs_info->device_dir_kobj);
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
- __btrfs_sysfs_remove_one(fs_info);
+ sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
+ btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
}
const char * const btrfs_feature_set_names[3] = {
@@ -602,40 +635,60 @@ static void init_feature_attrs(void)
}
}
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+/* when one_device is NULL, it removes all device links */
+
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
- if (!fs_info->device_dir_kobj)
+ if (!fs_devices->device_dir_kobj)
return -EINVAL;
if (one_device && one_device->bdev) {
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_info->device_dir_kobj,
+ sysfs_remove_link(fs_devices->device_dir_kobj,
+ disk_kobj->name);
+ }
+
+ if (one_device)
+ return 0;
+
+ list_for_each_entry(one_device,
+ &fs_devices->devices, dev_list) {
+ if (!one_device->bdev)
+ continue;
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ sysfs_remove_link(fs_devices->device_dir_kobj,
disk_kobj->name);
}
return 0;
}
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
{
- int error = 0;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- struct btrfs_device *dev;
-
- if (!fs_info->device_dir_kobj)
- fs_info->device_dir_kobj = kobject_create_and_add("devices",
- &fs_info->super_kobj);
+ if (!fs_devs->device_dir_kobj)
+ fs_devs->device_dir_kobj = kobject_create_and_add("devices",
+ &fs_devs->super_kobj);
- if (!fs_info->device_dir_kobj)
+ if (!fs_devs->device_dir_kobj)
return -ENOMEM;
+ return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device *one_device)
+{
+ int error = 0;
+ struct btrfs_device *dev;
+
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
struct hd_struct *disk;
struct kobject *disk_kobj;
@@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
disk = dev->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_info->device_dir_kobj,
+ error = sysfs_create_link(fs_devices->device_dir_kobj,
disk_kobj, disk_kobj->name);
if (error)
break;
@@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry;
/* Debugging tunables and exported data */
u64 btrfs_debugfs_test;
+/*
+ * Can be called by the device discovery thread.
+ * And parent can be specified for seed device
+ */
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+ struct kobject *parent)
+{
+ int error;
+
+ init_completion(&fs_devs->kobj_unregister);
+ fs_devs->super_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_devs->super_kobj,
+ &btrfs_ktype, parent, "%pU", fs_devs->fsid);
+ return error;
+}
+
int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
{
int error;
+ struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
+ struct kobject *super_kobj = &fs_devs->super_kobj;
+
+ btrfs_set_fs_info_ptr(fs_info);
- init_completion(&fs_info->kobj_unregister);
- fs_info->super_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
- "%pU", fs_info->fsid);
+ error = btrfs_kobj_add_device(fs_devs, NULL);
if (error)
return error;
- error = sysfs_create_group(&fs_info->super_kobj,
- &btrfs_feature_attr_group);
+ error = sysfs_create_files(super_kobj, btrfs_attrs);
if (error) {
- __btrfs_sysfs_remove_one(fs_info);
+ btrfs_kobj_rm_device(fs_devs, NULL);
return error;
}
- error = addrm_unknown_feature_attrs(fs_info, true);
+ error = sysfs_create_group(super_kobj,
+ &btrfs_feature_attr_group);
if (error)
goto failure;
- error = btrfs_kobj_add_device(fs_info, NULL);
+ error = addrm_unknown_feature_attrs(fs_info, true);
if (error)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
- &fs_info->super_kobj);
+ super_kobj);
if (!fs_info->space_info_kobj) {
error = -ENOMEM;
goto failure;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 3a4bbed723fd..6392527bcc15 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,8 +82,12 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+ struct kobject *parent);
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index c32a7ba76bca..846d277b1901 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -21,6 +21,7 @@
#include "../transaction.h"
#include "../disk-io.h"
#include "../qgroup.h"
+#include "../backref.h"
static void init_dummy_trans(struct btrfs_trans_handle *trans)
{
@@ -227,6 +228,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
int ret;
init_dummy_trans(&trans);
@@ -238,10 +241,15 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ /*
+ * Since the test trans doesn't havee the complicated delayed refs,
+ * we can only call btrfs_qgroup_account_extent() directly to test
+ * quota.
+ */
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
if (ret) {
- test_msg("Couldn't add space to a qgroup %d\n", ret);
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
@@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
if (ret)
return ret;
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Delayed qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
+ old_roots = NULL;
+ new_roots = NULL;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
ret = remove_extent_item(root, 4096, 4096);
if (ret)
return -EINVAL;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Couldn't remove space from the qgroup %d\n", ret);
- return -EINVAL;
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return -EINVAL;
}
@@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
int ret;
init_dummy_trans(&trans);
@@ -307,20 +337,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return ret;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Couldn't add space to a qgroup %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Delayed qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return -EINVAL;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = add_tree_ref(root, 4096, 4096, 0, 256);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Qgroup record ref failed %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return -EINVAL;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = remove_extent_ref(root, 4096, 4096, 0, 256);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
- BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Qgroup record ref failed %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5628e25250c0..8f259b3a66b3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -225,12 +225,14 @@ loop:
cur_trans->dirty_bg_run = 0;
cur_trans->delayed_refs.href_root = RB_ROOT;
+ cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
cur_trans->delayed_refs.num_heads_ready = 0;
cur_trans->delayed_refs.pending_csums = 0;
cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0;
cur_trans->delayed_refs.run_delayed_start = 0;
+ cur_trans->delayed_refs.qgroup_to_skip = 0;
/*
* although the tree mod log is per file system and not per transaction,
@@ -256,6 +258,8 @@ loop:
mutex_init(&cur_trans->cache_write_mutex);
cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
+ INIT_LIST_HEAD(&cur_trans->deleted_bgs);
+ spin_lock_init(&cur_trans->deleted_bgs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -509,6 +513,7 @@ again:
h->transaction = cur_trans;
h->blocks_used = 0;
h->bytes_reserved = 0;
+ h->chunk_bytes_reserved = 0;
h->root = root;
h->delayed_ref_updates = 0;
h->use_count = 1;
@@ -758,7 +763,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->ordered)) {
spin_lock(&info->trans_lock);
- list_splice(&trans->ordered, &cur_trans->pending_ordered);
+ list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
spin_unlock(&info->trans_lock);
}
@@ -792,6 +797,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
+
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root) &&
ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -1290,7 +1297,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (pending->error)
goto no_free_objectid;
- btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+ /*
+ * Make qgroup to skip current new snapshot's qgroupid, as it is
+ * accounted by later btrfs_qgroup_inherit().
+ */
+ btrfs_set_skip_qgroup(trans, objectid);
+
+ btrfs_reloc_pre_snapshot(pending, &to_reserve);
if (to_reserve > 0) {
pending->error = btrfs_block_rsv_add(root,
@@ -1298,7 +1311,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
to_reserve,
BTRFS_RESERVE_NO_FLUSH);
if (pending->error)
- goto no_free_objectid;
+ goto clear_skip_qgroup;
}
key.objectid = objectid;
@@ -1396,25 +1409,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, root, ret);
goto fail;
}
-
- /*
- * We need to flush delayed refs in order to make sure all of our quota
- * operations have been done before we call btrfs_qgroup_inherit.
- */
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
- ret = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
/* see comments in should_cow_block() */
set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_wmb();
@@ -1497,11 +1491,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
}
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ /*
+ * account qgroup counters before qgroup_inherit()
+ */
+ ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+ if (ret)
+ goto fail;
+ ret = btrfs_qgroup_account_extents(trans, fs_info);
+ if (ret)
+ goto fail;
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ root->root_key.objectid,
+ objectid, pending->inherit);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
fail:
pending->error = ret;
dir_item_existed:
trans->block_rsv = rsv;
trans->bytes_reserved = 0;
+clear_skip_qgroup:
+ btrfs_clear_skip_qgroup(trans);
no_free_objectid:
kfree(new_root_item);
root_item_alloc_fail:
@@ -1620,9 +1640,7 @@ static void do_async_commit(struct work_struct *work)
* Tell lockdep about it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
- rwsem_acquire_read(
- &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
+ __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS);
current->journal_info = ac->newtrans;
@@ -1661,9 +1679,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
* async commit thread will be the one to unlock it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
- rwsem_release(
- &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 1, _THIS_IP_);
+ __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS);
schedule_work(&ac->work);
@@ -1848,7 +1864,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
spin_lock(&root->fs_info->trans_lock);
- list_splice(&trans->ordered, &cur_trans->pending_ordered);
+ list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
@@ -1875,8 +1891,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
spin_unlock(&root->fs_info->trans_lock);
wait_for_commit(root, prev_trans);
+ ret = prev_trans->aborted;
btrfs_put_transaction(prev_trans);
+ if (ret)
+ goto cleanup_transaction;
} else {
spin_unlock(&root->fs_info->trans_lock);
}
@@ -1963,6 +1982,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto scrub_continue;
}
+ /* Reocrd old roots for later qgroup accounting */
+ ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
/*
* make sure none of the code above managed to slip in a
* delayed item
@@ -2004,6 +2030,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
btrfs_free_log_root_tree(trans, root->fs_info);
+ /*
+ * Since fs roots are all committed, we can get a quite accurate
+ * new_roots. So let's do quota accounting.
+ */
+ ret = btrfs_qgroup_account_extents(trans, root->fs_info);
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
ret = commit_cowonly_roots(trans, root);
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
@@ -2054,6 +2091,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+ btrfs_trans_release_chunk_metadata(trans);
+
spin_lock(&root->fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
@@ -2114,7 +2153,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- if (current != root->fs_info->transaction_kthread)
+ if (current != root->fs_info->transaction_kthread &&
+ current != root->fs_info->cleaner_kthread)
btrfs_run_delayed_iputs(root);
return ret;
@@ -2123,6 +2163,7 @@ scrub_continue:
btrfs_scrub_continue(root);
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
if (trans->qgroup_reserved) {
btrfs_qgroup_free(root, trans->qgroup_reserved);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0b24755596ba..edc2fbc262d7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -74,6 +74,8 @@ struct btrfs_transaction {
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
+ struct list_head deleted_bgs;
+ spinlock_t deleted_bgs_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
int dirty_bg_run;
@@ -102,6 +104,7 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
+ u64 chunk_bytes_reserved;
u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
@@ -153,6 +156,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
spin_unlock(&BTRFS_I(inode)->lock);
}
+/*
+ * Make qgroup codes to skip given qgroupid, means the old/new_roots for
+ * qgroup won't contain the qgroupid in it.
+ */
+static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
+ u64 qgroupid)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = qgroupid;
+}
+
+static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(!delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = 0;
+}
+
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a63719cc9578..a4b9c8b2d35a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
goto out;
- if (btrfs_test_opt(root, SSD))
- goto out;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d04968374e9d..1bbaace73383 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- int index;
- int ret;
+ int ret = 0;
mutex_lock(&root->log_mutex);
+
if (root->log_root) {
if (btrfs_need_log_full_commit(root->fs_info, trans)) {
ret = -EAGAIN;
goto out;
}
+
if (!root->log_start_pid) {
- root->log_start_pid = current->pid;
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
} else if (root->log_start_pid != current->pid) {
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
+ } else {
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ if (!root->fs_info->log_root_tree)
+ ret = btrfs_init_log_root_tree(trans, root->fs_info);
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ if (ret)
+ goto out;
- atomic_inc(&root->log_batch);
- atomic_inc(&root->log_writers);
- if (ctx) {
- index = root->log_transid % 2;
- list_add_tail(&ctx->list, &root->log_ctxs[index]);
- ctx->log_transid = root->log_transid;
- }
- mutex_unlock(&root->log_mutex);
- return 0;
- }
-
- ret = 0;
- mutex_lock(&root->fs_info->tree_log_mutex);
- if (!root->fs_info->log_root_tree)
- ret = btrfs_init_log_root_tree(trans, root->fs_info);
- mutex_unlock(&root->fs_info->tree_log_mutex);
- if (ret)
- goto out;
-
- if (!root->log_root) {
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
+
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
}
- clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
- root->log_start_pid = current->pid;
+
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
if (ctx) {
- index = root->log_transid % 2;
+ int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
}
+
out:
mutex_unlock(&root->log_mutex);
return ret;
@@ -731,12 +722,66 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
&ordered_sums, 0);
if (ret)
goto out;
+ /*
+ * Now delete all existing cums in the csum root that
+ * cover our range. We do this because we can have an
+ * extent that is completely referenced by one file
+ * extent item and partially referenced by another
+ * file extent item (like after using the clone or
+ * extent_same ioctls). In this case if we end up doing
+ * the replay of the one that partially references the
+ * extent first, and we do not do the csum deletion
+ * below, we can get 2 csum items in the csum tree that
+ * overlap each other. For example, imagine our log has
+ * the two following file extent items:
+ *
+ * key (257 EXTENT_DATA 409600)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 20480 nr 20480 ram 102400
+ *
+ * key (257 EXTENT_DATA 819200)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 0 nr 102400 ram 102400
+ *
+ * Where the second one fully references the 100K extent
+ * that starts at disk byte 12845056, and the log tree
+ * has a single csum item that covers the entire range
+ * of the extent:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ *
+ * After the first file extent item is replayed, the
+ * csum tree gets the following csum item:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which covers the 20K sub-range starting at offset 20K
+ * of our extent. Now when we replay the second file
+ * extent item, if we do not delete existing csum items
+ * that cover any of its blocks, we end up getting two
+ * csum items in our csum tree that overlap each other:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which is a problem, because after this anyone trying
+ * to lookup up for the checksum of any block of our
+ * extent starting at an offset of 40K or higher, will
+ * end up looking at the second csum item only, which
+ * does not contain the checksum for any block starting
+ * at offset 40K or higher of our extent.
+ */
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
if (!ret)
+ ret = btrfs_del_csums(trans,
+ root->fs_info->csum_root,
+ sums->bytenr,
+ sums->len);
+ if (!ret)
ret = btrfs_csum_file_blocks(trans,
root->fs_info->csum_root,
sums);
@@ -1549,9 +1594,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
*/
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path,
u64 dirid, u64 index,
- char *name, int name_len, u8 type,
+ char *name, int name_len,
struct btrfs_key *location)
{
struct inode *inode;
@@ -1613,6 +1657,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root,
* not exist in the FS, it is skipped. fsyncs on directories
* do not force down inodes inside that directory, just changes to the
* names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
*/
static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -1631,6 +1678,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
int exists;
int ret = 0;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+ bool name_added = false;
dir = read_one_inode(root, key->objectid);
if (!dir)
@@ -1708,6 +1756,8 @@ out:
}
kfree(name);
iput(dir);
+ if (!ret && name_added)
+ ret = 1;
return ret;
insert:
@@ -1719,10 +1769,12 @@ insert:
goto out;
}
btrfs_release_path(path);
- ret = insert_one_name(trans, root, path, key->objectid, key->offset,
- name, name_len, log_type, &log_key);
+ ret = insert_one_name(trans, root, key->objectid, key->offset,
+ name, name_len, &log_key);
if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
+ if (!ret)
+ name_added = true;
update_size = false;
ret = 0;
goto out;
@@ -1740,12 +1792,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- int ret;
+ int ret = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
struct btrfs_dir_item *di;
int name_len;
unsigned long ptr;
unsigned long ptr_end;
+ struct btrfs_path *fixup_path = NULL;
ptr = btrfs_item_ptr_offset(eb, slot);
ptr_end = ptr + item_size;
@@ -1755,12 +1808,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
- if (ret)
- return ret;
+ if (ret < 0)
+ break;
ptr = (unsigned long)(di + 1);
ptr += name_len;
+
+ /*
+ * If this entry refers to a non-directory (directories can not
+ * have a link count > 1) and it was added in the transaction
+ * that was not committed, make sure we fixup the link count of
+ * the inode it the entry points to. Otherwise something like
+ * the following would result in a directory pointing to an
+ * inode with a wrong link that does not account for this dir
+ * entry:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * touch testdir/bar
+ * sync
+ *
+ * ln testdir/bar testdir/bar_link
+ * ln testdir/foo testdir/foo_link
+ * xfs_io -c "fsync" testdir/bar
+ *
+ * <power failure>
+ *
+ * mount fs, log replay happens
+ *
+ * File foo would remain with a link count of 1 when it has two
+ * entries pointing to it in the directory testdir. This would
+ * make it impossible to ever delete the parent directory has
+ * it would result in stale dentries that can never be deleted.
+ */
+ if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+ struct btrfs_key di_key;
+
+ if (!fixup_path) {
+ fixup_path = btrfs_alloc_path();
+ if (!fixup_path) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+ ret = link_to_fixup_dir(trans, root, fixup_path,
+ di_key.objectid);
+ if (ret)
+ break;
+ }
+ ret = 0;
}
- return 0;
+ btrfs_free_path(fixup_path);
+ return ret;
}
/*
@@ -2535,8 +2635,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static void wait_log_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -2561,8 +2660,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans,
atomic_read(&root->log_commit[index]));
}
-static void wait_for_writer(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
{
DEFINE_WAIT(wait);
@@ -2642,7 +2740,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(trans, root, log_transid);
+ wait_log_commit(root, log_transid);
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
@@ -2651,7 +2749,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(trans, root, log_transid - 1);
+ wait_log_commit(root, log_transid - 1);
while (1) {
int batch = atomic_read(&root->log_batch);
@@ -2662,7 +2760,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
}
- wait_for_writer(trans, root);
+ wait_for_writer(root);
if (batch == atomic_read(&root->log_batch))
break;
}
@@ -2759,7 +2857,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
mark);
btrfs_wait_logged_extents(trans, log, log_transid);
- wait_log_commit(trans, log_root_tree,
+ wait_log_commit(log_root_tree,
root_log_ctx.log_transid);
mutex_unlock(&log_root_tree->log_mutex);
if (!ret)
@@ -2770,11 +2868,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
- wait_log_commit(trans, log_root_tree,
+ wait_log_commit(log_root_tree,
root_log_ctx.log_transid - 1);
}
- wait_for_writer(trans, log_root_tree);
+ wait_for_writer(log_root_tree);
/*
* now that we've moved on to the tree of log tree roots,
@@ -3881,12 +3979,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
&ordered->flags))
continue;
- if (ordered->csum_bytes_left) {
- btrfs_start_ordered_extent(inode, ordered, 0);
- wait_event(ordered->wait,
- ordered->csum_bytes_left == 0);
- }
-
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
if (ret)
@@ -4123,6 +4215,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
return 0;
}
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path)
+{
+ int ret;
+ struct btrfs_key key;
+ const u64 ino = btrfs_ino(inode);
+ int ins_nr = 0;
+ int start_slot = 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ int nritems = btrfs_header_nritems(leaf);
+
+ if (slot >= nritems) {
+ if (ins_nr > 0) {
+ u64 last_extent = 0;
+
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ cond_resched();
+ }
+ if (ins_nr > 0) {
+ u64 last_extent = 0;
+
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * If the no holes feature is enabled we need to make sure any hole between the
+ * last extent and the i_size of our inode is explicitly marked in the log. This
+ * is to make sure that doing something like:
+ *
+ * 1) create file with 128Kb of data
+ * 2) truncate file to 64Kb
+ * 3) truncate file to 256Kb
+ * 4) fsync file
+ * 5) <crash/power failure>
+ * 6) mount fs and trigger log replay
+ *
+ * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+ * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+ * file correspond to a hole. The presence of explicit holes in a log tree is
+ * what guarantees that log replay will remove/adjust file extent items in the
+ * fs/subvol tree.
+ *
+ * Here we do not need to care about holes between extents, that is already done
+ * by copy_items(). We also only need to do this in the full sync path, where we
+ * lookup for extents from the fs/subvol tree only. In the fast path case, we
+ * lookup the list of modified extent maps and if any represents a hole, we
+ * insert a corresponding extent representing a hole in the log tree.
+ */
+static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ u64 hole_start;
+ u64 hole_size;
+ struct extent_buffer *leaf;
+ struct btrfs_root *log = root->log_root;
+ const u64 ino = btrfs_ino(inode);
+ const u64 i_size = i_size_read(inode);
+
+ if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* inode does not have any extents */
+ hole_start = 0;
+ hole_size = i_size;
+ } else {
+ struct btrfs_file_extent_item *extent;
+ u64 len;
+
+ /*
+ * If there's an extent beyond i_size, an explicit hole was
+ * already inserted by copy_items().
+ */
+ if (key.offset >= i_size)
+ return 0;
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, extent) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_inline_len(leaf,
+ path->slots[0],
+ extent);
+ ASSERT(len == i_size);
+ return 0;
+ }
+
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ /* Last extent goes beyond i_size, no need to log a hole. */
+ if (key.offset + len > i_size)
+ return 0;
+ hole_start = key.offset + len;
+ hole_size = i_size - hole_start;
+ }
+ btrfs_release_path(path);
+
+ /* Last extent ends at i_size. */
+ if (hole_size == 0)
+ return 0;
+
+ hole_size = ALIGN(hole_size, root->sectorsize);
+ ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+ hole_size, 0, hole_size, 0, 0, 0);
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4161,6 +4434,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 logged_isize = 0;
+ bool need_log_inode_item = true;
path = btrfs_alloc_path();
if (!path)
@@ -4269,11 +4543,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
- ret = log_inode_item(trans, log, dst_path, inode);
- if (ret) {
- err = ret;
- goto out_unlock;
- }
goto log_extents;
}
@@ -4296,6 +4565,28 @@ again:
if (min_key.type > max_key.type)
break;
+ if (min_key.type == BTRFS_INODE_ITEM_KEY)
+ need_log_inode_item = false;
+
+ /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+ if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ins_nr = 0;
+ if (ret) {
+ btrfs_release_path(path);
+ continue;
+ }
+ goto next_slot;
+ }
+
src = path->nodes[0];
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
@@ -4363,9 +4654,26 @@ next_slot:
ins_nr = 0;
}
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+ if (err)
+ goto out_unlock;
+ if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ err = btrfs_log_trailing_hole(trans, root, inode, path);
+ if (err)
+ goto out_unlock;
+ }
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
+ if (need_log_inode_item) {
+ err = log_inode_item(trans, log, dst_path, inode);
+ if (err)
+ goto out_unlock;
+ }
if (fast_search) {
/*
* Some ordered extents started by fsync might have completed
@@ -4694,6 +5002,94 @@ next_dir_inode:
return ret;
}
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_log_ctx *ctx)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ const u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u32 cur_offset = 0;
+ u32 item_size;
+ unsigned long ptr;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+ if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ while (cur_offset < item_size) {
+ struct btrfs_key inode_key;
+ struct inode *dir_inode;
+
+ inode_key.type = BTRFS_INODE_ITEM_KEY;
+ inode_key.offset = 0;
+
+ if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)
+ (ptr + cur_offset);
+ inode_key.objectid = btrfs_inode_extref_parent(
+ leaf, extref);
+ cur_offset += sizeof(*extref);
+ cur_offset += btrfs_inode_extref_name_len(leaf,
+ extref);
+ } else {
+ inode_key.objectid = key.offset;
+ cur_offset = item_size;
+ }
+
+ dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+ root, NULL);
+ /* If parent inode was deleted, skip it. */
+ if (IS_ERR(dir_inode))
+ continue;
+
+ ret = btrfs_log_inode(trans, root, dir_inode,
+ LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ iput(dir_inode);
+ if (ret)
+ goto out;
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -4713,9 +5109,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = root->fs_info->last_trans_committed;
- const struct dentry * const first_parent = parent;
- const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
- last_committed);
bool log_dentries = false;
struct inode *orig_inode = inode;
@@ -4776,6 +5169,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
log_dentries = true;
+ /*
+ * On unlink we must make sure all our current and old parent directores
+ * inodes are fully logged. This is to prevent leaving dangling
+ * directory index entries in directories that were our parents but are
+ * not anymore. Not doing this results in old parent directory being
+ * impossible to delete after log replay (rmdir will always fail with
+ * error -ENOTEMPTY).
+ *
+ * Example 1:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * ln testdir/foo testdir/bar
+ * sync
+ * unlink testdir/bar
+ * xfs_io -c fsync testdir/foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * If we don't log the parent directory (testdir), after log replay the
+ * directory still has an entry pointing to the file inode using the bar
+ * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+ * the file inode has a link count of 1.
+ *
+ * Example 2:
+ *
+ * mkdir testdir
+ * touch foo
+ * ln foo testdir/foo2
+ * ln foo testdir/foo3
+ * sync
+ * unlink testdir/foo3
+ * xfs_io -c fsync foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * Similar as the first example, after log replay the parent directory
+ * testdir still has an entry pointing to the inode file with name foo3
+ * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+ * and has a link count of 2.
+ */
+ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+ ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+ if (ret)
+ goto end_trans;
+ }
+
while (1) {
if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
break;
@@ -4784,23 +5224,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (root != BTRFS_I(inode)->root)
break;
- /*
- * On unlink we must make sure our immediate parent directory
- * inode is fully logged. This is to prevent leaving dangling
- * directory index entries and a wrong directory inode's i_size.
- * Not doing so can result in a directory being impossible to
- * delete after log replay (rmdir will always fail with error
- * -ENOTEMPTY).
- */
- if (did_unlink && parent == first_parent)
- inode_only = LOG_INODE_ALL;
- else
- inode_only = LOG_INODE_EXISTS;
-
- if (BTRFS_I(inode)->generation >
- root->fs_info->last_trans_committed ||
- inode_only == LOG_INODE_ALL) {
- ret = btrfs_log_inode(trans, root, inode, inode_only,
+ if (BTRFS_I(inode)->generation > last_committed) {
+ ret = btrfs_log_inode(trans, root, inode,
+ LOG_INODE_EXISTS,
0, LLONG_MAX, ctx);
if (ret)
goto end_trans;
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 840a38b2778a..91feb2bdefee 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
return NULL;
}
+static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
+{
+ rb_erase(&node->rb_node, &ulist->root);
+ list_del(&node->list);
+ kfree(node);
+ BUG_ON(ulist->nnodes == 0);
+ ulist->nnodes--;
+}
+
static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
struct rb_node **p = &ulist->root.rb_node;
@@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
node->val = val;
node->aux = aux;
-#ifdef CONFIG_BTRFS_DEBUG
- node->seqnum = ulist->nnodes;
-#endif
ret = ulist_rbtree_insert(ulist, node);
ASSERT(!ret);
@@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
return 1;
}
+/*
+ * ulist_del - delete one node from ulist
+ * @ulist: ulist to remove node from
+ * @val: value to delete
+ * @aux: aux to delete
+ *
+ * The deletion will only be done when *BOTH* val and aux matches.
+ * Return 0 for successful delete.
+ * Return > 0 for not found.
+ */
+int ulist_del(struct ulist *ulist, u64 val, u64 aux)
+{
+ struct ulist_node *node;
+
+ node = ulist_rbtree_search(ulist, val);
+ /* Not found */
+ if (!node)
+ return 1;
+
+ if (node->aux != aux)
+ return 1;
+
+ /* Found and delete */
+ ulist_rbtree_erase(ulist, node);
+ return 0;
+}
+
/**
* ulist_next - iterate ulist
* @ulist: ulist to iterate
@@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
uiter->cur_list = uiter->cur_list->next;
} else {
uiter->cur_list = ulist->nodes.next;
-#ifdef CONFIG_BTRFS_DEBUG
- uiter->i = 0;
-#endif
}
node = list_entry(uiter->cur_list, struct ulist_node, list);
-#ifdef CONFIG_BTRFS_DEBUG
- ASSERT(node->seqnum == uiter->i);
- ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
- uiter->i++;
-#endif
return node;
}
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 4c29db604bbe..a01a2c45825f 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 *old_aux, gfp_t gfp_mask);
+int ulist_del(struct ulist *ulist, u64 val, u64 aux);
/* just like ulist_add_merge() but take a pointer for the aux data */
static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 53af23f2c087..76201d6f6ce4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
+struct list_head *btrfs_get_fs_uuids(void)
+{
+ return &fs_uuids;
+}
static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
@@ -441,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+
+void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+{
+ struct btrfs_fs_devices *fs_devs;
+ struct btrfs_device *dev;
+
+ if (!cur_dev->name)
+ return;
+
+ list_for_each_entry(fs_devs, &fs_uuids, list) {
+ int del = 1;
+
+ if (fs_devs->opened)
+ continue;
+ if (fs_devs->seeding)
+ continue;
+
+ list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+
+ if (dev == cur_dev)
+ continue;
+ if (!dev->name)
+ continue;
+
+ /*
+ * Todo: This won't be enough. What if the same device
+ * comes back (with new uuid and) with its mapper path?
+ * But for now, this does help as mostly an admin will
+ * either use mapper or non mapper path throughout.
+ */
+ rcu_read_lock();
+ del = strcmp(rcu_str_deref(dev->name),
+ rcu_str_deref(cur_dev->name));
+ rcu_read_unlock();
+ if (!del)
+ break;
+ }
+
+ if (!del) {
+ /* delete the stale device */
+ if (fs_devs->num_devices == 1) {
+ btrfs_sysfs_remove_fsid(fs_devs);
+ list_del(&fs_devs->list);
+ free_fs_devices(fs_devs);
+ } else {
+ fs_devs->num_devices--;
+ list_del(&dev->dev_list);
+ rcu_string_free(dev->name);
+ kfree(dev);
+ }
+ break;
+ }
+ }
+}
+
/*
* Add new device to list of registered devices
*
@@ -556,6 +615,12 @@ static noinline int device_list_add(const char *path,
if (!fs_devices->opened)
device->generation = found_transid;
+ /*
+ * if there is new btrfs on an already registered device,
+ * then remove the stale device entry.
+ */
+ btrfs_free_stale_device(device);
+
*fs_devices_ret = fs_devices;
return ret;
@@ -693,13 +758,13 @@ static void free_device(struct rcu_head *head)
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
+ struct btrfs_device *device, *tmp;
if (--fs_devices->opened > 0)
return 0;
mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
struct btrfs_device *new_device;
struct rcu_string *name;
@@ -1051,15 +1116,18 @@ out:
return ret;
}
-static int contains_pending_extent(struct btrfs_trans_handle *trans,
+static int contains_pending_extent(struct btrfs_transaction *transaction,
struct btrfs_device *device,
u64 *start, u64 len)
{
+ struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
struct extent_map *em;
- struct list_head *search_list = &trans->transaction->pending_chunks;
+ struct list_head *search_list = &fs_info->pinned_chunks;
int ret = 0;
u64 physical_start = *start;
+ if (transaction)
+ search_list = &transaction->pending_chunks;
again:
list_for_each_entry(em, search_list, list) {
struct map_lookup *map;
@@ -1067,19 +1135,35 @@ again:
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
+ u64 end;
+
if (map->stripes[i].dev != device)
continue;
if (map->stripes[i].physical >= physical_start + len ||
map->stripes[i].physical + em->orig_block_len <=
physical_start)
continue;
- *start = map->stripes[i].physical +
- em->orig_block_len;
- ret = 1;
+ /*
+ * Make sure that while processing the pinned list we do
+ * not override our *start with a lower value, because
+ * we can have pinned chunks that fall within this
+ * device hole and that have lower physical addresses
+ * than the pending chunks we processed before. If we
+ * do not take this special care we can end up getting
+ * 2 pending chunks that start at the same physical
+ * device offsets because the end offset of a pinned
+ * chunk can be equal to the start offset of some
+ * pending chunk.
+ */
+ end = map->stripes[i].physical + em->orig_block_len;
+ if (end > *start) {
+ *start = end;
+ ret = 1;
+ }
}
}
- if (search_list == &trans->transaction->pending_chunks) {
- search_list = &trans->root->fs_info->pinned_chunks;
+ if (search_list != &fs_info->pinned_chunks) {
+ search_list = &fs_info->pinned_chunks;
goto again;
}
@@ -1088,12 +1172,13 @@ again:
/*
- * find_free_dev_extent - find free space in the specified device
- * @device: the device which we search the free space in
- * @num_bytes: the size of the free space that we need
- * @start: store the start of the free space.
- * @len: the size of the free space. that we find, or the size of the max
- * free space if we don't find suitable free space
+ * find_free_dev_extent_start - find free space in the specified device
+ * @device: the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @search_start: the position from which to begin the search
+ * @start: store the start of the free space.
+ * @len: the size of the free space. that we find, or the size
+ * of the max free space if we don't find suitable free space
*
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
@@ -1107,9 +1192,9 @@ again:
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *len)
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 search_start, u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
@@ -1119,19 +1204,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
u64 max_hole_start;
u64 max_hole_size;
u64 extent_end;
- u64 search_start;
u64 search_end = device->total_bytes;
int ret;
int slot;
struct extent_buffer *l;
- /* FIXME use last free of some kind */
-
- /* we don't want to overwrite the superblock on the drive,
- * so we make sure to start at an offset of at least 1MB
- */
- search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1192,7 +1269,7 @@ again:
* Have to check before we set max_hole_start, otherwise
* we could end up sending back this offset anyway.
*/
- if (contains_pending_extent(trans, device,
+ if (contains_pending_extent(transaction, device,
&search_start,
hole_size)) {
if (key.offset >= search_start) {
@@ -1241,7 +1318,7 @@ next:
if (search_end > search_start) {
hole_size = search_end - search_start;
- if (contains_pending_extent(trans, device, &search_start,
+ if (contains_pending_extent(transaction, device, &search_start,
hole_size)) {
btrfs_release_path(path);
goto again;
@@ -1267,6 +1344,24 @@ out:
return ret;
}
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *len)
+{
+ struct btrfs_root *root = device->dev_root;
+ u64 search_start;
+
+ /* FIXME use last free of some kind */
+
+ /*
+ * we don't want to overwrite the superblock on the drive,
+ * so we make sure to start at an offset of at least 1MB
+ */
+ search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ return find_free_dev_extent_start(trans->transaction, device,
+ num_bytes, search_start, start, len);
+}
+
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start, u64 *dev_extent_len)
@@ -1706,7 +1801,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
- btrfs_kobj_rm_device(root->fs_info, device);
+ btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
}
call_rcu(&device->rcu, free_device);
@@ -1875,6 +1970,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
+
+ btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+
if (tgtdev->bdev) {
btrfs_scratch_superblock(tgtdev);
fs_info->fs_devices->open_devices--;
@@ -2211,7 +2309,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
tmp + 1);
/* add sysfs device entry */
- btrfs_kobj_add_device(root->fs_info, device);
+ btrfs_kobj_add_device(root->fs_info->fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
@@ -2252,8 +2350,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
- if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
- goto error_trans;
+ if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+ fsid_buf))
+ pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
}
root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2289,7 +2388,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
error_trans:
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
- btrfs_kobj_rm_device(root->fs_info, device);
+ btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2609,6 +2708,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
return -EINVAL;
}
map = (struct map_lookup *)em->bdev;
+ lock_chunks(root->fs_info->chunk_root);
+ check_system_chunk(trans, extent_root, map->type);
+ unlock_chunks(root->fs_info->chunk_root);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
@@ -2667,9 +2769,7 @@ out:
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_root *root,
- u64 chunk_objectid,
- u64 chunk_offset)
+static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
{
struct btrfs_root *extent_root;
struct btrfs_trans_handle *trans;
@@ -2678,12 +2778,28 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
root = root->fs_info->chunk_root;
extent_root = root->fs_info->extent_root;
+ /*
+ * Prevent races with automatic removal of unused block groups.
+ * After we relocate and before we remove the chunk with offset
+ * chunk_offset, automatic removal of the block group can kick in,
+ * resulting in a failure when calling btrfs_remove_chunk() below.
+ *
+ * Make sure to acquire this mutex before doing a tree search (dev
+ * or chunk trees) to find chunks. Otherwise the cleaner kthread might
+ * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
+ * we release the path used to search the chunk/dev tree and before
+ * the current task acquires this mutex and calls us.
+ */
+ ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
+
ret = btrfs_can_relocate(extent_root, chunk_offset);
if (ret)
return -ENOSPC;
/* step one, relocate all the extents inside this chunk */
+ btrfs_scrub_pause(root);
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ btrfs_scrub_continue(root);
if (ret)
return ret;
@@ -2726,13 +2842,18 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
goto error;
+ }
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
+ if (ret)
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto error;
if (ret > 0)
@@ -2748,13 +2869,13 @@ again:
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_relocate_chunk(chunk_root,
- found_key.objectid,
found_key.offset);
if (ret == -ENOSPC)
failed++;
else
BUG_ON(ret);
}
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (found_key.offset == 0)
break;
@@ -3211,9 +3332,12 @@ again:
goto error;
}
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
+ }
/*
* this shouldn't happen, it means the last relocate
@@ -3225,6 +3349,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ret = 0;
break;
}
@@ -3233,8 +3358,10 @@ again:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
- if (found_key.objectid != key.objectid)
+ if (found_key.objectid != key.objectid) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
break;
+ }
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
@@ -3247,10 +3374,13 @@ again:
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
btrfs_release_path(path);
- if (!ret)
+ if (!ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto loop;
+ }
if (counting) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
@@ -3258,8 +3388,8 @@ again:
}
ret = btrfs_relocate_chunk(chunk_root,
- found_key.objectid,
found_key.offset);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto error;
if (ret == -ENOSPC) {
@@ -3908,9 +4038,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
uuid_root = btrfs_create_tree(trans, fs_info,
BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
- btrfs_abort_transaction(trans, tree_root,
- PTR_ERR(uuid_root));
- return PTR_ERR(uuid_root);
+ ret = PTR_ERR(uuid_root);
+ btrfs_abort_transaction(trans, tree_root, ret);
+ return ret;
}
fs_info->uuid_root = uuid_root;
@@ -3959,12 +4089,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
u64 length;
- u64 chunk_objectid;
u64 chunk_offset;
int ret;
int slot;
int failed = 0;
bool retried = false;
+ bool checked_pending_chunks = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
@@ -3998,11 +4128,16 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
goto done;
+ }
ret = btrfs_previous_item(root, path, 0, key.type);
+ if (ret)
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto done;
if (ret) {
@@ -4016,6 +4151,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
@@ -4024,15 +4160,16 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
- chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
- ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+ ret = btrfs_relocate_chunk(root, chunk_offset);
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)
@@ -4045,15 +4182,6 @@ again:
goto again;
} else if (failed && retried) {
ret = -ENOSPC;
- lock_chunks(root);
-
- btrfs_device_set_total_bytes(device, old_size);
- if (device->writeable)
- device->fs_devices->total_rw_bytes += diff;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += diff;
- spin_unlock(&root->fs_info->free_chunk_lock);
- unlock_chunks(root);
goto done;
}
@@ -4065,6 +4193,36 @@ again:
}
lock_chunks(root);
+
+ /*
+ * We checked in the above loop all device extents that were already in
+ * the device tree. However before we have updated the device's
+ * total_bytes to the new size, we might have had chunk allocations that
+ * have not complete yet (new block groups attached to transaction
+ * handles), and therefore their device extents were not yet in the
+ * device tree and we missed them in the loop above. So if we have any
+ * pending chunk using a device extent that overlaps the device range
+ * that we can not use anymore, commit the current transaction and
+ * repeat the search on the device tree - this way we guarantee we will
+ * not have chunks using device extents that end beyond 'new_size'.
+ */
+ if (!checked_pending_chunks) {
+ u64 start = new_size;
+ u64 len = old_size - new_size;
+
+ if (contains_pending_extent(trans->transaction, device,
+ &start, len)) {
+ unlock_chunks(root);
+ checked_pending_chunks = true;
+ failed = 0;
+ retried = false;
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ goto done;
+ goto again;
+ }
+ }
+
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->resized_list))
list_add_tail(&device->resized_list,
@@ -4079,6 +4237,16 @@ again:
btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
+ if (ret) {
+ lock_chunks(root);
+ btrfs_device_set_total_bytes(device, old_size);
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ unlock_chunks(root);
+ }
return ret;
}
@@ -4914,9 +5082,7 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
* and the stripes
*/
sizeof(u64) * (total_stripes),
- GFP_NOFS);
- if (!bbio)
- return NULL;
+ GFP_NOFS|__GFP_NOFAIL);
atomic_set(&bbio->error, 0);
atomic_set(&bbio->refs, 1);
@@ -5584,26 +5750,26 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
{
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
- bio_endio(bio, err);
+ bio_endio(bio);
btrfs_put_bbio(bbio);
}
-static void btrfs_end_bio(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio)
{
struct btrfs_bio *bbio = bio->bi_private;
- struct btrfs_device *dev = bbio->stripes[0].dev;
int is_orig_bio = 0;
- if (err) {
+ if (bio->bi_error) {
atomic_inc(&bbio->error);
- if (err == -EIO || err == -EREMOTEIO) {
+ if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
+ struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
@@ -5638,17 +5804,16 @@ static void btrfs_end_bio(struct bio *bio, int err)
* beyond the tolerance of the btrfs bio
*/
if (atomic_read(&bbio->error) > bbio->max_errors) {
- err = -EIO;
+ bio->bi_error = -EIO;
} else {
/*
* this bio is actually up to date, we didn't
* go over the max number of errors
*/
- set_bit(BIO_UPTODATE, &bio->bi_flags);
- err = 0;
+ bio->bi_error = 0;
}
- btrfs_end_bbio(bbio, bio, err);
+ btrfs_end_bbio(bbio, bio);
} else if (!is_orig_bio) {
bio_put(bio);
}
@@ -5669,7 +5834,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
struct btrfs_pending_bios *pending_bios;
if (device->missing || !device->bdev) {
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
return;
}
@@ -5714,34 +5879,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
&device->work);
}
-static int bio_size_ok(struct block_device *bdev, struct bio *bio,
- sector_t sector)
-{
- struct bio_vec *prev;
- struct request_queue *q = bdev_get_queue(bdev);
- unsigned int max_sectors = queue_max_sectors(q);
- struct bvec_merge_data bvm = {
- .bi_bdev = bdev,
- .bi_sector = sector,
- .bi_rw = bio->bi_rw,
- };
-
- if (WARN_ON(bio->bi_vcnt == 0))
- return 1;
-
- prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
- if (bio_sectors(bio) > max_sectors)
- return 0;
-
- if (!q->merge_bvec_fn)
- return 1;
-
- bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
- if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
- return 0;
- return 1;
-}
-
static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
struct bio *bio, u64 physical, int dev_nr,
int rw, int async)
@@ -5775,38 +5912,6 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
btrfsic_submit_bio(rw, bio);
}
-static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
- struct bio *first_bio, struct btrfs_device *dev,
- int dev_nr, int rw, int async)
-{
- struct bio_vec *bvec = first_bio->bi_io_vec;
- struct bio *bio;
- int nr_vecs = bio_get_nr_vecs(dev->bdev);
- u64 physical = bbio->stripes[dev_nr].physical;
-
-again:
- bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
- if (!bio)
- return -ENOMEM;
-
- while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
- if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
- bvec->bv_offset) < bvec->bv_len) {
- u64 len = bio->bi_iter.bi_size;
-
- atomic_inc(&bbio->stripes_pending);
- submit_stripe_bio(root, bbio, bio, physical, dev_nr,
- rw, async);
- physical += len;
- goto again;
- }
- bvec++;
- }
-
- submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
- return 0;
-}
-
static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
@@ -5816,8 +5921,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
-
- btrfs_end_bbio(bbio, bio, -EIO);
+ bio->bi_error = -EIO;
+ btrfs_end_bbio(bbio, bio);
}
}
@@ -5879,18 +5984,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
continue;
}
- /*
- * Check and see if we're ok with this bio based on it's size
- * and offset with the given device.
- */
- if (!bio_size_ok(dev->bdev, first_bio,
- bbio->stripes[dev_nr].physical >> 9)) {
- ret = breakup_stripe_bio(root, bbio, first_bio, dev,
- dev_nr, rw, async_submit);
- BUG_ON(ret);
- continue;
- }
-
if (dev_nr < total_devs - 1) {
bio = btrfs_bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio); /* -ENOMEM */
@@ -6072,6 +6165,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
free_extent_map(em);
return -EIO;
}
+ btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
}
map->stripes[i].dev->in_fs_metadata = 1;
}
@@ -6191,10 +6286,11 @@ static int read_one_dev(struct btrfs_root *root,
if (!btrfs_test_opt(root, DEGRADED))
return -EIO;
- btrfs_warn(root->fs_info, "devid %llu missing", devid);
device = add_missing_dev(root, fs_devices, devid, dev_uuid);
if (!device)
return -ENOMEM;
+ btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
+ devid, dev_uuid);
} else {
if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
return -EIO;
@@ -6722,3 +6818,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
}
unlock_chunks(root);
}
+
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ while (fs_devices) {
+ fs_devices->fs_info = fs_info;
+ fs_devices = fs_devices->seed;
+ }
+}
+
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ while (fs_devices) {
+ fs_devices->fs_info = NULL;
+ fs_devices = fs_devices->seed;
+ }
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index cedae0356558..2ca784a14e84 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -253,6 +253,12 @@ struct btrfs_fs_devices {
* nonrot flag set
*/
int rotating;
+
+ struct btrfs_fs_info *fs_info;
+ /* sysfs kobjects */
+ struct kobject super_kobj;
+ struct kobject *device_dir_kobj;
+ struct completion kobj_unregister;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -447,6 +453,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 search_start, u64 *start, u64 *max_avail);
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
@@ -535,5 +544,8 @@ static inline void unlock_chunks(struct btrfs_root *root)
mutex_unlock(&root->fs_info->chunk_mutex);
}
+struct list_head *btrfs_get_fs_uuids(void);
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/buffer.c b/fs/buffer.c
index 1cf7a53a0277..82283abb2795 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2957,14 +2957,14 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
}
EXPORT_SYMBOL(generic_block_bmap);
-static void end_bio_bh_io_sync(struct bio *bio, int err)
+static void end_bio_bh_io_sync(struct bio *bio)
{
struct buffer_head *bh = bio->bi_private;
- if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+ if (unlikely(bio_flagged(bio, BIO_QUIET)))
set_bit(BH_Quiet, &bh->b_state);
- bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
+ bh->b_end_io(bh, !bio->bi_error);
bio_put(bio);
}
@@ -3046,12 +3046,9 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh,
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
- bio->bi_io_vec[0].bv_page = bh->b_page;
- bio->bi_io_vec[0].bv_len = bh->b_size;
- bio->bi_io_vec[0].bv_offset = bh_offset(bh);
- bio->bi_vcnt = 1;
- bio->bi_iter.bi_size = bh->b_size;
+ bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ BUG_ON(bio->bi_iter.bi_size != bh->b_size);
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 8c52472d2efa..aecd0859eacb 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -43,7 +43,6 @@ struct cachefiles_object {
loff_t i_size; /* object size */
unsigned long flags;
#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
-#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */
atomic_t usage; /* object usage count */
uint8_t type; /* object type */
uint8_t new; /* T if object new */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ab857ab9f40d..fc1056f5c96a 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -97,7 +97,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
* call vfs_unlink(), vfs_rmdir() or vfs_rename()
*/
static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
- struct dentry *dentry)
+ struct dentry *dentry,
+ enum fscache_why_object_killed why)
{
struct cachefiles_object *object;
struct rb_node *p;
@@ -132,8 +133,9 @@ found_dentry:
pr_err("\n");
pr_err("Error: Can't preemptively bury live object\n");
cachefiles_printk_object(object, NULL);
- } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
- pr_err("Error: Object already preemptively buried\n");
+ } else {
+ if (why != FSCACHE_OBJECT_IS_STALE)
+ fscache_object_mark_killed(&object->fscache, why);
}
write_unlock(&cache->active_lock);
@@ -265,7 +267,8 @@ requeue:
static int cachefiles_bury_object(struct cachefiles_cache *cache,
struct dentry *dir,
struct dentry *rep,
- bool preemptive)
+ bool preemptive,
+ enum fscache_why_object_killed why)
{
struct dentry *grave, *trap;
struct path path, path_to_graveyard;
@@ -289,7 +292,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
ret = vfs_unlink(d_inode(dir), rep, NULL);
if (preemptive)
- cachefiles_mark_object_buried(cache, rep);
+ cachefiles_mark_object_buried(cache, rep, why);
}
mutex_unlock(&d_inode(dir)->i_mutex);
@@ -394,7 +397,7 @@ try_again:
"Rename failed with error %d", ret);
if (preemptive)
- cachefiles_mark_object_buried(cache, rep);
+ cachefiles_mark_object_buried(cache, rep, why);
}
unlock_rename(cache->graveyard, dir);
@@ -422,7 +425,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
- if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+ if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) {
/* object allocation for the same key preemptively deleted this
* object's file so that it could create its own file */
_debug("object preemptively buried");
@@ -433,7 +436,8 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
* may have been renamed */
if (dir == object->dentry->d_parent) {
ret = cachefiles_bury_object(cache, dir,
- object->dentry, false);
+ object->dentry, false,
+ FSCACHE_OBJECT_WAS_RETIRED);
} else {
/* it got moved, presumably by cachefilesd culling it,
* so it's no longer in the key path and we can ignore
@@ -522,7 +526,7 @@ lookup_again:
if (d_is_negative(next)) {
ret = cachefiles_has_space(cache, 1, 0);
if (ret < 0)
- goto create_error;
+ goto no_space_error;
path.dentry = dir;
ret = security_path_mkdir(&path, next, 0);
@@ -551,7 +555,7 @@ lookup_again:
if (d_is_negative(next)) {
ret = cachefiles_has_space(cache, 1, 0);
if (ret < 0)
- goto create_error;
+ goto no_space_error;
path.dentry = dir;
ret = security_path_mknod(&path, next, S_IFREG, 0);
@@ -602,7 +606,8 @@ lookup_again:
* mutex) */
object->dentry = NULL;
- ret = cachefiles_bury_object(cache, dir, next, true);
+ ret = cachefiles_bury_object(cache, dir, next, true,
+ FSCACHE_OBJECT_IS_STALE);
dput(next);
next = NULL;
@@ -610,6 +615,7 @@ lookup_again:
goto delete_error;
_debug("redo lookup");
+ fscache_object_retrying_stale(&object->fscache);
goto lookup_again;
}
}
@@ -662,6 +668,8 @@ lookup_again:
_leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino);
return 0;
+no_space_error:
+ fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE);
create_error:
_debug("create error %d", ret);
if (ret == -EIO)
@@ -927,7 +935,8 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
/* actually remove the victim (drops the dir mutex) */
_debug("bury");
- ret = cachefiles_bury_object(cache, dir, victim, false);
+ ret = cachefiles_bury_object(cache, dir, victim, false,
+ FSCACHE_OBJECT_WAS_CULLED);
if (ret < 0)
goto error;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 64fa248343f6..8f84646f10e9 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
val_size2 = posix_acl_xattr_size(default_acl->a_count);
err = -ENOMEM;
- tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
+ tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
if (!tmp_buf)
goto out_err;
- pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
+ pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
if (!pagelist)
goto out_err;
ceph_pagelist_init(pagelist);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e162bcd105ee..a268abfe60ac 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page)
inode = mapping->host;
ci = ceph_inode(inode);
- /*
- * Note that we're grabbing a snapc ref here without holding
- * any locks!
- */
- snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
-
/* dirty the head */
spin_lock(&ci->i_ceph_lock);
- if (ci->i_head_snapc == NULL)
- ci->i_head_snapc = ceph_get_snap_context(snapc);
- ++ci->i_wrbuffer_ref_head;
+ BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ capsnap->dirty_pages++;
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ ++ci->i_wrbuffer_ref_head;
+ }
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
@@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
/* build page vector */
nr_pages = calc_pages_for(0, len);
- pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
+ pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL);
ret = -ENOMEM;
if (!pages)
goto out;
@@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
dout("start_read %p adding %p idx %lu\n", inode, page,
page->index);
if (add_to_page_cache_lru(page, &inode->i_data, page->index,
- GFP_NOFS)) {
+ GFP_KERNEL)) {
ceph_fscache_uncache_page(inode, page);
page_cache_release(page);
dout("start_read %p add_to_page_cache failed %p\n",
@@ -436,7 +440,7 @@ out:
* only snap context we are allowed to write back.
*/
static struct ceph_snap_context *get_oldest_context(struct inode *inode,
- u64 *snap_size)
+ loff_t *snap_size)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc = NULL;
@@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_osd_client *osdc;
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
+ loff_t snap_size = -1;
long writeback_stat;
- u64 truncate_size, snap_size = 0;
+ u64 truncate_size;
u32 truncate_seq;
int err = 0, len = PAGE_CACHE_SIZE;
@@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
spin_lock(&ci->i_ceph_lock);
truncate_seq = ci->i_truncate_seq;
truncate_size = ci->i_truncate_size;
- if (!snap_size)
+ if (snap_size == -1)
snap_size = i_size_read(inode);
spin_unlock(&ci->i_ceph_lock);
@@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping,
unsigned wsize = 1 << inode->i_blkbits;
struct ceph_osd_request *req = NULL;
int do_sync = 0;
- u64 truncate_size, snap_size;
+ loff_t snap_size, i_size;
+ u64 truncate_size;
u32 truncate_seq;
/*
@@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping,
retry:
/* find oldest snap context with dirty data */
ceph_put_snap_context(snapc);
- snap_size = 0;
+ snap_size = -1;
snapc = get_oldest_context(inode, &snap_size);
if (!snapc) {
/* hmm, why does writepages get called when there
@@ -749,16 +755,13 @@ retry:
dout(" no snap context with dirty data?\n");
goto out;
}
- if (snap_size == 0)
- snap_size = i_size_read(inode);
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
snapc, snapc->seq, snapc->num_snaps);
spin_lock(&ci->i_ceph_lock);
truncate_seq = ci->i_truncate_seq;
truncate_size = ci->i_truncate_size;
- if (!snap_size)
- snap_size = i_size_read(inode);
+ i_size = i_size_read(inode);
spin_unlock(&ci->i_ceph_lock);
if (last_snapc && snapc != last_snapc) {
@@ -828,8 +831,10 @@ get_more_pages:
dout("waiting on writeback %p\n", page);
wait_on_page_writeback(page);
}
- if (page_offset(page) >= snap_size) {
- dout("%p page eof %llu\n", page, snap_size);
+ if (page_offset(page) >=
+ (snap_size == -1 ? i_size : snap_size)) {
+ dout("%p page eof %llu\n", page,
+ (snap_size == -1 ? i_size : snap_size));
done = 1;
unlock_page(page);
break;
@@ -884,7 +889,8 @@ get_more_pages:
}
if (do_sync)
- osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+ osd_req_op_init(req, 1,
+ CEPH_OSD_OP_STARTSYNC, 0);
req->r_callback = writepages_finish;
req->r_inode = inode;
@@ -944,10 +950,18 @@ get_more_pages:
}
/* Format the osd request message and submit the write */
-
offset = page_offset(pages[0]);
- len = min(snap_size - offset,
- (u64)locked_pages << PAGE_CACHE_SHIFT);
+ len = (u64)locked_pages << PAGE_CACHE_SHIFT;
+ if (snap_size == -1) {
+ len = min(len, (u64)i_size_read(inode) - offset);
+ /* writepages_finish() clears writeback pages
+ * according to the data length, so make sure
+ * data length covers all locked pages */
+ len = max(len, 1 +
+ ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
+ } else {
+ len = min(len, snap_size - offset);
+ }
dout("writepages got %d pages at %llu~%llu\n",
locked_pages, offset, len);
@@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file,
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
loff_t page_off = pos & PAGE_CACHE_MASK;
int pos_in_page = pos & ~PAGE_CACHE_MASK;
int end_in_page = pos_in_page + len;
@@ -1044,10 +1057,6 @@ retry_locked:
/* writepages currently holds page lock, but if we change that later, */
wait_on_page_writeback(page);
- /* check snap context */
- BUG_ON(!ci->i_snap_realm);
- down_read(&mdsc->snap_rwsem);
- BUG_ON(!ci->i_snap_realm->cached_context);
snapc = page_snap_context(page);
if (snapc && snapc != ci->i_head_snapc) {
/*
@@ -1055,7 +1064,6 @@ retry_locked:
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL);
- up_read(&mdsc->snap_rwsem);
if (snapc->seq > oldest->seq) {
ceph_put_snap_context(oldest);
@@ -1112,7 +1120,6 @@ retry_locked:
}
/* we need to read it. */
- up_read(&mdsc->snap_rwsem);
r = readpage_nounlock(file, page);
if (r < 0)
goto fail_nosnap;
@@ -1157,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
/*
* we don't do anything in here that simple_write_end doesn't do
- * except adjust dirty page accounting and drop read lock on
- * mdsc->snap_rwsem.
+ * except adjust dirty page accounting
*/
static int ceph_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
int check_cap = 0;
@@ -1188,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
set_page_dirty(page);
unlock_page(page);
- up_read(&mdsc->snap_rwsem);
page_cache_release(page);
if (check_cap)
@@ -1314,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = file_inode(vma->vm_file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_cap_flush *prealloc_cf;
struct page *page = vmf->page;
loff_t off = page_offset(page);
loff_t size = i_size_read(inode);
size_t len;
int want, got, ret;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return VM_FAULT_SIGBUS;
+
if (ci->i_inline_version != CEPH_INLINE_NONE) {
struct page *locked_page = NULL;
if (off == 0) {
@@ -1330,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = ceph_uninline_data(vma->vm_file, locked_page);
if (locked_page)
unlock_page(locked_page);
- if (ret < 0)
- return VM_FAULT_SIGBUS;
+ if (ret < 0) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_free;
+ }
}
if (off + PAGE_CACHE_SIZE <= size)
@@ -1353,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
break;
if (ret != -ERESTARTSYS) {
WARN_ON(1);
- return VM_FAULT_SIGBUS;
+ ret = VM_FAULT_SIGBUS;
+ goto out_free;
}
}
dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
@@ -1373,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret == 0) {
/* success. we'll keep the page locked. */
set_page_dirty(page);
- up_read(&mdsc->snap_rwsem);
ret = VM_FAULT_LOCKED;
} else {
if (ret == -ENOMEM)
@@ -1389,7 +1398,8 @@ out:
int dirty;
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
@@ -1398,6 +1408,8 @@ out:
dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
inode, off, len, ceph_cap_string(got), ret);
ceph_put_cap_refs(ci, got);
+out_free:
+ ceph_free_cap_flush(prealloc_cf);
return ret;
}
@@ -1509,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
- ci->i_snap_realm->cached_context,
- 0, 0, false);
+ ceph_empty_snapc, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -1528,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 1, 3,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
- ci->i_snap_realm->cached_context,
+ ceph_empty_snapc,
ci->i_truncate_seq, ci->i_truncate_size,
false);
if (IS_ERR(req)) {
@@ -1582,7 +1593,7 @@ out:
return err;
}
-static struct vm_operations_struct ceph_vmops = {
+static const struct vm_operations_struct ceph_vmops = {
.fault = ceph_filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
};
@@ -1597,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_ops = &ceph_vmops;
return 0;
}
+
+enum {
+ POOL_READ = 1,
+ POOL_WRITE = 2,
+};
+
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
+ struct rb_node **p, *parent;
+ struct ceph_pool_perm *perm;
+ struct page **pages;
+ int err = 0, err2 = 0, have = 0;
+
+ down_read(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
+ while (*p) {
+ perm = rb_entry(*p, struct ceph_pool_perm, node);
+ if (pool < perm->pool)
+ p = &(*p)->rb_left;
+ else if (pool > perm->pool)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
+ }
+ up_read(&mdsc->pool_perm_rwsem);
+ if (*p)
+ goto out;
+
+ dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+
+ down_write(&mdsc->pool_perm_rwsem);
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ perm = rb_entry(parent, struct ceph_pool_perm, node);
+ if (pool < perm->pool)
+ p = &(*p)->rb_left;
+ else if (pool > perm->pool)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
+ }
+ if (*p) {
+ up_write(&mdsc->pool_perm_rwsem);
+ goto out;
+ }
+
+ rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+ ceph_empty_snapc,
+ 1, false, GFP_NOFS);
+ if (!rd_req) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ rd_req->r_flags = CEPH_OSD_FLAG_READ;
+ osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
+ rd_req->r_base_oloc.pool = pool;
+ snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
+ "%llx.00000000", ci->i_vino.ino);
+ rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+
+ wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+ ceph_empty_snapc,
+ 1, false, GFP_NOFS);
+ if (!wr_req) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+ osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
+ wr_req->r_base_oloc.pool = pool;
+ wr_req->r_base_oid = rd_req->r_base_oid;
+
+ /* one page should be large enough for STAT data */
+ pages = ceph_alloc_page_vector(1, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ err = PTR_ERR(pages);
+ goto out_unlock;
+ }
+
+ osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
+ 0, false, true);
+ ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
+ &ci->vfs_inode.i_mtime);
+ err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+
+ ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
+ &ci->vfs_inode.i_mtime);
+ err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+
+ if (!err)
+ err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+ if (!err2)
+ err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+
+ if (err >= 0 || err == -ENOENT)
+ have |= POOL_READ;
+ else if (err != -EPERM)
+ goto out_unlock;
+
+ if (err2 == 0 || err2 == -EEXIST)
+ have |= POOL_WRITE;
+ else if (err2 != -EPERM) {
+ err = err2;
+ goto out_unlock;
+ }
+
+ perm = kmalloc(sizeof(*perm), GFP_NOFS);
+ if (!perm) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ perm->pool = pool;
+ perm->perm = have;
+ rb_link_node(&perm->node, parent, p);
+ rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
+ err = 0;
+out_unlock:
+ up_write(&mdsc->pool_perm_rwsem);
+
+ if (rd_req)
+ ceph_osdc_put_request(rd_req);
+ if (wr_req)
+ ceph_osdc_put_request(wr_req);
+out:
+ if (!err)
+ err = have;
+ dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+ return err;
+}
+
+int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
+{
+ u32 pool;
+ int ret, flags;
+
+ if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
+ NOPOOLPERM))
+ return 0;
+
+ spin_lock(&ci->i_ceph_lock);
+ flags = ci->i_ceph_flags;
+ pool = ceph_file_layout_pg_pool(ci->i_layout);
+ spin_unlock(&ci->i_ceph_lock);
+check:
+ if (flags & CEPH_I_POOL_PERM) {
+ if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
+ dout("ceph_pool_perm_check pool %u no read perm\n",
+ pool);
+ return -EPERM;
+ }
+ if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
+ dout("ceph_pool_perm_check pool %u no write perm\n",
+ pool);
+ return -EPERM;
+ }
+ return 0;
+ }
+
+ ret = __ceph_pool_perm_get(ci, pool);
+ if (ret < 0)
+ return ret;
+
+ flags = CEPH_I_POOL_PERM;
+ if (ret & POOL_READ)
+ flags |= CEPH_I_POOL_RD;
+ if (ret & POOL_WRITE)
+ flags |= CEPH_I_POOL_WR;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
+ ci->i_ceph_flags = flags;
+ } else {
+ pool = ceph_file_layout_pg_pool(ci->i_layout);
+ flags = ci->i_ceph_flags;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ goto check;
+}
+
+void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
+{
+ struct ceph_pool_perm *perm;
+ struct rb_node *n;
+
+ while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
+ n = rb_first(&mdsc->pool_perm_tree);
+ perm = rb_entry(n, struct ceph_pool_perm, node);
+ rb_erase(n, &mdsc->pool_perm_tree);
+ kfree(perm);
+ }
+}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index be5ea6af8366..ddd5e9471290 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
used |= CEPH_CAP_PIN;
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
- if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
+ if (ci->i_rdcache_ref ||
+ (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+ ci->vfs_inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
@@ -926,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
/* remove from session list */
spin_lock(&session->s_cap_lock);
- /*
- * s_cap_reconnect is protected by s_cap_lock. no one changes
- * s_cap_gen while session is in the reconnect state.
- */
- if (queue_release &&
- (!session->s_cap_reconnect ||
- cap->cap_gen == session->s_cap_gen))
- __queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
- cap->mseq, cap->issue_seq);
-
if (session->s_cap_iterator == cap) {
/* not yet, we are iterating over this very cap */
dout("__ceph_remove_cap delaying %p removal from session %p\n",
@@ -948,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
}
/* protect backpointer with s_cap_lock: see iterate_session_caps */
cap->ci = NULL;
+
+ /*
+ * s_cap_reconnect is protected by s_cap_lock. no one changes
+ * s_cap_gen while session is in the reconnect state.
+ */
+ if (queue_release &&
+ (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
+ cap->queue_release = 1;
+ if (removed) {
+ list_add_tail(&cap->session_caps,
+ &session->s_cap_releases);
+ session->s_num_cap_releases++;
+ removed = 0;
+ }
+ } else {
+ cap->queue_release = 0;
+ }
+ cap->cap_ino = ci->i_vino.ino;
+
spin_unlock(&session->s_cap_lock);
/* remove from inode list */
@@ -977,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
static int send_cap_msg(struct ceph_mds_session *session,
u64 ino, u64 cid, int op,
int caps, int wanted, int dirty,
- u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
- u64 size, u64 max_size,
+ u32 seq, u64 flush_tid, u64 oldest_flush_tid,
+ u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
u64 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode,
@@ -992,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session,
size_t extra_len;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
- " seq %u/%u mseq %u follows %lld size %llu/%llu"
+ " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
ceph_cap_string(dirty),
- seq, issue_seq, mseq, follows, size, max_size,
+ seq, issue_seq, flush_tid, oldest_flush_tid,
+ mseq, follows, size, max_size,
xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
- /* flock buffer size + inline version + inline data size */
- extra_len = 4 + 8 + 4;
+ /* flock buffer size + inline version + inline data size +
+ * osd_epoch_barrier + oldest_flush_tid */
+ extra_len = 4 + 8 + 4 + 4 + 8;
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
GFP_NOFS, false);
if (!msg)
return -ENOMEM;
+ msg->hdr.version = cpu_to_le16(6);
msg->hdr.tid = cpu_to_le64(flush_tid);
fc = msg->front.iov_base;
@@ -1041,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session,
ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
/* inline data size */
ceph_encode_32(&p, 0);
+ /* osd_epoch_barrier */
+ ceph_encode_32(&p, 0);
+ /* oldest_flush_tid */
+ ceph_encode_64(&p, oldest_flush_tid);
fc->xattr_version = cpu_to_le64(xattr_version);
if (xattrs_buf) {
@@ -1053,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session,
return 0;
}
-void __queue_cap_release(struct ceph_mds_session *session,
- u64 ino, u64 cap_id, u32 migrate_seq,
- u32 issue_seq)
-{
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- struct ceph_mds_cap_item *item;
-
- BUG_ON(!session->s_num_cap_releases);
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
-
- dout(" adding %llx release to mds%d msg %p (%d left)\n",
- ino, session->s_mds, msg, session->s_num_cap_releases);
-
- BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
- head = msg->front.iov_base;
- le32_add_cpu(&head->num, 1);
- item = msg->front.iov_base + msg->front.iov_len;
- item->ino = cpu_to_le64(ino);
- item->cap_id = cpu_to_le64(cap_id);
- item->migrate_seq = cpu_to_le32(migrate_seq);
- item->seq = cpu_to_le32(issue_seq);
-
- session->s_num_cap_releases--;
-
- msg->front.iov_len += sizeof(*item);
- if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
- dout(" release msg %p full\n", msg);
- list_move_tail(&msg->list_head, &session->s_cap_releases_done);
- } else {
- dout(" release msg %p at %d/%d (%d)\n", msg,
- (int)le32_to_cpu(head->num),
- (int)CEPH_CAPS_PER_RELEASE,
- (int)msg->front.iov_len);
- }
-}
-
/*
* Queue cap releases when an inode is dropped from our cache. Since
* inode is about to be destroyed, there is no need for i_ceph_lock.
@@ -1127,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode)
*/
static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
int op, int used, int want, int retain, int flushing,
- unsigned *pflush_tid)
+ u64 flush_tid, u64 oldest_flush_tid)
__releases(cap->ci->i_ceph_lock)
{
struct ceph_inode_info *ci = cap->ci;
@@ -1145,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
u64 xattr_version = 0;
struct ceph_buffer *xattr_blob = NULL;
int delayed = 0;
- u64 flush_tid = 0;
- int i;
int ret;
bool inline_data;
@@ -1190,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
cap->implemented &= cap->issued | used;
cap->mds_wanted = want;
- if (flushing) {
- /*
- * assign a tid for flush operations so we can avoid
- * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
- * clean type races. track latest tid for every bit
- * so we can handle flush AxFw, flush Fw, and have the
- * first ack clean Ax.
- */
- flush_tid = ++ci->i_cap_flush_last_tid;
- if (pflush_tid)
- *pflush_tid = flush_tid;
- dout(" cap_flush_tid %d\n", (int)flush_tid);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if (flushing & (1 << i))
- ci->i_cap_flush_tid[i] = flush_tid;
-
- follows = ci->i_head_snapc->seq;
- } else {
- follows = 0;
- }
+ follows = flushing ? ci->i_head_snapc->seq : 0;
keep = cap->implemented;
seq = cap->seq;
@@ -1237,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
spin_unlock(&ci->i_ceph_lock);
ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
- op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+ op, keep, want, flushing, seq,
+ flush_tid, oldest_flush_tid, issue_seq, mseq,
size, max_size, &mtime, &atime, time_warp_seq,
uid, gid, mode, xattr_version, xattr_blob,
follows, inline_data);
@@ -1259,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Unless @again is true, skip cap_snaps that were already sent to
+ * Unless @kick is true, skip cap_snaps that were already sent to
* the MDS (i.e., during this session).
*
* Called under i_ceph_lock. Takes s_mutex as needed.
*/
void __ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession,
- int again)
+ int kick)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
@@ -1297,11 +1257,8 @@ retry:
if (capsnap->dirty_pages || capsnap->writing)
break;
- /*
- * if cap writeback already occurred, we should have dropped
- * the capsnap in ceph_put_wrbuffer_cap_refs.
- */
- BUG_ON(capsnap->dirty == 0);
+ /* should be removed by ceph_try_drop_cap_snap() */
+ BUG_ON(!capsnap->need_flush);
/* pick mds, take s_mutex */
if (ci->i_auth_cap == NULL) {
@@ -1310,7 +1267,7 @@ retry:
}
/* only flush each capsnap once */
- if (!again && !list_empty(&capsnap->flushing_item)) {
+ if (!kick && !list_empty(&capsnap->flushing_item)) {
dout("already flushed %p, skipping\n", capsnap);
continue;
}
@@ -1320,6 +1277,9 @@ retry:
if (session && session->s_mds != mds) {
dout("oops, wrong session %p mutex\n", session);
+ if (kick)
+ goto out;
+
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
session = NULL;
@@ -1343,20 +1303,22 @@ retry:
goto retry;
}
- capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+ spin_lock(&mdsc->cap_dirty_lock);
+ capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
+ spin_unlock(&mdsc->cap_dirty_lock);
+
atomic_inc(&capsnap->nref);
- if (!list_empty(&capsnap->flushing_item))
- list_del_init(&capsnap->flushing_item);
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
+ if (list_empty(&capsnap->flushing_item))
+ list_add_tail(&capsnap->flushing_item,
+ &session->s_cap_snaps_flushing);
spin_unlock(&ci->i_ceph_lock);
dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
inode, capsnap, capsnap->follows, capsnap->flush_tid);
send_cap_msg(session, ceph_vino(inode).ino, 0,
CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
- capsnap->size, 0,
+ capsnap->dirty, 0, capsnap->flush_tid, 0,
+ 0, mseq, capsnap->size, 0,
&capsnap->mtime, &capsnap->atime,
capsnap->time_warp_seq,
capsnap->uid, capsnap->gid, capsnap->mode,
@@ -1396,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
* Caller is then responsible for calling __mark_inode_dirty with the
* returned flags value.
*/
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+ struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
@@ -1416,9 +1379,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
if (was == 0) {
- if (!ci->i_head_snapc)
+ WARN_ON_ONCE(ci->i_prealloc_cap_flush);
+ swap(ci->i_prealloc_cap_flush, *pcf);
+
+ if (!ci->i_head_snapc) {
+ WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
ci->i_head_snapc = ceph_get_snap_context(
ci->i_snap_realm->cached_context);
+ }
dout(" inode %p now dirty snapc %p auth cap %p\n",
&ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
@@ -1429,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
ihold(inode);
dirty |= I_DIRTY_SYNC;
}
+ } else {
+ WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
}
BUG_ON(list_empty(&ci->i_dirty_item));
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
@@ -1438,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
return dirty;
}
+static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_cap_flush *other = NULL;
+
+ while (*p) {
+ parent = *p;
+ other = rb_entry(parent, struct ceph_cap_flush, i_node);
+
+ if (cf->tid < other->tid)
+ p = &(*p)->rb_left;
+ else if (cf->tid > other->tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&cf->i_node, parent, p);
+ rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
+}
+
+static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
+ struct ceph_cap_flush *cf)
+{
+ struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_cap_flush *other = NULL;
+
+ while (*p) {
+ parent = *p;
+ other = rb_entry(parent, struct ceph_cap_flush, g_node);
+
+ if (cf->tid < other->tid)
+ p = &(*p)->rb_left;
+ else if (cf->tid > other->tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&cf->g_node, parent, p);
+ rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
+}
+
+struct ceph_cap_flush *ceph_alloc_cap_flush(void)
+{
+ return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+}
+
+void ceph_free_cap_flush(struct ceph_cap_flush *cf)
+{
+ if (cf)
+ kmem_cache_free(ceph_cap_flush_cachep, cf);
+}
+
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
+{
+ struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
+ if (n) {
+ struct ceph_cap_flush *cf =
+ rb_entry(n, struct ceph_cap_flush, g_node);
+ return cf->tid;
+ }
+ return 0;
+}
+
/*
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
@@ -1445,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
* Called under i_ceph_lock.
*/
static int __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session)
+ struct ceph_mds_session *session,
+ u64 *flush_tid, u64 *oldest_flush_tid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap_flush *cf = NULL;
int flushing;
BUG_ON(ci->i_dirty_caps == 0);
BUG_ON(list_empty(&ci->i_dirty_item));
+ BUG_ON(!ci->i_prealloc_cap_flush);
flushing = ci->i_dirty_caps;
dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
@@ -1463,22 +1504,30 @@ static int __mark_caps_flushing(struct inode *inode,
ci->i_dirty_caps = 0;
dout(" inode %p now !dirty\n", inode);
+ swap(cf, ci->i_prealloc_cap_flush);
+ cf->caps = flushing;
+
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
+ cf->tid = ++mdsc->last_cap_flush_tid;
+ __add_cap_flushing_to_mdsc(mdsc, cf);
+ *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+
if (list_empty(&ci->i_flushing_item)) {
- ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
- dout(" inode %p now flushing seq %lld\n", inode,
- ci->i_cap_flush_seq);
+ dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
} else {
list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) seq %lld\n", inode,
- ci->i_cap_flush_seq);
+ dout(" inode %p now flushing (more) tid %llu\n",
+ inode, cf->tid);
}
spin_unlock(&mdsc->cap_dirty_lock);
+ __add_cap_flushing_to_inode(ci, cf);
+
+ *flush_tid = cf->tid;
return flushing;
}
@@ -1524,6 +1573,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
+ u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
@@ -1553,13 +1603,13 @@ retry:
retry_locked:
file_wanted = __ceph_caps_file_wanted(ci);
used = __ceph_caps_used(ci);
- want = file_wanted | used;
issued = __ceph_caps_issued(ci, &implemented);
revoking = implemented & ~issued;
- retain = want | CEPH_CAP_PIN;
+ want = file_wanted;
+ retain = file_wanted | used | CEPH_CAP_PIN;
if (!mdsc->stopping && inode->i_nlink > 0) {
- if (want) {
+ if (file_wanted) {
retain |= CEPH_CAP_ANY; /* be greedy */
} else if (S_ISDIR(inode->i_mode) &&
(issued & CEPH_CAP_FILE_SHARED) &&
@@ -1602,9 +1652,10 @@ retry_locked:
* If we fail, it's because pages are locked.... try again later.
*/
if ((!is_delayed || mdsc->stopping) &&
- ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
- inode->i_data.nrpages && /* have cached pages */
- (file_wanted == 0 || /* no open files */
+ !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
+ ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
+ inode->i_data.nrpages && /* have cached pages */
+ (file_wanted == 0 || /* no open files */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
!tried_invalidate) {
@@ -1742,17 +1793,25 @@ ack:
took_snap_rwsem = 1;
}
- if (cap == ci->i_auth_cap && ci->i_dirty_caps)
- flushing = __mark_caps_flushing(inode, session);
- else
+ if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
+ flushing = __mark_caps_flushing(inode, session,
+ &flush_tid,
+ &oldest_flush_tid);
+ } else {
flushing = 0;
+ flush_tid = 0;
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
mds = cap->mds; /* remember mds, so we don't repeat */
sent++;
/* __send_cap drops i_ceph_lock */
delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
- want, retain, flushing, NULL);
+ want, retain, flushing,
+ flush_tid, oldest_flush_tid);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -1781,12 +1840,13 @@ ack:
/*
* Try to flush dirty caps back to the auth mds.
*/
-static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, u64 *ptid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- int flushing = 0;
struct ceph_mds_session *session = NULL;
+ int flushing = 0;
+ u64 flush_tid = 0, oldest_flush_tid = 0;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1811,42 +1871,54 @@ retry:
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
- flushing = __mark_caps_flushing(inode, session);
+ flushing = __mark_caps_flushing(inode, session, &flush_tid,
+ &oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
- cap->issued | cap->implemented, flushing,
- flush_tid);
- if (!delayed)
- goto out_unlocked;
+ (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ if (delayed) {
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(mdsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ } else {
+ struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
+ if (n) {
+ struct ceph_cap_flush *cf =
+ rb_entry(n, struct ceph_cap_flush, i_node);
+ flush_tid = cf->tid;
+ }
+ flushing = ci->i_flushing_caps;
+ spin_unlock(&ci->i_ceph_lock);
}
out:
- spin_unlock(&ci->i_ceph_lock);
-out_unlocked:
if (session)
mutex_unlock(&session->s_mutex);
+
+ *ptid = flush_tid;
return flushing;
}
/*
* Return true if we've flushed caps through the given flush_tid.
*/
-static int caps_are_flushed(struct inode *inode, unsigned tid)
+static int caps_are_flushed(struct inode *inode, u64 flush_tid)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int i, ret = 1;
+ struct ceph_cap_flush *cf;
+ struct rb_node *n;
+ int ret = 1;
spin_lock(&ci->i_ceph_lock);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((ci->i_flushing_caps & (1 << i)) &&
- ci->i_cap_flush_tid[i] <= tid) {
- /* still flushing this bit */
+ n = rb_first(&ci->i_cap_flush_tree);
+ if (n) {
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ if (cf->tid <= flush_tid)
ret = 0;
- break;
- }
+ }
spin_unlock(&ci->i_ceph_lock);
return ret;
}
@@ -1864,13 +1936,16 @@ static void sync_write_wait(struct inode *inode)
struct ceph_osd_request *req;
u64 last_tid;
+ if (!S_ISREG(inode->i_mode))
+ return;
+
spin_lock(&ci->i_unsafe_lock);
if (list_empty(head))
goto out;
/* set upper bound as _last_ entry in chain */
- req = list_entry(head->prev, struct ceph_osd_request,
- r_unsafe_item);
+ req = list_last_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
last_tid = req->r_tid;
do {
@@ -1888,18 +1963,64 @@ static void sync_write_wait(struct inode *inode)
*/
if (list_empty(head))
break;
- req = list_entry(head->next, struct ceph_osd_request,
- r_unsafe_item);
+ req = list_first_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
} while (req->r_tid < last_tid);
out:
spin_unlock(&ci->i_unsafe_lock);
}
+/*
+ * wait for any uncommitted directory operations to commit.
+ */
+static int unsafe_dirop_wait(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct list_head *head = &ci->i_unsafe_dirops;
+ struct ceph_mds_request *req;
+ u64 last_tid;
+ int ret = 0;
+
+ if (!S_ISDIR(inode->i_mode))
+ return 0;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ req = list_last_entry(head, struct ceph_mds_request,
+ r_unsafe_dir_item);
+ last_tid = req->r_tid;
+
+ do {
+ ceph_mdsc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+
+ dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
+ inode, req->r_tid, last_tid);
+ ret = !wait_for_completion_timeout(&req->r_safe_completion,
+ ceph_timeout_jiffies(req->r_timeout));
+ if (ret)
+ ret = -EIO; /* timed out */
+
+ ceph_mdsc_put_request(req);
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (ret || list_empty(head))
+ break;
+ req = list_first_entry(head, struct ceph_mds_request,
+ r_unsafe_dir_item);
+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+ return ret;
+}
+
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
+ u64 flush_tid;
int ret;
int dirty;
@@ -1908,25 +2029,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret < 0)
- return ret;
+ goto out;
+
+ if (datasync)
+ goto out;
+
mutex_lock(&inode->i_mutex);
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+ ret = unsafe_dirop_wait(inode);
+
/*
* only wait on non-file metadata writeback (the mds
* can recover size and mtime, so we don't need to
* wait for that)
*/
- if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
- dout("fsync waiting for flush_tid %u\n", flush_tid);
+ if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
ret = wait_event_interruptible(ci->i_cap_wq,
- caps_are_flushed(inode, flush_tid));
+ caps_are_flushed(inode, flush_tid));
}
-
- dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
mutex_unlock(&inode->i_mutex);
+out:
+ dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
}
@@ -1939,7 +2065,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
+ u64 flush_tid;
int err = 0;
int dirty;
int wait = wbc->sync_mode == WB_SYNC_ALL;
@@ -1994,6 +2120,93 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
}
}
+static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ struct ceph_cap_flush *cf;
+ struct rb_node *n;
+ int delayed = 0;
+ u64 first_tid = 0;
+ u64 oldest_flush_tid;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ while (true) {
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n", inode,
+ cap, session->s_mds);
+ spin_unlock(&ci->i_ceph_lock);
+ break;
+ }
+
+ for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ if (cf->tid >= first_tid)
+ break;
+ }
+ if (!n) {
+ spin_unlock(&ci->i_ceph_lock);
+ break;
+ }
+
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+
+ first_tid = cf->tid + 1;
+
+ dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
+ cap, cf->tid, ceph_cap_string(cf->caps));
+ delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ cf->caps, cf->tid, oldest_flush_tid);
+ }
+ return delayed;
+}
+
+void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+
+ dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+ list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n",
+ &ci->vfs_inode, cap, session->s_mds);
+ spin_unlock(&ci->i_ceph_lock);
+ continue;
+ }
+
+
+ /*
+ * if flushing caps were revoked, we re-send the cap flush
+ * in client reconnect stage. This guarantees MDS * processes
+ * the cap flush message before issuing the flushing caps to
+ * other client.
+ */
+ if ((cap->issued & ci->i_flushing_caps) !=
+ ci->i_flushing_caps) {
+ spin_unlock(&ci->i_ceph_lock);
+ if (!__kick_flushing_caps(mdsc, session, ci))
+ continue;
+ spin_lock(&ci->i_ceph_lock);
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
@@ -2003,28 +2216,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
dout("kick_flushing_caps mds%d\n", session->s_mds);
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
- int delayed = 0;
-
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p %s\n", inode,
- cap, ceph_cap_string(ci->i_flushing_caps));
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- ci->i_flushing_caps, NULL);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
+ int delayed = __kick_flushing_caps(mdsc, session, ci);
+ if (delayed) {
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
}
}
@@ -2036,26 +2231,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
- int delayed = 0;
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
- dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
- ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+ dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+ ceph_cap_string(ci->i_flushing_caps));
__ceph_flush_snaps(ci, &session, 1);
if (ci->i_flushing_caps) {
+ int delayed;
+
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&cap->session->s_cap_flushing);
spin_unlock(&mdsc->cap_dirty_lock);
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- ci->i_flushing_caps, NULL);
+ spin_unlock(&ci->i_ceph_lock);
+
+ delayed = __kick_flushing_caps(mdsc, session, ci);
if (delayed) {
spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
@@ -2073,7 +2267,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
*
* Protected by i_ceph_lock.
*/
-static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+ bool snap_rwsem_locked)
{
if (got & CEPH_CAP_PIN)
ci->i_pin_ref++;
@@ -2081,8 +2276,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
ci->i_rd_ref++;
if (got & CEPH_CAP_FILE_CACHE)
ci->i_rdcache_ref++;
- if (got & CEPH_CAP_FILE_WR)
+ if (got & CEPH_CAP_FILE_WR) {
+ if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
+ BUG_ON(!snap_rwsem_locked);
+ ci->i_head_snapc = ceph_get_snap_context(
+ ci->i_snap_realm->cached_context);
+ }
ci->i_wr_ref++;
+ }
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
ihold(&ci->vfs_inode);
@@ -2100,16 +2301,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
* requested from the MDS.
*/
static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- loff_t endoff, int *got, int *check_max, int *err)
+ loff_t endoff, bool nonblock, int *got, int *err)
{
struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
int ret = 0;
int have, implemented;
int file_wanted;
+ bool snap_rwsem_locked = false;
dout("get_cap_refs %p need %s want %s\n", inode,
ceph_cap_string(need), ceph_cap_string(want));
+again:
spin_lock(&ci->i_ceph_lock);
/* make sure file is actually open */
@@ -2125,6 +2329,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
+ if (snap_rwsem_locked) {
+ up_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = false;
+ }
__ceph_do_pending_vmtruncate(inode);
spin_lock(&ci->i_ceph_lock);
}
@@ -2136,7 +2344,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size) {
- *check_max = 1;
+ *err = -EAGAIN;
ret = 1;
}
goto out_unlock;
@@ -2164,8 +2372,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
if ((revoking & not) == 0) {
+ if (!snap_rwsem_locked &&
+ !ci->i_head_snapc &&
+ (need & CEPH_CAP_FILE_WR)) {
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ /*
+ * we can not call down_read() when
+ * task isn't in TASK_RUNNING state
+ */
+ if (nonblock) {
+ *err = -EAGAIN;
+ ret = 1;
+ goto out_unlock;
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = true;
+ goto again;
+ }
+ snap_rwsem_locked = true;
+ }
*got = need | (have & want);
- __take_cap_refs(ci, *got);
+ __take_cap_refs(ci, *got, true);
ret = 1;
}
} else {
@@ -2189,6 +2418,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
}
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ if (snap_rwsem_locked)
+ up_read(&mdsc->snap_rwsem);
dout("get_cap_refs %p ret %d got %s\n", inode,
ret, ceph_cap_string(*got));
@@ -2231,50 +2462,70 @@ static void check_max_size(struct inode *inode, loff_t endoff)
int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page)
{
- int _got, check_max, ret, err = 0;
+ int _got, ret, err = 0;
-retry:
- if (endoff > 0)
- check_max_size(&ci->vfs_inode, endoff);
- _got = 0;
- check_max = 0;
- ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want, endoff,
- &_got, &check_max, &err));
- if (err)
- ret = err;
+ ret = ceph_pool_perm_check(ci, need);
if (ret < 0)
return ret;
- if (check_max)
- goto retry;
+ while (true) {
+ if (endoff > 0)
+ check_max_size(&ci->vfs_inode, endoff);
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
- (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
- i_size_read(&ci->vfs_inode) > 0) {
- struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- *pinned_page = page;
- goto out;
- }
- page_cache_release(page);
- }
- /*
- * drop cap refs first because getattr while holding
- * caps refs can cause deadlock.
- */
- ceph_put_cap_refs(ci, _got);
+ err = 0;
_got = 0;
+ ret = try_get_cap_refs(ci, need, want, endoff,
+ false, &_got, &err);
+ if (ret) {
+ if (err == -EAGAIN)
+ continue;
+ if (err < 0)
+ return err;
+ } else {
+ ret = wait_event_interruptible(ci->i_cap_wq,
+ try_get_cap_refs(ci, need, want, endoff,
+ true, &_got, &err));
+ if (err == -EAGAIN)
+ continue;
+ if (err < 0)
+ ret = err;
+ if (ret < 0)
+ return ret;
+ }
- /* getattr request will bring inline data into page cache */
- ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (ret < 0)
- return ret;
- goto retry;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ i_size_read(&ci->vfs_inode) > 0) {
+ struct page *page =
+ find_get_page(ci->vfs_inode.i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ *pinned_page = page;
+ break;
+ }
+ page_cache_release(page);
+ }
+ /*
+ * drop cap refs first because getattr while
+ * holding * caps refs can cause deadlock.
+ */
+ ceph_put_cap_refs(ci, _got);
+ _got = 0;
+
+ /*
+ * getattr request will bring inline data into
+ * page cache
+ */
+ ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+ CEPH_STAT_CAP_INLINE_DATA,
+ true);
+ if (ret < 0)
+ return ret;
+ continue;
+ }
+ break;
}
-out:
+
*got = _got;
return 0;
}
@@ -2286,10 +2537,31 @@ out:
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
spin_lock(&ci->i_ceph_lock);
- __take_cap_refs(ci, caps);
+ __take_cap_refs(ci, caps, false);
spin_unlock(&ci->i_ceph_lock);
}
+
+/*
+ * drop cap_snap that is not associated with any snapshot.
+ * we don't need to send FLUSHSNAP message for it.
+ */
+static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+{
+ if (!capsnap->need_flush &&
+ !capsnap->writing && !capsnap->dirty_pages) {
+
+ dout("dropping cap_snap %p follows %llu\n",
+ capsnap, capsnap->follows);
+ ceph_put_snap_context(capsnap->context);
+ list_del(&capsnap->ci_item);
+ list_del(&capsnap->flushing_item);
+ ceph_put_cap_snap(capsnap);
+ return 1;
+ }
+ return 0;
+}
+
/*
* Release cap refs.
*
@@ -2303,7 +2575,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
{
struct inode *inode = &ci->vfs_inode;
int last = 0, put = 0, flushsnaps = 0, wake = 0;
- struct ceph_cap_snap *capsnap;
spin_lock(&ci->i_ceph_lock);
if (had & CEPH_CAP_PIN)
@@ -2325,17 +2596,24 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (had & CEPH_CAP_FILE_WR)
if (--ci->i_wr_ref == 0) {
last++;
- if (!list_empty(&ci->i_cap_snaps)) {
- capsnap = list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap,
- ci_item);
- if (capsnap->writing) {
- capsnap->writing = 0;
- flushsnaps =
- __ceph_finish_cap_snap(ci,
- capsnap);
- wake = 1;
- }
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ capsnap->writing = 0;
+ if (ceph_try_drop_cap_snap(capsnap))
+ put++;
+ else if (__ceph_finish_cap_snap(ci, capsnap))
+ flushsnaps = 1;
+ wake = 1;
+ }
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ BUG_ON(!ci->i_head_snapc);
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
}
/* see comment in __ceph_remove_cap() */
if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
@@ -2352,7 +2630,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
ceph_flush_snaps(ci);
if (wake)
wake_up_all(&ci->i_cap_wq);
- if (put)
+ while (put-- > 0)
iput(inode);
}
@@ -2380,7 +2658,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
if (ci->i_head_snapc == snapc) {
ci->i_wrbuffer_ref_head -= nr;
if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
BUG_ON(!ci->i_head_snapc);
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
@@ -2401,25 +2681,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
capsnap->dirty_pages -= nr;
if (capsnap->dirty_pages == 0) {
complete_capsnap = 1;
- if (capsnap->dirty == 0)
- /* cap writeback completed before we created
- * the cap_snap; no FLUSHSNAP is needed */
- drop_capsnap = 1;
+ drop_capsnap = ceph_try_drop_cap_snap(capsnap);
}
dout("put_wrbuffer_cap_refs on %p cap_snap %p "
- " snap %lld %d/%d -> %d/%d %s%s%s\n",
+ " snap %lld %d/%d -> %d/%d %s%s\n",
inode, capsnap, capsnap->context->seq,
ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
ci->i_wrbuffer_ref, capsnap->dirty_pages,
last ? " (wrbuffer last)" : "",
- complete_capsnap ? " (complete capsnap)" : "",
- drop_capsnap ? " (drop capsnap)" : "");
- if (drop_capsnap) {
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- }
+ complete_capsnap ? " (complete capsnap)" : "");
}
spin_unlock(&ci->i_ceph_lock);
@@ -2526,7 +2796,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
*/
- if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+ if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!ci->i_wrbuffer_ref) {
if (try_nonblocking_invalidate(inode)) {
@@ -2732,16 +3003,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_cap_flush *cf;
+ struct rb_node *n;
+ LIST_HEAD(to_remove);
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
int drop = 0;
- int i;
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((dirty & (1 << i)) &&
- (u16)flush_tid == ci->i_cap_flush_tid[i])
- cleaned |= 1 << i;
+ n = rb_first(&ci->i_cap_flush_tree);
+ while (n) {
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ n = rb_next(&cf->i_node);
+ if (cf->tid == flush_tid)
+ cleaned = cf->caps;
+ if (cf->tid <= flush_tid) {
+ rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
+ list_add_tail(&cf->list, &to_remove);
+ } else {
+ cleaned &= ~cf->caps;
+ if (!cleaned)
+ break;
+ }
+ }
dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
" flushing %s -> %s\n",
@@ -2749,12 +3033,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(ci->i_flushing_caps & ~cleaned));
- if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+ if (list_empty(&to_remove) && !cleaned)
goto out;
ci->i_flushing_caps &= ~cleaned;
spin_lock(&mdsc->cap_dirty_lock);
+
+ if (!list_empty(&to_remove)) {
+ list_for_each_entry(cf, &to_remove, list)
+ rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
+ n = rb_first(&mdsc->cap_flush_tree);
+ cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+ if (!cf || cf->tid > flush_tid)
+ wake_up_all(&mdsc->cap_flushing_wq);
+ }
+
if (ci->i_flushing_caps == 0) {
list_del_init(&ci->i_flushing_item);
if (!list_empty(&session->s_cap_flushing))
@@ -2764,14 +3059,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info,
i_flushing_item)->vfs_inode);
mdsc->num_cap_flushing--;
- wake_up_all(&mdsc->cap_flushing_wq);
dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) {
dout(" inode %p now clean\n", inode);
BUG_ON(!list_empty(&ci->i_dirty_item));
drop = 1;
- if (ci->i_wrbuffer_ref_head == 0) {
+ if (ci->i_wr_ref == 0 &&
+ ci->i_wrbuffer_ref_head == 0) {
BUG_ON(!ci->i_head_snapc);
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
@@ -2785,6 +3080,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
out:
spin_unlock(&ci->i_ceph_lock);
+
+ while (!list_empty(&to_remove)) {
+ cf = list_first_entry(&to_remove,
+ struct ceph_cap_flush, list);
+ list_del(&cf->list);
+ ceph_free_cap_flush(cf);
+ }
if (drop)
iput(inode);
}
@@ -2800,6 +3102,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap;
int drop = 0;
@@ -2823,6 +3126,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
list_del(&capsnap->ci_item);
list_del(&capsnap->flushing_item);
ceph_put_cap_snap(capsnap);
+ wake_up_all(&mdsc->cap_flushing_wq);
drop = 1;
break;
} else {
@@ -2971,7 +3275,6 @@ retry:
mutex_lock_nested(&session->s_mutex,
SINGLE_DEPTH_NESTING);
}
- ceph_add_cap_releases(mdsc, tsession);
new_cap = ceph_get_cap(mdsc, NULL);
} else {
WARN_ON(1);
@@ -3167,16 +3470,20 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);
- if (op == CEPH_CAP_OP_IMPORT)
- ceph_add_cap_releases(mdsc, session);
-
if (!inode) {
dout(" i don't have ino %llx\n", vino.ino);
if (op == CEPH_CAP_OP_IMPORT) {
+ cap = ceph_get_cap(mdsc, NULL);
+ cap->cap_ino = vino.ino;
+ cap->queue_release = 1;
+ cap->cap_id = cap_id;
+ cap->mseq = mseq;
+ cap->seq = seq;
spin_lock(&session->s_cap_lock);
- __queue_cap_release(session, vino.ino, cap_id,
- mseq, seq);
+ list_add_tail(&cap->session_caps,
+ &session->s_cap_releases);
+ session->s_num_cap_releases++;
spin_unlock(&session->s_cap_lock);
}
goto flush_cap_releases;
@@ -3252,11 +3559,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
flush_cap_releases:
/*
- * send any full release message to try to move things
+ * send any cap release message to try to move things
* along for the mds (who clearly thinks we still have this
* cap).
*/
- ceph_add_cap_releases(mdsc, session);
ceph_send_cap_releases(mdsc, session);
done:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4248307fea90..9314b4ea2375 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
if (dentry->d_fsdata)
return 0;
- di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
+ di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
if (!di)
return -ENOMEM; /* oh well */
@@ -107,6 +107,27 @@ static int fpos_cmp(loff_t l, loff_t r)
}
/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+ int len, unsigned next_offset)
+{
+ char *buf = kmalloc(len+1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ kfree(fi->last_name);
+ fi->last_name = buf;
+ memcpy(fi->last_name, name, len);
+ fi->last_name[len] = 0;
+ fi->next_offset = next_offset;
+ dout("note_last_dentry '%s'\n", fi->last_name);
+ return 0;
+}
+
+/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
* d_child when we initially get results back from the MDS, and
@@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
struct ceph_file_info *fi = file->private_data;
struct dentry *parent = file->f_path.dentry;
struct inode *dir = d_inode(parent);
- struct list_head *p;
- struct dentry *dentry, *last;
+ struct dentry *dentry, *last = NULL;
struct ceph_dentry_info *di;
+ unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
int err = 0;
+ loff_t ptr_pos = 0;
+ struct ceph_readdir_cache_control cache_ctl = {};
- /* claim ref on last dentry we returned */
- last = fi->dentry;
- fi->dentry = NULL;
-
- dout("__dcache_readdir %p v%u at %llu (last %p)\n",
- dir, shared_gen, ctx->pos, last);
+ dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
- spin_lock(&parent->d_lock);
-
- /* start at beginning? */
- if (ctx->pos == 2 || last == NULL ||
- fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
- if (list_empty(&parent->d_subdirs))
- goto out_unlock;
- p = parent->d_subdirs.prev;
- dout(" initial p %p/%p\n", p->prev, p->next);
- } else {
- p = last->d_child.prev;
+ /* we can calculate cache index for the first dirfrag */
+ if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
+ cache_ctl.index = fpos_off(ctx->pos) - 2;
+ BUG_ON(cache_ctl.index < 0);
+ ptr_pos = cache_ctl.index * sizeof(struct dentry *);
}
-more:
- dentry = list_entry(p, struct dentry, d_child);
- di = ceph_dentry(dentry);
- while (1) {
- dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
- d_unhashed(dentry) ? "!hashed" : "hashed",
- parent->d_subdirs.prev, parent->d_subdirs.next);
- if (p == &parent->d_subdirs) {
+ while (true) {
+ pgoff_t pgoff;
+ bool emit_dentry;
+
+ if (ptr_pos >= i_size_read(dir)) {
fi->flags |= CEPH_F_ATEND;
- goto out_unlock;
+ err = 0;
+ break;
+ }
+
+ err = -EAGAIN;
+ pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
+ if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
+ ceph_readdir_cache_release(&cache_ctl);
+ cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
+ if (!cache_ctl.page) {
+ dout(" page %lu not found\n", pgoff);
+ break;
+ }
+ /* reading/filling the cache are serialized by
+ * i_mutex, no need to use page lock */
+ unlock_page(cache_ctl.page);
+ cache_ctl.dentries = kmap(cache_ctl.page);
}
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+
+ rcu_read_lock();
+ spin_lock(&parent->d_lock);
+ /* check i_size again here, because empty directory can be
+ * marked as complete while not holding the i_mutex. */
+ if (ceph_dir_is_complete_ordered(dir) &&
+ ptr_pos < i_size_read(dir))
+ dentry = cache_ctl.dentries[cache_ctl.index % nsize];
+ else
+ dentry = NULL;
+ spin_unlock(&parent->d_lock);
+ if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+ dentry = NULL;
+ rcu_read_unlock();
+ if (!dentry)
+ break;
+
+ emit_dentry = false;
+ di = ceph_dentry(dentry);
+ spin_lock(&dentry->d_lock);
if (di->lease_shared_gen == shared_gen &&
- !d_unhashed(dentry) && d_really_is_positive(dentry) &&
+ d_really_is_positive(dentry) &&
ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
- fpos_cmp(ctx->pos, di->offset) <= 0)
- break;
- dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry,
- dentry, di->offset,
- ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
- !d_inode(dentry) ? " null" : "");
+ fpos_cmp(ctx->pos, di->offset) <= 0) {
+ emit_dentry = true;
+ }
spin_unlock(&dentry->d_lock);
- p = p->prev;
- dentry = list_entry(p, struct dentry, d_child);
- di = ceph_dentry(dentry);
- }
-
- dget_dlock(dentry);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&parent->d_lock);
- /* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_dir_is_complete_ordered(dir)) {
- dout(" lost dir complete on %p; falling back to mds\n", dir);
- dput(dentry);
- err = -EAGAIN;
- goto out;
- }
+ if (emit_dentry) {
+ dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+ dentry, dentry, d_inode(dentry));
+ ctx->pos = di->offset;
+ if (!dir_emit(ctx, dentry->d_name.name,
+ dentry->d_name.len,
+ ceph_translate_ino(dentry->d_sb,
+ d_inode(dentry)->i_ino),
+ d_inode(dentry)->i_mode >> 12)) {
+ dput(dentry);
+ err = 0;
+ break;
+ }
+ ctx->pos++;
- dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
- dentry, dentry, d_inode(dentry));
- if (!dir_emit(ctx, dentry->d_name.name,
- dentry->d_name.len,
- ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino),
- d_inode(dentry)->i_mode >> 12)) {
- if (last) {
- /* remember our position */
- fi->dentry = last;
- fi->next_offset = fpos_off(di->offset);
+ if (last)
+ dput(last);
+ last = dentry;
+ } else {
+ dput(dentry);
}
- dput(dentry);
- return 0;
- }
-
- ctx->pos = di->offset + 1;
- if (last)
- dput(last);
- last = dentry;
-
- spin_lock(&parent->d_lock);
- p = p->prev; /* advance to next dentry */
- goto more;
-
-out_unlock:
- spin_unlock(&parent->d_lock);
-out:
- if (last)
+ cache_ctl.index++;
+ ptr_pos += sizeof(struct dentry *);
+ }
+ ceph_readdir_cache_release(&cache_ctl);
+ if (last) {
+ int ret;
+ di = ceph_dentry(last);
+ ret = note_last_dentry(fi, last->d_name.name, last->d_name.len,
+ fpos_off(di->offset) + 1);
+ if (ret < 0)
+ err = ret;
dput(last);
+ }
return err;
}
-/*
- * make note of the last dentry we read, so we can
- * continue at the same lexicographical point,
- * regardless of what dir changes take place on the
- * server.
- */
-static int note_last_dentry(struct ceph_file_info *fi, const char *name,
- int len)
-{
- kfree(fi->last_name);
- fi->last_name = kmalloc(len+1, GFP_NOFS);
- if (!fi->last_name)
- return -ENOMEM;
- memcpy(fi->last_name, name, len);
- fi->last_name[len] = 0;
- dout("note_last_dentry '%s'\n", fi->last_name);
- return 0;
-}
-
static int ceph_readdir(struct file *file, struct dir_context *ctx)
{
struct ceph_file_info *fi = file->private_data;
@@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
- if ((ctx->pos == 2 || fi->dentry) &&
- ceph_test_mount_opt(fsc, DCACHE) &&
+ if (ceph_test_mount_opt(fsc, DCACHE) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete_ordered(ci) &&
@@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
} else {
spin_unlock(&ci->i_ceph_lock);
}
- if (fi->dentry) {
- err = note_last_dentry(fi, fi->dentry->d_name.name,
- fi->dentry->d_name.len);
- if (err)
- return err;
- dput(fi->dentry);
- fi->dentry = NULL;
- }
/* proceed with a normal readdir */
-
- if (ctx->pos == 2) {
- /* note dir version at start of readdir so we can tell
- * if any dentries get dropped */
- fi->dir_release_count = atomic_read(&ci->i_release_count);
- fi->dir_ordered_count = ci->i_ordered_count;
- }
-
more:
/* do we have the correct frag content buffered? */
if (fi->frag != frag || fi->last_readdir == NULL) {
@@ -342,12 +336,15 @@ more:
req->r_direct_hash = ceph_frag_value(frag);
req->r_direct_is_hash = true;
if (fi->last_name) {
- req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+ req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
if (!req->r_path2) {
ceph_mdsc_put_request(req);
return -ENOMEM;
}
}
+ req->r_dir_release_cnt = fi->dir_release_count;
+ req->r_dir_ordered_cnt = fi->dir_ordered_count;
+ req->r_readdir_cache_idx = fi->readdir_cache_idx;
req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
@@ -364,26 +361,38 @@ more:
(int)req->r_reply_info.dir_end,
(int)req->r_reply_info.dir_complete);
- if (!req->r_did_prepopulate) {
- dout("readdir !did_prepopulate");
- /* preclude from marking dir complete */
- fi->dir_release_count--;
- }
/* note next offset and last dentry name */
rinfo = &req->r_reply_info;
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
frag = le32_to_cpu(rinfo->dir_dir->frag);
- if (ceph_frag_is_leftmost(frag))
- fi->next_offset = 2;
- else
- fi->next_offset = 0;
- off = fi->next_offset;
+ off = req->r_readdir_offset;
+ fi->next_offset = off;
}
+
fi->frag = frag;
fi->offset = fi->next_offset;
fi->last_readdir = req;
+ if (req->r_did_prepopulate) {
+ fi->readdir_cache_idx = req->r_readdir_cache_idx;
+ if (fi->readdir_cache_idx < 0) {
+ /* preclude from marking dir ordered */
+ fi->dir_ordered_count = 0;
+ } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+ /* note dir version at start of readdir so
+ * we can tell if any dentries get dropped */
+ fi->dir_release_count = req->r_dir_release_cnt;
+ fi->dir_ordered_count = req->r_dir_ordered_cnt;
+ }
+ } else {
+ dout("readdir !did_prepopulate");
+ /* disable readdir cache */
+ fi->readdir_cache_idx = -1;
+ /* preclude from marking dir complete */
+ fi->dir_release_count = 0;
+ }
+
if (req->r_reply_info.dir_end) {
kfree(fi->last_name);
fi->last_name = NULL;
@@ -394,10 +403,10 @@ more:
} else {
err = note_last_dentry(fi,
rinfo->dir_dname[rinfo->dir_nr-1],
- rinfo->dir_dname_len[rinfo->dir_nr-1]);
+ rinfo->dir_dname_len[rinfo->dir_nr-1],
+ fi->next_offset + rinfo->dir_nr);
if (err)
return err;
- fi->next_offset += rinfo->dir_nr;
}
}
@@ -453,16 +462,22 @@ more:
* were released during the whole readdir, and we should have
* the complete dir contents in our cache.
*/
- spin_lock(&ci->i_ceph_lock);
- if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
- if (ci->i_ordered_count == fi->dir_ordered_count)
+ if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
+ spin_lock(&ci->i_ceph_lock);
+ if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
dout(" marking %p complete and ordered\n", inode);
- else
+ /* use i_size to track number of entries in
+ * readdir cache */
+ BUG_ON(fi->readdir_cache_idx < 0);
+ i_size_write(inode, fi->readdir_cache_idx *
+ sizeof(struct dentry*));
+ } else {
dout(" marking %p complete\n", inode);
+ }
__ceph_dir_set_complete(ci, fi->dir_release_count,
fi->dir_ordered_count);
+ spin_unlock(&ci->i_ceph_lock);
}
- spin_unlock(&ci->i_ceph_lock);
dout("readdir %p file %p done.\n", inode, file);
return 0;
@@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
}
kfree(fi->last_name);
fi->last_name = NULL;
+ fi->dir_release_count = 0;
+ fi->readdir_cache_idx = -1;
if (ceph_frag_is_leftmost(frag))
fi->next_offset = 2; /* compensate for . and .. */
else
fi->next_offset = 0;
- if (fi->dentry) {
- dput(fi->dentry);
- fi->dentry = NULL;
- }
fi->flags &= ~CEPH_F_ATEND;
}
@@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
mutex_lock(&inode->i_mutex);
retval = -EINVAL;
switch (whence) {
- case SEEK_END:
- offset += inode->i_size + 2; /* FIXME */
- break;
case SEEK_CUR:
offset += file->f_pos;
case SEEK_SET:
break;
+ case SEEK_END:
+ retval = -EOPNOTSUPP;
default:
goto out;
}
@@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
}
retval = offset;
- /*
- * discard buffered readdir content on seekdir(0), or
- * seek to new frag, or seek prior to current chunk.
- */
if (offset == 0 ||
fpos_frag(offset) != fi->frag ||
fpos_off(offset) < fi->offset) {
+ /* discard buffered readdir content on seekdir(0), or
+ * seek to new frag, or seek prior to current chunk */
dout("dir_llseek dropping %p content\n", file);
reset_readdir(fi, fpos_frag(offset));
+ } else if (fpos_cmp(offset, old_offset) > 0) {
+ /* reset dir_release_count if we did a forward seek */
+ fi->dir_release_count = 0;
+ fi->readdir_cache_idx = -1;
}
-
- /* bump dir_release_count if we did a forward seek */
- if (fpos_cmp(offset, old_offset) > 0)
- fi->dir_release_count--;
}
out:
mutex_unlock(&inode->i_mutex);
@@ -764,7 +774,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
err = PTR_ERR(req);
goto out;
}
- req->r_path2 = kstrdup(dest, GFP_NOFS);
+ req->r_path2 = kstrdup(dest, GFP_KERNEL);
if (!req->r_path2) {
err = -ENOMEM;
ceph_mdsc_put_request(req);
@@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
* to do it here.
*/
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_complete(old_dir);
+ ceph_dir_clear_complete(new_dir);
+
d_move(old_dentry, new_dentry);
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(new_dentry);
-
- /* d_move screws up sibling dentries' offsets */
- ceph_dir_clear_complete(old_dir);
- ceph_dir_clear_complete(new_dir);
-
}
ceph_mdsc_put_request(req);
return err;
@@ -1189,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
return -EISDIR;
if (!cf->dir_info) {
- cf->dir_info = kmalloc(bufsize, GFP_NOFS);
+ cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
if (!cf->dir_info)
return -ENOMEM;
cf->dir_info_len =
@@ -1224,66 +1233,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
}
/*
- * an fsync() on a dir will wait for any uncommitted directory
- * operations to commit.
- */
-static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_dirops;
- struct ceph_mds_request *req;
- u64 last_tid;
- int ret = 0;
-
- dout("dir_fsync %p\n", inode);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- req = list_entry(head->prev,
- struct ceph_mds_request, r_unsafe_dir_item);
- last_tid = req->r_tid;
-
- do {
- ceph_mdsc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
-
- dout("dir_fsync %p wait on tid %llu (until %llu)\n",
- inode, req->r_tid, last_tid);
- if (req->r_timeout) {
- unsigned long time_left = wait_for_completion_timeout(
- &req->r_safe_completion,
- req->r_timeout);
- if (time_left > 0)
- ret = 0;
- else
- ret = -EIO; /* timed out */
- } else {
- wait_for_completion(&req->r_safe_completion);
- }
- ceph_mdsc_put_request(req);
-
- spin_lock(&ci->i_unsafe_lock);
- if (ret || list_empty(head))
- break;
- req = list_entry(head->next,
- struct ceph_mds_request, r_unsafe_dir_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
- mutex_unlock(&inode->i_mutex);
-
- return ret;
-}
-
-/*
* We maintain a private dentry LRU.
*
* FIXME: this needs to be changed to a per-mds lru to be useful.
@@ -1353,7 +1302,7 @@ const struct file_operations ceph_dir_fops = {
.open = ceph_open,
.release = ceph_release,
.unlocked_ioctl = ceph_ioctl,
- .fsync = ceph_dir_fsync,
+ .fsync = ceph_fsync,
};
const struct file_operations ceph_snapdir_fops = {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b6b522b4b31..8b79d87eaf46 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
- cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+ cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
if (cf == NULL) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM;
}
cf->fmode = fmode;
cf->next_offset = 2;
+ cf->readdir_cache_idx = -1;
file->private_data = cf;
BUG_ON(inode->i_fop->release != ceph_release);
break;
@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
ceph_mdsc_put_request(cf->last_readdir);
kfree(cf->last_name);
kfree(cf->dir_info);
- dput(cf->dentry);
kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */
@@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
}
} else {
num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages))
return PTR_ERR(pages);
ret = striped_read(inode, off, len, pages,
@@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
* objects, rollback on failure, etc.)
*/
static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+ struct ceph_snap_context *snapc)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
@@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t start;
ssize_t n;
- snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0,
@@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
break;
}
- osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+ osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
n = iov_iter_get_pages_alloc(from, &pages, len, &start);
if (unlikely(n < 0)) {
@@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
* objects, rollback on failure, etc.)
*/
static ssize_t
-ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+ struct ceph_snap_context *snapc)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
@@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t left;
int n;
- snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0, 1,
@@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
*/
num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
@@ -860,7 +858,7 @@ again:
struct page *page = NULL;
loff_t i_size;
if (retry_op == READ_INLINE) {
- page = __page_cache_alloc(GFP_NOFS);
+ page = __page_cache_alloc(GFP_KERNEL);
if (!page)
return -ENOMEM;
}
@@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
+ struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
int err, want, got;
loff_t pos;
@@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
@@ -959,7 +962,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
pos = iocb->ki_pos;
count = iov_iter_count(from);
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (err)
goto out;
@@ -996,14 +999,30 @@ retry_snap:
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ struct ceph_snap_context *snapc;
struct iov_iter data;
mutex_unlock(&inode->i_mutex);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
/* we might need to revert back to that point */
data = *from;
if (iocb->ki_flags & IOCB_DIRECT)
- written = ceph_sync_direct_write(iocb, &data, pos);
+ written = ceph_sync_direct_write(iocb, &data, pos,
+ snapc);
else
- written = ceph_sync_write(iocb, &data, pos);
+ written = ceph_sync_write(iocb, &data, pos, snapc);
if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n",
@@ -1014,6 +1033,7 @@ retry_snap:
}
if (written > 0)
iov_iter_advance(from, written);
+ ceph_put_snap_context(snapc);
} else {
loff_t old_size = inode->i_size;
/*
@@ -1035,7 +1055,8 @@ retry_snap:
int dirty;
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
@@ -1059,6 +1080,7 @@ retry_snap:
out:
mutex_unlock(&inode->i_mutex);
out_unlocked:
+ ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL;
return written ? written : err;
}
@@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
+ struct ceph_cap_flush *prealloc_cf;
int want, got = 0;
int dirty;
int ret = 0;
@@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
mutex_lock(&inode->i_mutex);
if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
if (!ret) {
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
@@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
ceph_put_cap_refs(ci, got);
unlock:
mutex_unlock(&inode->i_mutex);
+ ceph_free_cap_flush(prealloc_cf);
return ret;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 571acd88606c..96d2bd829902 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_inline_version = 0;
ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0;
- ci->i_ordered_count = 0;
- atomic_set(&ci->i_release_count, 1);
- atomic_set(&ci->i_complete_count, 0);
+ atomic64_set(&ci->i_ordered_count, 1);
+ atomic64_set(&ci->i_release_count, 1);
+ atomic64_set(&ci->i_complete_seq[0], 0);
+ atomic64_set(&ci->i_complete_seq[1], 0);
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
@@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_flushing_caps = 0;
INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item);
- ci->i_cap_flush_seq = 0;
- ci->i_cap_flush_last_tid = 0;
- memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+ ci->i_prealloc_cap_flush = NULL;
+ ci->i_cap_flush_tree = RB_ROOT;
init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
@@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
ci->i_layout = info->layout;
+
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
le64_to_cpu(info->truncate_size),
@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode);
+ i_size_write(inode, 0);
__ceph_dir_set_complete(ci,
- atomic_read(&ci->i_release_count),
- ci->i_ordered_count);
+ atomic64_read(&ci->i_release_count),
+ atomic64_read(&ci->i_ordered_count));
}
wake = true;
@@ -1212,6 +1216,10 @@ retry_lookup:
dout("fill_trace doing d_move %p -> %p\n",
req->r_old_dentry, dn);
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(olddir);
+
d_move(req->r_old_dentry, dn);
dout(" src %p '%pd' dst %p '%pd'\n",
req->r_old_dentry,
@@ -1222,10 +1230,6 @@ retry_lookup:
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
- /* d_move screws up sibling dentries' offsets */
- ceph_dir_clear_ordered(dir);
- ceph_dir_clear_ordered(olddir);
-
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
return err;
}
+void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
+{
+ if (ctl->page) {
+ kunmap(ctl->page);
+ page_cache_release(ctl->page);
+ ctl->page = NULL;
+ }
+}
+
+static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
+ struct ceph_readdir_cache_control *ctl,
+ struct ceph_mds_request *req)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
+ unsigned idx = ctl->index % nsize;
+ pgoff_t pgoff = ctl->index / nsize;
+
+ if (!ctl->page || pgoff != page_index(ctl->page)) {
+ ceph_readdir_cache_release(ctl);
+ ctl->page = grab_cache_page(&dir->i_data, pgoff);
+ if (!ctl->page) {
+ ctl->index = -1;
+ return -ENOMEM;
+ }
+ /* reading/filling the cache are serialized by
+ * i_mutex, no need to use page lock */
+ unlock_page(ctl->page);
+ ctl->dentries = kmap(ctl->page);
+ }
+
+ if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
+ req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
+ dout("readdir cache dn %p idx %d\n", dn, ctl->index);
+ ctl->dentries[idx] = dn;
+ ctl->index++;
+ } else {
+ dout("disable readdir cache\n");
+ ctl->index = -1;
+ }
+ return 0;
+}
+
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di;
- u64 r_readdir_offset = req->r_readdir_offset;
u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+ struct ceph_readdir_cache_control cache_ctl = {};
+
+ if (req->r_aborted)
+ return readdir_prepopulate_inodes_only(req, session);
if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
@@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag);
if (ceph_frag_is_leftmost(frag))
- r_readdir_offset = 2;
+ req->r_readdir_offset = 2;
else
- r_readdir_offset = 0;
+ req->r_readdir_offset = 0;
}
- if (req->r_aborted)
- return readdir_prepopulate_inodes_only(req, session);
-
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
snapdir = ceph_get_snapdir(d_inode(parent));
parent = d_find_alias(snapdir);
@@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
}
+ if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
+ /* note dir version at start of readdir so we can tell
+ * if any dentries get dropped */
+ struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+ req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
+ req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
+ req->r_readdir_cache_idx = 0;
+ }
+
+ cache_ctl.index = req->r_readdir_cache_idx;
+
/* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_vino vino;
@@ -1413,13 +1471,6 @@ retry_lookup:
d_delete(dn);
dput(dn);
goto retry_lookup;
- } else {
- /* reorder parent's d_subdirs */
- spin_lock(&parent->d_lock);
- spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_child, &parent->d_subdirs);
- spin_unlock(&dn->d_lock);
- spin_unlock(&parent->d_lock);
}
/* inode */
@@ -1436,13 +1487,15 @@ retry_lookup:
}
}
- if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation) < 0) {
+ ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+ req->r_request_started, -1,
+ &req->r_caps_reservation);
+ if (ret < 0) {
pr_err("fill_inode badness on %p\n", in);
if (d_really_is_negative(dn))
iput(in);
d_drop(dn);
+ err = ret;
goto next_item;
}
@@ -1458,19 +1511,28 @@ retry_lookup:
}
di = dn->d_fsdata;
- di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+ di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
update_dentry_lease(dn, rinfo->dir_dlease[i],
req->r_session,
req->r_request_started);
+
+ if (err == 0 && cache_ctl.index >= 0) {
+ ret = fill_readdir_cache(d_inode(parent), dn,
+ &cache_ctl, req);
+ if (ret < 0)
+ err = ret;
+ }
next_item:
if (dn)
dput(dn);
}
- if (err == 0)
- req->r_did_prepopulate = true;
-
out:
+ if (err == 0) {
+ req->r_did_prepopulate = true;
+ req->r_readdir_cache_idx = cache_ctl.index;
+ }
+ ceph_readdir_cache_release(&cache_ctl);
if (snapdir) {
iput(snapdir);
dput(parent);
@@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+ struct ceph_cap_flush *prealloc_cf;
int issued;
int release = 0, dirtied = 0;
int mask = 0;
int err = 0;
int inode_dirty_flags = 0;
+ bool lock_snap_rwsem = false;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
@@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (err != 0)
return err;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
USE_AUTH_MDS);
- if (IS_ERR(req))
+ if (IS_ERR(req)) {
+ ceph_free_cap_flush(prealloc_cf);
return PTR_ERR(req);
+ }
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
+
+ if (!ci->i_head_snapc &&
+ (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
+ lock_snap_rwsem = true;
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ }
+ }
+
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ia_valid & ATTR_UID) {
@@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
dout("setattr %p ATTR_FILE ... hrm!\n", inode);
if (dirtied) {
- inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
+ inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
+ &prealloc_cf);
inode->i_ctime = CURRENT_TIME;
}
release &= issued;
spin_unlock(&ci->i_ceph_lock);
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
@@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_mdsc_put_request(req);
if (mask & CEPH_SETATTR_SIZE)
__ceph_do_pending_vmtruncate(inode);
+ ceph_free_cap_flush(prealloc_cf);
return err;
out_put:
ceph_mdsc_put_request(req);
+ ceph_free_cap_flush(prealloc_cf);
return err;
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 4347039ecc18..6706bde9ad1b 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -287,7 +287,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
return 0;
spin_lock(&ctx->flc_lock);
- list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+ list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
++seen_fcntl;
if (seen_fcntl > num_fcntl_locks) {
err = -ENOSPC;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 84f37f34f9aa..6aa07af67603 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -8,6 +8,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
+#include <linux/ratelimit.h>
#include "super.h"
#include "mds_client.h"
@@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
- INIT_LIST_HEAD(&s->s_cap_releases_done);
INIT_LIST_HEAD(&s->s_cap_flushing);
INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
@@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
req->r_uid = current_fsuid();
req->r_gid = current_fsgid();
+ if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
+ mdsc->oldest_tid = req->r_tid;
+
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
@@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+
+ if (req->r_tid == mdsc->oldest_tid) {
+ struct rb_node *p = rb_next(&req->r_node);
+ mdsc->oldest_tid = 0;
+ while (p) {
+ struct ceph_mds_request *next_req =
+ rb_entry(p, struct ceph_mds_request, r_node);
+ if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
+ mdsc->oldest_tid = next_req->r_tid;
+ break;
+ }
+ p = rb_next(p);
+ }
+ }
+
rb_erase(&req->r_node, &mdsc->request_tree);
RB_CLEAR_NODE(&req->r_node);
@@ -998,27 +1016,25 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
* session caps
*/
-/*
- * Free preallocated cap messages assigned to this session
- */
-static void cleanup_cap_releases(struct ceph_mds_session *session)
+/* caller holds s_cap_lock, we drop it */
+static void cleanup_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+ __releases(session->s_cap_lock)
{
- struct ceph_msg *msg;
+ LIST_HEAD(tmp_list);
+ list_splice_init(&session->s_cap_releases, &tmp_list);
+ session->s_num_cap_releases = 0;
+ spin_unlock(&session->s_cap_lock);
- spin_lock(&session->s_cap_lock);
- while (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
- }
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
+ dout("cleanup_cap_releases mds%d\n", session->s_mds);
+ while (!list_empty(&tmp_list)) {
+ struct ceph_cap *cap;
+ /* zero out the in-progress message */
+ cap = list_first_entry(&tmp_list,
+ struct ceph_cap, session_caps);
+ list_del(&cap->session_caps);
+ ceph_put_cap(mdsc, cap);
}
- spin_unlock(&session->s_cap_lock);
}
static void cleanup_session_requests(struct ceph_mds_client *mdsc,
@@ -1033,7 +1049,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
req = list_first_entry(&session->s_unsafe,
struct ceph_mds_request, r_unsafe_item);
list_del_init(&req->r_unsafe_item);
- pr_info(" dropping unsafe request %llu\n", req->r_tid);
+ pr_warn_ratelimited(" dropping unsafe request %llu\n",
+ req->r_tid);
__unregister_request(mdsc, req);
}
/* zero r_attempts, so kick_requests() will re-send requests */
@@ -1095,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session,
dout("iterate_session_caps finishing cap %p removal\n",
cap);
BUG_ON(cap->session != session);
+ cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
- cap->session = NULL;
- old_cap = cap; /* put_cap it w/o locks held */
+ if (cap->queue_release) {
+ list_add_tail(&cap->session_caps,
+ &session->s_cap_releases);
+ session->s_num_cap_releases++;
+ } else {
+ old_cap = cap; /* put_cap it w/o locks held */
+ }
}
if (ret < 0)
goto out;
@@ -1119,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ LIST_HEAD(to_remove);
int drop = 0;
dout("removing cap %p, ci is %p, inode is %p\n",
@@ -1126,12 +1150,27 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_lock(&ci->i_ceph_lock);
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
+ struct ceph_cap_flush *cf;
struct ceph_mds_client *mdsc =
ceph_sb_to_client(inode->i_sb)->mdsc;
+ while (true) {
+ struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
+ if (!n)
+ break;
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
+ list_add(&cf->list, &to_remove);
+ }
+
spin_lock(&mdsc->cap_dirty_lock);
+
+ list_for_each_entry(cf, &to_remove, list)
+ rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
if (!list_empty(&ci->i_dirty_item)) {
- pr_info(" dropping dirty %s state for %p %lld\n",
+ pr_warn_ratelimited(
+ " dropping dirty %s state for %p %lld\n",
ceph_cap_string(ci->i_dirty_caps),
inode, ceph_ino(inode));
ci->i_dirty_caps = 0;
@@ -1139,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
drop = 1;
}
if (!list_empty(&ci->i_flushing_item)) {
- pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+ pr_warn_ratelimited(
+ " dropping dirty+flushing %s state for %p %lld\n",
ceph_cap_string(ci->i_flushing_caps),
inode, ceph_ino(inode));
ci->i_flushing_caps = 0;
@@ -1148,8 +1188,20 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
drop = 1;
}
spin_unlock(&mdsc->cap_dirty_lock);
+
+ if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+ list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+ ci->i_prealloc_cap_flush = NULL;
+ }
}
spin_unlock(&ci->i_ceph_lock);
+ while (!list_empty(&to_remove)) {
+ struct ceph_cap_flush *cf;
+ cf = list_first_entry(&to_remove,
+ struct ceph_cap_flush, list);
+ list_del(&cf->list);
+ ceph_free_cap_flush(cf);
+ }
while (drop--)
iput(inode);
return 0;
@@ -1191,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session)
spin_lock(&session->s_cap_lock);
}
}
- spin_unlock(&session->s_cap_lock);
+
+ // drop cap expires and unlock s_cap_lock
+ cleanup_cap_releases(session->s_mdsc, session);
BUG_ON(session->s_nr_caps > 0);
BUG_ON(!list_empty(&session->s_cap_flushing));
- cleanup_cap_releases(session);
}
/*
@@ -1371,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
ceph_cap_string(used), ceph_cap_string(wanted));
if (cap == ci->i_auth_cap) {
- if (ci->i_dirty_caps | ci->i_flushing_caps)
+ if (ci->i_dirty_caps || ci->i_flushing_caps ||
+ !list_empty(&ci->i_cap_snaps))
goto out;
if ((used | wanted) & CEPH_CAP_ANY_WR)
goto out;
@@ -1417,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc,
session->s_trim_caps = 0;
}
- ceph_add_cap_releases(mdsc, session);
ceph_send_cap_releases(mdsc, session);
return 0;
}
-/*
- * Allocate cap_release messages. If there is a partially full message
- * in the queue, try to allocate enough to cover it's remainder, so that
- * we can send it immediately.
- *
- * Called under s_mutex.
- */
-int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static int check_capsnap_flush(struct ceph_inode_info *ci,
+ u64 want_snap_seq)
{
- struct ceph_msg *msg, *partial = NULL;
- struct ceph_mds_cap_release *head;
- int err = -ENOMEM;
- int extra = mdsc->fsc->mount_options->cap_release_safety;
- int num;
-
- dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
- extra);
-
- spin_lock(&session->s_cap_lock);
-
- if (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg,
- list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- if (num) {
- dout(" partial %p with (%d/%d)\n", msg, num,
- (int)CEPH_CAPS_PER_RELEASE);
- extra += CEPH_CAPS_PER_RELEASE - num;
- partial = msg;
- }
- }
- while (session->s_num_cap_releases < session->s_nr_caps + extra) {
- spin_unlock(&session->s_cap_lock);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
- GFP_NOFS, false);
- if (!msg)
- goto out_unlocked;
- dout("add_cap_releases %p msg %p now %d\n", session, msg,
- (int)msg->front.iov_len);
- head = msg->front.iov_base;
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- spin_lock(&session->s_cap_lock);
- list_add(&msg->list_head, &session->s_cap_releases);
- session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
- }
-
- if (partial) {
- head = partial->front.iov_base;
- num = le32_to_cpu(head->num);
- dout(" queueing partial %p with %d/%d\n", partial, num,
- (int)CEPH_CAPS_PER_RELEASE);
- list_move_tail(&partial->list_head,
- &session->s_cap_releases_done);
- session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
+ int ret = 1;
+ spin_lock(&ci->i_ceph_lock);
+ if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ ret = capsnap->follows >= want_snap_seq;
}
- err = 0;
- spin_unlock(&session->s_cap_lock);
-out_unlocked:
- return err;
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
}
-static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+static int check_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int ret;
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_flushing_caps)
- ret = ci->i_cap_flush_seq >= want_flush_seq;
- else
- ret = 1;
- spin_unlock(&ci->i_ceph_lock);
+ struct rb_node *n;
+ struct ceph_cap_flush *cf;
+ int ret = 1;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ n = rb_first(&mdsc->cap_flush_tree);
+ cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+ if (cf && cf->tid <= want_flush_tid) {
+ dout("check_caps_flush still flushing tid %llu <= %llu\n",
+ cf->tid, want_flush_tid);
+ ret = 0;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
return ret;
}
/*
* flush all dirty inode data to disk.
*
- * returns true if we've flushed through want_flush_seq
+ * returns true if we've flushed through want_flush_tid
*/
-static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void wait_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid, u64 want_snap_seq)
{
int mds;
- dout("check_cap_flush want %lld\n", want_flush_seq);
+ dout("check_caps_flush want %llu snap want %llu\n",
+ want_flush_tid, want_snap_seq);
mutex_lock(&mdsc->mutex);
- for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ for (mds = 0; mds < mdsc->max_sessions; ) {
struct ceph_mds_session *session = mdsc->sessions[mds];
struct inode *inode = NULL;
- if (!session)
+ if (!session) {
+ mds++;
continue;
+ }
get_session(session);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_flushing)) {
- struct ceph_inode_info *ci =
- list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item);
-
- if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
- dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n",
- &ci->vfs_inode, ci->i_cap_flush_seq,
- want_flush_seq, session->s_mds);
+ if (!list_empty(&session->s_cap_snaps_flushing)) {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&session->s_cap_snaps_flushing,
+ struct ceph_cap_snap,
+ flushing_item);
+ struct ceph_inode_info *ci = capsnap->ci;
+ if (!check_capsnap_flush(ci, want_snap_seq)) {
+ dout("check_cap_flush still flushing snap %p "
+ "follows %lld <= %lld to mds%d\n",
+ &ci->vfs_inode, capsnap->follows,
+ want_snap_seq, mds);
inode = igrab(&ci->vfs_inode);
}
}
@@ -1540,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
if (inode) {
wait_event(mdsc->cap_flushing_wq,
- check_cap_flush(inode, want_flush_seq));
+ check_capsnap_flush(ceph_inode(inode),
+ want_snap_seq));
iput(inode);
+ } else {
+ mds++;
}
mutex_lock(&mdsc->mutex);
}
-
mutex_unlock(&mdsc->mutex);
- dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+
+ wait_event(mdsc->cap_flushing_wq,
+ check_caps_flush(mdsc, want_flush_tid));
+
+ dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
}
/*
@@ -1557,60 +1576,74 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
- struct ceph_msg *msg;
+ struct ceph_msg *msg = NULL;
+ struct ceph_mds_cap_release *head;
+ struct ceph_mds_cap_item *item;
+ struct ceph_cap *cap;
+ LIST_HEAD(tmp_list);
+ int num_cap_releases;
- dout("send_cap_releases mds%d\n", session->s_mds);
spin_lock(&session->s_cap_lock);
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- spin_unlock(&session->s_cap_lock);
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
- ceph_con_send(&session->s_con, msg);
- spin_lock(&session->s_cap_lock);
- }
+again:
+ list_splice_init(&session->s_cap_releases, &tmp_list);
+ num_cap_releases = session->s_num_cap_releases;
+ session->s_num_cap_releases = 0;
spin_unlock(&session->s_cap_lock);
-}
-static void discard_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- unsigned n